Login to prostor.ba befoure crawl.
This commit is contained in:
@@ -62,15 +62,16 @@ class ProstorCrawler {
|
|||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
//We need session cookie to use login privileges
|
||||||
|
const prostorCookie = await this.getCookies();
|
||||||
//New tag to check if crawler loged in
|
//New tag to check if crawler loged in
|
||||||
const login = await this.loginForScraping(PROSTOR_LOGIN);
|
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
|
||||||
const newRealEstates = [];
|
const newRealEstates = [];
|
||||||
//
|
//Crawl only if login was successful
|
||||||
console.log("login before crawl:", login);
|
|
||||||
if (crawlAdCategories && login) {
|
if (crawlAdCategories && login) {
|
||||||
const indexGenerators = [];
|
const indexGenerators = [];
|
||||||
for (const adCategory of crawlAdCategories) {
|
for (const adCategory of crawlAdCategories) {
|
||||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
|
||||||
}
|
}
|
||||||
|
|
||||||
let done = false;
|
let done = false;
|
||||||
@@ -123,13 +124,14 @@ class ProstorCrawler {
|
|||||||
return newRealEstates;
|
return newRealEstates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *categoryIndexer(adCategory) {
|
async *categoryIndexer(adCategory, prostorCookie) {
|
||||||
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
||||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
||||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
|
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
|
||||||
const listOfAllRealEstates = await this.extractRealEstates(
|
const listOfAllRealEstates = await this.extractRealEstates(
|
||||||
urlPageToCrawl
|
urlPageToCrawl,
|
||||||
|
prostorCookie
|
||||||
);
|
);
|
||||||
|
|
||||||
let elementToStartIndexFrom = 0;
|
let elementToStartIndexFrom = 0;
|
||||||
@@ -143,7 +145,8 @@ class ProstorCrawler {
|
|||||||
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
||||||
|
|
||||||
const singlePageResults = await this.indexSinglePage(
|
const singlePageResults = await this.indexSinglePage(
|
||||||
realEstatesForSinglePage
|
realEstatesForSinglePage,
|
||||||
|
prostorCookie
|
||||||
);
|
);
|
||||||
|
|
||||||
const filteredSinglePageResults = singlePageResults.filter(
|
const filteredSinglePageResults = singlePageResults.filter(
|
||||||
@@ -167,10 +170,10 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async indexSinglePage(realEstatesList) {
|
async indexSinglePage(realEstatesList, prostorCookie) {
|
||||||
const asyncActions = [];
|
const asyncActions = [];
|
||||||
for (const realEstate of realEstatesList) {
|
for (const realEstate of realEstatesList) {
|
||||||
asyncActions.push(this.scrapeAd(realEstate));
|
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -184,7 +187,7 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(realEstate) {
|
async scrapeAd(realEstate, prostorCookie) {
|
||||||
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
||||||
|
|
||||||
//Status information is given already in realestate list
|
//Status information is given already in realestate list
|
||||||
@@ -200,7 +203,9 @@ class ProstorCrawler {
|
|||||||
|
|
||||||
// console.log("[PROSTOR] Scraping : ", url);
|
// console.log("[PROSTOR] Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url, {
|
||||||
|
headers: { Cookie: prostorCookie }
|
||||||
|
});
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
@@ -422,13 +427,15 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractRealEstates(url) {
|
async extractRealEstates(url, prostorCookie) {
|
||||||
if (PRINT_CRAWLER_DEBUG) {
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
console.log("[PROSTOR] Index page : ", url);
|
console.log("[PROSTOR] Index page : ", url);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
const res = await fetch(url, {
|
||||||
|
headers: { Cookie: prostorCookie }
|
||||||
|
});
|
||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
@@ -585,28 +592,21 @@ class ProstorCrawler {
|
|||||||
return savers[0].save(results);
|
return savers[0].save(results);
|
||||||
//so that we can use some sequelize options and information when data is inserted
|
//so that we can use some sequelize options and information when data is inserted
|
||||||
}
|
}
|
||||||
async loginForScraping(PROSTOR_LOGIN) {
|
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
|
||||||
console.log("PROSTOR_LOGIN", PROSTOR_LOGIN);
|
|
||||||
const prostorCookie = await this.getCookies();
|
|
||||||
console.log("prostor cookie", prostorCookie);
|
|
||||||
let formData = new FormData();
|
let formData = new FormData();
|
||||||
formData.append("email", PROSTOR_LOGIN.EMAIL);
|
formData.append("email", PROSTOR_LOGIN.EMAIL);
|
||||||
formData.append("password", PROSTOR_LOGIN.PASSWORD);
|
formData.append("password", PROSTOR_LOGIN.PASSWORD);
|
||||||
//When once loged in it stays loged in with same credentials.
|
|
||||||
//Do we need to log out ??
|
|
||||||
return fetch("https://prostor.ba/moj-prostor/prijava", {
|
return fetch("https://prostor.ba/moj-prostor/prijava", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
body: formData,
|
body: formData,
|
||||||
headers: { Cookie: prostorCookie }
|
headers: { Cookie: prostorCookie }
|
||||||
})
|
})
|
||||||
.then(page => {
|
.then(page => {
|
||||||
//
|
|
||||||
console.log("headers: ", page.headers.raw()["set-cookie"]);
|
|
||||||
return page.text();
|
return page.text();
|
||||||
})
|
})
|
||||||
.then(resp => {
|
.then(resp => {
|
||||||
const $ = cheerio.load(resp);
|
const $ = cheerio.load(resp);
|
||||||
console.log("$ ", $("h1").text());
|
|
||||||
if (
|
if (
|
||||||
$("h1")
|
$("h1")
|
||||||
.text()
|
.text()
|
||||||
|
|||||||
Reference in New Issue
Block a user