Login to prostor.ba befoure crawl.

This commit is contained in:
Naida Vatric
2020-01-13 12:05:33 +01:00
parent ba43fa0713
commit 511b290096

View File

@@ -62,15 +62,16 @@ class ProstorCrawler {
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
//We need session cookie to use login privileges
const prostorCookie = await this.getCookies();
//New tag to check if crawler loged in
const login = await this.loginForScraping(PROSTOR_LOGIN);
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
const newRealEstates = [];
//
console.log("login before crawl:", login);
//Crawl only if login was successful
if (crawlAdCategories && login) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory));
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
}
let done = false;
@@ -123,13 +124,14 @@ class ProstorCrawler {
return newRealEstates;
}
async *categoryIndexer(adCategory) {
async *categoryIndexer(adCategory, prostorCookie) {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl
urlPageToCrawl,
prostorCookie
);
let elementToStartIndexFrom = 0;
@@ -143,7 +145,8 @@ class ProstorCrawler {
elementToStartIndexFrom += realEstatesForSinglePage.length;
const singlePageResults = await this.indexSinglePage(
realEstatesForSinglePage
realEstatesForSinglePage,
prostorCookie
);
const filteredSinglePageResults = singlePageResults.filter(
@@ -167,10 +170,10 @@ class ProstorCrawler {
}
}
async indexSinglePage(realEstatesList) {
async indexSinglePage(realEstatesList, prostorCookie) {
const asyncActions = [];
for (const realEstate of realEstatesList) {
asyncActions.push(this.scrapeAd(realEstate));
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
}
try {
@@ -184,7 +187,7 @@ class ProstorCrawler {
}
}
async scrapeAd(realEstate) {
async scrapeAd(realEstate, prostorCookie) {
const { lat, lng, property_name, price, size, link, status } = realEstate;
//Status information is given already in realestate list
@@ -200,7 +203,9 @@ class ProstorCrawler {
// console.log("[PROSTOR] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const adPageSource = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await adPageSource.text();
const $ = cheerio.load(body);
@@ -422,13 +427,15 @@ class ProstorCrawler {
}
}
async extractRealEstates(url) {
async extractRealEstates(url, prostorCookie) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url);
}
try {
const res = await fetch(url);
const res = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await res.text();
const $ = cheerio.load(body);
@@ -585,28 +592,21 @@ class ProstorCrawler {
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
async loginForScraping(PROSTOR_LOGIN) {
console.log("PROSTOR_LOGIN", PROSTOR_LOGIN);
const prostorCookie = await this.getCookies();
console.log("prostor cookie", prostorCookie);
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
let formData = new FormData();
formData.append("email", PROSTOR_LOGIN.EMAIL);
formData.append("password", PROSTOR_LOGIN.PASSWORD);
//When once loged in it stays loged in with same credentials.
//Do we need to log out ??
return fetch("https://prostor.ba/moj-prostor/prijava", {
method: "POST",
body: formData,
headers: { Cookie: prostorCookie }
})
.then(page => {
//
console.log("headers: ", page.headers.raw()["set-cookie"]);
return page.text();
})
.then(resp => {
const $ = cheerio.load(resp);
console.log("$ ", $("h1").text());
if (
$("h1")
.text()