Login to prostor.ba befoure crawl.

This commit is contained in:
Naida Vatric
2020-01-13 12:05:33 +01:00
parent ba43fa0713
commit 511b290096

View File

@@ -62,15 +62,16 @@ class ProstorCrawler {
async crawl() { async crawl() {
const crawlAdCategories = this.crawlerAdCategories; const crawlAdCategories = this.crawlerAdCategories;
//We need session cookie to use login privileges
const prostorCookie = await this.getCookies();
//New tag to check if crawler loged in //New tag to check if crawler loged in
const login = await this.loginForScraping(PROSTOR_LOGIN); const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
const newRealEstates = []; const newRealEstates = [];
// //Crawl only if login was successful
console.log("login before crawl:", login);
if (crawlAdCategories && login) { if (crawlAdCategories && login) {
const indexGenerators = []; const indexGenerators = [];
for (const adCategory of crawlAdCategories) { for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory)); indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
} }
let done = false; let done = false;
@@ -123,13 +124,14 @@ class ProstorCrawler {
return newRealEstates; return newRealEstates;
} }
async *categoryIndexer(adCategory) { async *categoryIndexer(adCategory, prostorCookie) {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`; const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates( const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl urlPageToCrawl,
prostorCookie
); );
let elementToStartIndexFrom = 0; let elementToStartIndexFrom = 0;
@@ -143,7 +145,8 @@ class ProstorCrawler {
elementToStartIndexFrom += realEstatesForSinglePage.length; elementToStartIndexFrom += realEstatesForSinglePage.length;
const singlePageResults = await this.indexSinglePage( const singlePageResults = await this.indexSinglePage(
realEstatesForSinglePage realEstatesForSinglePage,
prostorCookie
); );
const filteredSinglePageResults = singlePageResults.filter( const filteredSinglePageResults = singlePageResults.filter(
@@ -167,10 +170,10 @@ class ProstorCrawler {
} }
} }
async indexSinglePage(realEstatesList) { async indexSinglePage(realEstatesList, prostorCookie) {
const asyncActions = []; const asyncActions = [];
for (const realEstate of realEstatesList) { for (const realEstate of realEstatesList) {
asyncActions.push(this.scrapeAd(realEstate)); asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
} }
try { try {
@@ -184,7 +187,7 @@ class ProstorCrawler {
} }
} }
async scrapeAd(realEstate) { async scrapeAd(realEstate, prostorCookie) {
const { lat, lng, property_name, price, size, link, status } = realEstate; const { lat, lng, property_name, price, size, link, status } = realEstate;
//Status information is given already in realestate list //Status information is given already in realestate list
@@ -200,7 +203,9 @@ class ProstorCrawler {
// console.log("[PROSTOR] Scraping : ", url); // console.log("[PROSTOR] Scraping : ", url);
try { try {
const adPageSource = await fetch(url); const adPageSource = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
@@ -422,13 +427,15 @@ class ProstorCrawler {
} }
} }
async extractRealEstates(url) { async extractRealEstates(url, prostorCookie) {
if (PRINT_CRAWLER_DEBUG) { if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url); console.log("[PROSTOR] Index page : ", url);
} }
try { try {
const res = await fetch(url); const res = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await res.text(); const body = await res.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
@@ -585,28 +592,21 @@ class ProstorCrawler {
return savers[0].save(results); return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted //so that we can use some sequelize options and information when data is inserted
} }
async loginForScraping(PROSTOR_LOGIN) { async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
console.log("PROSTOR_LOGIN", PROSTOR_LOGIN);
const prostorCookie = await this.getCookies();
console.log("prostor cookie", prostorCookie);
let formData = new FormData(); let formData = new FormData();
formData.append("email", PROSTOR_LOGIN.EMAIL); formData.append("email", PROSTOR_LOGIN.EMAIL);
formData.append("password", PROSTOR_LOGIN.PASSWORD); formData.append("password", PROSTOR_LOGIN.PASSWORD);
//When once loged in it stays loged in with same credentials.
//Do we need to log out ??
return fetch("https://prostor.ba/moj-prostor/prijava", { return fetch("https://prostor.ba/moj-prostor/prijava", {
method: "POST", method: "POST",
body: formData, body: formData,
headers: { Cookie: prostorCookie } headers: { Cookie: prostorCookie }
}) })
.then(page => { .then(page => {
//
console.log("headers: ", page.headers.raw()["set-cookie"]);
return page.text(); return page.text();
}) })
.then(resp => { .then(resp => {
const $ = cheerio.load(resp); const $ = cheerio.load(resp);
console.log("$ ", $("h1").text());
if ( if (
$("h1") $("h1")
.text() .text()