From 511b2900961a470c80ba22d64bbab3ec57193ee9 Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Mon, 13 Jan 2020 12:05:33 +0100 Subject: [PATCH] Login to prostor.ba befoure crawl. --- app/crawler/specificCrawlers/prostor.js | 44 ++++++++++++------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index 01e2402..04be5f3 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -62,15 +62,16 @@ class ProstorCrawler { async crawl() { const crawlAdCategories = this.crawlerAdCategories; + //We need session cookie to use login privileges + const prostorCookie = await this.getCookies(); //New tag to check if crawler loged in - const login = await this.loginForScraping(PROSTOR_LOGIN); + const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie); const newRealEstates = []; - // - console.log("login before crawl:", login); + //Crawl only if login was successful if (crawlAdCategories && login) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { - indexGenerators.push(this.categoryIndexer(adCategory)); + indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie)); } let done = false; @@ -123,13 +124,14 @@ class ProstorCrawler { return newRealEstates; } - async *categoryIndexer(adCategory) { + async *categoryIndexer(adCategory, prostorCookie) { const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`; const listOfAllRealEstates = await this.extractRealEstates( - urlPageToCrawl + urlPageToCrawl, + prostorCookie ); let elementToStartIndexFrom = 0; @@ -143,7 +145,8 @@ class ProstorCrawler { elementToStartIndexFrom += realEstatesForSinglePage.length; const singlePageResults = await this.indexSinglePage( - realEstatesForSinglePage + realEstatesForSinglePage, + prostorCookie ); const filteredSinglePageResults = singlePageResults.filter( @@ -167,10 +170,10 @@ class ProstorCrawler { } } - async indexSinglePage(realEstatesList) { + async indexSinglePage(realEstatesList, prostorCookie) { const asyncActions = []; for (const realEstate of realEstatesList) { - asyncActions.push(this.scrapeAd(realEstate)); + asyncActions.push(this.scrapeAd(realEstate, prostorCookie)); } try { @@ -184,7 +187,7 @@ class ProstorCrawler { } } - async scrapeAd(realEstate) { + async scrapeAd(realEstate, prostorCookie) { const { lat, lng, property_name, price, size, link, status } = realEstate; //Status information is given already in realestate list @@ -200,7 +203,9 @@ class ProstorCrawler { // console.log("[PROSTOR] Scraping : ", url); try { - const adPageSource = await fetch(url); + const adPageSource = await fetch(url, { + headers: { Cookie: prostorCookie } + }); const body = await adPageSource.text(); const $ = cheerio.load(body); @@ -422,13 +427,15 @@ class ProstorCrawler { } } - async extractRealEstates(url) { + async extractRealEstates(url, prostorCookie) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); } try { - const res = await fetch(url); + const res = await fetch(url, { + headers: { Cookie: prostorCookie } + }); const body = await res.text(); const $ = cheerio.load(body); @@ -585,28 +592,21 @@ class ProstorCrawler { return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } - async loginForScraping(PROSTOR_LOGIN) { - console.log("PROSTOR_LOGIN", PROSTOR_LOGIN); - const prostorCookie = await this.getCookies(); - console.log("prostor cookie", prostorCookie); + async loginForScraping(PROSTOR_LOGIN, prostorCookie) { let formData = new FormData(); formData.append("email", PROSTOR_LOGIN.EMAIL); formData.append("password", PROSTOR_LOGIN.PASSWORD); - //When once loged in it stays loged in with same credentials. - //Do we need to log out ?? + return fetch("https://prostor.ba/moj-prostor/prijava", { method: "POST", body: formData, headers: { Cookie: prostorCookie } }) .then(page => { - // - console.log("headers: ", page.headers.raw()["set-cookie"]); return page.text(); }) .then(resp => { const $ = cheerio.load(resp); - console.log("$ ", $("h1").text()); if ( $("h1") .text()