Fixed olx scraper api bug.

2020-03-04 22:56:06 +01:00
parent f5f8fa276c
commit b3708cf842
5 changed files with 58 additions and 46 deletions
--- a/app/config/appConfig.js
+++ b/app/config/appConfig.js
@@ -48,9 +48,7 @@ const USER_AGENT =
 const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
 const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
 const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
-  process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10;
-const DELAY_BETWEEN_REQ_SCRAPER_API =
-  process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000;
+  parseInt(process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API) || 10;

 module.exports = {
  APP_PORT,
@@ -69,6 +67,5 @@ module.exports = {
  USER_AGENT,
  USE_SCRAPER_API,
  SCRAPER_API_KEY,
-  NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
-  DELAY_BETWEEN_REQ_SCRAPER_API
+  NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
 };
--- a/app/crawler/specificCrawlers/olx.js
+++ b/app/crawler/specificCrawlers/olx.js
@@ -19,8 +19,7 @@ const {
 const {
  DEFAULT_TIMEZONE,
  PRINT_CRAWLER_DEBUG,
-  NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
-  DELAY_BETWEEN_REQ_SCRAPER_API
+  NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
 } = require("../../config/appConfig");

 const OLX_ENUMS = {
@@ -195,21 +194,20 @@ class OlxCrawler {
      const scrapedData = [];
      for (
        let i = 0;
-        i < actualNoOfResults;
-        i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
+        i <= actualNoOfResults;
+        i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
      ) {
        const concurrentUrlsToScrape = hrefs.slice(
          i,
-          NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
+          i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
        );

        const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
          this.scrapeAd(url)
        );
        const concurrentReqData = await Promise.all(concurrentReqScraperApi);
-        scrapedData.push(concurrentReqData);

-        this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
+        concurrentReqData.forEach(reqData => scrapedData.push(reqData));
      }

      const filteredScrapedData = scrapedData.filter(adData => !!adData);
@@ -221,13 +219,9 @@ class OlxCrawler {
  }

  async scrapeAd(url) {
-    console.log("Scraping : ", url);
-    //  let hasParseErrors = false;
-    //let numberOfParseErrors = 0;
-    //   do {
-    try {
-      // await this.sleep(this.delayBetweenAds);
+    // console.log("Scraping : ", url);

+    try {
      const adPageSource = await fetch(url);
      const body = await adPageSource.text();
      const $ = cheerio.load(body);
@@ -269,13 +263,13 @@ class OlxCrawler {
      )
        .text()
        .trim();
+      //For cases where price is given in discount manner - different from default parsing
+      const discountPriceValue = $(
+        "#artikal_glavni_div > div.artikal_lijevo > div.op.pop > p"
+      )
+        .text()
+        .trim();

-      //Debug
-      //console.log("Title:", title);
-      //console.log("Url scraped:", url);
-      // console.log("Normal price value:", normalPriceValue);
-      // console.log("Urgent price value:", urgentPriceValue);
-      //
      if (normalPriceValue && normalPriceValue.length > 0) {
        normalPrice = normalPriceValue
          .replace(/\r\n|\n|\r/gm, "")
@@ -290,10 +284,11 @@ class OlxCrawler {
        } else {
          status = AD_STATUS.STATUS_NORMAL;
        }
+      } else if (discountPriceValue && discountPriceValue.length > 0) {
+        status = AD_STATUS.STATUS_URGENT;
+        const priceValues = discountPriceValue.split("KM");
+        normalPrice = priceValues[0].trim();
      } else {
-        //
-        console.log("Body:", body);
-        //
        throw { message: "Can't find normal price" };
      }
      if (urgentPriceValue && urgentPriceValue.length > 0) {
@@ -306,6 +301,10 @@ class OlxCrawler {
          } else {
            urgentPrice = priceValues[0].trim();
          }
+        } else if (discountPriceValue && discountPriceValue.length > 0) {
+          status = AD_STATUS.STATUS_URGENT;
+          const priceValues = discountPriceValue.split("KM");
+          urgentPrice = priceValues[1].trim();
        } else {
          throw { message: "Can't find urgent price" };
        }
@@ -692,20 +691,11 @@ class OlxCrawler {
        distanceToRiver,
        numberOfViewsAgency
      };
-      //
-      //console.log("Scraped data:", data);
-
-      //Delay between real estate ads to avoid error from Scraper API
-      // await this.sleep(this.delayBetweenAds);

      return data;
    } catch (e) {
-      // hasParseErrors = true;
-      // numberOfParseErrors++;
      console.error("Exception caught: " + e.message, "\r\nURL:", url);
    }
-    //  } while (hasParseErrors && numberOfParseErrors <= 1);
-    // await this.sleep(this.delayBetweenAds);

    return null;
  }
--- a/app/crawler/specificCrawlers/saljic.js
+++ b/app/crawler/specificCrawlers/saljic.js
@@ -16,7 +16,8 @@ const {

 const {
  PRINT_CRAWLER_DEBUG,
-  DEFAULT_TIMEZONE
+  DEFAULT_TIMEZONE,
+  NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
 } = require("../../config/appConfig");
 const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");

@@ -204,13 +205,32 @@ class SaljicCrawler {
          ? hrefsAbs.length
          : maxResultsPerPage;

-      const asyncScraping = [];
-      for (let i = 0; i < actualNoOfResults; i++) {
-        asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
+      const scrapedData = [];
+      for (
+        let i = 0;
+        i <= actualNoOfResults;
+        i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
+      ) {
+        const concurrentUrlsToScrape = hrefsAbs.slice(
+          i,
+          i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
+        );
+
+        const concurrentAdTypesOfReq = adTypes.slice(
+          i,
+          i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
+        );
+
+        const concurrentReqScraperApi = concurrentUrlsToScrape.map(
+          (url, index) => this.scrapeAd(url, concurrentAdTypesOfReq[index])
+        );
+        const concurrentReqData = await Promise.all(concurrentReqScraperApi);
+
+        concurrentReqData.forEach(reqData => scrapedData.push(reqData));
      }

-      const scrapedData = await Promise.all(asyncScraping);
      const filteredScrapedData = scrapedData.filter(adData => !!adData);
+
      return filteredScrapedData;
    } catch (e) {
      console.error("[SALJIC] Exception caught:" + e);
@@ -225,6 +245,10 @@ class SaljicCrawler {
      const body = await adPageSource.text();
      const $ = cheerio.load(body);

+      //Throws error if req to Scraper API proxy wasn't succesful and responds with error
+      if (body.indexOf("<html>") === -1) {
+        throw { message: "Scraper API server error." };
+      }
      // No information for status ex. PRODAN
      const status = AD_STATUS.STATUS_NORMAL;
      //Extracting agency ID from url
@@ -508,6 +532,11 @@ class SaljicCrawler {
      const region = "";
      const entity = "";
      const country = "";
+      //Throws error if realEstateType is null - not read. Still dont know why?
+      if (realEstateType === null) {
+        console.log("Body:", body);
+        throw { message: "Couldn't read real estate type." };
+      }

      const data = {
        url,
@@ -573,13 +602,10 @@ class SaljicCrawler {
        numberOfViewsKivi
      };

-      await this.sleep(1000);
-
      return data;
    } catch (e) {
      console.error("Exception caught: " + e.message, "\r\nURL:", url);
    }
-    await this.sleep(1000);

    return null;
  }
--- a/app/helpers/fetchWrapper.js
+++ b/app/helpers/fetchWrapper.js
@@ -16,7 +16,7 @@ const fetch = async (url, options = {}) => {
    : url;

  //
-  console.log("Url for scraping:", urlAdaptedForScraping);
+  // console.log("Url for scraping:", urlAdaptedForScraping);

  return nodeFetch(urlAdaptedForScraping, newOptions);
 };
--- a/development.env
+++ b/development.env
@@ -26,7 +26,6 @@ API_MAP_KEY=(your-key-here)
 USE_SCRAPER_API= To turn it on (1) or off (0)
 SCRAPER_API_KEY= Key for Scraper api 
 NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy
-DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req  bulk to awoid server errors with Scraper API

 #=============== AWS SDK EMAIL SETTINGS =======#
 AWS_KEY_ID=(your-key-here)