diff --git a/app/config/appConfig.js b/app/config/appConfig.js index e1864a8..b8fe9c8 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -48,9 +48,7 @@ const USER_AGENT = const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || ""; const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API = - process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10; -const DELAY_BETWEEN_REQ_SCRAPER_API = - process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000; + parseInt(process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API) || 10; module.exports = { APP_PORT, @@ -69,6 +67,5 @@ module.exports = { USER_AGENT, USE_SCRAPER_API, SCRAPER_API_KEY, - NUMBER_OF_CONCURRENT_REQ_SCRAPER_API, - DELAY_BETWEEN_REQ_SCRAPER_API + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API }; diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 2801a55..1786cb7 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -19,8 +19,7 @@ const { const { DEFAULT_TIMEZONE, PRINT_CRAWLER_DEBUG, - NUMBER_OF_CONCURRENT_REQ_SCRAPER_API, - DELAY_BETWEEN_REQ_SCRAPER_API + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API } = require("../../config/appConfig"); const OLX_ENUMS = { @@ -195,21 +194,20 @@ class OlxCrawler { const scrapedData = []; for ( let i = 0; - i < actualNoOfResults; - i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + i <= actualNoOfResults; + i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API ) { const concurrentUrlsToScrape = hrefs.slice( i, - NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API ); const concurrentReqScraperApi = concurrentUrlsToScrape.map(url => this.scrapeAd(url) ); const concurrentReqData = await Promise.all(concurrentReqScraperApi); - scrapedData.push(concurrentReqData); - this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API); + concurrentReqData.forEach(reqData => scrapedData.push(reqData)); } const filteredScrapedData = scrapedData.filter(adData => !!adData); @@ -221,13 +219,9 @@ class OlxCrawler { } async scrapeAd(url) { - console.log("Scraping : ", url); - // let hasParseErrors = false; - //let numberOfParseErrors = 0; - // do { - try { - // await this.sleep(this.delayBetweenAds); + // console.log("Scraping : ", url); + try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); @@ -269,13 +263,13 @@ class OlxCrawler { ) .text() .trim(); + //For cases where price is given in discount manner - different from default parsing + const discountPriceValue = $( + "#artikal_glavni_div > div.artikal_lijevo > div.op.pop > p" + ) + .text() + .trim(); - //Debug - //console.log("Title:", title); - //console.log("Url scraped:", url); - // console.log("Normal price value:", normalPriceValue); - // console.log("Urgent price value:", urgentPriceValue); - // if (normalPriceValue && normalPriceValue.length > 0) { normalPrice = normalPriceValue .replace(/\r\n|\n|\r/gm, "") @@ -290,10 +284,11 @@ class OlxCrawler { } else { status = AD_STATUS.STATUS_NORMAL; } + } else if (discountPriceValue && discountPriceValue.length > 0) { + status = AD_STATUS.STATUS_URGENT; + const priceValues = discountPriceValue.split("KM"); + normalPrice = priceValues[0].trim(); } else { - // - console.log("Body:", body); - // throw { message: "Can't find normal price" }; } if (urgentPriceValue && urgentPriceValue.length > 0) { @@ -306,6 +301,10 @@ class OlxCrawler { } else { urgentPrice = priceValues[0].trim(); } + } else if (discountPriceValue && discountPriceValue.length > 0) { + status = AD_STATUS.STATUS_URGENT; + const priceValues = discountPriceValue.split("KM"); + urgentPrice = priceValues[1].trim(); } else { throw { message: "Can't find urgent price" }; } @@ -692,20 +691,11 @@ class OlxCrawler { distanceToRiver, numberOfViewsAgency }; - // - //console.log("Scraped data:", data); - - //Delay between real estate ads to avoid error from Scraper API - // await this.sleep(this.delayBetweenAds); return data; } catch (e) { - // hasParseErrors = true; - // numberOfParseErrors++; console.error("Exception caught: " + e.message, "\r\nURL:", url); } - // } while (hasParseErrors && numberOfParseErrors <= 1); - // await this.sleep(this.delayBetweenAds); return null; } diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 3e3b371..221da05 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -16,7 +16,8 @@ const { const { PRINT_CRAWLER_DEBUG, - DEFAULT_TIMEZONE + DEFAULT_TIMEZONE, + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API } = require("../../config/appConfig"); const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic"); @@ -204,13 +205,32 @@ class SaljicCrawler { ? hrefsAbs.length : maxResultsPerPage; - const asyncScraping = []; - for (let i = 0; i < actualNoOfResults; i++) { - asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i])); + const scrapedData = []; + for ( + let i = 0; + i <= actualNoOfResults; + i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + ) { + const concurrentUrlsToScrape = hrefsAbs.slice( + i, + i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + ); + + const concurrentAdTypesOfReq = adTypes.slice( + i, + i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + ); + + const concurrentReqScraperApi = concurrentUrlsToScrape.map( + (url, index) => this.scrapeAd(url, concurrentAdTypesOfReq[index]) + ); + const concurrentReqData = await Promise.all(concurrentReqScraperApi); + + concurrentReqData.forEach(reqData => scrapedData.push(reqData)); } - const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); + return filteredScrapedData; } catch (e) { console.error("[SALJIC] Exception caught:" + e); @@ -225,6 +245,10 @@ class SaljicCrawler { const body = await adPageSource.text(); const $ = cheerio.load(body); + //Throws error if req to Scraper API proxy wasn't succesful and responds with error + if (body.indexOf("") === -1) { + throw { message: "Scraper API server error." }; + } // No information for status ex. PRODAN const status = AD_STATUS.STATUS_NORMAL; //Extracting agency ID from url @@ -508,6 +532,11 @@ class SaljicCrawler { const region = ""; const entity = ""; const country = ""; + //Throws error if realEstateType is null - not read. Still dont know why? + if (realEstateType === null) { + console.log("Body:", body); + throw { message: "Couldn't read real estate type." }; + } const data = { url, @@ -573,13 +602,10 @@ class SaljicCrawler { numberOfViewsKivi }; - await this.sleep(1000); - return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); } - await this.sleep(1000); return null; } diff --git a/app/helpers/fetchWrapper.js b/app/helpers/fetchWrapper.js index 6091053..f7d2cc3 100644 --- a/app/helpers/fetchWrapper.js +++ b/app/helpers/fetchWrapper.js @@ -16,7 +16,7 @@ const fetch = async (url, options = {}) => { : url; // - console.log("Url for scraping:", urlAdaptedForScraping); + // console.log("Url for scraping:", urlAdaptedForScraping); return nodeFetch(urlAdaptedForScraping, newOptions); }; diff --git a/development.env b/development.env index 9548ae7..2dbe922 100644 --- a/development.env +++ b/development.env @@ -26,7 +26,6 @@ API_MAP_KEY=(your-key-here) USE_SCRAPER_API= To turn it on (1) or off (0) SCRAPER_API_KEY= Key for Scraper api NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy -DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req bulk to awoid server errors with Scraper API #=============== AWS SDK EMAIL SETTINGS =======# AWS_KEY_ID=(your-key-here)