From f5f8fa276c1fc8143aeefef0e0a17e7f3702009e Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Tue, 3 Mar 2020 15:21:51 +0100 Subject: [PATCH] Created non-concurrent request groups. --- app/config/appConfig.js | 8 +++- app/crawler/specificConfigs/olx.js | 1 - app/crawler/specificCrawlers/olx.js | 63 +++++++++++++++++++++-------- app/helpers/fetchWrapper.js | 3 ++ development.env | 4 +- 5 files changed, 60 insertions(+), 19 deletions(-) diff --git a/app/config/appConfig.js b/app/config/appConfig.js index 0ff991a..e1864a8 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -47,6 +47,10 @@ const USER_AGENT = const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || ""; +const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API = + process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10; +const DELAY_BETWEEN_REQ_SCRAPER_API = + process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000; module.exports = { APP_PORT, @@ -64,5 +68,7 @@ module.exports = { PROSTOR_LOGIN, USER_AGENT, USE_SCRAPER_API, - SCRAPER_API_KEY + SCRAPER_API_KEY, + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API, + DELAY_BETWEEN_REQ_SCRAPER_API }; diff --git a/app/crawler/specificConfigs/olx.js b/app/crawler/specificConfigs/olx.js index f64c251..150ec16 100644 --- a/app/crawler/specificConfigs/olx.js +++ b/app/crawler/specificConfigs/olx.js @@ -35,6 +35,5 @@ module.exports = { OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000, - OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000, OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL) }; diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index eed5050..2801a55 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -18,7 +18,9 @@ const { const { DEFAULT_TIMEZONE, - PRINT_CRAWLER_DEBUG + PRINT_CRAWLER_DEBUG, + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API, + DELAY_BETWEEN_REQ_SCRAPER_API } = require("../../config/appConfig"); const OLX_ENUMS = { @@ -42,10 +44,7 @@ const OLX_ENUMS = { OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" }; -const { - OLX_FORCE_CRAWL, - OLX_DELAY_BETWEEN_ADS -} = require("../specificConfigs/olx"); +const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx"); class OlxCrawler { constructor( @@ -55,8 +54,7 @@ class OlxCrawler { maxPages = 1000, maxResultsPerPage = 100, ignoredUsernames = [], - delayBetweenPages = 1000, - delayBetweenAds = OLX_DELAY_BETWEEN_ADS + delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; @@ -66,7 +64,6 @@ class OlxCrawler { this.maxResultsPerPage = maxResultsPerPage; this.ignoredUsernames = ignoredUsernames; this.delayBetweenPages = delayBetweenPages; - this.delayBetweenAds = delayBetweenAds; } async crawl() { @@ -195,14 +192,26 @@ class OlxCrawler { let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; - const asyncScraping = []; - for (let i = 0; i < actualNoOfResults; i++) { - asyncScraping.push(this.scrapeAd(hrefs[i])); - //Delaying next scrape ad request to avoid ScraperAPI server error - asyncScraping.push(this.sleep(this.delayBetweenAds)); + const scrapedData = []; + for ( + let i = 0; + i < actualNoOfResults; + i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + ) { + const concurrentUrlsToScrape = hrefs.slice( + i, + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + ); + + const concurrentReqScraperApi = concurrentUrlsToScrape.map(url => + this.scrapeAd(url) + ); + const concurrentReqData = await Promise.all(concurrentReqScraperApi); + scrapedData.push(concurrentReqData); + + this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API); } - const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; } catch (e) { @@ -217,7 +226,7 @@ class OlxCrawler { //let numberOfParseErrors = 0; // do { try { - await this.sleep(this.delayBetweenAds); + // await this.sleep(this.delayBetweenAds); const adPageSource = await fetch(url); const body = await adPageSource.text(); @@ -696,7 +705,7 @@ class OlxCrawler { console.error("Exception caught: " + e.message, "\r\nURL:", url); } // } while (hasParseErrors && numberOfParseErrors <= 1); - await this.sleep(this.delayBetweenAds); + // await this.sleep(this.delayBetweenAds); return null; } @@ -913,6 +922,28 @@ class OlxCrawler { console.log("sprat = NEPOZNATO [", floorText, "]"); return null; } + /* + async consecutiveRequestSending(requestsToScraperApi) { + let dataFromAllRequests = []; + + for ( + const i = 0; + i <= requestsToScraperApi.length; + i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + ) { + const concurrentRequestsToScraperApi = requestsToScraperApi.slice( + i, + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + ); + const dataFromConcurrentRequest = await Promise.all( + concurrentRequestsToScraperApi + ); + dataFromAllRequests.push(dataFromConcurrentRequest); + this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API); + } + + return dataFromAllRequests; + }*/ async sleep(ms) { // console.log("Sleep for:", ms); diff --git a/app/helpers/fetchWrapper.js b/app/helpers/fetchWrapper.js index 3af2817..6091053 100644 --- a/app/helpers/fetchWrapper.js +++ b/app/helpers/fetchWrapper.js @@ -15,6 +15,9 @@ const fetch = async (url, options = {}) => { ? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}` : url; + // + console.log("Url for scraping:", urlAdaptedForScraping); + return nodeFetch(urlAdaptedForScraping, newOptions); }; diff --git a/development.env b/development.env index acd2339..9548ae7 100644 --- a/development.env +++ b/development.env @@ -25,6 +25,8 @@ API_MAP_KEY=(your-key-here) #=============== SCRAPER API SUPORT =============# USE_SCRAPER_API= To turn it on (1) or off (0) SCRAPER_API_KEY= Key for Scraper api +NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy +DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req bulk to awoid server errors with Scraper API #=============== AWS SDK EMAIL SETTINGS =======# AWS_KEY_ID=(your-key-here) @@ -36,6 +38,7 @@ SOURCE_EMAIL=info@saburly.com CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds STOP_CRAWLER=Non-zero value will skip crawler execution PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console + #==OLX== OLX_MAX_PAGES=Restrict crawler to this number of pages OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved @@ -43,7 +46,6 @@ OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check commo OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page -OLX_DELAY_BETWEEN_ADS = time in miliseconds to wait before scraping next add to awoid server errors with Scraper API OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found #==RENTAL==