From e4775158fc528c45c535eed539e58dbe9013cc7f Mon Sep 17 00:00:00 2001 From: Senad Uka Date: Thu, 10 Sep 2020 19:39:13 +0200 Subject: [PATCH] Promise pool --- app/crawler/specificCrawlers/olx.js | 9 +++++++++ app/helpers/fetchWrapper.js | 2 -- package-lock.json | 15 +++++++++++++++ package.json | 1 + 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index ecaf7eb..03d63d8 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -4,6 +4,7 @@ const fetch = require("../../helpers/fetchWrapper"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); +const PromisePool = require('@supercharge/promise-pool'); const { AD_TYPE, @@ -224,6 +225,14 @@ class OlxCrawler { asyncScraping.push(this.scrapeAd(hrefs[i])); } + + const { scrapedData, errors } = await PromisePool + .withConcurrency(2) + .for(asyncScraping) + .process(async data => { + return await data + }) + const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; diff --git a/app/helpers/fetchWrapper.js b/app/helpers/fetchWrapper.js index 8c0c457..f439a47 100644 --- a/app/helpers/fetchWrapper.js +++ b/app/helpers/fetchWrapper.js @@ -25,8 +25,6 @@ const fetch = async (url, options = {}) => { const urlAdaptedForScraping = USE_SCRAPER_API ? `${SCRAPER_API_BASE_URL}?api_key=${SCRAPER_API_KEY}&url=${urlToFetchThroughAPI}` : url; - const randomPauseMS = Math.floor(Math.random() * Math.floor(1500)) + 500; - await timeout(randomPauseMS); return nodeFetch(urlAdaptedForScraping, newOptions); }; diff --git a/package-lock.json b/package-lock.json index 4626180..f45e4f8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -40,6 +40,21 @@ "@sendgrid/helpers": "^6.3.0" } }, + "@supercharge/goodies": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/@supercharge/goodies/-/goodies-1.4.0.tgz", + "integrity": "sha512-Np6u2qjRwiA3wTgzz4n2yduydIjSXqtJWP5cOnNqjdlCR/EUAK86LAOhEcU+YW211D1ksugns3GqpARJDoXQ7g==", + "dev": true + }, + "@supercharge/promise-pool": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@supercharge/promise-pool/-/promise-pool-1.3.0.tgz", + "integrity": "sha512-9/EVrJevSPEqI4i/gRH8Dt7C+FQT65wRRYuu0MDaGmSLZ2aTel0jOGu8Ae84fPiQ+Ah0B80RPFUxk+K+Cz48DA==", + "dev": true, + "requires": { + "@supercharge/goodies": "~1.4.0" + } + }, "@types/caseless": { "version": "0.12.2", "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.2.tgz", diff --git a/package.json b/package.json index 1dd28bd..5feed79 100644 --- a/package.json +++ b/package.json @@ -55,6 +55,7 @@ "sequelize-cli": "^5.5.0" }, "devDependencies": { + "@supercharge/promise-pool": "^1.3.0", "nodemon": "^1.19.0" } }