diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index d4c335e..86f894c 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -59,16 +59,17 @@ async function crawlAll() { AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE, AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES, AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES - ), - new SaljicCrawler( - [postgresSaver], - SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE, - SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES, - SALJIC_CONFIG.SALJIC_MAX_PAGES, - SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE, - SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES, - SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES ) + //, + //new SaljicCrawler( + //[postgresSaver], + //SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE, + //SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES, + //SALJIC_CONFIG.SALJIC_MAX_PAGES, + //SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE, + //SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES, + //SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES + //) ]; const newRealEstates = []; diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 92da55c..3722f5e 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -1,6 +1,7 @@ "use strict"; const fetch = require("../../helpers/fetchWrapper"); +const { logDebug } = require("../../helpers/log"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); @@ -45,6 +46,16 @@ const OLX_ENUMS = { const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx"); +const chunk = (array, size = 10) => { + let i, j ,temparray; + const result = [] + for (i=0,j=array.length; i { - const result = await this.scrapeAd(data) - await this.sleep(this.delayBetweenPages); - dataResults.push(result) - return result; //TODO: this does not work, scrapedData is null, dataResults works - }) + for (let i = 0; i < allChunks.length; i++) { + const singleChunk = allChunks[i]; + const promises = singleChunk.map(c => this.scrapeAd(c)) + const chunkResults = await Promise.all(promises); + await this.sleep(this.delayBetweenPages); + dataResults.push(...chunkResults); + logDebug("Chunk results len:", chunkResults.length); + } + const filteredScrapedData = dataResults.filter(adData => !!adData); + logDebug("Filtered scraped data length: ", filteredScrapedData.length); + return filteredScrapedData; } catch (e) { console.error("Exception caught:" + e); @@ -248,7 +262,7 @@ class OlxCrawler { } async scrapeAd(url) { - //console.log("Scraping : ", url); + logDebug("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); diff --git a/app/helpers/log.js b/app/helpers/log.js new file mode 100644 index 0000000..f75ac66 --- /dev/null +++ b/app/helpers/log.js @@ -0,0 +1,13 @@ +const { + PRINT_CRAWLER_DEBUG +} = require("../config/appConfig"); + +const logDebug = (...args) => { + if (PRINT_CRAWLER_DEBUG) { + console.log(...args); + } +} + +module.exports = { + logDebug +}; diff --git a/index.js b/index.js index 0669e7b..c32882b 100644 --- a/index.js +++ b/index.js @@ -4,6 +4,7 @@ const bodyParser = require("body-parser"); const layout = require("express-layout"); const compression = require("compression"); const forceSSL = require("./app/helpers/forceSSL"); +const { logDebug } = require("./app/helpers/log"); const { APP_PORT, @@ -38,9 +39,11 @@ app.listen(APP_PORT, () => let crawlerRunning = STOP_CRAWLER; const crawl = () => { + logDebug("Crawl start"); if (!crawlerRunning) { crawlerRunning = true; crawlAll().then(newRealEstates => { + logDebug("crawlAll done, new real estate len: ", newRealEstates.length); crawlerRunning = false; notifyForNewRealEstates(newRealEstates); });