From c9a959f8be0b3136659cbc9d4b05e1fa574dd49d Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 25 Sep 2019 08:54:33 +0200 Subject: [PATCH] stop crawling when existing, not renewed ad is found --- app/crawler/crawl.js | 11 +++-- app/crawler/specific/olx.js | 87 ++++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 44 deletions(-) diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index d37cb9c..77d4fc9 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -18,18 +18,23 @@ const crawlers = [ OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, OLX_CONFIG.OLX_MAX_PAGES, OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, - OLX_CONFIG.OLX_IGNORED_USERNAMES + OLX_CONFIG.OLX_IGNORED_USERNAMES, + OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES ) ]; async function crawlAll() { for (let crawler of crawlers) { try { - await crawler.crawl(); + const newRealEstates = await crawler.crawl(); + + console.log("Number of new real estates : ", newRealEstates.length); } catch (e) { console.log("Error crawling. Trying next crawler! ", e); } } } -crawlAll(); +(async () => { + await crawlAll(); +})(); diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 1f6ea1c..75f9578 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -44,7 +44,8 @@ class OlxCrawler { ], maxPages = 1000, maxResultsPerPage = 100, - ignoredUsernames = [] + ignoredUsernames = [], + delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; @@ -53,14 +54,14 @@ class OlxCrawler { this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; this.ignoredUsernames = ignoredUsernames; + this.delayBetweenPages = delayBetweenPages; } async crawl() { console.log("[OLX] Crawler started"); const crawlAdCategories = this.crawlerAdCategories; - const savedRealEstates = []; - const asyncSaveActions = []; + const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; @@ -71,58 +72,62 @@ class OlxCrawler { let done = false; while (!done) { const categoryIndexerPromises = []; + const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); } - Promise.all(categoryIndexerPromises).then(singlePageResults => { - const entries = singlePageResults.entries(); - for (const [index, { value: singlePageResult }] of entries) { - if (singlePageResult) { - const savePromise = this.saveCrawledResults(singlePageResult) - .then(({ exist, savedRecords }) => { - if (exist) { - indexGenerators.splice(index, 1); - if (indexGenerators.length === 0) { - done = true; - } - } + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); - for (const savedRecord of savedRecords) { - const { createdAt, updatedAt } = savedRecord; + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords, existingRecords } = saveResults; - console.log("Comparing ", createdAt, " <> ", updatedAt); + newRealEstates.push(...newRecords); - const createdAtMoment = moment.utc(createdAt); - const updatedAtMoment = moment.utc(updatedAt); + for (const existingRecord of existingRecords) { + const { publishedDate, renewedDate } = existingRecord; - if (createdAtMoment.isSame(updatedAtMoment, "second")) { - console.log("\tEqual !"); - savedRealEstates.push(savedRecord); - } - } - }) - .catch(error => - console.log("[POSTGRES Saver] Error saving results : ", error) - ); - asyncSaveActions.push(savePromise); - } else { - //Generator returned undefined, no more pages - indexGenerators.splice(index, 1); - if (indexGenerators.length === 0) { - done = true; + const publishedDateMoment = moment.utc(publishedDate); + const renewedDateMoment = moment.utc(renewedDate); + + const stopCrawlingThisCategory = publishedDateMoment.isSame( + renewedDateMoment, + "minute" + ); + + if (stopCrawlingThisCategory) { + generatorsToRemove[index] = true; + // console.log("\tGenerator ", index + 1, "has no more new ads"); + break; } } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); } - }); + } - await this.sleep(5000); + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + // await this.sleep(this.delayBetweenPages); } } - console.log("[OLX] Waiting for async save actions ..."); - await Promise.all(asyncSaveActions); console.log("[OLX] Crawler finished"); - return savedRealEstates; + return newRealEstates; } async *categoryIndexer(adCategory) { @@ -191,7 +196,7 @@ class OlxCrawler { } async scrapeAd(url) { - console.log("Scraping : ", url); + //console.log("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text();