Created non-concurrent request groups.

This commit is contained in:
Naida Vatric
2020-03-03 15:21:51 +01:00
parent ccea5fe2aa
commit f5f8fa276c
5 changed files with 60 additions and 19 deletions

View File

@@ -35,6 +35,5 @@ module.exports = {
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES:
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000,
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
};

View File

@@ -18,7 +18,9 @@ const {
const {
DEFAULT_TIMEZONE,
PRINT_CRAWLER_DEBUG
PRINT_CRAWLER_DEBUG,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
DELAY_BETWEEN_REQ_SCRAPER_API
} = require("../../config/appConfig");
const OLX_ENUMS = {
@@ -42,10 +44,7 @@ const OLX_ENUMS = {
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
};
const {
OLX_FORCE_CRAWL,
OLX_DELAY_BETWEEN_ADS
} = require("../specificConfigs/olx");
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
class OlxCrawler {
constructor(
@@ -55,8 +54,7 @@ class OlxCrawler {
maxPages = 1000,
maxResultsPerPage = 100,
ignoredUsernames = [],
delayBetweenPages = 1000,
delayBetweenAds = OLX_DELAY_BETWEEN_ADS
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -66,7 +64,6 @@ class OlxCrawler {
this.maxResultsPerPage = maxResultsPerPage;
this.ignoredUsernames = ignoredUsernames;
this.delayBetweenPages = delayBetweenPages;
this.delayBetweenAds = delayBetweenAds;
}
async crawl() {
@@ -195,14 +192,26 @@ class OlxCrawler {
let actualNoOfResults =
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefs[i]));
//Delaying next scrape ad request to avoid ScraperAPI server error
asyncScraping.push(this.sleep(this.delayBetweenAds));
const scrapedData = [];
for (
let i = 0;
i < actualNoOfResults;
i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentUrlsToScrape = hrefs.slice(
i,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
this.scrapeAd(url)
);
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
scrapedData.push(concurrentReqData);
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
}
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
@@ -217,7 +226,7 @@ class OlxCrawler {
//let numberOfParseErrors = 0;
// do {
try {
await this.sleep(this.delayBetweenAds);
// await this.sleep(this.delayBetweenAds);
const adPageSource = await fetch(url);
const body = await adPageSource.text();
@@ -696,7 +705,7 @@ class OlxCrawler {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
// } while (hasParseErrors && numberOfParseErrors <= 1);
await this.sleep(this.delayBetweenAds);
// await this.sleep(this.delayBetweenAds);
return null;
}
@@ -913,6 +922,28 @@ class OlxCrawler {
console.log("sprat = NEPOZNATO [", floorText, "]");
return null;
}
/*
async consecutiveRequestSending(requestsToScraperApi) {
let dataFromAllRequests = [];
for (
const i = 0;
i <= requestsToScraperApi.length;
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentRequestsToScraperApi = requestsToScraperApi.slice(
i,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const dataFromConcurrentRequest = await Promise.all(
concurrentRequestsToScraperApi
);
dataFromAllRequests.push(dataFromConcurrentRequest);
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
}
return dataFromAllRequests;
}*/
async sleep(ms) {
// console.log("Sleep for:", ms);