Created non-concurrent request groups.
This commit is contained in:
@@ -47,6 +47,10 @@ const USER_AGENT =
|
||||
|
||||
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
|
||||
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
|
||||
const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
|
||||
process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10;
|
||||
const DELAY_BETWEEN_REQ_SCRAPER_API =
|
||||
process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000;
|
||||
|
||||
module.exports = {
|
||||
APP_PORT,
|
||||
@@ -64,5 +68,7 @@ module.exports = {
|
||||
PROSTOR_LOGIN,
|
||||
USER_AGENT,
|
||||
USE_SCRAPER_API,
|
||||
SCRAPER_API_KEY
|
||||
SCRAPER_API_KEY,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API
|
||||
};
|
||||
|
||||
@@ -35,6 +35,5 @@ module.exports = {
|
||||
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
||||
OLX_DELAY_BETWEEN_PAGES:
|
||||
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
|
||||
OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000,
|
||||
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
|
||||
};
|
||||
|
||||
@@ -18,7 +18,9 @@ const {
|
||||
|
||||
const {
|
||||
DEFAULT_TIMEZONE,
|
||||
PRINT_CRAWLER_DEBUG
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API
|
||||
} = require("../../config/appConfig");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
@@ -42,10 +44,7 @@ const OLX_ENUMS = {
|
||||
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||
};
|
||||
|
||||
const {
|
||||
OLX_FORCE_CRAWL,
|
||||
OLX_DELAY_BETWEEN_ADS
|
||||
} = require("../specificConfigs/olx");
|
||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||
|
||||
class OlxCrawler {
|
||||
constructor(
|
||||
@@ -55,8 +54,7 @@ class OlxCrawler {
|
||||
maxPages = 1000,
|
||||
maxResultsPerPage = 100,
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 1000,
|
||||
delayBetweenAds = OLX_DELAY_BETWEEN_ADS
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||
@@ -66,7 +64,6 @@ class OlxCrawler {
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.ignoredUsernames = ignoredUsernames;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
this.delayBetweenAds = delayBetweenAds;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
@@ -195,14 +192,26 @@ class OlxCrawler {
|
||||
let actualNoOfResults =
|
||||
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
||||
//Delaying next scrape ad request to avoid ScraperAPI server error
|
||||
asyncScraping.push(this.sleep(this.delayBetweenAds));
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i < actualNoOfResults;
|
||||
i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefs.slice(
|
||||
i,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
|
||||
this.scrapeAd(url)
|
||||
);
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
scrapedData.push(concurrentReqData);
|
||||
|
||||
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
|
||||
}
|
||||
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
@@ -217,7 +226,7 @@ class OlxCrawler {
|
||||
//let numberOfParseErrors = 0;
|
||||
// do {
|
||||
try {
|
||||
await this.sleep(this.delayBetweenAds);
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
@@ -696,7 +705,7 @@ class OlxCrawler {
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
// } while (hasParseErrors && numberOfParseErrors <= 1);
|
||||
await this.sleep(this.delayBetweenAds);
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
return null;
|
||||
}
|
||||
@@ -913,6 +922,28 @@ class OlxCrawler {
|
||||
console.log("sprat = NEPOZNATO [", floorText, "]");
|
||||
return null;
|
||||
}
|
||||
/*
|
||||
async consecutiveRequestSending(requestsToScraperApi) {
|
||||
let dataFromAllRequests = [];
|
||||
|
||||
for (
|
||||
const i = 0;
|
||||
i <= requestsToScraperApi.length;
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentRequestsToScraperApi = requestsToScraperApi.slice(
|
||||
i,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
const dataFromConcurrentRequest = await Promise.all(
|
||||
concurrentRequestsToScraperApi
|
||||
);
|
||||
dataFromAllRequests.push(dataFromConcurrentRequest);
|
||||
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
|
||||
}
|
||||
|
||||
return dataFromAllRequests;
|
||||
}*/
|
||||
|
||||
async sleep(ms) {
|
||||
// console.log("Sleep for:", ms);
|
||||
|
||||
@@ -15,6 +15,9 @@ const fetch = async (url, options = {}) => {
|
||||
? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}`
|
||||
: url;
|
||||
|
||||
//
|
||||
console.log("Url for scraping:", urlAdaptedForScraping);
|
||||
|
||||
return nodeFetch(urlAdaptedForScraping, newOptions);
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user