Created non-concurrent request groups.

This commit is contained in:
Naida Vatric
2020-03-03 15:21:51 +01:00
parent ccea5fe2aa
commit f5f8fa276c
5 changed files with 60 additions and 19 deletions

View File

@@ -47,6 +47,10 @@ const USER_AGENT =
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10;
const DELAY_BETWEEN_REQ_SCRAPER_API =
process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000;
module.exports = {
APP_PORT,
@@ -64,5 +68,7 @@ module.exports = {
PROSTOR_LOGIN,
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY
SCRAPER_API_KEY,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
DELAY_BETWEEN_REQ_SCRAPER_API
};

View File

@@ -35,6 +35,5 @@ module.exports = {
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES:
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000,
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
};

View File

@@ -18,7 +18,9 @@ const {
const {
DEFAULT_TIMEZONE,
PRINT_CRAWLER_DEBUG
PRINT_CRAWLER_DEBUG,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
DELAY_BETWEEN_REQ_SCRAPER_API
} = require("../../config/appConfig");
const OLX_ENUMS = {
@@ -42,10 +44,7 @@ const OLX_ENUMS = {
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
};
const {
OLX_FORCE_CRAWL,
OLX_DELAY_BETWEEN_ADS
} = require("../specificConfigs/olx");
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
class OlxCrawler {
constructor(
@@ -55,8 +54,7 @@ class OlxCrawler {
maxPages = 1000,
maxResultsPerPage = 100,
ignoredUsernames = [],
delayBetweenPages = 1000,
delayBetweenAds = OLX_DELAY_BETWEEN_ADS
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -66,7 +64,6 @@ class OlxCrawler {
this.maxResultsPerPage = maxResultsPerPage;
this.ignoredUsernames = ignoredUsernames;
this.delayBetweenPages = delayBetweenPages;
this.delayBetweenAds = delayBetweenAds;
}
async crawl() {
@@ -195,14 +192,26 @@ class OlxCrawler {
let actualNoOfResults =
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefs[i]));
//Delaying next scrape ad request to avoid ScraperAPI server error
asyncScraping.push(this.sleep(this.delayBetweenAds));
const scrapedData = [];
for (
let i = 0;
i < actualNoOfResults;
i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentUrlsToScrape = hrefs.slice(
i,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
this.scrapeAd(url)
);
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
scrapedData.push(concurrentReqData);
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
}
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
@@ -217,7 +226,7 @@ class OlxCrawler {
//let numberOfParseErrors = 0;
// do {
try {
await this.sleep(this.delayBetweenAds);
// await this.sleep(this.delayBetweenAds);
const adPageSource = await fetch(url);
const body = await adPageSource.text();
@@ -696,7 +705,7 @@ class OlxCrawler {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
// } while (hasParseErrors && numberOfParseErrors <= 1);
await this.sleep(this.delayBetweenAds);
// await this.sleep(this.delayBetweenAds);
return null;
}
@@ -913,6 +922,28 @@ class OlxCrawler {
console.log("sprat = NEPOZNATO [", floorText, "]");
return null;
}
/*
async consecutiveRequestSending(requestsToScraperApi) {
let dataFromAllRequests = [];
for (
const i = 0;
i <= requestsToScraperApi.length;
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentRequestsToScraperApi = requestsToScraperApi.slice(
i,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const dataFromConcurrentRequest = await Promise.all(
concurrentRequestsToScraperApi
);
dataFromAllRequests.push(dataFromConcurrentRequest);
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
}
return dataFromAllRequests;
}*/
async sleep(ms) {
// console.log("Sleep for:", ms);

View File

@@ -15,6 +15,9 @@ const fetch = async (url, options = {}) => {
? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}`
: url;
//
console.log("Url for scraping:", urlAdaptedForScraping);
return nodeFetch(urlAdaptedForScraping, newOptions);
};

View File

@@ -25,6 +25,8 @@ API_MAP_KEY=(your-key-here)
#=============== SCRAPER API SUPORT =============#
USE_SCRAPER_API= To turn it on (1) or off (0)
SCRAPER_API_KEY= Key for Scraper api
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy
DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req bulk to awoid server errors with Scraper API
#=============== AWS SDK EMAIL SETTINGS =======#
AWS_KEY_ID=(your-key-here)
@@ -36,6 +38,7 @@ SOURCE_EMAIL=info@saburly.com
CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds
STOP_CRAWLER=Non-zero value will skip crawler execution
PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console
#==OLX==
OLX_MAX_PAGES=Restrict crawler to this number of pages
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
@@ -43,7 +46,6 @@ OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check commo
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
OLX_DELAY_BETWEEN_ADS = time in miliseconds to wait before scraping next add to awoid server errors with Scraper API
OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==RENTAL==