Created non-concurrent request groups.

This commit is contained in:
Naida Vatric
2020-03-03 15:21:51 +01:00
parent ccea5fe2aa
commit f5f8fa276c
5 changed files with 60 additions and 19 deletions

View File

@@ -47,6 +47,10 @@ const USER_AGENT =
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || ""; const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10;
const DELAY_BETWEEN_REQ_SCRAPER_API =
process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000;
module.exports = { module.exports = {
APP_PORT, APP_PORT,
@@ -64,5 +68,7 @@ module.exports = {
PROSTOR_LOGIN, PROSTOR_LOGIN,
USER_AGENT, USER_AGENT,
USE_SCRAPER_API, USE_SCRAPER_API,
SCRAPER_API_KEY SCRAPER_API_KEY,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
DELAY_BETWEEN_REQ_SCRAPER_API
}; };

View File

@@ -35,6 +35,5 @@ module.exports = {
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES: OLX_DELAY_BETWEEN_PAGES:
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000, parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000,
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL) OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
}; };

View File

@@ -18,7 +18,9 @@ const {
const { const {
DEFAULT_TIMEZONE, DEFAULT_TIMEZONE,
PRINT_CRAWLER_DEBUG PRINT_CRAWLER_DEBUG,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
DELAY_BETWEEN_REQ_SCRAPER_API
} = require("../../config/appConfig"); } = require("../../config/appConfig");
const OLX_ENUMS = { const OLX_ENUMS = {
@@ -42,10 +44,7 @@ const OLX_ENUMS = {
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
}; };
const { const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
OLX_FORCE_CRAWL,
OLX_DELAY_BETWEEN_ADS
} = require("../specificConfigs/olx");
class OlxCrawler { class OlxCrawler {
constructor( constructor(
@@ -55,8 +54,7 @@ class OlxCrawler {
maxPages = 1000, maxPages = 1000,
maxResultsPerPage = 100, maxResultsPerPage = 100,
ignoredUsernames = [], ignoredUsernames = [],
delayBetweenPages = 1000, delayBetweenPages = 1000
delayBetweenAds = OLX_DELAY_BETWEEN_ADS
) { ) {
this.savers = savers; this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -66,7 +64,6 @@ class OlxCrawler {
this.maxResultsPerPage = maxResultsPerPage; this.maxResultsPerPage = maxResultsPerPage;
this.ignoredUsernames = ignoredUsernames; this.ignoredUsernames = ignoredUsernames;
this.delayBetweenPages = delayBetweenPages; this.delayBetweenPages = delayBetweenPages;
this.delayBetweenAds = delayBetweenAds;
} }
async crawl() { async crawl() {
@@ -195,14 +192,26 @@ class OlxCrawler {
let actualNoOfResults = let actualNoOfResults =
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
const asyncScraping = []; const scrapedData = [];
for (let i = 0; i < actualNoOfResults; i++) { for (
asyncScraping.push(this.scrapeAd(hrefs[i])); let i = 0;
//Delaying next scrape ad request to avoid ScraperAPI server error i < actualNoOfResults;
asyncScraping.push(this.sleep(this.delayBetweenAds)); i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentUrlsToScrape = hrefs.slice(
i,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
this.scrapeAd(url)
);
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
scrapedData.push(concurrentReqData);
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
} }
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData); const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData; return filteredScrapedData;
} catch (e) { } catch (e) {
@@ -217,7 +226,7 @@ class OlxCrawler {
//let numberOfParseErrors = 0; //let numberOfParseErrors = 0;
// do { // do {
try { try {
await this.sleep(this.delayBetweenAds); // await this.sleep(this.delayBetweenAds);
const adPageSource = await fetch(url); const adPageSource = await fetch(url);
const body = await adPageSource.text(); const body = await adPageSource.text();
@@ -696,7 +705,7 @@ class OlxCrawler {
console.error("Exception caught: " + e.message, "\r\nURL:", url); console.error("Exception caught: " + e.message, "\r\nURL:", url);
} }
// } while (hasParseErrors && numberOfParseErrors <= 1); // } while (hasParseErrors && numberOfParseErrors <= 1);
await this.sleep(this.delayBetweenAds); // await this.sleep(this.delayBetweenAds);
return null; return null;
} }
@@ -913,6 +922,28 @@ class OlxCrawler {
console.log("sprat = NEPOZNATO [", floorText, "]"); console.log("sprat = NEPOZNATO [", floorText, "]");
return null; return null;
} }
/*
async consecutiveRequestSending(requestsToScraperApi) {
let dataFromAllRequests = [];
for (
const i = 0;
i <= requestsToScraperApi.length;
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentRequestsToScraperApi = requestsToScraperApi.slice(
i,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const dataFromConcurrentRequest = await Promise.all(
concurrentRequestsToScraperApi
);
dataFromAllRequests.push(dataFromConcurrentRequest);
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
}
return dataFromAllRequests;
}*/
async sleep(ms) { async sleep(ms) {
// console.log("Sleep for:", ms); // console.log("Sleep for:", ms);

View File

@@ -15,6 +15,9 @@ const fetch = async (url, options = {}) => {
? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}` ? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}`
: url; : url;
//
console.log("Url for scraping:", urlAdaptedForScraping);
return nodeFetch(urlAdaptedForScraping, newOptions); return nodeFetch(urlAdaptedForScraping, newOptions);
}; };

View File

@@ -25,6 +25,8 @@ API_MAP_KEY=(your-key-here)
#=============== SCRAPER API SUPORT =============# #=============== SCRAPER API SUPORT =============#
USE_SCRAPER_API= To turn it on (1) or off (0) USE_SCRAPER_API= To turn it on (1) or off (0)
SCRAPER_API_KEY= Key for Scraper api SCRAPER_API_KEY= Key for Scraper api
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy
DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req bulk to awoid server errors with Scraper API
#=============== AWS SDK EMAIL SETTINGS =======# #=============== AWS SDK EMAIL SETTINGS =======#
AWS_KEY_ID=(your-key-here) AWS_KEY_ID=(your-key-here)
@@ -36,6 +38,7 @@ SOURCE_EMAIL=info@saburly.com
CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds
STOP_CRAWLER=Non-zero value will skip crawler execution STOP_CRAWLER=Non-zero value will skip crawler execution
PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console
#==OLX== #==OLX==
OLX_MAX_PAGES=Restrict crawler to this number of pages OLX_MAX_PAGES=Restrict crawler to this number of pages
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
@@ -43,7 +46,6 @@ OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check commo
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
OLX_DELAY_BETWEEN_ADS = time in miliseconds to wait before scraping next add to awoid server errors with Scraper API
OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==RENTAL== #==RENTAL==