Created non-concurrent request groups.
This commit is contained in:
@@ -47,6 +47,10 @@ const USER_AGENT =
|
||||
|
||||
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
|
||||
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
|
||||
const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
|
||||
process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10;
|
||||
const DELAY_BETWEEN_REQ_SCRAPER_API =
|
||||
process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000;
|
||||
|
||||
module.exports = {
|
||||
APP_PORT,
|
||||
@@ -64,5 +68,7 @@ module.exports = {
|
||||
PROSTOR_LOGIN,
|
||||
USER_AGENT,
|
||||
USE_SCRAPER_API,
|
||||
SCRAPER_API_KEY
|
||||
SCRAPER_API_KEY,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API
|
||||
};
|
||||
|
||||
@@ -35,6 +35,5 @@ module.exports = {
|
||||
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
||||
OLX_DELAY_BETWEEN_PAGES:
|
||||
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
|
||||
OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000,
|
||||
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
|
||||
};
|
||||
|
||||
@@ -18,7 +18,9 @@ const {
|
||||
|
||||
const {
|
||||
DEFAULT_TIMEZONE,
|
||||
PRINT_CRAWLER_DEBUG
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API
|
||||
} = require("../../config/appConfig");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
@@ -42,10 +44,7 @@ const OLX_ENUMS = {
|
||||
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||
};
|
||||
|
||||
const {
|
||||
OLX_FORCE_CRAWL,
|
||||
OLX_DELAY_BETWEEN_ADS
|
||||
} = require("../specificConfigs/olx");
|
||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||
|
||||
class OlxCrawler {
|
||||
constructor(
|
||||
@@ -55,8 +54,7 @@ class OlxCrawler {
|
||||
maxPages = 1000,
|
||||
maxResultsPerPage = 100,
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 1000,
|
||||
delayBetweenAds = OLX_DELAY_BETWEEN_ADS
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||
@@ -66,7 +64,6 @@ class OlxCrawler {
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.ignoredUsernames = ignoredUsernames;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
this.delayBetweenAds = delayBetweenAds;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
@@ -195,14 +192,26 @@ class OlxCrawler {
|
||||
let actualNoOfResults =
|
||||
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
||||
//Delaying next scrape ad request to avoid ScraperAPI server error
|
||||
asyncScraping.push(this.sleep(this.delayBetweenAds));
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i < actualNoOfResults;
|
||||
i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefs.slice(
|
||||
i,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
|
||||
this.scrapeAd(url)
|
||||
);
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
scrapedData.push(concurrentReqData);
|
||||
|
||||
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
|
||||
}
|
||||
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
@@ -217,7 +226,7 @@ class OlxCrawler {
|
||||
//let numberOfParseErrors = 0;
|
||||
// do {
|
||||
try {
|
||||
await this.sleep(this.delayBetweenAds);
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
@@ -696,7 +705,7 @@ class OlxCrawler {
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
// } while (hasParseErrors && numberOfParseErrors <= 1);
|
||||
await this.sleep(this.delayBetweenAds);
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
return null;
|
||||
}
|
||||
@@ -913,6 +922,28 @@ class OlxCrawler {
|
||||
console.log("sprat = NEPOZNATO [", floorText, "]");
|
||||
return null;
|
||||
}
|
||||
/*
|
||||
async consecutiveRequestSending(requestsToScraperApi) {
|
||||
let dataFromAllRequests = [];
|
||||
|
||||
for (
|
||||
const i = 0;
|
||||
i <= requestsToScraperApi.length;
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentRequestsToScraperApi = requestsToScraperApi.slice(
|
||||
i,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
const dataFromConcurrentRequest = await Promise.all(
|
||||
concurrentRequestsToScraperApi
|
||||
);
|
||||
dataFromAllRequests.push(dataFromConcurrentRequest);
|
||||
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
|
||||
}
|
||||
|
||||
return dataFromAllRequests;
|
||||
}*/
|
||||
|
||||
async sleep(ms) {
|
||||
// console.log("Sleep for:", ms);
|
||||
|
||||
@@ -15,6 +15,9 @@ const fetch = async (url, options = {}) => {
|
||||
? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}`
|
||||
: url;
|
||||
|
||||
//
|
||||
console.log("Url for scraping:", urlAdaptedForScraping);
|
||||
|
||||
return nodeFetch(urlAdaptedForScraping, newOptions);
|
||||
};
|
||||
|
||||
|
||||
@@ -25,6 +25,8 @@ API_MAP_KEY=(your-key-here)
|
||||
#=============== SCRAPER API SUPORT =============#
|
||||
USE_SCRAPER_API= To turn it on (1) or off (0)
|
||||
SCRAPER_API_KEY= Key for Scraper api
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req bulk to awoid server errors with Scraper API
|
||||
|
||||
#=============== AWS SDK EMAIL SETTINGS =======#
|
||||
AWS_KEY_ID=(your-key-here)
|
||||
@@ -36,6 +38,7 @@ SOURCE_EMAIL=info@saburly.com
|
||||
CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds
|
||||
STOP_CRAWLER=Non-zero value will skip crawler execution
|
||||
PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console
|
||||
|
||||
#==OLX==
|
||||
OLX_MAX_PAGES=Restrict crawler to this number of pages
|
||||
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||
@@ -43,7 +46,6 @@ OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check commo
|
||||
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
|
||||
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||
OLX_DELAY_BETWEEN_ADS = time in miliseconds to wait before scraping next add to awoid server errors with Scraper API
|
||||
OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||
|
||||
#==RENTAL==
|
||||
|
||||
Reference in New Issue
Block a user