Added delay between ads.

This commit is contained in:
Naida Vatric
2020-02-29 18:52:36 +01:00
parent 034106d87a
commit 97c09a6da1
3 changed files with 475 additions and 470 deletions

View File

@@ -35,5 +35,6 @@ module.exports = {
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES: OLX_DELAY_BETWEEN_PAGES:
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000, parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000,
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL) OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
}; };

View File

@@ -42,7 +42,10 @@ const OLX_ENUMS = {
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
}; };
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx"); const {
OLX_FORCE_CRAWL,
OLX_DELAY_BETWEEN_ADS
} = require("../specificConfigs/olx");
class OlxCrawler { class OlxCrawler {
constructor( constructor(
@@ -52,7 +55,8 @@ class OlxCrawler {
maxPages = 1000, maxPages = 1000,
maxResultsPerPage = 100, maxResultsPerPage = 100,
ignoredUsernames = [], ignoredUsernames = [],
delayBetweenPages = 1000 delayBetweenPages = 1000,
delayBetweenAds = OLX_DELAY_BETWEEN_ADS
) { ) {
this.savers = savers; this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -62,6 +66,7 @@ class OlxCrawler {
this.maxResultsPerPage = maxResultsPerPage; this.maxResultsPerPage = maxResultsPerPage;
this.ignoredUsernames = ignoredUsernames; this.ignoredUsernames = ignoredUsernames;
this.delayBetweenPages = delayBetweenPages; this.delayBetweenPages = delayBetweenPages;
this.delayBetweenAds = delayBetweenAds;
} }
async crawl() { async crawl() {
@@ -193,6 +198,8 @@ class OlxCrawler {
const asyncScraping = []; const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) { for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefs[i])); asyncScraping.push(this.scrapeAd(hrefs[i]));
//Delaying next scrape ad request to avoid ScraperAPI server error
asyncScraping.push(this.sleep(this.delayBetweenAds));
} }
const scrapedData = await Promise.all(asyncScraping); const scrapedData = await Promise.all(asyncScraping);
@@ -206,15 +213,17 @@ class OlxCrawler {
async scrapeAd(url) { async scrapeAd(url) {
console.log("Scraping : ", url); console.log("Scraping : ", url);
let hasParseErrors = false; // let hasParseErrors = false;
let numberOfParseErrors = 0; //let numberOfParseErrors = 0;
do { // do {
try { try {
const adPageSource = await fetch(url); const adPageSource = await fetch(url);
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let status = AD_STATUS.STATUS_NORMAL; let status = AD_STATUS.STATUS_NORMAL;
//
console.log("Body:", body);
//
const propertySelectors = { const propertySelectors = {
username: username:
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
@@ -276,9 +285,7 @@ class OlxCrawler {
throw { message: "Can't find normal price" }; throw { message: "Can't find normal price" };
} }
if (urgentPriceValue && urgentPriceValue.length > 0) { if (urgentPriceValue && urgentPriceValue.length > 0) {
const priceValues = urgentPriceValue const priceValues = urgentPriceValue.replace("Cijena", "").split("KM");
.replace("Cijena", "")
.split("KM");
//priceValues will contain values like ["100000", "90000", ...], second element is urgent price //priceValues will contain values like ["100000", "90000", ...], second element is urgent price
if (priceValues.length > 0) { if (priceValues.length > 0) {
if (priceValues[0].trim().indexOf("Hitno") != -1) { if (priceValues[0].trim().indexOf("Hitno") != -1) {
@@ -448,16 +455,10 @@ class OlxCrawler {
gardenSize = fieldValue; gardenSize = fieldValue;
break; break;
case "broj soba": case "broj soba":
numberOfRooms = this.parseNumberOfRooms( numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
fieldValue,
parsedCategory
);
break; break;
case "broj prostorija": case "broj prostorija":
numberOfRooms = this.parseNumberOfRooms( numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
fieldValue,
parsedCategory
);
break; break;
case "broj spratova": case "broj spratova":
numberOfFloors = this.parseNumberOfFloors( numberOfFloors = this.parseNumberOfFloors(
@@ -584,10 +585,7 @@ class OlxCrawler {
break; break;
} }
if ( if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") {
++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS ||
fieldTitle === ""
) {
break; break;
} }
} while (true); } while (true);
@@ -685,13 +683,16 @@ class OlxCrawler {
// //
//console.log("Scraped data:", data); //console.log("Scraped data:", data);
//Delay between real estate ads to avoid error from Scraper API
// await this.sleep(this.delayBetweenAds);
return data; return data;
} catch (e) { } catch (e) {
hasParseErrors = true; // hasParseErrors = true;
numberOfParseErrors++; // numberOfParseErrors++;
console.error("Exception caught: " + e.message, "\r\nURL:", url); console.error("Exception caught: " + e.message, "\r\nURL:", url);
} }
} while (hasParseErrors && numberOfParseErrors <= 1); // } while (hasParseErrors && numberOfParseErrors <= 1);
return null; return null;
} }
@@ -909,6 +910,7 @@ class OlxCrawler {
} }
async sleep(ms) { async sleep(ms) {
// console.log("Sleep for:", ms);
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }

View File

@@ -43,7 +43,9 @@ OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check commo
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
OLX_DELAY_BETWEEN_ADS = time in miliseconds to wait before scraping next add to awoid server errors with Scraper API
OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==RENTAL== #==RENTAL==
RENTAL_MAX_PAGES=Restrict crawler to this number of pages RENTAL_MAX_PAGES=Restrict crawler to this number of pages
RENTAL_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved RENTAL_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved