Added delay between ads.
This commit is contained in:
@@ -35,5 +35,6 @@ module.exports = {
|
|||||||
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
||||||
OLX_DELAY_BETWEEN_PAGES:
|
OLX_DELAY_BETWEEN_PAGES:
|
||||||
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
|
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
|
||||||
|
OLX_DELAY_BETWEEN_ADS: parseInt(process.env.OLX_DELAY_BETWEEN_ADS) || 1000,
|
||||||
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
|
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -42,7 +42,10 @@ const OLX_ENUMS = {
|
|||||||
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||||
};
|
};
|
||||||
|
|
||||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
const {
|
||||||
|
OLX_FORCE_CRAWL,
|
||||||
|
OLX_DELAY_BETWEEN_ADS
|
||||||
|
} = require("../specificConfigs/olx");
|
||||||
|
|
||||||
class OlxCrawler {
|
class OlxCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
@@ -52,7 +55,8 @@ class OlxCrawler {
|
|||||||
maxPages = 1000,
|
maxPages = 1000,
|
||||||
maxResultsPerPage = 100,
|
maxResultsPerPage = 100,
|
||||||
ignoredUsernames = [],
|
ignoredUsernames = [],
|
||||||
delayBetweenPages = 1000
|
delayBetweenPages = 1000,
|
||||||
|
delayBetweenAds = OLX_DELAY_BETWEEN_ADS
|
||||||
) {
|
) {
|
||||||
this.savers = savers;
|
this.savers = savers;
|
||||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||||
@@ -62,6 +66,7 @@ class OlxCrawler {
|
|||||||
this.maxResultsPerPage = maxResultsPerPage;
|
this.maxResultsPerPage = maxResultsPerPage;
|
||||||
this.ignoredUsernames = ignoredUsernames;
|
this.ignoredUsernames = ignoredUsernames;
|
||||||
this.delayBetweenPages = delayBetweenPages;
|
this.delayBetweenPages = delayBetweenPages;
|
||||||
|
this.delayBetweenAds = delayBetweenAds;
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
@@ -193,6 +198,8 @@ class OlxCrawler {
|
|||||||
const asyncScraping = [];
|
const asyncScraping = [];
|
||||||
for (let i = 0; i < actualNoOfResults; i++) {
|
for (let i = 0; i < actualNoOfResults; i++) {
|
||||||
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
||||||
|
//Delaying next scrape ad request to avoid ScraperAPI server error
|
||||||
|
asyncScraping.push(this.sleep(this.delayBetweenAds));
|
||||||
}
|
}
|
||||||
|
|
||||||
const scrapedData = await Promise.all(asyncScraping);
|
const scrapedData = await Promise.all(asyncScraping);
|
||||||
@@ -206,15 +213,17 @@ class OlxCrawler {
|
|||||||
|
|
||||||
async scrapeAd(url) {
|
async scrapeAd(url) {
|
||||||
console.log("Scraping : ", url);
|
console.log("Scraping : ", url);
|
||||||
let hasParseErrors = false;
|
// let hasParseErrors = false;
|
||||||
let numberOfParseErrors = 0;
|
//let numberOfParseErrors = 0;
|
||||||
do {
|
// do {
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let status = AD_STATUS.STATUS_NORMAL;
|
let status = AD_STATUS.STATUS_NORMAL;
|
||||||
|
//
|
||||||
|
console.log("Body:", body);
|
||||||
|
//
|
||||||
const propertySelectors = {
|
const propertySelectors = {
|
||||||
username:
|
username:
|
||||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
||||||
@@ -276,9 +285,7 @@ class OlxCrawler {
|
|||||||
throw { message: "Can't find normal price" };
|
throw { message: "Can't find normal price" };
|
||||||
}
|
}
|
||||||
if (urgentPriceValue && urgentPriceValue.length > 0) {
|
if (urgentPriceValue && urgentPriceValue.length > 0) {
|
||||||
const priceValues = urgentPriceValue
|
const priceValues = urgentPriceValue.replace("Cijena", "").split("KM");
|
||||||
.replace("Cijena", "")
|
|
||||||
.split("KM");
|
|
||||||
//priceValues will contain values like ["100000", "90000", ...], second element is urgent price
|
//priceValues will contain values like ["100000", "90000", ...], second element is urgent price
|
||||||
if (priceValues.length > 0) {
|
if (priceValues.length > 0) {
|
||||||
if (priceValues[0].trim().indexOf("Hitno") != -1) {
|
if (priceValues[0].trim().indexOf("Hitno") != -1) {
|
||||||
@@ -448,16 +455,10 @@ class OlxCrawler {
|
|||||||
gardenSize = fieldValue;
|
gardenSize = fieldValue;
|
||||||
break;
|
break;
|
||||||
case "broj soba":
|
case "broj soba":
|
||||||
numberOfRooms = this.parseNumberOfRooms(
|
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
|
||||||
fieldValue,
|
|
||||||
parsedCategory
|
|
||||||
);
|
|
||||||
break;
|
break;
|
||||||
case "broj prostorija":
|
case "broj prostorija":
|
||||||
numberOfRooms = this.parseNumberOfRooms(
|
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
|
||||||
fieldValue,
|
|
||||||
parsedCategory
|
|
||||||
);
|
|
||||||
break;
|
break;
|
||||||
case "broj spratova":
|
case "broj spratova":
|
||||||
numberOfFloors = this.parseNumberOfFloors(
|
numberOfFloors = this.parseNumberOfFloors(
|
||||||
@@ -584,10 +585,7 @@ class OlxCrawler {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") {
|
||||||
++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS ||
|
|
||||||
fieldTitle === ""
|
|
||||||
) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} while (true);
|
} while (true);
|
||||||
@@ -685,13 +683,16 @@ class OlxCrawler {
|
|||||||
//
|
//
|
||||||
//console.log("Scraped data:", data);
|
//console.log("Scraped data:", data);
|
||||||
|
|
||||||
|
//Delay between real estate ads to avoid error from Scraper API
|
||||||
|
// await this.sleep(this.delayBetweenAds);
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
hasParseErrors = true;
|
// hasParseErrors = true;
|
||||||
numberOfParseErrors++;
|
// numberOfParseErrors++;
|
||||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||||
}
|
}
|
||||||
} while (hasParseErrors && numberOfParseErrors <= 1);
|
// } while (hasParseErrors && numberOfParseErrors <= 1);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -909,6 +910,7 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async sleep(ms) {
|
async sleep(ms) {
|
||||||
|
// console.log("Sleep for:", ms);
|
||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,9 @@ OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check commo
|
|||||||
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||||
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
|
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
|
||||||
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||||
|
OLX_DELAY_BETWEEN_ADS = time in miliseconds to wait before scraping next add to awoid server errors with Scraper API
|
||||||
OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
|
|
||||||
#==RENTAL==
|
#==RENTAL==
|
||||||
RENTAL_MAX_PAGES=Restrict crawler to this number of pages
|
RENTAL_MAX_PAGES=Restrict crawler to this number of pages
|
||||||
RENTAL_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
RENTAL_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||||
|
|||||||
Reference in New Issue
Block a user