Fixed olx scraper api bug.
This commit is contained in:
@@ -19,8 +19,7 @@ const {
|
||||
const {
|
||||
DEFAULT_TIMEZONE,
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
} = require("../../config/appConfig");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
@@ -195,21 +194,20 @@ class OlxCrawler {
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i < actualNoOfResults;
|
||||
i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
i <= actualNoOfResults;
|
||||
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefs.slice(
|
||||
i,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
|
||||
this.scrapeAd(url)
|
||||
);
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
scrapedData.push(concurrentReqData);
|
||||
|
||||
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
|
||||
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
|
||||
}
|
||||
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
@@ -221,13 +219,9 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
console.log("Scraping : ", url);
|
||||
// let hasParseErrors = false;
|
||||
//let numberOfParseErrors = 0;
|
||||
// do {
|
||||
try {
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
// console.log("Scraping : ", url);
|
||||
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
@@ -269,13 +263,13 @@ class OlxCrawler {
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
//For cases where price is given in discount manner - different from default parsing
|
||||
const discountPriceValue = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div.op.pop > p"
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
//Debug
|
||||
//console.log("Title:", title);
|
||||
//console.log("Url scraped:", url);
|
||||
// console.log("Normal price value:", normalPriceValue);
|
||||
// console.log("Urgent price value:", urgentPriceValue);
|
||||
//
|
||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
||||
normalPrice = normalPriceValue
|
||||
.replace(/\r\n|\n|\r/gm, "")
|
||||
@@ -290,10 +284,11 @@ class OlxCrawler {
|
||||
} else {
|
||||
status = AD_STATUS.STATUS_NORMAL;
|
||||
}
|
||||
} else if (discountPriceValue && discountPriceValue.length > 0) {
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
const priceValues = discountPriceValue.split("KM");
|
||||
normalPrice = priceValues[0].trim();
|
||||
} else {
|
||||
//
|
||||
console.log("Body:", body);
|
||||
//
|
||||
throw { message: "Can't find normal price" };
|
||||
}
|
||||
if (urgentPriceValue && urgentPriceValue.length > 0) {
|
||||
@@ -306,6 +301,10 @@ class OlxCrawler {
|
||||
} else {
|
||||
urgentPrice = priceValues[0].trim();
|
||||
}
|
||||
} else if (discountPriceValue && discountPriceValue.length > 0) {
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
const priceValues = discountPriceValue.split("KM");
|
||||
urgentPrice = priceValues[1].trim();
|
||||
} else {
|
||||
throw { message: "Can't find urgent price" };
|
||||
}
|
||||
@@ -692,20 +691,11 @@ class OlxCrawler {
|
||||
distanceToRiver,
|
||||
numberOfViewsAgency
|
||||
};
|
||||
//
|
||||
//console.log("Scraped data:", data);
|
||||
|
||||
//Delay between real estate ads to avoid error from Scraper API
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
// hasParseErrors = true;
|
||||
// numberOfParseErrors++;
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
// } while (hasParseErrors && numberOfParseErrors <= 1);
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user