Fixed olx scraper api bug.

This commit is contained in:
Naida Vatric
2020-03-04 22:56:06 +01:00
parent f5f8fa276c
commit b3708cf842
5 changed files with 58 additions and 46 deletions

View File

@@ -16,7 +16,8 @@ const {
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
DEFAULT_TIMEZONE,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
} = require("../../config/appConfig");
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
@@ -204,13 +205,32 @@ class SaljicCrawler {
? hrefsAbs.length
: maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
const scrapedData = [];
for (
let i = 0;
i <= actualNoOfResults;
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentUrlsToScrape = hrefsAbs.slice(
i,
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentAdTypesOfReq = adTypes.slice(
i,
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentReqScraperApi = concurrentUrlsToScrape.map(
(url, index) => this.scrapeAd(url, concurrentAdTypesOfReq[index])
);
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
}
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
console.error("[SALJIC] Exception caught:" + e);
@@ -225,6 +245,10 @@ class SaljicCrawler {
const body = await adPageSource.text();
const $ = cheerio.load(body);
//Throws error if req to Scraper API proxy wasn't succesful and responds with error
if (body.indexOf("<html>") === -1) {
throw { message: "Scraper API server error." };
}
// No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url
@@ -508,6 +532,11 @@ class SaljicCrawler {
const region = "";
const entity = "";
const country = "";
//Throws error if realEstateType is null - not read. Still dont know why?
if (realEstateType === null) {
console.log("Body:", body);
throw { message: "Couldn't read real estate type." };
}
const data = {
url,
@@ -573,13 +602,10 @@ class SaljicCrawler {
numberOfViewsKivi
};
await this.sleep(1000);
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
await this.sleep(1000);
return null;
}