Fixed olx scraper api bug.
This commit is contained in:
@@ -16,7 +16,8 @@ const {
|
||||
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
DEFAULT_TIMEZONE
|
||||
DEFAULT_TIMEZONE,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
} = require("../../config/appConfig");
|
||||
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
|
||||
|
||||
@@ -204,13 +205,32 @@ class SaljicCrawler {
|
||||
? hrefsAbs.length
|
||||
: maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i <= actualNoOfResults;
|
||||
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefsAbs.slice(
|
||||
i,
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentAdTypesOfReq = adTypes.slice(
|
||||
i,
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(
|
||||
(url, index) => this.scrapeAd(url, concurrentAdTypesOfReq[index])
|
||||
);
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
|
||||
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
|
||||
}
|
||||
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("[SALJIC] Exception caught:" + e);
|
||||
@@ -225,6 +245,10 @@ class SaljicCrawler {
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
//Throws error if req to Scraper API proxy wasn't succesful and responds with error
|
||||
if (body.indexOf("<html>") === -1) {
|
||||
throw { message: "Scraper API server error." };
|
||||
}
|
||||
// No information for status ex. PRODAN
|
||||
const status = AD_STATUS.STATUS_NORMAL;
|
||||
//Extracting agency ID from url
|
||||
@@ -508,6 +532,11 @@ class SaljicCrawler {
|
||||
const region = "";
|
||||
const entity = "";
|
||||
const country = "";
|
||||
//Throws error if realEstateType is null - not read. Still dont know why?
|
||||
if (realEstateType === null) {
|
||||
console.log("Body:", body);
|
||||
throw { message: "Couldn't read real estate type." };
|
||||
}
|
||||
|
||||
const data = {
|
||||
url,
|
||||
@@ -573,13 +602,10 @@ class SaljicCrawler {
|
||||
numberOfViewsKivi
|
||||
};
|
||||
|
||||
await this.sleep(1000);
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
await this.sleep(1000);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user