Fixed olx scraper api bug.

This commit is contained in:
Naida Vatric
2020-03-04 22:56:06 +01:00
parent f5f8fa276c
commit b3708cf842
5 changed files with 58 additions and 46 deletions

View File

@@ -48,9 +48,7 @@ const USER_AGENT =
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10;
const DELAY_BETWEEN_REQ_SCRAPER_API =
process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000;
parseInt(process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API) || 10;
module.exports = {
APP_PORT,
@@ -69,6 +67,5 @@ module.exports = {
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
DELAY_BETWEEN_REQ_SCRAPER_API
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
};

View File

@@ -19,8 +19,7 @@ const {
const {
DEFAULT_TIMEZONE,
PRINT_CRAWLER_DEBUG,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
DELAY_BETWEEN_REQ_SCRAPER_API
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
} = require("../../config/appConfig");
const OLX_ENUMS = {
@@ -195,21 +194,20 @@ class OlxCrawler {
const scrapedData = [];
for (
let i = 0;
i < actualNoOfResults;
i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
i <= actualNoOfResults;
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentUrlsToScrape = hrefs.slice(
i,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
this.scrapeAd(url)
);
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
scrapedData.push(concurrentReqData);
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
}
const filteredScrapedData = scrapedData.filter(adData => !!adData);
@@ -221,13 +219,9 @@ class OlxCrawler {
}
async scrapeAd(url) {
console.log("Scraping : ", url);
// let hasParseErrors = false;
//let numberOfParseErrors = 0;
// do {
try {
// await this.sleep(this.delayBetweenAds);
// console.log("Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
@@ -269,13 +263,13 @@ class OlxCrawler {
)
.text()
.trim();
//For cases where price is given in discount manner - different from default parsing
const discountPriceValue = $(
"#artikal_glavni_div > div.artikal_lijevo > div.op.pop > p"
)
.text()
.trim();
//Debug
//console.log("Title:", title);
//console.log("Url scraped:", url);
// console.log("Normal price value:", normalPriceValue);
// console.log("Urgent price value:", urgentPriceValue);
//
if (normalPriceValue && normalPriceValue.length > 0) {
normalPrice = normalPriceValue
.replace(/\r\n|\n|\r/gm, "")
@@ -290,10 +284,11 @@ class OlxCrawler {
} else {
status = AD_STATUS.STATUS_NORMAL;
}
} else if (discountPriceValue && discountPriceValue.length > 0) {
status = AD_STATUS.STATUS_URGENT;
const priceValues = discountPriceValue.split("KM");
normalPrice = priceValues[0].trim();
} else {
//
console.log("Body:", body);
//
throw { message: "Can't find normal price" };
}
if (urgentPriceValue && urgentPriceValue.length > 0) {
@@ -306,6 +301,10 @@ class OlxCrawler {
} else {
urgentPrice = priceValues[0].trim();
}
} else if (discountPriceValue && discountPriceValue.length > 0) {
status = AD_STATUS.STATUS_URGENT;
const priceValues = discountPriceValue.split("KM");
urgentPrice = priceValues[1].trim();
} else {
throw { message: "Can't find urgent price" };
}
@@ -692,20 +691,11 @@ class OlxCrawler {
distanceToRiver,
numberOfViewsAgency
};
//
//console.log("Scraped data:", data);
//Delay between real estate ads to avoid error from Scraper API
// await this.sleep(this.delayBetweenAds);
return data;
} catch (e) {
// hasParseErrors = true;
// numberOfParseErrors++;
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
// } while (hasParseErrors && numberOfParseErrors <= 1);
// await this.sleep(this.delayBetweenAds);
return null;
}

View File

@@ -16,7 +16,8 @@ const {
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
DEFAULT_TIMEZONE,
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
} = require("../../config/appConfig");
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
@@ -204,13 +205,32 @@ class SaljicCrawler {
? hrefsAbs.length
: maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
const scrapedData = [];
for (
let i = 0;
i <= actualNoOfResults;
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
) {
const concurrentUrlsToScrape = hrefsAbs.slice(
i,
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentAdTypesOfReq = adTypes.slice(
i,
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
);
const concurrentReqScraperApi = concurrentUrlsToScrape.map(
(url, index) => this.scrapeAd(url, concurrentAdTypesOfReq[index])
);
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
}
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
console.error("[SALJIC] Exception caught:" + e);
@@ -225,6 +245,10 @@ class SaljicCrawler {
const body = await adPageSource.text();
const $ = cheerio.load(body);
//Throws error if req to Scraper API proxy wasn't succesful and responds with error
if (body.indexOf("<html>") === -1) {
throw { message: "Scraper API server error." };
}
// No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url
@@ -508,6 +532,11 @@ class SaljicCrawler {
const region = "";
const entity = "";
const country = "";
//Throws error if realEstateType is null - not read. Still dont know why?
if (realEstateType === null) {
console.log("Body:", body);
throw { message: "Couldn't read real estate type." };
}
const data = {
url,
@@ -573,13 +602,10 @@ class SaljicCrawler {
numberOfViewsKivi
};
await this.sleep(1000);
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
await this.sleep(1000);
return null;
}

View File

@@ -16,7 +16,7 @@ const fetch = async (url, options = {}) => {
: url;
//
console.log("Url for scraping:", urlAdaptedForScraping);
// console.log("Url for scraping:", urlAdaptedForScraping);
return nodeFetch(urlAdaptedForScraping, newOptions);
};

View File

@@ -26,7 +26,6 @@ API_MAP_KEY=(your-key-here)
USE_SCRAPER_API= To turn it on (1) or off (0)
SCRAPER_API_KEY= Key for Scraper api
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy
DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req bulk to awoid server errors with Scraper API
#=============== AWS SDK EMAIL SETTINGS =======#
AWS_KEY_ID=(your-key-here)