Fixed olx scraper api bug.
This commit is contained in:
@@ -48,9 +48,7 @@ const USER_AGENT =
|
||||
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
|
||||
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
|
||||
const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
|
||||
process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API || 10;
|
||||
const DELAY_BETWEEN_REQ_SCRAPER_API =
|
||||
process.env.DELAY_BETWEEN_REQ_SCRAPER_API || 1000;
|
||||
parseInt(process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API) || 10;
|
||||
|
||||
module.exports = {
|
||||
APP_PORT,
|
||||
@@ -69,6 +67,5 @@ module.exports = {
|
||||
USER_AGENT,
|
||||
USE_SCRAPER_API,
|
||||
SCRAPER_API_KEY,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
};
|
||||
|
||||
@@ -19,8 +19,7 @@ const {
|
||||
const {
|
||||
DEFAULT_TIMEZONE,
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
} = require("../../config/appConfig");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
@@ -195,21 +194,20 @@ class OlxCrawler {
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i < actualNoOfResults;
|
||||
i += NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
i <= actualNoOfResults;
|
||||
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefs.slice(
|
||||
i,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
|
||||
this.scrapeAd(url)
|
||||
);
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
scrapedData.push(concurrentReqData);
|
||||
|
||||
this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API);
|
||||
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
|
||||
}
|
||||
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
@@ -221,13 +219,9 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
console.log("Scraping : ", url);
|
||||
// let hasParseErrors = false;
|
||||
//let numberOfParseErrors = 0;
|
||||
// do {
|
||||
try {
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
// console.log("Scraping : ", url);
|
||||
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
@@ -269,13 +263,13 @@ class OlxCrawler {
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
//For cases where price is given in discount manner - different from default parsing
|
||||
const discountPriceValue = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div.op.pop > p"
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
//Debug
|
||||
//console.log("Title:", title);
|
||||
//console.log("Url scraped:", url);
|
||||
// console.log("Normal price value:", normalPriceValue);
|
||||
// console.log("Urgent price value:", urgentPriceValue);
|
||||
//
|
||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
||||
normalPrice = normalPriceValue
|
||||
.replace(/\r\n|\n|\r/gm, "")
|
||||
@@ -290,10 +284,11 @@ class OlxCrawler {
|
||||
} else {
|
||||
status = AD_STATUS.STATUS_NORMAL;
|
||||
}
|
||||
} else if (discountPriceValue && discountPriceValue.length > 0) {
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
const priceValues = discountPriceValue.split("KM");
|
||||
normalPrice = priceValues[0].trim();
|
||||
} else {
|
||||
//
|
||||
console.log("Body:", body);
|
||||
//
|
||||
throw { message: "Can't find normal price" };
|
||||
}
|
||||
if (urgentPriceValue && urgentPriceValue.length > 0) {
|
||||
@@ -306,6 +301,10 @@ class OlxCrawler {
|
||||
} else {
|
||||
urgentPrice = priceValues[0].trim();
|
||||
}
|
||||
} else if (discountPriceValue && discountPriceValue.length > 0) {
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
const priceValues = discountPriceValue.split("KM");
|
||||
urgentPrice = priceValues[1].trim();
|
||||
} else {
|
||||
throw { message: "Can't find urgent price" };
|
||||
}
|
||||
@@ -692,20 +691,11 @@ class OlxCrawler {
|
||||
distanceToRiver,
|
||||
numberOfViewsAgency
|
||||
};
|
||||
//
|
||||
//console.log("Scraped data:", data);
|
||||
|
||||
//Delay between real estate ads to avoid error from Scraper API
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
// hasParseErrors = true;
|
||||
// numberOfParseErrors++;
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
// } while (hasParseErrors && numberOfParseErrors <= 1);
|
||||
// await this.sleep(this.delayBetweenAds);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -16,7 +16,8 @@ const {
|
||||
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
DEFAULT_TIMEZONE
|
||||
DEFAULT_TIMEZONE,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
} = require("../../config/appConfig");
|
||||
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
|
||||
|
||||
@@ -204,13 +205,32 @@ class SaljicCrawler {
|
||||
? hrefsAbs.length
|
||||
: maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i <= actualNoOfResults;
|
||||
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefsAbs.slice(
|
||||
i,
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentAdTypesOfReq = adTypes.slice(
|
||||
i,
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(
|
||||
(url, index) => this.scrapeAd(url, concurrentAdTypesOfReq[index])
|
||||
);
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
|
||||
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
|
||||
}
|
||||
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("[SALJIC] Exception caught:" + e);
|
||||
@@ -225,6 +245,10 @@ class SaljicCrawler {
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
//Throws error if req to Scraper API proxy wasn't succesful and responds with error
|
||||
if (body.indexOf("<html>") === -1) {
|
||||
throw { message: "Scraper API server error." };
|
||||
}
|
||||
// No information for status ex. PRODAN
|
||||
const status = AD_STATUS.STATUS_NORMAL;
|
||||
//Extracting agency ID from url
|
||||
@@ -508,6 +532,11 @@ class SaljicCrawler {
|
||||
const region = "";
|
||||
const entity = "";
|
||||
const country = "";
|
||||
//Throws error if realEstateType is null - not read. Still dont know why?
|
||||
if (realEstateType === null) {
|
||||
console.log("Body:", body);
|
||||
throw { message: "Couldn't read real estate type." };
|
||||
}
|
||||
|
||||
const data = {
|
||||
url,
|
||||
@@ -573,13 +602,10 @@ class SaljicCrawler {
|
||||
numberOfViewsKivi
|
||||
};
|
||||
|
||||
await this.sleep(1000);
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
await this.sleep(1000);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ const fetch = async (url, options = {}) => {
|
||||
: url;
|
||||
|
||||
//
|
||||
console.log("Url for scraping:", urlAdaptedForScraping);
|
||||
// console.log("Url for scraping:", urlAdaptedForScraping);
|
||||
|
||||
return nodeFetch(urlAdaptedForScraping, newOptions);
|
||||
};
|
||||
|
||||
@@ -26,7 +26,6 @@ API_MAP_KEY=(your-key-here)
|
||||
USE_SCRAPER_API= To turn it on (1) or off (0)
|
||||
SCRAPER_API_KEY= Key for Scraper api
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy
|
||||
DELAY_BETWEEN_REQ_SCRAPER_API= time in miliseconds to wait before sending next req bulk to awoid server errors with Scraper API
|
||||
|
||||
#=============== AWS SDK EMAIL SETTINGS =======#
|
||||
AWS_KEY_ID=(your-key-here)
|
||||
|
||||
Reference in New Issue
Block a user