From 65068932ad5bf760d1a4ae0a2f48c734eedc652b Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 18 May 2020 03:43:49 +0200 Subject: [PATCH 1/2] handle failed page fetch; detect discounted price --- app/crawler/specificCrawlers/olx.js | 70 +++++++++-------------------- 1 file changed, 22 insertions(+), 48 deletions(-) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 548d5ac..bd8afd6 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -205,13 +205,17 @@ class OlxCrawler { } async scrapeAd(url) { - console.log("Scraping : ", url); + // console.log("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); let status = AD_STATUS.STATUS_NORMAL; + if (body.indexOf(' div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", @@ -238,56 +242,26 @@ class OlxCrawler { //====== PRICE DETECTION AND EXTRACTION ===== let price = null; - let normalPrice = null; - let urgentPrice = null; - const normalPriceValue = $("#pc > p:nth-child(2)") - .text() - .trim(); - const urgentPriceValue = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" - ) - .text() - .trim(); - //Debug - //console.log("Title:", title); - //console.log("Url scraped:", url); - // console.log("Normal price value:", normalPriceValue); - // console.log("Urgent price value:", urgentPriceValue); - // - if (normalPriceValue && normalPriceValue.length > 0) { - normalPrice = normalPriceValue - .replace(/\r\n|\n|\r/gm, "") - .replace("KM", "") - .trim(); - if ( - $("#pc > p.n") - .text() - .indexOf("Hitna") !== -1 - ) { - status = AD_STATUS.STATUS_URGENT; - } else { - status = AD_STATUS.STATUS_NORMAL; - } - } else { - throw { message: "Can't find normal price" }; - } - if (urgentPriceValue && urgentPriceValue.length > 0) { - const priceValues = urgentPriceValue.replace("Cijena", "").split("KM"); - //priceValues will contain values like ["100000", "90000", ...], second element is urgent price - if (priceValues.length > 0) { - if (priceValues[0].trim().indexOf("Hitno") != -1) { - urgentPrice = priceValues[0].replace("Hitno", "").trim(); - status = AD_STATUS.STATUS_URGENT; - } else { - urgentPrice = priceValues[0].trim(); - } - } else { - throw { message: "Can't find urgent price" }; - } + const priceHeader = $("#pc > p.n").text().trim(); + const priceValue = $("#pc > p:nth-child(2)").text().trim(); + price = priceValue; + + if (priceHeader.indexOf('Hitn') !== -1) { + // Urgent price + status = AD_STATUS.STATUS_URGENT; } - price = status === AD_STATUS.STATUS_URGENT ? urgentPrice : normalPrice; + const discountPriceTag = $("#artikal_glavni_div > div.artikal_lijevo > p:nth-child(4)").text().trim(); + if (discountPriceTag.indexOf('Akcij') !== -1) { + status = AD_STATUS.STATUS_DISCOUNTED; + const discountPriceValues = $("#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p").text().trim(); + // discountPriceValues contain string like "10.000 KM 7.500 KM" + // First price is regular, second is currently active (discounted) price + const bothPrices = discountPriceValues.split('KM'); + // Now, currently active price is second element of bothPrices array + price = bothPrices[1] ? bothPrices[1].trim() : null; + } //====== OTHER AD INFORMATION =============== let adType = null; -- 2.47.3 From 159fedbc2d9d76d7619e46b2f8f2b96e59576774 Mon Sep 17 00:00:00 2001 From: Bilal Date: Mon, 18 May 2020 03:53:08 +0200 Subject: [PATCH 2/2] handle failed page fetch --- app/crawler/specificCrawlers/aktido.js | 4 ++++ app/crawler/specificCrawlers/prostor.js | 4 ++++ app/crawler/specificCrawlers/rental.js | 4 ++++ app/crawler/specificCrawlers/saljic.js | 6 +++++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index 9755eb7..ac4dcc6 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -202,6 +202,10 @@ class AktidoCrawler { const body = await adPageSource.text(); const $ = cheerio.load(body); + if (body.indexOf('