diff --git a/app/config/appConfig.js b/app/config/appConfig.js index b4144cc..5b06652 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -6,7 +6,10 @@ const APP_URL = ? process.env.APP_URL || "http://market-alarm" : process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`; +const DEFAULT_TIMEZONE = "Europe/Sarajevo"; + module.exports = { APP_PORT, - APP_URL + APP_URL, + DEFAULT_TIMEZONE }; diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 37af9e6..34a949a 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -3,6 +3,7 @@ const fetch = require("node-fetch"); const cheerio = require("cheerio"); const Promise = require("bluebird"); +const moment = require("moment-timezone"); const { AD_TYPE, @@ -13,6 +14,8 @@ const { CRAWLER_AD_TYPE } = require("../../common/enums"); +const { DEFAULT_TIMEZONE } = require("../../config/appConfig"); + const OLX_ENUMS = { OLX_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "", @@ -27,7 +30,9 @@ const OLX_ENUMS = { [AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27", [AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30" }, - MAX_DETAIL_FIELDS: 30 + MAX_DETAIL_FIELDS: 30, + OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm", + OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" }; class OlxCrawler { @@ -246,6 +251,35 @@ class OlxCrawler { } const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; + const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; + const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`; + + const publishedDate = $(publishedDateValueSelector) + .text() + .trim(); + + const publishedDateMoment = moment.tz( + publishedDate, + OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + + if (!publishedDateMoment.isValid()) { + throw { message: "Invalid published date ! Check parsing format" }; + } + + const renewedDate = $(renewedDateValueSelector) + .text() + .trim(); + + const renewedDateMoment = this.parseRenewedDate(renewedDate); + + if (!renewedDateMoment) { + throw { + message: + "Invalid renewed date ! Check how parser parsed renewed date text" + }; + } adType = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` @@ -358,7 +392,9 @@ class OlxCrawler { country: "", locationLat, locationLong, - adStatus: status + adStatus: status, + publishedDate: publishedDateMoment.toISOString(), + renewedDate: renewedDateMoment.toISOString() }; return data; @@ -416,6 +452,56 @@ class OlxCrawler { return parseFloat(formattedPriceText); } + parseRenewedDate(renewedDateText) { + const currentMoment = moment.tz(DEFAULT_TIMEZONE); + + if (renewedDateText.includes("Prije mjesec dana")) { + return currentMoment.add(-1, "month"); + } + + const dayVariations = ["dan", "dana"]; + for (const dayVariation of dayVariations) { + if (renewedDateText.includes(dayVariation)) { + // format for this case should be "Prije N dana" or "Prije N dan" + const dateParts = renewedDateText.split(" "); + if (dateParts[0] === "Prije") { + const numberOfDays = parseInt(dateParts[1]); + return currentMoment.add(-1 * numberOfDays, "days"); + } else { + return undefined; + } + } + } + + if (renewedDateText.includes("Jučer")) { + return currentMoment.add(-1, "day"); + } + + const todayVariations = [ + "sat", + "sati", + "sata", + "min", + "sekunde", + "sekundi", + "sekundu", + "maloprije" + ]; + for (const todayVariation of todayVariations) { + if (renewedDateText.includes(todayVariation)) { + return currentMoment; + } + } + + const renewedDateMoment = moment.tz( + renewedDateText, + OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + + return renewedDateMoment.isValid() ? renewedDateMoment : undefined; + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); }