parse and save published and renewed dates

This commit is contained in:
Bilal Catic
2019-09-23 21:19:28 +02:00
parent c7184be5fc
commit 63eb64b0f6
2 changed files with 92 additions and 3 deletions

View File

@@ -6,7 +6,10 @@ const APP_URL =
? process.env.APP_URL || "http://market-alarm"
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
module.exports = {
APP_PORT,
APP_URL
APP_URL,
DEFAULT_TIMEZONE
};

View File

@@ -3,6 +3,7 @@
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
const {
AD_TYPE,
@@ -13,6 +14,8 @@ const {
CRAWLER_AD_TYPE
} = require("../../common/enums");
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
const OLX_ENUMS = {
OLX_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "",
@@ -27,7 +30,9 @@ const OLX_ENUMS = {
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
},
MAX_DETAIL_FIELDS: 30
MAX_DETAIL_FIELDS: 30,
OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm",
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
};
class OlxCrawler {
@@ -246,6 +251,35 @@ class OlxCrawler {
}
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`;
const publishedDate = $(publishedDateValueSelector)
.text()
.trim();
const publishedDateMoment = moment.tz(
publishedDate,
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!publishedDateMoment.isValid()) {
throw { message: "Invalid published date ! Check parsing format" };
}
const renewedDate = $(renewedDateValueSelector)
.text()
.trim();
const renewedDateMoment = this.parseRenewedDate(renewedDate);
if (!renewedDateMoment) {
throw {
message:
"Invalid renewed date ! Check how parser parsed renewed date text"
};
}
adType = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
@@ -358,7 +392,9 @@ class OlxCrawler {
country: "",
locationLat,
locationLong,
adStatus: status
adStatus: status,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString()
};
return data;
@@ -416,6 +452,56 @@ class OlxCrawler {
return parseFloat(formattedPriceText);
}
parseRenewedDate(renewedDateText) {
const currentMoment = moment.tz(DEFAULT_TIMEZONE);
if (renewedDateText.includes("Prije mjesec dana")) {
return currentMoment.add(-1, "month");
}
const dayVariations = ["dan", "dana"];
for (const dayVariation of dayVariations) {
if (renewedDateText.includes(dayVariation)) {
// format for this case should be "Prije N dana" or "Prije N dan"
const dateParts = renewedDateText.split(" ");
if (dateParts[0] === "Prije") {
const numberOfDays = parseInt(dateParts[1]);
return currentMoment.add(-1 * numberOfDays, "days");
} else {
return undefined;
}
}
}
if (renewedDateText.includes("Jučer")) {
return currentMoment.add(-1, "day");
}
const todayVariations = [
"sat",
"sati",
"sata",
"min",
"sekunde",
"sekundi",
"sekundu",
"maloprije"
];
for (const todayVariation of todayVariations) {
if (renewedDateText.includes(todayVariation)) {
return currentMoment;
}
}
const renewedDateMoment = moment.tz(
renewedDateText,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
return renewedDateMoment.isValid() ? renewedDateMoment : undefined;
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}