parse and save published and renewed dates
This commit is contained in:
@@ -6,7 +6,10 @@ const APP_URL =
|
||||
? process.env.APP_URL || "http://market-alarm"
|
||||
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
||||
|
||||
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
|
||||
|
||||
module.exports = {
|
||||
APP_PORT,
|
||||
APP_URL
|
||||
APP_URL,
|
||||
DEFAULT_TIMEZONE
|
||||
};
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
const fetch = require("node-fetch");
|
||||
const cheerio = require("cheerio");
|
||||
const Promise = require("bluebird");
|
||||
const moment = require("moment-timezone");
|
||||
|
||||
const {
|
||||
AD_TYPE,
|
||||
@@ -13,6 +14,8 @@ const {
|
||||
CRAWLER_AD_TYPE
|
||||
} = require("../../common/enums");
|
||||
|
||||
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
OLX_AD_TYPE: {
|
||||
[CRAWLER_AD_TYPE.ALL]: "",
|
||||
@@ -27,7 +30,9 @@ const OLX_ENUMS = {
|
||||
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
|
||||
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
|
||||
},
|
||||
MAX_DETAIL_FIELDS: 30
|
||||
MAX_DETAIL_FIELDS: 30,
|
||||
OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm",
|
||||
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||
};
|
||||
|
||||
class OlxCrawler {
|
||||
@@ -246,6 +251,35 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
||||
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
||||
const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`;
|
||||
|
||||
const publishedDate = $(publishedDateValueSelector)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
const publishedDateMoment = moment.tz(
|
||||
publishedDate,
|
||||
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
if (!publishedDateMoment.isValid()) {
|
||||
throw { message: "Invalid published date ! Check parsing format" };
|
||||
}
|
||||
|
||||
const renewedDate = $(renewedDateValueSelector)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
const renewedDateMoment = this.parseRenewedDate(renewedDate);
|
||||
|
||||
if (!renewedDateMoment) {
|
||||
throw {
|
||||
message:
|
||||
"Invalid renewed date ! Check how parser parsed renewed date text"
|
||||
};
|
||||
}
|
||||
|
||||
adType = $(
|
||||
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
|
||||
@@ -358,7 +392,9 @@ class OlxCrawler {
|
||||
country: "",
|
||||
locationLat,
|
||||
locationLong,
|
||||
adStatus: status
|
||||
adStatus: status,
|
||||
publishedDate: publishedDateMoment.toISOString(),
|
||||
renewedDate: renewedDateMoment.toISOString()
|
||||
};
|
||||
|
||||
return data;
|
||||
@@ -416,6 +452,56 @@ class OlxCrawler {
|
||||
return parseFloat(formattedPriceText);
|
||||
}
|
||||
|
||||
parseRenewedDate(renewedDateText) {
|
||||
const currentMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||
|
||||
if (renewedDateText.includes("Prije mjesec dana")) {
|
||||
return currentMoment.add(-1, "month");
|
||||
}
|
||||
|
||||
const dayVariations = ["dan", "dana"];
|
||||
for (const dayVariation of dayVariations) {
|
||||
if (renewedDateText.includes(dayVariation)) {
|
||||
// format for this case should be "Prije N dana" or "Prije N dan"
|
||||
const dateParts = renewedDateText.split(" ");
|
||||
if (dateParts[0] === "Prije") {
|
||||
const numberOfDays = parseInt(dateParts[1]);
|
||||
return currentMoment.add(-1 * numberOfDays, "days");
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("Jučer")) {
|
||||
return currentMoment.add(-1, "day");
|
||||
}
|
||||
|
||||
const todayVariations = [
|
||||
"sat",
|
||||
"sati",
|
||||
"sata",
|
||||
"min",
|
||||
"sekunde",
|
||||
"sekundi",
|
||||
"sekundu",
|
||||
"maloprije"
|
||||
];
|
||||
for (const todayVariation of todayVariations) {
|
||||
if (renewedDateText.includes(todayVariation)) {
|
||||
return currentMoment;
|
||||
}
|
||||
}
|
||||
|
||||
const renewedDateMoment = moment.tz(
|
||||
renewedDateText,
|
||||
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
return renewedDateMoment.isValid() ? renewedDateMoment : undefined;
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user