parse and save published and renewed dates
This commit is contained in:
@@ -6,7 +6,10 @@ const APP_URL =
|
|||||||
? process.env.APP_URL || "http://market-alarm"
|
? process.env.APP_URL || "http://market-alarm"
|
||||||
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
||||||
|
|
||||||
|
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
APP_PORT,
|
APP_PORT,
|
||||||
APP_URL
|
APP_URL,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
const fetch = require("node-fetch");
|
const fetch = require("node-fetch");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const Promise = require("bluebird");
|
const Promise = require("bluebird");
|
||||||
|
const moment = require("moment-timezone");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
AD_TYPE,
|
AD_TYPE,
|
||||||
@@ -13,6 +14,8 @@ const {
|
|||||||
CRAWLER_AD_TYPE
|
CRAWLER_AD_TYPE
|
||||||
} = require("../../common/enums");
|
} = require("../../common/enums");
|
||||||
|
|
||||||
|
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
|
||||||
|
|
||||||
const OLX_ENUMS = {
|
const OLX_ENUMS = {
|
||||||
OLX_AD_TYPE: {
|
OLX_AD_TYPE: {
|
||||||
[CRAWLER_AD_TYPE.ALL]: "",
|
[CRAWLER_AD_TYPE.ALL]: "",
|
||||||
@@ -27,7 +30,9 @@ const OLX_ENUMS = {
|
|||||||
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
|
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
|
||||||
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
|
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
|
||||||
},
|
},
|
||||||
MAX_DETAIL_FIELDS: 30
|
MAX_DETAIL_FIELDS: 30,
|
||||||
|
OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm",
|
||||||
|
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||||
};
|
};
|
||||||
|
|
||||||
class OlxCrawler {
|
class OlxCrawler {
|
||||||
@@ -246,6 +251,35 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
||||||
|
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
||||||
|
const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`;
|
||||||
|
|
||||||
|
const publishedDate = $(publishedDateValueSelector)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const publishedDateMoment = moment.tz(
|
||||||
|
publishedDate,
|
||||||
|
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!publishedDateMoment.isValid()) {
|
||||||
|
throw { message: "Invalid published date ! Check parsing format" };
|
||||||
|
}
|
||||||
|
|
||||||
|
const renewedDate = $(renewedDateValueSelector)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const renewedDateMoment = this.parseRenewedDate(renewedDate);
|
||||||
|
|
||||||
|
if (!renewedDateMoment) {
|
||||||
|
throw {
|
||||||
|
message:
|
||||||
|
"Invalid renewed date ! Check how parser parsed renewed date text"
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
adType = $(
|
adType = $(
|
||||||
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
|
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
|
||||||
@@ -358,7 +392,9 @@ class OlxCrawler {
|
|||||||
country: "",
|
country: "",
|
||||||
locationLat,
|
locationLat,
|
||||||
locationLong,
|
locationLong,
|
||||||
adStatus: status
|
adStatus: status,
|
||||||
|
publishedDate: publishedDateMoment.toISOString(),
|
||||||
|
renewedDate: renewedDateMoment.toISOString()
|
||||||
};
|
};
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
@@ -416,6 +452,56 @@ class OlxCrawler {
|
|||||||
return parseFloat(formattedPriceText);
|
return parseFloat(formattedPriceText);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
parseRenewedDate(renewedDateText) {
|
||||||
|
const currentMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||||
|
|
||||||
|
if (renewedDateText.includes("Prije mjesec dana")) {
|
||||||
|
return currentMoment.add(-1, "month");
|
||||||
|
}
|
||||||
|
|
||||||
|
const dayVariations = ["dan", "dana"];
|
||||||
|
for (const dayVariation of dayVariations) {
|
||||||
|
if (renewedDateText.includes(dayVariation)) {
|
||||||
|
// format for this case should be "Prije N dana" or "Prije N dan"
|
||||||
|
const dateParts = renewedDateText.split(" ");
|
||||||
|
if (dateParts[0] === "Prije") {
|
||||||
|
const numberOfDays = parseInt(dateParts[1]);
|
||||||
|
return currentMoment.add(-1 * numberOfDays, "days");
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (renewedDateText.includes("Jučer")) {
|
||||||
|
return currentMoment.add(-1, "day");
|
||||||
|
}
|
||||||
|
|
||||||
|
const todayVariations = [
|
||||||
|
"sat",
|
||||||
|
"sati",
|
||||||
|
"sata",
|
||||||
|
"min",
|
||||||
|
"sekunde",
|
||||||
|
"sekundi",
|
||||||
|
"sekundu",
|
||||||
|
"maloprije"
|
||||||
|
];
|
||||||
|
for (const todayVariation of todayVariations) {
|
||||||
|
if (renewedDateText.includes(todayVariation)) {
|
||||||
|
return currentMoment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const renewedDateMoment = moment.tz(
|
||||||
|
renewedDateText,
|
||||||
|
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
);
|
||||||
|
|
||||||
|
return renewedDateMoment.isValid() ? renewedDateMoment : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
async sleep(ms) {
|
async sleep(ms) {
|
||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user