Merge branch 'make-crawler-smarter' into 'master'
Make crawler smarter See merge request saburly/marketalarm/web!33
This commit was merged in pull request #33.
This commit is contained in:
@@ -12,8 +12,6 @@ const AD_CATEGORY = {
|
||||
CATEGORY_GARAGE: "GARAGE"
|
||||
};
|
||||
|
||||
const IGNORED_USERNAMES = [];
|
||||
|
||||
const AD_STATUS = {
|
||||
STATUS_NORMAL: 1,
|
||||
STATUS_RESERVED: 2,
|
||||
@@ -36,7 +34,6 @@ const CRAWLER_AD_TYPE = {
|
||||
|
||||
module.exports = {
|
||||
AD_TYPE,
|
||||
IGNORED_USERNAMES,
|
||||
AD_CATEGORY,
|
||||
AD_STATUS,
|
||||
AD_AGENCY,
|
||||
|
||||
@@ -6,7 +6,10 @@ const APP_URL =
|
||||
? process.env.APP_URL || "http://market-alarm"
|
||||
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
||||
|
||||
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
|
||||
|
||||
module.exports = {
|
||||
APP_PORT,
|
||||
APP_URL
|
||||
APP_URL,
|
||||
DEFAULT_TIMEZONE
|
||||
};
|
||||
|
||||
@@ -13,23 +13,28 @@ const PostgresSaver = require("./savers/postgres");
|
||||
|
||||
const crawlers = [
|
||||
new OlxCrawler(
|
||||
OLX_CONFIG.OLX_START_PAGE,
|
||||
OLX_CONFIG.OLX_END_PAGE,
|
||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||
[new PostgresSaver()],
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
||||
OLX_CONFIG.OLX_MAX_PAGES,
|
||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||
OLX_CONFIG.OLX_IGNORED_USERNAMES,
|
||||
OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES
|
||||
)
|
||||
];
|
||||
|
||||
async function crawlAll() {
|
||||
for (let crawler of crawlers) {
|
||||
try {
|
||||
await crawler.crawl();
|
||||
const newRealEstates = await crawler.crawl();
|
||||
|
||||
console.log("Number of new real estates : ", newRealEstates.length);
|
||||
} catch (e) {
|
||||
console.log("Error crawling. Trying next crawler! ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
crawlAll();
|
||||
(async () => {
|
||||
await crawlAll();
|
||||
})();
|
||||
|
||||
@@ -2,29 +2,37 @@
|
||||
require("dotenv").config({ path: "../../.env" });
|
||||
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
|
||||
|
||||
const crawlerAdType =
|
||||
const olxCrawlerAdType =
|
||||
process.env.OLX_CRAWLER_AD_TYPE !== undefined
|
||||
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
|
||||
: null;
|
||||
|
||||
const parsedCrawlerAdCategories =
|
||||
const olxParsedCrawlerAdCategories =
|
||||
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
|
||||
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
||||
category.trim()
|
||||
)
|
||||
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
|
||||
|
||||
const transformedCrawlerAdCategories = parsedCrawlerAdCategories
|
||||
const olxIgnoredUsernames =
|
||||
process.env.OLX_IGNORED_USERNAMES !== undefined
|
||||
? process.env.OLX_IGNORED_USERNAMES.split(",").map(username =>
|
||||
username.trim()
|
||||
)
|
||||
: [];
|
||||
|
||||
const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories
|
||||
.map(categoryName => AD_CATEGORY[categoryName])
|
||||
.filter(category => !!category);
|
||||
|
||||
const OLX_CONFIG = {
|
||||
OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1,
|
||||
OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10,
|
||||
OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500,
|
||||
OLX_MAX_RESULTS_PER_PAGE:
|
||||
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
|
||||
OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories
|
||||
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories,
|
||||
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
||||
OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
const moment = require("moment");
|
||||
|
||||
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
|
||||
|
||||
class PostgresSaver {
|
||||
@@ -9,7 +11,33 @@ class PostgresSaver {
|
||||
|
||||
async save(results) {
|
||||
console.log("[POSTGRES] Saving...");
|
||||
await bulkUpsertRealEstates(results);
|
||||
|
||||
const savedRecords = await bulkUpsertRealEstates(results);
|
||||
|
||||
if (Array.isArray(savedRecords)) {
|
||||
const newRealEstates = [];
|
||||
const existingRealEstates = [];
|
||||
|
||||
for (const savedRecord of savedRecords) {
|
||||
const { createdAt, updatedAt } = savedRecord;
|
||||
|
||||
const createdAtMoment = moment.utc(createdAt);
|
||||
const updatedAtMoment = moment.utc(updatedAt);
|
||||
|
||||
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
|
||||
newRealEstates.push(savedRecord);
|
||||
} else {
|
||||
existingRealEstates.push(savedRecord);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
newRecords: newRealEstates,
|
||||
existingRecords: existingRealEstates
|
||||
};
|
||||
} else {
|
||||
throw { message: "[POSTGRES] Failed to save records" };
|
||||
}
|
||||
}
|
||||
|
||||
close() {
|
||||
|
||||
@@ -1,102 +1,170 @@
|
||||
"use strict";
|
||||
|
||||
let fetch = require("node-fetch");
|
||||
let cheerio = require("cheerio");
|
||||
const fetch = require("node-fetch");
|
||||
const cheerio = require("cheerio");
|
||||
const Promise = require("bluebird");
|
||||
const moment = require("moment-timezone");
|
||||
|
||||
const {
|
||||
AD_TYPE,
|
||||
AD_CATEGORY,
|
||||
IGNORED_USERNAMES,
|
||||
AD_AGENCY,
|
||||
AD_STATUS,
|
||||
CRAWLER_AD_TYPE
|
||||
} = require("../../common/enums");
|
||||
|
||||
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
OLX_AD_TYPE: {},
|
||||
OLX_AD_CATEGORY: {},
|
||||
MAX_DETAIL_FIELDS: 30
|
||||
OLX_AD_TYPE: {
|
||||
[CRAWLER_AD_TYPE.ALL]: "",
|
||||
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
|
||||
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
|
||||
},
|
||||
OLX_AD_CATEGORY: {
|
||||
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
|
||||
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
|
||||
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
|
||||
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
|
||||
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
|
||||
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
|
||||
},
|
||||
MAX_DETAIL_FIELDS: 30,
|
||||
OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm",
|
||||
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||
};
|
||||
|
||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
|
||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
|
||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
|
||||
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
|
||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
|
||||
|
||||
class OlxCrawler {
|
||||
constructor(
|
||||
fromPage = 1,
|
||||
toPage = 10,
|
||||
maxResults = 1000,
|
||||
savers = [],
|
||||
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||
crawlerAdCategories = [
|
||||
AD_CATEGORY.CATEGORY_FLAT,
|
||||
AD_CATEGORY.CATEGORY_HOUSE
|
||||
]
|
||||
],
|
||||
maxPages = 1000,
|
||||
maxResultsPerPage = 100,
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.fromPage = fromPage;
|
||||
this.toPage = toPage;
|
||||
this.maxResults = maxResults;
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||
this.crawlerAdTypes = crawlerAdTypes;
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxPages = maxPages;
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.ignoredUsernames = ignoredUsernames;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
console.log("[OLX] Crawler started");
|
||||
const crawlAdTypes = this.crawlerAdTypes;
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`;
|
||||
const newRealEstates = [];
|
||||
|
||||
if (crawlAdCategories && crawlAdTypes) {
|
||||
const asyncPagesIndexingByCategory = [];
|
||||
if (crawlAdCategories) {
|
||||
const indexGenerators = [];
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
asyncPagesIndexingByCategory.push(
|
||||
this.indexPages(
|
||||
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
|
||||
)
|
||||
);
|
||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||
}
|
||||
|
||||
await Promise.all(asyncPagesIndexingByCategory);
|
||||
let done = false;
|
||||
while (!done) {
|
||||
const categoryIndexerPromises = [];
|
||||
const generatorsToRemove = [];
|
||||
for (const indexGenerator of indexGenerators) {
|
||||
categoryIndexerPromises.push(indexGenerator.next());
|
||||
generatorsToRemove.push(false);
|
||||
}
|
||||
|
||||
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
||||
const entries = singlePageResults.entries();
|
||||
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||
const { newRecords, existingRecords } = saveResults;
|
||||
|
||||
newRealEstates.push(...newRecords);
|
||||
|
||||
for (const existingRecord of existingRecords) {
|
||||
const { publishedDate, renewedDate } = existingRecord;
|
||||
|
||||
const publishedDateMoment = moment.utc(publishedDate);
|
||||
const renewedDateMoment = moment.utc(renewedDate);
|
||||
|
||||
const stopCrawlingThisCategory = publishedDateMoment.isSame(
|
||||
renewedDateMoment,
|
||||
"minute"
|
||||
);
|
||||
|
||||
if (stopCrawlingThisCategory) {
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//Generator returned undefined, remove this generator from array
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("Generator ", index + 1, "has no more pages");
|
||||
}
|
||||
}
|
||||
|
||||
// console.log("Generators state : ", generatorsToRemove);
|
||||
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
||||
if (generatorsToRemove[i]) {
|
||||
// console.log("\tRemove generator ", i + 1);
|
||||
indexGenerators.splice(i, 1);
|
||||
}
|
||||
}
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
}
|
||||
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
}
|
||||
}
|
||||
console.log("[OLX] Crawler finished");
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async indexPages(url) {
|
||||
const startPage = this.fromPage;
|
||||
const endPage = this.toPage;
|
||||
const maxResultsPerPage = this.maxResults;
|
||||
async *categoryIndexer(adCategory) {
|
||||
let pageToIndex = 1;
|
||||
|
||||
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
url,
|
||||
pageNumber,
|
||||
maxResultsPerPage
|
||||
);
|
||||
await this.saveCrawledResults(singlePageResults);
|
||||
await this.sleep(5000);
|
||||
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart && urlCategoryPart) {
|
||||
while (true) {
|
||||
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
urlPageToCrawl,
|
||||
this.maxResultsPerPage
|
||||
);
|
||||
|
||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||
yield singlePageResults;
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
++pageToIndex;
|
||||
if (pageToIndex === this.maxPages) {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
async indexSinglePage(urlWithoutPageNumber, pageNumber, maxResultsPerPage) {
|
||||
async indexSinglePage(url, maxResultsPerPage) {
|
||||
try {
|
||||
const url = `${urlWithoutPageNumber}&stranica=${pageNumber}`;
|
||||
|
||||
const res = await fetch(url);
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
let hrefs = [];
|
||||
const singlePageResults = [];
|
||||
|
||||
$("#rezultatipretrage")
|
||||
.find(".listitem")
|
||||
@@ -113,50 +181,60 @@ class OlxCrawler {
|
||||
let actualNoOfResults =
|
||||
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
console.log(`Scraping : ${hrefs[i]}`);
|
||||
|
||||
const adData = await this.scrapeAd(hrefs[i]);
|
||||
|
||||
if (adData) {
|
||||
singlePageResults.push(adData);
|
||||
}
|
||||
await this.sleep(500);
|
||||
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
||||
}
|
||||
|
||||
return singlePageResults;
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("Exception caught:" + e);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
//console.log("Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
let status = AD_STATUS.STATUS_NORMAL;
|
||||
|
||||
const username = $(
|
||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
|
||||
).text();
|
||||
const propertySelectors = {
|
||||
username:
|
||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
||||
title: "#naslovartikla",
|
||||
descriptions: ".artikal_detaljniopis_tekst",
|
||||
category:
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||
};
|
||||
|
||||
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
|
||||
const username = $(propertySelectors.username)
|
||||
.text()
|
||||
.trim();
|
||||
if (this.ignoredUsernames.includes((username || "").toLowerCase())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const title = $("#naslovartikla").text();
|
||||
const descriptions = $(".artikal_detaljniopis_tekst");
|
||||
const category = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||
).text();
|
||||
const title = $(propertySelectors.title)
|
||||
.text()
|
||||
.trim();
|
||||
const descriptions = $(propertySelectors.descriptions);
|
||||
const category = $(propertySelectors.category)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
//====== PRICE DETECTION AND EXTRACTION =====
|
||||
let price = null;
|
||||
const normalPriceValue = $("#pc > p:nth-child(2)").text();
|
||||
const urgentPriceValue = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
||||
).text();
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
||||
price = normalPriceValue;
|
||||
@@ -208,6 +286,39 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
||||
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
||||
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
|
||||
|
||||
const publishedDate = $(publishedDateValueSelector)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
const publishedDateMoment = moment.tz(
|
||||
publishedDate,
|
||||
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
if (!publishedDateMoment.isValid()) {
|
||||
throw { message: "Invalid published date ! Check parsing format" };
|
||||
}
|
||||
|
||||
const renewedDate = $(renewedDateFullValueSelector)
|
||||
.data("content")
|
||||
.trim();
|
||||
|
||||
const renewedDateMoment = moment.tz(
|
||||
renewedDate,
|
||||
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
if (!renewedDateMoment) {
|
||||
throw {
|
||||
message:
|
||||
"Invalid renewed date ! Check how parser parsed renewed date text"
|
||||
};
|
||||
}
|
||||
|
||||
adType = $(
|
||||
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
|
||||
@@ -262,7 +373,9 @@ class OlxCrawler {
|
||||
const time = $("time").attr("datetime");
|
||||
const numberOfViews = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
|
||||
).text();
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
//===========================================
|
||||
|
||||
//=========================================
|
||||
@@ -300,8 +413,14 @@ class OlxCrawler {
|
||||
price: parsedPrice,
|
||||
area: parsedArea,
|
||||
gardenSize: parsedGardenSize,
|
||||
shortDescription: descriptions.first().text(),
|
||||
longDescription: descriptions.last().text(),
|
||||
shortDescription: descriptions
|
||||
.first()
|
||||
.text()
|
||||
.trim(),
|
||||
longDescription: descriptions
|
||||
.last()
|
||||
.text()
|
||||
.trim(),
|
||||
streetNumber: 0,
|
||||
streetName: "",
|
||||
locality: "",
|
||||
@@ -312,7 +431,9 @@ class OlxCrawler {
|
||||
country: "",
|
||||
locationLat,
|
||||
locationLong,
|
||||
adStatus: status
|
||||
adStatus: status,
|
||||
publishedDate: publishedDateMoment.toISOString(),
|
||||
renewedDate: renewedDateMoment.toISOString()
|
||||
};
|
||||
|
||||
return data;
|
||||
@@ -334,6 +455,8 @@ class OlxCrawler {
|
||||
return AD_CATEGORY.CATEGORY_HOUSE;
|
||||
case "Poslovni prostori":
|
||||
return AD_CATEGORY.CATEGORY_OFFICE;
|
||||
case "Apartmani":
|
||||
return AD_CATEGORY.CATEGORY_APARTMENT;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
@@ -370,6 +493,58 @@ class OlxCrawler {
|
||||
return parseFloat(formattedPriceText);
|
||||
}
|
||||
|
||||
parseRenewedDate(renewedDateText) {
|
||||
const currentMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||
|
||||
if (renewedDateText.includes("Prije mjesec dana")) {
|
||||
return currentMoment.add(-1, "month");
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("Jučer")) {
|
||||
return currentMoment.add(-1, "day");
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("Prije sat")) {
|
||||
return currentMoment.add(-1, "hour");
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("dan")) {
|
||||
// format for this case should be "Prije N dana" or "Prije N dan"
|
||||
const dateParts = renewedDateText.split(" ");
|
||||
if (dateParts[0] === "Prije") {
|
||||
const numberOfDays = parseInt(dateParts[1]);
|
||||
return currentMoment.add(-1 * numberOfDays, "days");
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("sat")) {
|
||||
const dateParts = renewedDateText.split(" ");
|
||||
const parsedHours =
|
||||
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
|
||||
if (!parsedHours) {
|
||||
return undefined;
|
||||
}
|
||||
return currentMoment.add(-1 * parsedHours, "hours");
|
||||
}
|
||||
|
||||
const todayVariations = ["min", "sekund", "maloprije"];
|
||||
for (const todayVariation of todayVariations) {
|
||||
if (renewedDateText.includes(todayVariation)) {
|
||||
return currentMoment;
|
||||
}
|
||||
}
|
||||
|
||||
const renewedDateMoment = moment.tz(
|
||||
renewedDateText,
|
||||
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
return renewedDateMoment.isValid() ? renewedDateMoment : undefined;
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
@@ -377,9 +552,13 @@ class OlxCrawler {
|
||||
async saveCrawledResults(results) {
|
||||
const savers = this.savers;
|
||||
|
||||
for (const saver of savers) {
|
||||
await saver.save(results);
|
||||
}
|
||||
// for (const saver of savers) {
|
||||
// await saver.save(results);
|
||||
// }
|
||||
|
||||
//For now, we use only Postgres saver, so ...
|
||||
return await savers[0].save(results);
|
||||
//so that we can use some sequelize options and information when data is inserted
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,10 +23,13 @@ const bulkUpsertRealEstates = async realEstateData => {
|
||||
"longDescription",
|
||||
"gardenSize",
|
||||
"adStatus",
|
||||
"updatedAt"
|
||||
"updatedAt",
|
||||
"renewedDate"
|
||||
];
|
||||
|
||||
return await db.RealEstate.bulkCreate(realEstateData, {
|
||||
updateOnDuplicate: fieldsToUpdateIfDuplicate
|
||||
updateOnDuplicate: fieldsToUpdateIfDuplicate,
|
||||
returning: true
|
||||
});
|
||||
} catch (e) {
|
||||
console.log("Error bulk upserting realEstates : ", e);
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
"use strict";
|
||||
|
||||
module.exports = {
|
||||
up: (queryInterface, Sequelize) => {
|
||||
return Promise.all([
|
||||
queryInterface.addColumn("RealEstates", "publishedDate", {
|
||||
type: Sequelize.DATE
|
||||
}),
|
||||
queryInterface.addColumn("RealEstates", "renewedDate", {
|
||||
type: Sequelize.DATE
|
||||
})
|
||||
]);
|
||||
},
|
||||
|
||||
down: (queryInterface, Sequelize) => {
|
||||
return Promise.all([
|
||||
queryInterface.removeColumn("RealEstates", "renewedDate"),
|
||||
queryInterface.removeColumn("RealEstates", "publishedDate")
|
||||
]);
|
||||
}
|
||||
};
|
||||
@@ -43,14 +43,12 @@ module.exports = (sequelize, DataTypes) => {
|
||||
country: DataTypes.TEXT,
|
||||
locationLat: DataTypes.REAL,
|
||||
locationLong: DataTypes.REAL,
|
||||
lastTimeCrawled: {
|
||||
type: DataTypes.DATE,
|
||||
allowNull: false
|
||||
},
|
||||
title: DataTypes.TEXT,
|
||||
shortDescription: DataTypes.TEXT,
|
||||
longDescription: DataTypes.TEXT,
|
||||
adStatus: DataTypes.INTEGER
|
||||
adStatus: DataTypes.INTEGER,
|
||||
publishedDate: DataTypes.DATE,
|
||||
renewedDate: DataTypes.DATE
|
||||
});
|
||||
|
||||
RealEstate.associate = models => {
|
||||
|
||||
@@ -16,8 +16,9 @@ SOURCE_EMAIL=info@saburly.com
|
||||
|
||||
#=============== CRAWLER SETTINGS===============#
|
||||
#==OLX==
|
||||
OLX_START_PAGE=Crawler starts from this page
|
||||
OLX_END_PAGE=Crawler ends with this page (including this page)
|
||||
OLX_MAX_PAGES=Restrict crawler to this number of pages
|
||||
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
|
||||
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||
|
||||
6
package-lock.json
generated
6
package-lock.json
generated
@@ -2697,9 +2697,9 @@
|
||||
"integrity": "sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg=="
|
||||
},
|
||||
"moment-timezone": {
|
||||
"version": "0.5.25",
|
||||
"resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.25.tgz",
|
||||
"integrity": "sha512-DgEaTyN/z0HFaVcVbSyVCUU6HeFdnNC3vE4c9cgu2dgMTvjBUBdBzWfasTBmAW45u5OIMeCJtU8yNjM22DHucw==",
|
||||
"version": "0.5.26",
|
||||
"resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.26.tgz",
|
||||
"integrity": "sha512-sFP4cgEKTCymBBKgoxZjYzlSovC20Y6J7y3nanDc5RoBIXKlZhoYwBoZGe3flwU6A372AcRwScH8KiwV6zjy1g==",
|
||||
"requires": {
|
||||
"moment": ">= 2.9.0"
|
||||
}
|
||||
|
||||
@@ -34,6 +34,8 @@
|
||||
"express": "^4.16.4",
|
||||
"express-ejs-layouts": "^2.5.0",
|
||||
"express-layout": "^0.1.0",
|
||||
"moment": "^2.24.0",
|
||||
"moment-timezone": "^0.5.26",
|
||||
"node-fetch": "^2.3.0",
|
||||
"node-schedule": "^1.3.2",
|
||||
"pg": "^7.10.0",
|
||||
|
||||
Reference in New Issue
Block a user