Merge branch 'make-crawler-smarter' into 'master'
Make crawler smarter See merge request saburly/marketalarm/web!33
This commit was merged in pull request #33.
This commit is contained in:
@@ -12,8 +12,6 @@ const AD_CATEGORY = {
|
|||||||
CATEGORY_GARAGE: "GARAGE"
|
CATEGORY_GARAGE: "GARAGE"
|
||||||
};
|
};
|
||||||
|
|
||||||
const IGNORED_USERNAMES = [];
|
|
||||||
|
|
||||||
const AD_STATUS = {
|
const AD_STATUS = {
|
||||||
STATUS_NORMAL: 1,
|
STATUS_NORMAL: 1,
|
||||||
STATUS_RESERVED: 2,
|
STATUS_RESERVED: 2,
|
||||||
@@ -36,7 +34,6 @@ const CRAWLER_AD_TYPE = {
|
|||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
AD_TYPE,
|
AD_TYPE,
|
||||||
IGNORED_USERNAMES,
|
|
||||||
AD_CATEGORY,
|
AD_CATEGORY,
|
||||||
AD_STATUS,
|
AD_STATUS,
|
||||||
AD_AGENCY,
|
AD_AGENCY,
|
||||||
|
|||||||
@@ -6,7 +6,10 @@ const APP_URL =
|
|||||||
? process.env.APP_URL || "http://market-alarm"
|
? process.env.APP_URL || "http://market-alarm"
|
||||||
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
||||||
|
|
||||||
|
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
APP_PORT,
|
APP_PORT,
|
||||||
APP_URL
|
APP_URL,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -13,23 +13,28 @@ const PostgresSaver = require("./savers/postgres");
|
|||||||
|
|
||||||
const crawlers = [
|
const crawlers = [
|
||||||
new OlxCrawler(
|
new OlxCrawler(
|
||||||
OLX_CONFIG.OLX_START_PAGE,
|
|
||||||
OLX_CONFIG.OLX_END_PAGE,
|
|
||||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
|
||||||
[new PostgresSaver()],
|
[new PostgresSaver()],
|
||||||
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
|
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
|
||||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES
|
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
||||||
|
OLX_CONFIG.OLX_MAX_PAGES,
|
||||||
|
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||||
|
OLX_CONFIG.OLX_IGNORED_USERNAMES,
|
||||||
|
OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES
|
||||||
)
|
)
|
||||||
];
|
];
|
||||||
|
|
||||||
async function crawlAll() {
|
async function crawlAll() {
|
||||||
for (let crawler of crawlers) {
|
for (let crawler of crawlers) {
|
||||||
try {
|
try {
|
||||||
await crawler.crawl();
|
const newRealEstates = await crawler.crawl();
|
||||||
|
|
||||||
|
console.log("Number of new real estates : ", newRealEstates.length);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.log("Error crawling. Trying next crawler! ", e);
|
console.log("Error crawling. Trying next crawler! ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
crawlAll();
|
(async () => {
|
||||||
|
await crawlAll();
|
||||||
|
})();
|
||||||
|
|||||||
@@ -2,29 +2,37 @@
|
|||||||
require("dotenv").config({ path: "../../.env" });
|
require("dotenv").config({ path: "../../.env" });
|
||||||
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
|
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums");
|
||||||
|
|
||||||
const crawlerAdType =
|
const olxCrawlerAdType =
|
||||||
process.env.OLX_CRAWLER_AD_TYPE !== undefined
|
process.env.OLX_CRAWLER_AD_TYPE !== undefined
|
||||||
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
|
? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE]
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
const parsedCrawlerAdCategories =
|
const olxParsedCrawlerAdCategories =
|
||||||
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
|
process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined
|
||||||
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
||||||
category.trim()
|
category.trim()
|
||||||
)
|
)
|
||||||
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
|
: ["CATEGORY_FLAT", "CATEGORY_HOUSE"];
|
||||||
|
|
||||||
const transformedCrawlerAdCategories = parsedCrawlerAdCategories
|
const olxIgnoredUsernames =
|
||||||
|
process.env.OLX_IGNORED_USERNAMES !== undefined
|
||||||
|
? process.env.OLX_IGNORED_USERNAMES.split(",").map(username =>
|
||||||
|
username.trim()
|
||||||
|
)
|
||||||
|
: [];
|
||||||
|
|
||||||
|
const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories
|
||||||
.map(categoryName => AD_CATEGORY[categoryName])
|
.map(categoryName => AD_CATEGORY[categoryName])
|
||||||
.filter(category => !!category);
|
.filter(category => !!category);
|
||||||
|
|
||||||
const OLX_CONFIG = {
|
const OLX_CONFIG = {
|
||||||
OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1,
|
OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500,
|
||||||
OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10,
|
|
||||||
OLX_MAX_RESULTS_PER_PAGE:
|
OLX_MAX_RESULTS_PER_PAGE:
|
||||||
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
|
parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50,
|
||||||
OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE,
|
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||||
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories
|
OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories,
|
||||||
|
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
||||||
|
OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
const moment = require("moment");
|
||||||
|
|
||||||
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
|
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
|
||||||
|
|
||||||
class PostgresSaver {
|
class PostgresSaver {
|
||||||
@@ -9,7 +11,33 @@ class PostgresSaver {
|
|||||||
|
|
||||||
async save(results) {
|
async save(results) {
|
||||||
console.log("[POSTGRES] Saving...");
|
console.log("[POSTGRES] Saving...");
|
||||||
await bulkUpsertRealEstates(results);
|
|
||||||
|
const savedRecords = await bulkUpsertRealEstates(results);
|
||||||
|
|
||||||
|
if (Array.isArray(savedRecords)) {
|
||||||
|
const newRealEstates = [];
|
||||||
|
const existingRealEstates = [];
|
||||||
|
|
||||||
|
for (const savedRecord of savedRecords) {
|
||||||
|
const { createdAt, updatedAt } = savedRecord;
|
||||||
|
|
||||||
|
const createdAtMoment = moment.utc(createdAt);
|
||||||
|
const updatedAtMoment = moment.utc(updatedAt);
|
||||||
|
|
||||||
|
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
|
||||||
|
newRealEstates.push(savedRecord);
|
||||||
|
} else {
|
||||||
|
existingRealEstates.push(savedRecord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
newRecords: newRealEstates,
|
||||||
|
existingRecords: existingRealEstates
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
throw { message: "[POSTGRES] Failed to save records" };
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
close() {
|
close() {
|
||||||
|
|||||||
@@ -1,102 +1,170 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
let fetch = require("node-fetch");
|
const fetch = require("node-fetch");
|
||||||
let cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
|
const Promise = require("bluebird");
|
||||||
|
const moment = require("moment-timezone");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
AD_TYPE,
|
AD_TYPE,
|
||||||
AD_CATEGORY,
|
AD_CATEGORY,
|
||||||
IGNORED_USERNAMES,
|
|
||||||
AD_AGENCY,
|
AD_AGENCY,
|
||||||
AD_STATUS,
|
AD_STATUS,
|
||||||
CRAWLER_AD_TYPE
|
CRAWLER_AD_TYPE
|
||||||
} = require("../../common/enums");
|
} = require("../../common/enums");
|
||||||
|
|
||||||
|
const { DEFAULT_TIMEZONE } = require("../../config/appConfig");
|
||||||
|
|
||||||
const OLX_ENUMS = {
|
const OLX_ENUMS = {
|
||||||
OLX_AD_TYPE: {},
|
OLX_AD_TYPE: {
|
||||||
OLX_AD_CATEGORY: {},
|
[CRAWLER_AD_TYPE.ALL]: "",
|
||||||
MAX_DETAIL_FIELDS: 30
|
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
|
||||||
|
},
|
||||||
|
OLX_AD_CATEGORY: {
|
||||||
|
[AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23",
|
||||||
|
[AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24",
|
||||||
|
[AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29",
|
||||||
|
[AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25",
|
||||||
|
[AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27",
|
||||||
|
[AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30"
|
||||||
|
},
|
||||||
|
MAX_DETAIL_FIELDS: 30,
|
||||||
|
OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm",
|
||||||
|
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||||
};
|
};
|
||||||
|
|
||||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = "";
|
|
||||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja";
|
|
||||||
OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje";
|
|
||||||
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27";
|
|
||||||
OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30";
|
|
||||||
|
|
||||||
class OlxCrawler {
|
class OlxCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
fromPage = 1,
|
|
||||||
toPage = 10,
|
|
||||||
maxResults = 1000,
|
|
||||||
savers = [],
|
savers = [],
|
||||||
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||||
crawlerAdCategories = [
|
crawlerAdCategories = [
|
||||||
AD_CATEGORY.CATEGORY_FLAT,
|
AD_CATEGORY.CATEGORY_FLAT,
|
||||||
AD_CATEGORY.CATEGORY_HOUSE
|
AD_CATEGORY.CATEGORY_HOUSE
|
||||||
]
|
],
|
||||||
|
maxPages = 1000,
|
||||||
|
maxResultsPerPage = 100,
|
||||||
|
ignoredUsernames = [],
|
||||||
|
delayBetweenPages = 1000
|
||||||
) {
|
) {
|
||||||
this.fromPage = fromPage;
|
|
||||||
this.toPage = toPage;
|
|
||||||
this.maxResults = maxResults;
|
|
||||||
this.savers = savers;
|
this.savers = savers;
|
||||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||||
this.crawlerAdTypes = crawlerAdTypes;
|
this.crawlerAdTypes = crawlerAdTypes;
|
||||||
this.crawlerAdCategories = crawlerAdCategories;
|
this.crawlerAdCategories = crawlerAdCategories;
|
||||||
|
this.maxPages = maxPages;
|
||||||
|
this.maxResultsPerPage = maxResultsPerPage;
|
||||||
|
this.ignoredUsernames = ignoredUsernames;
|
||||||
|
this.delayBetweenPages = delayBetweenPages;
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
console.log("[OLX] Crawler started");
|
console.log("[OLX] Crawler started");
|
||||||
const crawlAdTypes = this.crawlerAdTypes;
|
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
|
||||||
const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`;
|
const newRealEstates = [];
|
||||||
|
|
||||||
if (crawlAdCategories && crawlAdTypes) {
|
if (crawlAdCategories) {
|
||||||
const asyncPagesIndexingByCategory = [];
|
const indexGenerators = [];
|
||||||
for (const adCategory of crawlAdCategories) {
|
for (const adCategory of crawlAdCategories) {
|
||||||
asyncPagesIndexingByCategory.push(
|
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||||
this.indexPages(
|
|
||||||
`${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}`
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await Promise.all(asyncPagesIndexingByCategory);
|
let done = false;
|
||||||
|
while (!done) {
|
||||||
|
const categoryIndexerPromises = [];
|
||||||
|
const generatorsToRemove = [];
|
||||||
|
for (const indexGenerator of indexGenerators) {
|
||||||
|
categoryIndexerPromises.push(indexGenerator.next());
|
||||||
|
generatorsToRemove.push(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
||||||
|
const entries = singlePageResults.entries();
|
||||||
|
|
||||||
|
for (const [index, { value: singlePageResult }] of entries) {
|
||||||
|
if (singlePageResult) {
|
||||||
|
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||||
|
const { newRecords, existingRecords } = saveResults;
|
||||||
|
|
||||||
|
newRealEstates.push(...newRecords);
|
||||||
|
|
||||||
|
for (const existingRecord of existingRecords) {
|
||||||
|
const { publishedDate, renewedDate } = existingRecord;
|
||||||
|
|
||||||
|
const publishedDateMoment = moment.utc(publishedDate);
|
||||||
|
const renewedDateMoment = moment.utc(renewedDate);
|
||||||
|
|
||||||
|
const stopCrawlingThisCategory = publishedDateMoment.isSame(
|
||||||
|
renewedDateMoment,
|
||||||
|
"minute"
|
||||||
|
);
|
||||||
|
|
||||||
|
if (stopCrawlingThisCategory) {
|
||||||
|
generatorsToRemove[index] = true;
|
||||||
|
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//Generator returned undefined, remove this generator from array
|
||||||
|
generatorsToRemove[index] = true;
|
||||||
|
// console.log("Generator ", index + 1, "has no more pages");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// console.log("Generators state : ", generatorsToRemove);
|
||||||
|
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
||||||
|
if (generatorsToRemove[i]) {
|
||||||
|
// console.log("\tRemove generator ", i + 1);
|
||||||
|
indexGenerators.splice(i, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (indexGenerators.length === 0) {
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.sleep(this.delayBetweenPages);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
console.log("[OLX] Crawler finished");
|
console.log("[OLX] Crawler finished");
|
||||||
|
return newRealEstates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async indexPages(url) {
|
async *categoryIndexer(adCategory) {
|
||||||
const startPage = this.fromPage;
|
let pageToIndex = 1;
|
||||||
const endPage = this.toPage;
|
|
||||||
const maxResultsPerPage = this.maxResults;
|
|
||||||
|
|
||||||
for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
|
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||||
const singlePageResults = await this.indexSinglePage(
|
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||||
url,
|
if (urlAdTypePart && urlCategoryPart) {
|
||||||
pageNumber,
|
while (true) {
|
||||||
maxResultsPerPage
|
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||||
);
|
const singlePageResults = await this.indexSinglePage(
|
||||||
await this.saveCrawledResults(singlePageResults);
|
urlPageToCrawl,
|
||||||
await this.sleep(5000);
|
this.maxResultsPerPage
|
||||||
|
);
|
||||||
|
|
||||||
|
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||||
|
yield singlePageResults;
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
++pageToIndex;
|
||||||
|
if (pageToIndex === this.maxPages) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async indexSinglePage(urlWithoutPageNumber, pageNumber, maxResultsPerPage) {
|
async indexSinglePage(url, maxResultsPerPage) {
|
||||||
try {
|
try {
|
||||||
const url = `${urlWithoutPageNumber}&stranica=${pageNumber}`;
|
|
||||||
|
|
||||||
const res = await fetch(url);
|
const res = await fetch(url);
|
||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let hrefs = [];
|
let hrefs = [];
|
||||||
const singlePageResults = [];
|
|
||||||
|
|
||||||
$("#rezultatipretrage")
|
$("#rezultatipretrage")
|
||||||
.find(".listitem")
|
.find(".listitem")
|
||||||
@@ -113,50 +181,60 @@ class OlxCrawler {
|
|||||||
let actualNoOfResults =
|
let actualNoOfResults =
|
||||||
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
||||||
|
|
||||||
|
const asyncScraping = [];
|
||||||
for (let i = 0; i < actualNoOfResults; i++) {
|
for (let i = 0; i < actualNoOfResults; i++) {
|
||||||
console.log(`Scraping : ${hrefs[i]}`);
|
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
||||||
|
|
||||||
const adData = await this.scrapeAd(hrefs[i]);
|
|
||||||
|
|
||||||
if (adData) {
|
|
||||||
singlePageResults.push(adData);
|
|
||||||
}
|
|
||||||
await this.sleep(500);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return singlePageResults;
|
const scrapedData = await Promise.all(asyncScraping);
|
||||||
|
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||||
|
return filteredScrapedData;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught:" + e);
|
console.error("Exception caught:" + e);
|
||||||
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(url) {
|
async scrapeAd(url) {
|
||||||
|
//console.log("Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let status = AD_STATUS.STATUS_NORMAL;
|
let status = AD_STATUS.STATUS_NORMAL;
|
||||||
|
|
||||||
const username = $(
|
const propertySelectors = {
|
||||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
|
username:
|
||||||
).text();
|
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
||||||
|
title: "#naslovartikla",
|
||||||
|
descriptions: ".artikal_detaljniopis_tekst",
|
||||||
|
category:
|
||||||
|
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||||
|
};
|
||||||
|
|
||||||
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
|
const username = $(propertySelectors.username)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
if (this.ignoredUsernames.includes((username || "").toLowerCase())) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const title = $("#naslovartikla").text();
|
const title = $(propertySelectors.title)
|
||||||
const descriptions = $(".artikal_detaljniopis_tekst");
|
.text()
|
||||||
const category = $(
|
.trim();
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
const descriptions = $(propertySelectors.descriptions);
|
||||||
).text();
|
const category = $(propertySelectors.category)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
//====== PRICE DETECTION AND EXTRACTION =====
|
//====== PRICE DETECTION AND EXTRACTION =====
|
||||||
let price = null;
|
let price = null;
|
||||||
const normalPriceValue = $("#pc > p:nth-child(2)").text();
|
const normalPriceValue = $("#pc > p:nth-child(2)").text();
|
||||||
const urgentPriceValue = $(
|
const urgentPriceValue = $(
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
||||||
).text();
|
)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
if (normalPriceValue && normalPriceValue.length > 0) {
|
||||||
price = normalPriceValue;
|
price = normalPriceValue;
|
||||||
@@ -208,6 +286,39 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
||||||
|
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
||||||
|
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
|
||||||
|
|
||||||
|
const publishedDate = $(publishedDateValueSelector)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const publishedDateMoment = moment.tz(
|
||||||
|
publishedDate,
|
||||||
|
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!publishedDateMoment.isValid()) {
|
||||||
|
throw { message: "Invalid published date ! Check parsing format" };
|
||||||
|
}
|
||||||
|
|
||||||
|
const renewedDate = $(renewedDateFullValueSelector)
|
||||||
|
.data("content")
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const renewedDateMoment = moment.tz(
|
||||||
|
renewedDate,
|
||||||
|
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!renewedDateMoment) {
|
||||||
|
throw {
|
||||||
|
message:
|
||||||
|
"Invalid renewed date ! Check how parser parsed renewed date text"
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
adType = $(
|
adType = $(
|
||||||
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
|
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
|
||||||
@@ -262,7 +373,9 @@ class OlxCrawler {
|
|||||||
const time = $("time").attr("datetime");
|
const time = $("time").attr("datetime");
|
||||||
const numberOfViews = $(
|
const numberOfViews = $(
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
|
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
|
||||||
).text();
|
)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
//===========================================
|
//===========================================
|
||||||
|
|
||||||
//=========================================
|
//=========================================
|
||||||
@@ -300,8 +413,14 @@ class OlxCrawler {
|
|||||||
price: parsedPrice,
|
price: parsedPrice,
|
||||||
area: parsedArea,
|
area: parsedArea,
|
||||||
gardenSize: parsedGardenSize,
|
gardenSize: parsedGardenSize,
|
||||||
shortDescription: descriptions.first().text(),
|
shortDescription: descriptions
|
||||||
longDescription: descriptions.last().text(),
|
.first()
|
||||||
|
.text()
|
||||||
|
.trim(),
|
||||||
|
longDescription: descriptions
|
||||||
|
.last()
|
||||||
|
.text()
|
||||||
|
.trim(),
|
||||||
streetNumber: 0,
|
streetNumber: 0,
|
||||||
streetName: "",
|
streetName: "",
|
||||||
locality: "",
|
locality: "",
|
||||||
@@ -312,7 +431,9 @@ class OlxCrawler {
|
|||||||
country: "",
|
country: "",
|
||||||
locationLat,
|
locationLat,
|
||||||
locationLong,
|
locationLong,
|
||||||
adStatus: status
|
adStatus: status,
|
||||||
|
publishedDate: publishedDateMoment.toISOString(),
|
||||||
|
renewedDate: renewedDateMoment.toISOString()
|
||||||
};
|
};
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
@@ -334,6 +455,8 @@ class OlxCrawler {
|
|||||||
return AD_CATEGORY.CATEGORY_HOUSE;
|
return AD_CATEGORY.CATEGORY_HOUSE;
|
||||||
case "Poslovni prostori":
|
case "Poslovni prostori":
|
||||||
return AD_CATEGORY.CATEGORY_OFFICE;
|
return AD_CATEGORY.CATEGORY_OFFICE;
|
||||||
|
case "Apartmani":
|
||||||
|
return AD_CATEGORY.CATEGORY_APARTMENT;
|
||||||
default:
|
default:
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
@@ -370,6 +493,58 @@ class OlxCrawler {
|
|||||||
return parseFloat(formattedPriceText);
|
return parseFloat(formattedPriceText);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
parseRenewedDate(renewedDateText) {
|
||||||
|
const currentMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||||
|
|
||||||
|
if (renewedDateText.includes("Prije mjesec dana")) {
|
||||||
|
return currentMoment.add(-1, "month");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (renewedDateText.includes("Jučer")) {
|
||||||
|
return currentMoment.add(-1, "day");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (renewedDateText.includes("Prije sat")) {
|
||||||
|
return currentMoment.add(-1, "hour");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (renewedDateText.includes("dan")) {
|
||||||
|
// format for this case should be "Prije N dana" or "Prije N dan"
|
||||||
|
const dateParts = renewedDateText.split(" ");
|
||||||
|
if (dateParts[0] === "Prije") {
|
||||||
|
const numberOfDays = parseInt(dateParts[1]);
|
||||||
|
return currentMoment.add(-1 * numberOfDays, "days");
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (renewedDateText.includes("sat")) {
|
||||||
|
const dateParts = renewedDateText.split(" ");
|
||||||
|
const parsedHours =
|
||||||
|
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
|
||||||
|
if (!parsedHours) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return currentMoment.add(-1 * parsedHours, "hours");
|
||||||
|
}
|
||||||
|
|
||||||
|
const todayVariations = ["min", "sekund", "maloprije"];
|
||||||
|
for (const todayVariation of todayVariations) {
|
||||||
|
if (renewedDateText.includes(todayVariation)) {
|
||||||
|
return currentMoment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const renewedDateMoment = moment.tz(
|
||||||
|
renewedDateText,
|
||||||
|
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
);
|
||||||
|
|
||||||
|
return renewedDateMoment.isValid() ? renewedDateMoment : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
async sleep(ms) {
|
async sleep(ms) {
|
||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
@@ -377,9 +552,13 @@ class OlxCrawler {
|
|||||||
async saveCrawledResults(results) {
|
async saveCrawledResults(results) {
|
||||||
const savers = this.savers;
|
const savers = this.savers;
|
||||||
|
|
||||||
for (const saver of savers) {
|
// for (const saver of savers) {
|
||||||
await saver.save(results);
|
// await saver.save(results);
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
//For now, we use only Postgres saver, so ...
|
||||||
|
return await savers[0].save(results);
|
||||||
|
//so that we can use some sequelize options and information when data is inserted
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -23,10 +23,13 @@ const bulkUpsertRealEstates = async realEstateData => {
|
|||||||
"longDescription",
|
"longDescription",
|
||||||
"gardenSize",
|
"gardenSize",
|
||||||
"adStatus",
|
"adStatus",
|
||||||
"updatedAt"
|
"updatedAt",
|
||||||
|
"renewedDate"
|
||||||
];
|
];
|
||||||
|
|
||||||
return await db.RealEstate.bulkCreate(realEstateData, {
|
return await db.RealEstate.bulkCreate(realEstateData, {
|
||||||
updateOnDuplicate: fieldsToUpdateIfDuplicate
|
updateOnDuplicate: fieldsToUpdateIfDuplicate,
|
||||||
|
returning: true
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.log("Error bulk upserting realEstates : ", e);
|
console.log("Error bulk upserting realEstates : ", e);
|
||||||
|
|||||||
@@ -0,0 +1,21 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
up: (queryInterface, Sequelize) => {
|
||||||
|
return Promise.all([
|
||||||
|
queryInterface.addColumn("RealEstates", "publishedDate", {
|
||||||
|
type: Sequelize.DATE
|
||||||
|
}),
|
||||||
|
queryInterface.addColumn("RealEstates", "renewedDate", {
|
||||||
|
type: Sequelize.DATE
|
||||||
|
})
|
||||||
|
]);
|
||||||
|
},
|
||||||
|
|
||||||
|
down: (queryInterface, Sequelize) => {
|
||||||
|
return Promise.all([
|
||||||
|
queryInterface.removeColumn("RealEstates", "renewedDate"),
|
||||||
|
queryInterface.removeColumn("RealEstates", "publishedDate")
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
};
|
||||||
@@ -43,14 +43,12 @@ module.exports = (sequelize, DataTypes) => {
|
|||||||
country: DataTypes.TEXT,
|
country: DataTypes.TEXT,
|
||||||
locationLat: DataTypes.REAL,
|
locationLat: DataTypes.REAL,
|
||||||
locationLong: DataTypes.REAL,
|
locationLong: DataTypes.REAL,
|
||||||
lastTimeCrawled: {
|
|
||||||
type: DataTypes.DATE,
|
|
||||||
allowNull: false
|
|
||||||
},
|
|
||||||
title: DataTypes.TEXT,
|
title: DataTypes.TEXT,
|
||||||
shortDescription: DataTypes.TEXT,
|
shortDescription: DataTypes.TEXT,
|
||||||
longDescription: DataTypes.TEXT,
|
longDescription: DataTypes.TEXT,
|
||||||
adStatus: DataTypes.INTEGER
|
adStatus: DataTypes.INTEGER,
|
||||||
|
publishedDate: DataTypes.DATE,
|
||||||
|
renewedDate: DataTypes.DATE
|
||||||
});
|
});
|
||||||
|
|
||||||
RealEstate.associate = models => {
|
RealEstate.associate = models => {
|
||||||
|
|||||||
@@ -16,8 +16,9 @@ SOURCE_EMAIL=info@saburly.com
|
|||||||
|
|
||||||
#=============== CRAWLER SETTINGS===============#
|
#=============== CRAWLER SETTINGS===============#
|
||||||
#==OLX==
|
#==OLX==
|
||||||
OLX_START_PAGE=Crawler starts from this page
|
OLX_MAX_PAGES=Restrict crawler to this number of pages
|
||||||
OLX_END_PAGE=Crawler ends with this page (including this page)
|
|
||||||
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||||
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||||
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||||
|
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
|
||||||
|
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||||
|
|||||||
6
package-lock.json
generated
6
package-lock.json
generated
@@ -2697,9 +2697,9 @@
|
|||||||
"integrity": "sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg=="
|
"integrity": "sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg=="
|
||||||
},
|
},
|
||||||
"moment-timezone": {
|
"moment-timezone": {
|
||||||
"version": "0.5.25",
|
"version": "0.5.26",
|
||||||
"resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.25.tgz",
|
"resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.26.tgz",
|
||||||
"integrity": "sha512-DgEaTyN/z0HFaVcVbSyVCUU6HeFdnNC3vE4c9cgu2dgMTvjBUBdBzWfasTBmAW45u5OIMeCJtU8yNjM22DHucw==",
|
"integrity": "sha512-sFP4cgEKTCymBBKgoxZjYzlSovC20Y6J7y3nanDc5RoBIXKlZhoYwBoZGe3flwU6A372AcRwScH8KiwV6zjy1g==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"moment": ">= 2.9.0"
|
"moment": ">= 2.9.0"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,8 @@
|
|||||||
"express": "^4.16.4",
|
"express": "^4.16.4",
|
||||||
"express-ejs-layouts": "^2.5.0",
|
"express-ejs-layouts": "^2.5.0",
|
||||||
"express-layout": "^0.1.0",
|
"express-layout": "^0.1.0",
|
||||||
|
"moment": "^2.24.0",
|
||||||
|
"moment-timezone": "^0.5.26",
|
||||||
"node-fetch": "^2.3.0",
|
"node-fetch": "^2.3.0",
|
||||||
"node-schedule": "^1.3.2",
|
"node-schedule": "^1.3.2",
|
||||||
"pg": "^7.10.0",
|
"pg": "^7.10.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user