stop crawling when existing, non-renewed ad is found
This commit is contained in:
@@ -18,7 +18,7 @@ const crawlers = [
|
||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
||||
OLX_CONFIG.OLX_MAX_PAGES,
|
||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||
OLX_CONFIG.OLX_MAX_AGE
|
||||
OLX_CONFIG.OLX_IGNORED_USERNAMES
|
||||
)
|
||||
];
|
||||
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
|
||||
const moment = require("moment");
|
||||
|
||||
const {
|
||||
bulkUpsertRealEstates,
|
||||
checkIfAlreadyExist
|
||||
} = require("../../helpers/db/realEstate");
|
||||
|
||||
class PostgresSaver {
|
||||
connect() {
|
||||
@@ -7,9 +12,29 @@ class PostgresSaver {
|
||||
return true;
|
||||
}
|
||||
|
||||
async save(results, maxAge) {
|
||||
async save(results) {
|
||||
console.log("[POSTGRES] Saving...");
|
||||
await bulkUpsertRealEstates(results, maxAge);
|
||||
const resultsWithPublishedAndRenewedDateSame = results.filter(
|
||||
realEstate => {
|
||||
const { publishedDate, renewedDate } = realEstate;
|
||||
|
||||
const publishedMomentDate = moment.utc(publishedDate);
|
||||
const renewedMomentDate = moment.utc(renewedDate);
|
||||
|
||||
return publishedMomentDate.isSame(renewedMomentDate, "minute");
|
||||
}
|
||||
);
|
||||
|
||||
const exist =
|
||||
resultsWithPublishedAndRenewedDateSame.length > 0
|
||||
? await checkIfAlreadyExist(resultsWithPublishedAndRenewedDateSame)
|
||||
: false;
|
||||
const savedRecords = await bulkUpsertRealEstates(results);
|
||||
|
||||
return {
|
||||
exist,
|
||||
savedRecords
|
||||
};
|
||||
}
|
||||
|
||||
close() {
|
||||
|
||||
@@ -8,7 +8,6 @@ const moment = require("moment-timezone");
|
||||
const {
|
||||
AD_TYPE,
|
||||
AD_CATEGORY,
|
||||
IGNORED_USERNAMES,
|
||||
AD_AGENCY,
|
||||
AD_STATUS,
|
||||
CRAWLER_AD_TYPE
|
||||
@@ -45,7 +44,7 @@ class OlxCrawler {
|
||||
],
|
||||
maxPages = 1000,
|
||||
maxResultsPerPage = 100,
|
||||
maxAge = 30
|
||||
ignoredUsernames = []
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||
@@ -53,13 +52,16 @@ class OlxCrawler {
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxPages = maxPages;
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.maxAge = maxAge;
|
||||
this.ignoredUsernames = ignoredUsernames;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
console.log("[OLX] Crawler started");
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const savedRealEstates = [];
|
||||
const asyncSaveActions = [];
|
||||
|
||||
if (crawlAdCategories) {
|
||||
const indexGenerators = [];
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
@@ -77,11 +79,33 @@ class OlxCrawler {
|
||||
const entries = singlePageResults.entries();
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
this.saveCrawledResults(singlePageResult, this.maxAge)
|
||||
.then(numberOfSaved => {})
|
||||
const savePromise = this.saveCrawledResults(singlePageResult)
|
||||
.then(({ exist, savedRecords }) => {
|
||||
if (exist) {
|
||||
indexGenerators.splice(index, 1);
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
for (const savedRecord of savedRecords) {
|
||||
const { createdAt, updatedAt } = savedRecord;
|
||||
|
||||
console.log("Comparing ", createdAt, " <> ", updatedAt);
|
||||
|
||||
const createdAtMoment = moment.utc(createdAt);
|
||||
const updatedAtMoment = moment.utc(updatedAt);
|
||||
|
||||
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
|
||||
console.log("\tEqual !");
|
||||
savedRealEstates.push(savedRecord);
|
||||
}
|
||||
}
|
||||
})
|
||||
.catch(error =>
|
||||
console.log("[POSTGRES Saver] Error saving results : ", error)
|
||||
);
|
||||
asyncSaveActions.push(savePromise);
|
||||
} else {
|
||||
//Generator returned undefined, no more pages
|
||||
indexGenerators.splice(index, 1);
|
||||
@@ -92,11 +116,13 @@ class OlxCrawler {
|
||||
}
|
||||
});
|
||||
|
||||
await this.sleep(500);
|
||||
await this.sleep(5000);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("[OLX] Waiting for async save actions ...");
|
||||
await Promise.all(asyncSaveActions);
|
||||
console.log("[OLX] Crawler finished");
|
||||
return savedRealEstates;
|
||||
}
|
||||
|
||||
async *categoryIndexer(adCategory) {
|
||||
@@ -111,7 +137,6 @@ class OlxCrawler {
|
||||
urlPageToCrawl,
|
||||
this.maxResultsPerPage
|
||||
);
|
||||
console.log("indexing ", adCategory, " page : ", pageToIndex);
|
||||
|
||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||
yield singlePageResults;
|
||||
@@ -135,7 +160,6 @@ class OlxCrawler {
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
let hrefs = [];
|
||||
const singlePageResults = [];
|
||||
|
||||
$("#rezultatipretrage")
|
||||
.find(".listitem")
|
||||
@@ -158,7 +182,8 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
return scrapedData;
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("Exception caught:" + e);
|
||||
return [];
|
||||
@@ -166,29 +191,34 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
console.log("Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
let status = AD_STATUS.STATUS_NORMAL;
|
||||
|
||||
const username = $(
|
||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
|
||||
)
|
||||
const propertySelectors = {
|
||||
username:
|
||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
||||
title: "#naslovartikla",
|
||||
descriptions: ".artikal_detaljniopis_tekst",
|
||||
category:
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||
};
|
||||
|
||||
const username = $(propertySelectors.username)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
|
||||
if (this.ignoredUsernames.includes((username || "").toLowerCase())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const title = $("#naslovartikla")
|
||||
const title = $(propertySelectors.title)
|
||||
.text()
|
||||
.trim();
|
||||
const descriptions = $(".artikal_detaljniopis_tekst");
|
||||
const category = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||
)
|
||||
const descriptions = $(propertySelectors.descriptions);
|
||||
const category = $(propertySelectors.category)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
@@ -252,7 +282,7 @@ class OlxCrawler {
|
||||
|
||||
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
||||
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
||||
const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`;
|
||||
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
|
||||
|
||||
const publishedDate = $(publishedDateValueSelector)
|
||||
.text()
|
||||
@@ -268,11 +298,15 @@ class OlxCrawler {
|
||||
throw { message: "Invalid published date ! Check parsing format" };
|
||||
}
|
||||
|
||||
const renewedDate = $(renewedDateValueSelector)
|
||||
.text()
|
||||
const renewedDate = $(renewedDateFullValueSelector)
|
||||
.data("content")
|
||||
.trim();
|
||||
|
||||
const renewedDateMoment = this.parseRenewedDate(renewedDate);
|
||||
const renewedDateMoment = moment.tz(
|
||||
renewedDate,
|
||||
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
if (!renewedDateMoment) {
|
||||
throw {
|
||||
@@ -416,6 +450,8 @@ class OlxCrawler {
|
||||
return AD_CATEGORY.CATEGORY_HOUSE;
|
||||
case "Poslovni prostori":
|
||||
return AD_CATEGORY.CATEGORY_OFFICE;
|
||||
case "Apartmani":
|
||||
return AD_CATEGORY.CATEGORY_APARTMENT;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
@@ -459,34 +495,36 @@ class OlxCrawler {
|
||||
return currentMoment.add(-1, "month");
|
||||
}
|
||||
|
||||
const dayVariations = ["dan", "dana"];
|
||||
for (const dayVariation of dayVariations) {
|
||||
if (renewedDateText.includes(dayVariation)) {
|
||||
// format for this case should be "Prije N dana" or "Prije N dan"
|
||||
const dateParts = renewedDateText.split(" ");
|
||||
if (dateParts[0] === "Prije") {
|
||||
const numberOfDays = parseInt(dateParts[1]);
|
||||
return currentMoment.add(-1 * numberOfDays, "days");
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("Jučer")) {
|
||||
return currentMoment.add(-1, "day");
|
||||
}
|
||||
|
||||
const todayVariations = [
|
||||
"sat",
|
||||
"sati",
|
||||
"sata",
|
||||
"min",
|
||||
"sekunde",
|
||||
"sekundi",
|
||||
"sekundu",
|
||||
"maloprije"
|
||||
];
|
||||
if (renewedDateText.includes("Prije sat")) {
|
||||
return currentMoment.add(-1, "hour");
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("dan")) {
|
||||
// format for this case should be "Prije N dana" or "Prije N dan"
|
||||
const dateParts = renewedDateText.split(" ");
|
||||
if (dateParts[0] === "Prije") {
|
||||
const numberOfDays = parseInt(dateParts[1]);
|
||||
return currentMoment.add(-1 * numberOfDays, "days");
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
if (renewedDateText.includes("sat")) {
|
||||
const dateParts = renewedDateText.split(" ");
|
||||
const parsedHours =
|
||||
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
|
||||
if (!parsedHours) {
|
||||
return undefined;
|
||||
}
|
||||
return currentMoment.add(-1 * parsedHours, "hours");
|
||||
}
|
||||
|
||||
const todayVariations = ["min", "sekund", "maloprije"];
|
||||
for (const todayVariation of todayVariations) {
|
||||
if (renewedDateText.includes(todayVariation)) {
|
||||
return currentMoment;
|
||||
@@ -506,12 +544,16 @@ class OlxCrawler {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async saveCrawledResults(results, maxAge) {
|
||||
async saveCrawledResults(results) {
|
||||
const savers = this.savers;
|
||||
|
||||
for (const saver of savers) {
|
||||
await saver.save(results, maxAge);
|
||||
}
|
||||
// for (const saver of savers) {
|
||||
// await saver.save(results);
|
||||
// }
|
||||
|
||||
//For now, we use only Postgres saver, so ...
|
||||
return await savers[0].save(results);
|
||||
//so that we can use some sequelize options and information when data is inserted
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
"use strict";
|
||||
const db = require("../../models/index");
|
||||
const sequelize = require("sequelize");
|
||||
|
||||
const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
|
||||
const bulkUpsertRealEstates = async realEstateData => {
|
||||
try {
|
||||
const fieldsToUpdateIfDuplicate = [
|
||||
"realEstateType",
|
||||
@@ -23,16 +24,42 @@ const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
|
||||
"longDescription",
|
||||
"gardenSize",
|
||||
"adStatus",
|
||||
"updatedAt"
|
||||
"updatedAt",
|
||||
"renewedDate"
|
||||
];
|
||||
|
||||
return await db.RealEstate.bulkCreate(realEstateData, {
|
||||
updateOnDuplicate: fieldsToUpdateIfDuplicate
|
||||
updateOnDuplicate: fieldsToUpdateIfDuplicate,
|
||||
returning: true
|
||||
});
|
||||
} catch (e) {
|
||||
console.log("Error bulk upserting realEstates : ", e);
|
||||
}
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
bulkUpsertRealEstates
|
||||
const checkIfAlreadyExist = async realEstateData => {
|
||||
const orQueryPart = [];
|
||||
|
||||
for (const realEstate of realEstateData) {
|
||||
const { agencyObjectId, originAgencyName } = realEstate;
|
||||
|
||||
const singleRealEstateQueryPart = {
|
||||
agencyObjectId,
|
||||
originAgencyName
|
||||
};
|
||||
|
||||
orQueryPart.push(singleRealEstateQueryPart);
|
||||
}
|
||||
|
||||
const query = {
|
||||
[sequelize.Op.or]: orQueryPart
|
||||
};
|
||||
|
||||
const result = await db.RealEstate.count({ where: query });
|
||||
return result > 0;
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
bulkUpsertRealEstates,
|
||||
checkIfAlreadyExist
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user