stop crawling when existing, non-renewed ad is found
This commit is contained in:
@@ -18,7 +18,7 @@ const crawlers = [
|
|||||||
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
|
||||||
OLX_CONFIG.OLX_MAX_PAGES,
|
OLX_CONFIG.OLX_MAX_PAGES,
|
||||||
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
|
||||||
OLX_CONFIG.OLX_MAX_AGE
|
OLX_CONFIG.OLX_IGNORED_USERNAMES
|
||||||
)
|
)
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,9 @@
|
|||||||
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
|
const moment = require("moment");
|
||||||
|
|
||||||
|
const {
|
||||||
|
bulkUpsertRealEstates,
|
||||||
|
checkIfAlreadyExist
|
||||||
|
} = require("../../helpers/db/realEstate");
|
||||||
|
|
||||||
class PostgresSaver {
|
class PostgresSaver {
|
||||||
connect() {
|
connect() {
|
||||||
@@ -7,9 +12,29 @@ class PostgresSaver {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
async save(results, maxAge) {
|
async save(results) {
|
||||||
console.log("[POSTGRES] Saving...");
|
console.log("[POSTGRES] Saving...");
|
||||||
await bulkUpsertRealEstates(results, maxAge);
|
const resultsWithPublishedAndRenewedDateSame = results.filter(
|
||||||
|
realEstate => {
|
||||||
|
const { publishedDate, renewedDate } = realEstate;
|
||||||
|
|
||||||
|
const publishedMomentDate = moment.utc(publishedDate);
|
||||||
|
const renewedMomentDate = moment.utc(renewedDate);
|
||||||
|
|
||||||
|
return publishedMomentDate.isSame(renewedMomentDate, "minute");
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const exist =
|
||||||
|
resultsWithPublishedAndRenewedDateSame.length > 0
|
||||||
|
? await checkIfAlreadyExist(resultsWithPublishedAndRenewedDateSame)
|
||||||
|
: false;
|
||||||
|
const savedRecords = await bulkUpsertRealEstates(results);
|
||||||
|
|
||||||
|
return {
|
||||||
|
exist,
|
||||||
|
savedRecords
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
close() {
|
close() {
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ const moment = require("moment-timezone");
|
|||||||
const {
|
const {
|
||||||
AD_TYPE,
|
AD_TYPE,
|
||||||
AD_CATEGORY,
|
AD_CATEGORY,
|
||||||
IGNORED_USERNAMES,
|
|
||||||
AD_AGENCY,
|
AD_AGENCY,
|
||||||
AD_STATUS,
|
AD_STATUS,
|
||||||
CRAWLER_AD_TYPE
|
CRAWLER_AD_TYPE
|
||||||
@@ -45,7 +44,7 @@ class OlxCrawler {
|
|||||||
],
|
],
|
||||||
maxPages = 1000,
|
maxPages = 1000,
|
||||||
maxResultsPerPage = 100,
|
maxResultsPerPage = 100,
|
||||||
maxAge = 30
|
ignoredUsernames = []
|
||||||
) {
|
) {
|
||||||
this.savers = savers;
|
this.savers = savers;
|
||||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||||
@@ -53,13 +52,16 @@ class OlxCrawler {
|
|||||||
this.crawlerAdCategories = crawlerAdCategories;
|
this.crawlerAdCategories = crawlerAdCategories;
|
||||||
this.maxPages = maxPages;
|
this.maxPages = maxPages;
|
||||||
this.maxResultsPerPage = maxResultsPerPage;
|
this.maxResultsPerPage = maxResultsPerPage;
|
||||||
this.maxAge = maxAge;
|
this.ignoredUsernames = ignoredUsernames;
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
console.log("[OLX] Crawler started");
|
console.log("[OLX] Crawler started");
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
|
||||||
|
const savedRealEstates = [];
|
||||||
|
const asyncSaveActions = [];
|
||||||
|
|
||||||
if (crawlAdCategories) {
|
if (crawlAdCategories) {
|
||||||
const indexGenerators = [];
|
const indexGenerators = [];
|
||||||
for (const adCategory of crawlAdCategories) {
|
for (const adCategory of crawlAdCategories) {
|
||||||
@@ -77,11 +79,33 @@ class OlxCrawler {
|
|||||||
const entries = singlePageResults.entries();
|
const entries = singlePageResults.entries();
|
||||||
for (const [index, { value: singlePageResult }] of entries) {
|
for (const [index, { value: singlePageResult }] of entries) {
|
||||||
if (singlePageResult) {
|
if (singlePageResult) {
|
||||||
this.saveCrawledResults(singlePageResult, this.maxAge)
|
const savePromise = this.saveCrawledResults(singlePageResult)
|
||||||
.then(numberOfSaved => {})
|
.then(({ exist, savedRecords }) => {
|
||||||
|
if (exist) {
|
||||||
|
indexGenerators.splice(index, 1);
|
||||||
|
if (indexGenerators.length === 0) {
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const savedRecord of savedRecords) {
|
||||||
|
const { createdAt, updatedAt } = savedRecord;
|
||||||
|
|
||||||
|
console.log("Comparing ", createdAt, " <> ", updatedAt);
|
||||||
|
|
||||||
|
const createdAtMoment = moment.utc(createdAt);
|
||||||
|
const updatedAtMoment = moment.utc(updatedAt);
|
||||||
|
|
||||||
|
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
|
||||||
|
console.log("\tEqual !");
|
||||||
|
savedRealEstates.push(savedRecord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
.catch(error =>
|
.catch(error =>
|
||||||
console.log("[POSTGRES Saver] Error saving results : ", error)
|
console.log("[POSTGRES Saver] Error saving results : ", error)
|
||||||
);
|
);
|
||||||
|
asyncSaveActions.push(savePromise);
|
||||||
} else {
|
} else {
|
||||||
//Generator returned undefined, no more pages
|
//Generator returned undefined, no more pages
|
||||||
indexGenerators.splice(index, 1);
|
indexGenerators.splice(index, 1);
|
||||||
@@ -92,11 +116,13 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
await this.sleep(500);
|
await this.sleep(5000);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
console.log("[OLX] Waiting for async save actions ...");
|
||||||
|
await Promise.all(asyncSaveActions);
|
||||||
console.log("[OLX] Crawler finished");
|
console.log("[OLX] Crawler finished");
|
||||||
|
return savedRealEstates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *categoryIndexer(adCategory) {
|
async *categoryIndexer(adCategory) {
|
||||||
@@ -111,7 +137,6 @@ class OlxCrawler {
|
|||||||
urlPageToCrawl,
|
urlPageToCrawl,
|
||||||
this.maxResultsPerPage
|
this.maxResultsPerPage
|
||||||
);
|
);
|
||||||
console.log("indexing ", adCategory, " page : ", pageToIndex);
|
|
||||||
|
|
||||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||||
yield singlePageResults;
|
yield singlePageResults;
|
||||||
@@ -135,7 +160,6 @@ class OlxCrawler {
|
|||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let hrefs = [];
|
let hrefs = [];
|
||||||
const singlePageResults = [];
|
|
||||||
|
|
||||||
$("#rezultatipretrage")
|
$("#rezultatipretrage")
|
||||||
.find(".listitem")
|
.find(".listitem")
|
||||||
@@ -158,7 +182,8 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const scrapedData = await Promise.all(asyncScraping);
|
const scrapedData = await Promise.all(asyncScraping);
|
||||||
return scrapedData;
|
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||||
|
return filteredScrapedData;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught:" + e);
|
console.error("Exception caught:" + e);
|
||||||
return [];
|
return [];
|
||||||
@@ -166,29 +191,34 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(url) {
|
async scrapeAd(url) {
|
||||||
|
console.log("Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let status = AD_STATUS.STATUS_NORMAL;
|
let status = AD_STATUS.STATUS_NORMAL;
|
||||||
|
|
||||||
const username = $(
|
const propertySelectors = {
|
||||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
|
username:
|
||||||
)
|
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
||||||
|
title: "#naslovartikla",
|
||||||
|
descriptions: ".artikal_detaljniopis_tekst",
|
||||||
|
category:
|
||||||
|
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
||||||
|
};
|
||||||
|
|
||||||
|
const username = $(propertySelectors.username)
|
||||||
.text()
|
.text()
|
||||||
.trim();
|
.trim();
|
||||||
|
if (this.ignoredUsernames.includes((username || "").toLowerCase())) {
|
||||||
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const title = $("#naslovartikla")
|
const title = $(propertySelectors.title)
|
||||||
.text()
|
.text()
|
||||||
.trim();
|
.trim();
|
||||||
const descriptions = $(".artikal_detaljniopis_tekst");
|
const descriptions = $(propertySelectors.descriptions);
|
||||||
const category = $(
|
const category = $(propertySelectors.category)
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
|
||||||
)
|
|
||||||
.text()
|
.text()
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
@@ -252,7 +282,7 @@ class OlxCrawler {
|
|||||||
|
|
||||||
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
||||||
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
||||||
const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`;
|
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
|
||||||
|
|
||||||
const publishedDate = $(publishedDateValueSelector)
|
const publishedDate = $(publishedDateValueSelector)
|
||||||
.text()
|
.text()
|
||||||
@@ -268,11 +298,15 @@ class OlxCrawler {
|
|||||||
throw { message: "Invalid published date ! Check parsing format" };
|
throw { message: "Invalid published date ! Check parsing format" };
|
||||||
}
|
}
|
||||||
|
|
||||||
const renewedDate = $(renewedDateValueSelector)
|
const renewedDate = $(renewedDateFullValueSelector)
|
||||||
.text()
|
.data("content")
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
const renewedDateMoment = this.parseRenewedDate(renewedDate);
|
const renewedDateMoment = moment.tz(
|
||||||
|
renewedDate,
|
||||||
|
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
);
|
||||||
|
|
||||||
if (!renewedDateMoment) {
|
if (!renewedDateMoment) {
|
||||||
throw {
|
throw {
|
||||||
@@ -416,6 +450,8 @@ class OlxCrawler {
|
|||||||
return AD_CATEGORY.CATEGORY_HOUSE;
|
return AD_CATEGORY.CATEGORY_HOUSE;
|
||||||
case "Poslovni prostori":
|
case "Poslovni prostori":
|
||||||
return AD_CATEGORY.CATEGORY_OFFICE;
|
return AD_CATEGORY.CATEGORY_OFFICE;
|
||||||
|
case "Apartmani":
|
||||||
|
return AD_CATEGORY.CATEGORY_APARTMENT;
|
||||||
default:
|
default:
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
@@ -459,34 +495,36 @@ class OlxCrawler {
|
|||||||
return currentMoment.add(-1, "month");
|
return currentMoment.add(-1, "month");
|
||||||
}
|
}
|
||||||
|
|
||||||
const dayVariations = ["dan", "dana"];
|
|
||||||
for (const dayVariation of dayVariations) {
|
|
||||||
if (renewedDateText.includes(dayVariation)) {
|
|
||||||
// format for this case should be "Prije N dana" or "Prije N dan"
|
|
||||||
const dateParts = renewedDateText.split(" ");
|
|
||||||
if (dateParts[0] === "Prije") {
|
|
||||||
const numberOfDays = parseInt(dateParts[1]);
|
|
||||||
return currentMoment.add(-1 * numberOfDays, "days");
|
|
||||||
} else {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (renewedDateText.includes("Jučer")) {
|
if (renewedDateText.includes("Jučer")) {
|
||||||
return currentMoment.add(-1, "day");
|
return currentMoment.add(-1, "day");
|
||||||
}
|
}
|
||||||
|
|
||||||
const todayVariations = [
|
if (renewedDateText.includes("Prije sat")) {
|
||||||
"sat",
|
return currentMoment.add(-1, "hour");
|
||||||
"sati",
|
}
|
||||||
"sata",
|
|
||||||
"min",
|
if (renewedDateText.includes("dan")) {
|
||||||
"sekunde",
|
// format for this case should be "Prije N dana" or "Prije N dan"
|
||||||
"sekundi",
|
const dateParts = renewedDateText.split(" ");
|
||||||
"sekundu",
|
if (dateParts[0] === "Prije") {
|
||||||
"maloprije"
|
const numberOfDays = parseInt(dateParts[1]);
|
||||||
];
|
return currentMoment.add(-1 * numberOfDays, "days");
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (renewedDateText.includes("sat")) {
|
||||||
|
const dateParts = renewedDateText.split(" ");
|
||||||
|
const parsedHours =
|
||||||
|
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
|
||||||
|
if (!parsedHours) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return currentMoment.add(-1 * parsedHours, "hours");
|
||||||
|
}
|
||||||
|
|
||||||
|
const todayVariations = ["min", "sekund", "maloprije"];
|
||||||
for (const todayVariation of todayVariations) {
|
for (const todayVariation of todayVariations) {
|
||||||
if (renewedDateText.includes(todayVariation)) {
|
if (renewedDateText.includes(todayVariation)) {
|
||||||
return currentMoment;
|
return currentMoment;
|
||||||
@@ -506,12 +544,16 @@ class OlxCrawler {
|
|||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
async saveCrawledResults(results, maxAge) {
|
async saveCrawledResults(results) {
|
||||||
const savers = this.savers;
|
const savers = this.savers;
|
||||||
|
|
||||||
for (const saver of savers) {
|
// for (const saver of savers) {
|
||||||
await saver.save(results, maxAge);
|
// await saver.save(results);
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
//For now, we use only Postgres saver, so ...
|
||||||
|
return await savers[0].save(results);
|
||||||
|
//so that we can use some sequelize options and information when data is inserted
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
const db = require("../../models/index");
|
const db = require("../../models/index");
|
||||||
|
const sequelize = require("sequelize");
|
||||||
|
|
||||||
const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
|
const bulkUpsertRealEstates = async realEstateData => {
|
||||||
try {
|
try {
|
||||||
const fieldsToUpdateIfDuplicate = [
|
const fieldsToUpdateIfDuplicate = [
|
||||||
"realEstateType",
|
"realEstateType",
|
||||||
@@ -23,16 +24,42 @@ const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
|
|||||||
"longDescription",
|
"longDescription",
|
||||||
"gardenSize",
|
"gardenSize",
|
||||||
"adStatus",
|
"adStatus",
|
||||||
"updatedAt"
|
"updatedAt",
|
||||||
|
"renewedDate"
|
||||||
];
|
];
|
||||||
|
|
||||||
return await db.RealEstate.bulkCreate(realEstateData, {
|
return await db.RealEstate.bulkCreate(realEstateData, {
|
||||||
updateOnDuplicate: fieldsToUpdateIfDuplicate
|
updateOnDuplicate: fieldsToUpdateIfDuplicate,
|
||||||
|
returning: true
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.log("Error bulk upserting realEstates : ", e);
|
console.log("Error bulk upserting realEstates : ", e);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
const checkIfAlreadyExist = async realEstateData => {
|
||||||
bulkUpsertRealEstates
|
const orQueryPart = [];
|
||||||
|
|
||||||
|
for (const realEstate of realEstateData) {
|
||||||
|
const { agencyObjectId, originAgencyName } = realEstate;
|
||||||
|
|
||||||
|
const singleRealEstateQueryPart = {
|
||||||
|
agencyObjectId,
|
||||||
|
originAgencyName
|
||||||
|
};
|
||||||
|
|
||||||
|
orQueryPart.push(singleRealEstateQueryPart);
|
||||||
|
}
|
||||||
|
|
||||||
|
const query = {
|
||||||
|
[sequelize.Op.or]: orQueryPart
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await db.RealEstate.count({ where: query });
|
||||||
|
return result > 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
bulkUpsertRealEstates,
|
||||||
|
checkIfAlreadyExist
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user