stop crawling when existing, non-renewed ad is found

This commit is contained in:
Bilal Catic
2019-09-24 23:23:09 +02:00
parent 746732f30b
commit 90bc57edb6
4 changed files with 155 additions and 61 deletions

View File

@@ -18,7 +18,7 @@ const crawlers = [
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
OLX_CONFIG.OLX_MAX_PAGES, OLX_CONFIG.OLX_MAX_PAGES,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
OLX_CONFIG.OLX_MAX_AGE OLX_CONFIG.OLX_IGNORED_USERNAMES
) )
]; ];

View File

@@ -1,4 +1,9 @@
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate"); const moment = require("moment");
const {
bulkUpsertRealEstates,
checkIfAlreadyExist
} = require("../../helpers/db/realEstate");
class PostgresSaver { class PostgresSaver {
connect() { connect() {
@@ -7,9 +12,29 @@ class PostgresSaver {
return true; return true;
} }
async save(results, maxAge) { async save(results) {
console.log("[POSTGRES] Saving..."); console.log("[POSTGRES] Saving...");
await bulkUpsertRealEstates(results, maxAge); const resultsWithPublishedAndRenewedDateSame = results.filter(
realEstate => {
const { publishedDate, renewedDate } = realEstate;
const publishedMomentDate = moment.utc(publishedDate);
const renewedMomentDate = moment.utc(renewedDate);
return publishedMomentDate.isSame(renewedMomentDate, "minute");
}
);
const exist =
resultsWithPublishedAndRenewedDateSame.length > 0
? await checkIfAlreadyExist(resultsWithPublishedAndRenewedDateSame)
: false;
const savedRecords = await bulkUpsertRealEstates(results);
return {
exist,
savedRecords
};
} }
close() { close() {

View File

@@ -8,7 +8,6 @@ const moment = require("moment-timezone");
const { const {
AD_TYPE, AD_TYPE,
AD_CATEGORY, AD_CATEGORY,
IGNORED_USERNAMES,
AD_AGENCY, AD_AGENCY,
AD_STATUS, AD_STATUS,
CRAWLER_AD_TYPE CRAWLER_AD_TYPE
@@ -45,7 +44,7 @@ class OlxCrawler {
], ],
maxPages = 1000, maxPages = 1000,
maxResultsPerPage = 100, maxResultsPerPage = 100,
maxAge = 30 ignoredUsernames = []
) { ) {
this.savers = savers; this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -53,13 +52,16 @@ class OlxCrawler {
this.crawlerAdCategories = crawlerAdCategories; this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages; this.maxPages = maxPages;
this.maxResultsPerPage = maxResultsPerPage; this.maxResultsPerPage = maxResultsPerPage;
this.maxAge = maxAge; this.ignoredUsernames = ignoredUsernames;
} }
async crawl() { async crawl() {
console.log("[OLX] Crawler started"); console.log("[OLX] Crawler started");
const crawlAdCategories = this.crawlerAdCategories; const crawlAdCategories = this.crawlerAdCategories;
const savedRealEstates = [];
const asyncSaveActions = [];
if (crawlAdCategories) { if (crawlAdCategories) {
const indexGenerators = []; const indexGenerators = [];
for (const adCategory of crawlAdCategories) { for (const adCategory of crawlAdCategories) {
@@ -77,11 +79,33 @@ class OlxCrawler {
const entries = singlePageResults.entries(); const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) { for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) { if (singlePageResult) {
this.saveCrawledResults(singlePageResult, this.maxAge) const savePromise = this.saveCrawledResults(singlePageResult)
.then(numberOfSaved => {}) .then(({ exist, savedRecords }) => {
if (exist) {
indexGenerators.splice(index, 1);
if (indexGenerators.length === 0) {
done = true;
}
}
for (const savedRecord of savedRecords) {
const { createdAt, updatedAt } = savedRecord;
console.log("Comparing ", createdAt, " <> ", updatedAt);
const createdAtMoment = moment.utc(createdAt);
const updatedAtMoment = moment.utc(updatedAt);
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
console.log("\tEqual !");
savedRealEstates.push(savedRecord);
}
}
})
.catch(error => .catch(error =>
console.log("[POSTGRES Saver] Error saving results : ", error) console.log("[POSTGRES Saver] Error saving results : ", error)
); );
asyncSaveActions.push(savePromise);
} else { } else {
//Generator returned undefined, no more pages //Generator returned undefined, no more pages
indexGenerators.splice(index, 1); indexGenerators.splice(index, 1);
@@ -92,11 +116,13 @@ class OlxCrawler {
} }
}); });
await this.sleep(500); await this.sleep(5000);
} }
} }
console.log("[OLX] Waiting for async save actions ...");
await Promise.all(asyncSaveActions);
console.log("[OLX] Crawler finished"); console.log("[OLX] Crawler finished");
return savedRealEstates;
} }
async *categoryIndexer(adCategory) { async *categoryIndexer(adCategory) {
@@ -111,7 +137,6 @@ class OlxCrawler {
urlPageToCrawl, urlPageToCrawl,
this.maxResultsPerPage this.maxResultsPerPage
); );
console.log("indexing ", adCategory, " page : ", pageToIndex);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults; yield singlePageResults;
@@ -135,7 +160,6 @@ class OlxCrawler {
const body = await res.text(); const body = await res.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let hrefs = []; let hrefs = [];
const singlePageResults = [];
$("#rezultatipretrage") $("#rezultatipretrage")
.find(".listitem") .find(".listitem")
@@ -158,7 +182,8 @@ class OlxCrawler {
} }
const scrapedData = await Promise.all(asyncScraping); const scrapedData = await Promise.all(asyncScraping);
return scrapedData; const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) { } catch (e) {
console.error("Exception caught:" + e); console.error("Exception caught:" + e);
return []; return [];
@@ -166,29 +191,34 @@ class OlxCrawler {
} }
async scrapeAd(url) { async scrapeAd(url) {
console.log("Scraping : ", url);
try { try {
const adPageSource = await fetch(url); const adPageSource = await fetch(url);
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let status = AD_STATUS.STATUS_NORMAL; let status = AD_STATUS.STATUS_NORMAL;
const username = $( const propertySelectors = {
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" username:
) "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
title: "#naslovartikla",
descriptions: ".artikal_detaljniopis_tekst",
category:
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
};
const username = $(propertySelectors.username)
.text() .text()
.trim(); .trim();
if (this.ignoredUsernames.includes((username || "").toLowerCase())) {
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
return null; return null;
} }
const title = $("#naslovartikla") const title = $(propertySelectors.title)
.text() .text()
.trim(); .trim();
const descriptions = $(".artikal_detaljniopis_tekst"); const descriptions = $(propertySelectors.descriptions);
const category = $( const category = $(propertySelectors.category)
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
)
.text() .text()
.trim(); .trim();
@@ -252,7 +282,7 @@ class OlxCrawler {
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`; const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
const publishedDate = $(publishedDateValueSelector) const publishedDate = $(publishedDateValueSelector)
.text() .text()
@@ -268,11 +298,15 @@ class OlxCrawler {
throw { message: "Invalid published date ! Check parsing format" }; throw { message: "Invalid published date ! Check parsing format" };
} }
const renewedDate = $(renewedDateValueSelector) const renewedDate = $(renewedDateFullValueSelector)
.text() .data("content")
.trim(); .trim();
const renewedDateMoment = this.parseRenewedDate(renewedDate); const renewedDateMoment = moment.tz(
renewedDate,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!renewedDateMoment) { if (!renewedDateMoment) {
throw { throw {
@@ -416,6 +450,8 @@ class OlxCrawler {
return AD_CATEGORY.CATEGORY_HOUSE; return AD_CATEGORY.CATEGORY_HOUSE;
case "Poslovni prostori": case "Poslovni prostori":
return AD_CATEGORY.CATEGORY_OFFICE; return AD_CATEGORY.CATEGORY_OFFICE;
case "Apartmani":
return AD_CATEGORY.CATEGORY_APARTMENT;
default: default:
return undefined; return undefined;
} }
@@ -459,34 +495,36 @@ class OlxCrawler {
return currentMoment.add(-1, "month"); return currentMoment.add(-1, "month");
} }
const dayVariations = ["dan", "dana"];
for (const dayVariation of dayVariations) {
if (renewedDateText.includes(dayVariation)) {
// format for this case should be "Prije N dana" or "Prije N dan"
const dateParts = renewedDateText.split(" ");
if (dateParts[0] === "Prije") {
const numberOfDays = parseInt(dateParts[1]);
return currentMoment.add(-1 * numberOfDays, "days");
} else {
return undefined;
}
}
}
if (renewedDateText.includes("Jučer")) { if (renewedDateText.includes("Jučer")) {
return currentMoment.add(-1, "day"); return currentMoment.add(-1, "day");
} }
const todayVariations = [ if (renewedDateText.includes("Prije sat")) {
"sat", return currentMoment.add(-1, "hour");
"sati", }
"sata",
"min", if (renewedDateText.includes("dan")) {
"sekunde", // format for this case should be "Prije N dana" or "Prije N dan"
"sekundi", const dateParts = renewedDateText.split(" ");
"sekundu", if (dateParts[0] === "Prije") {
"maloprije" const numberOfDays = parseInt(dateParts[1]);
]; return currentMoment.add(-1 * numberOfDays, "days");
} else {
return undefined;
}
}
if (renewedDateText.includes("sat")) {
const dateParts = renewedDateText.split(" ");
const parsedHours =
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
if (!parsedHours) {
return undefined;
}
return currentMoment.add(-1 * parsedHours, "hours");
}
const todayVariations = ["min", "sekund", "maloprije"];
for (const todayVariation of todayVariations) { for (const todayVariation of todayVariations) {
if (renewedDateText.includes(todayVariation)) { if (renewedDateText.includes(todayVariation)) {
return currentMoment; return currentMoment;
@@ -506,12 +544,16 @@ class OlxCrawler {
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }
async saveCrawledResults(results, maxAge) { async saveCrawledResults(results) {
const savers = this.savers; const savers = this.savers;
for (const saver of savers) { // for (const saver of savers) {
await saver.save(results, maxAge); // await saver.save(results);
} // }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
} }
} }

View File

@@ -1,7 +1,8 @@
"use strict"; "use strict";
const db = require("../../models/index"); const db = require("../../models/index");
const sequelize = require("sequelize");
const bulkUpsertRealEstates = async (realEstateData, maxAge) => { const bulkUpsertRealEstates = async realEstateData => {
try { try {
const fieldsToUpdateIfDuplicate = [ const fieldsToUpdateIfDuplicate = [
"realEstateType", "realEstateType",
@@ -23,16 +24,42 @@ const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
"longDescription", "longDescription",
"gardenSize", "gardenSize",
"adStatus", "adStatus",
"updatedAt" "updatedAt",
"renewedDate"
]; ];
return await db.RealEstate.bulkCreate(realEstateData, { return await db.RealEstate.bulkCreate(realEstateData, {
updateOnDuplicate: fieldsToUpdateIfDuplicate updateOnDuplicate: fieldsToUpdateIfDuplicate,
returning: true
}); });
} catch (e) { } catch (e) {
console.log("Error bulk upserting realEstates : ", e); console.log("Error bulk upserting realEstates : ", e);
} }
}; };
module.exports = { const checkIfAlreadyExist = async realEstateData => {
bulkUpsertRealEstates const orQueryPart = [];
for (const realEstate of realEstateData) {
const { agencyObjectId, originAgencyName } = realEstate;
const singleRealEstateQueryPart = {
agencyObjectId,
originAgencyName
};
orQueryPart.push(singleRealEstateQueryPart);
}
const query = {
[sequelize.Op.or]: orQueryPart
};
const result = await db.RealEstate.count({ where: query });
return result > 0;
};
module.exports = {
bulkUpsertRealEstates,
checkIfAlreadyExist
}; };