stop crawling when existing, non-renewed ad is found

This commit is contained in:
Bilal Catic
2019-09-24 23:23:09 +02:00
parent 746732f30b
commit 90bc57edb6
4 changed files with 155 additions and 61 deletions

View File

@@ -18,7 +18,7 @@ const crawlers = [
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
OLX_CONFIG.OLX_MAX_PAGES,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
OLX_CONFIG.OLX_MAX_AGE
OLX_CONFIG.OLX_IGNORED_USERNAMES
)
];

View File

@@ -1,4 +1,9 @@
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
const moment = require("moment");
const {
bulkUpsertRealEstates,
checkIfAlreadyExist
} = require("../../helpers/db/realEstate");
class PostgresSaver {
connect() {
@@ -7,9 +12,29 @@ class PostgresSaver {
return true;
}
async save(results, maxAge) {
async save(results) {
console.log("[POSTGRES] Saving...");
await bulkUpsertRealEstates(results, maxAge);
const resultsWithPublishedAndRenewedDateSame = results.filter(
realEstate => {
const { publishedDate, renewedDate } = realEstate;
const publishedMomentDate = moment.utc(publishedDate);
const renewedMomentDate = moment.utc(renewedDate);
return publishedMomentDate.isSame(renewedMomentDate, "minute");
}
);
const exist =
resultsWithPublishedAndRenewedDateSame.length > 0
? await checkIfAlreadyExist(resultsWithPublishedAndRenewedDateSame)
: false;
const savedRecords = await bulkUpsertRealEstates(results);
return {
exist,
savedRecords
};
}
close() {

View File

@@ -8,7 +8,6 @@ const moment = require("moment-timezone");
const {
AD_TYPE,
AD_CATEGORY,
IGNORED_USERNAMES,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE
@@ -45,7 +44,7 @@ class OlxCrawler {
],
maxPages = 1000,
maxResultsPerPage = 100,
maxAge = 30
ignoredUsernames = []
) {
this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -53,13 +52,16 @@ class OlxCrawler {
this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages;
this.maxResultsPerPage = maxResultsPerPage;
this.maxAge = maxAge;
this.ignoredUsernames = ignoredUsernames;
}
async crawl() {
console.log("[OLX] Crawler started");
const crawlAdCategories = this.crawlerAdCategories;
const savedRealEstates = [];
const asyncSaveActions = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
@@ -77,11 +79,33 @@ class OlxCrawler {
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
this.saveCrawledResults(singlePageResult, this.maxAge)
.then(numberOfSaved => {})
const savePromise = this.saveCrawledResults(singlePageResult)
.then(({ exist, savedRecords }) => {
if (exist) {
indexGenerators.splice(index, 1);
if (indexGenerators.length === 0) {
done = true;
}
}
for (const savedRecord of savedRecords) {
const { createdAt, updatedAt } = savedRecord;
console.log("Comparing ", createdAt, " <> ", updatedAt);
const createdAtMoment = moment.utc(createdAt);
const updatedAtMoment = moment.utc(updatedAt);
if (createdAtMoment.isSame(updatedAtMoment, "second")) {
console.log("\tEqual !");
savedRealEstates.push(savedRecord);
}
}
})
.catch(error =>
console.log("[POSTGRES Saver] Error saving results : ", error)
);
asyncSaveActions.push(savePromise);
} else {
//Generator returned undefined, no more pages
indexGenerators.splice(index, 1);
@@ -92,11 +116,13 @@ class OlxCrawler {
}
});
await this.sleep(500);
await this.sleep(5000);
}
}
console.log("[OLX] Waiting for async save actions ...");
await Promise.all(asyncSaveActions);
console.log("[OLX] Crawler finished");
return savedRealEstates;
}
async *categoryIndexer(adCategory) {
@@ -111,7 +137,6 @@ class OlxCrawler {
urlPageToCrawl,
this.maxResultsPerPage
);
console.log("indexing ", adCategory, " page : ", pageToIndex);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
@@ -135,7 +160,6 @@ class OlxCrawler {
const body = await res.text();
const $ = cheerio.load(body);
let hrefs = [];
const singlePageResults = [];
$("#rezultatipretrage")
.find(".listitem")
@@ -158,7 +182,8 @@ class OlxCrawler {
}
const scrapedData = await Promise.all(asyncScraping);
return scrapedData;
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
console.error("Exception caught:" + e);
return [];
@@ -166,29 +191,34 @@ class OlxCrawler {
}
async scrapeAd(url) {
console.log("Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
let status = AD_STATUS.STATUS_NORMAL;
const username = $(
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span"
)
const propertySelectors = {
username:
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
title: "#naslovartikla",
descriptions: ".artikal_detaljniopis_tekst",
category:
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
};
const username = $(propertySelectors.username)
.text()
.trim();
if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) {
if (this.ignoredUsernames.includes((username || "").toLowerCase())) {
return null;
}
const title = $("#naslovartikla")
const title = $(propertySelectors.title)
.text()
.trim();
const descriptions = $(".artikal_detaljniopis_tekst");
const category = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
)
const descriptions = $(propertySelectors.descriptions);
const category = $(propertySelectors.category)
.text()
.trim();
@@ -252,7 +282,7 @@ class OlxCrawler {
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`;
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
const publishedDate = $(publishedDateValueSelector)
.text()
@@ -268,11 +298,15 @@ class OlxCrawler {
throw { message: "Invalid published date ! Check parsing format" };
}
const renewedDate = $(renewedDateValueSelector)
.text()
const renewedDate = $(renewedDateFullValueSelector)
.data("content")
.trim();
const renewedDateMoment = this.parseRenewedDate(renewedDate);
const renewedDateMoment = moment.tz(
renewedDate,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!renewedDateMoment) {
throw {
@@ -416,6 +450,8 @@ class OlxCrawler {
return AD_CATEGORY.CATEGORY_HOUSE;
case "Poslovni prostori":
return AD_CATEGORY.CATEGORY_OFFICE;
case "Apartmani":
return AD_CATEGORY.CATEGORY_APARTMENT;
default:
return undefined;
}
@@ -459,34 +495,36 @@ class OlxCrawler {
return currentMoment.add(-1, "month");
}
const dayVariations = ["dan", "dana"];
for (const dayVariation of dayVariations) {
if (renewedDateText.includes(dayVariation)) {
// format for this case should be "Prije N dana" or "Prije N dan"
const dateParts = renewedDateText.split(" ");
if (dateParts[0] === "Prije") {
const numberOfDays = parseInt(dateParts[1]);
return currentMoment.add(-1 * numberOfDays, "days");
} else {
return undefined;
}
}
}
if (renewedDateText.includes("Jučer")) {
return currentMoment.add(-1, "day");
}
const todayVariations = [
"sat",
"sati",
"sata",
"min",
"sekunde",
"sekundi",
"sekundu",
"maloprije"
];
if (renewedDateText.includes("Prije sat")) {
return currentMoment.add(-1, "hour");
}
if (renewedDateText.includes("dan")) {
// format for this case should be "Prije N dana" or "Prije N dan"
const dateParts = renewedDateText.split(" ");
if (dateParts[0] === "Prije") {
const numberOfDays = parseInt(dateParts[1]);
return currentMoment.add(-1 * numberOfDays, "days");
} else {
return undefined;
}
}
if (renewedDateText.includes("sat")) {
const dateParts = renewedDateText.split(" ");
const parsedHours =
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
if (!parsedHours) {
return undefined;
}
return currentMoment.add(-1 * parsedHours, "hours");
}
const todayVariations = ["min", "sekund", "maloprije"];
for (const todayVariation of todayVariations) {
if (renewedDateText.includes(todayVariation)) {
return currentMoment;
@@ -506,12 +544,16 @@ class OlxCrawler {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results, maxAge) {
async saveCrawledResults(results) {
const savers = this.savers;
for (const saver of savers) {
await saver.save(results, maxAge);
}
// for (const saver of savers) {
// await saver.save(results);
// }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}

View File

@@ -1,7 +1,8 @@
"use strict";
const db = require("../../models/index");
const sequelize = require("sequelize");
const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
const bulkUpsertRealEstates = async realEstateData => {
try {
const fieldsToUpdateIfDuplicate = [
"realEstateType",
@@ -23,16 +24,42 @@ const bulkUpsertRealEstates = async (realEstateData, maxAge) => {
"longDescription",
"gardenSize",
"adStatus",
"updatedAt"
"updatedAt",
"renewedDate"
];
return await db.RealEstate.bulkCreate(realEstateData, {
updateOnDuplicate: fieldsToUpdateIfDuplicate
updateOnDuplicate: fieldsToUpdateIfDuplicate,
returning: true
});
} catch (e) {
console.log("Error bulk upserting realEstates : ", e);
}
};
module.exports = {
bulkUpsertRealEstates
const checkIfAlreadyExist = async realEstateData => {
const orQueryPart = [];
for (const realEstate of realEstateData) {
const { agencyObjectId, originAgencyName } = realEstate;
const singleRealEstateQueryPart = {
agencyObjectId,
originAgencyName
};
orQueryPart.push(singleRealEstateQueryPart);
}
const query = {
[sequelize.Op.or]: orQueryPart
};
const result = await db.RealEstate.count({ where: query });
return result > 0;
};
module.exports = {
bulkUpsertRealEstates,
checkIfAlreadyExist
};