From 90bc57edb6ac84ff7c36d8ffd2b89b81eb6eeaa6 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Tue, 24 Sep 2019 23:23:09 +0200 Subject: [PATCH] stop crawling when existing, non-renewed ad is found --- app/crawler/crawl.js | 2 +- app/crawler/savers/postgres.js | 31 ++++++- app/crawler/specific/olx.js | 146 +++++++++++++++++++++------------ app/helpers/db/realEstate.js | 37 +++++++-- 4 files changed, 155 insertions(+), 61 deletions(-) diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 2ff5118..d37cb9c 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -18,7 +18,7 @@ const crawlers = [ OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, OLX_CONFIG.OLX_MAX_PAGES, OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, - OLX_CONFIG.OLX_MAX_AGE + OLX_CONFIG.OLX_IGNORED_USERNAMES ) ]; diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js index 4664be7..28fc635 100644 --- a/app/crawler/savers/postgres.js +++ b/app/crawler/savers/postgres.js @@ -1,4 +1,9 @@ -const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate"); +const moment = require("moment"); + +const { + bulkUpsertRealEstates, + checkIfAlreadyExist +} = require("../../helpers/db/realEstate"); class PostgresSaver { connect() { @@ -7,9 +12,29 @@ class PostgresSaver { return true; } - async save(results, maxAge) { + async save(results) { console.log("[POSTGRES] Saving..."); - await bulkUpsertRealEstates(results, maxAge); + const resultsWithPublishedAndRenewedDateSame = results.filter( + realEstate => { + const { publishedDate, renewedDate } = realEstate; + + const publishedMomentDate = moment.utc(publishedDate); + const renewedMomentDate = moment.utc(renewedDate); + + return publishedMomentDate.isSame(renewedMomentDate, "minute"); + } + ); + + const exist = + resultsWithPublishedAndRenewedDateSame.length > 0 + ? await checkIfAlreadyExist(resultsWithPublishedAndRenewedDateSame) + : false; + const savedRecords = await bulkUpsertRealEstates(results); + + return { + exist, + savedRecords + }; } close() { diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 34a949a..1f6ea1c 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -8,7 +8,6 @@ const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, - IGNORED_USERNAMES, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE @@ -45,7 +44,7 @@ class OlxCrawler { ], maxPages = 1000, maxResultsPerPage = 100, - maxAge = 30 + ignoredUsernames = [] ) { this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; @@ -53,13 +52,16 @@ class OlxCrawler { this.crawlerAdCategories = crawlerAdCategories; this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; - this.maxAge = maxAge; + this.ignoredUsernames = ignoredUsernames; } async crawl() { console.log("[OLX] Crawler started"); const crawlAdCategories = this.crawlerAdCategories; + const savedRealEstates = []; + const asyncSaveActions = []; + if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { @@ -77,11 +79,33 @@ class OlxCrawler { const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { - this.saveCrawledResults(singlePageResult, this.maxAge) - .then(numberOfSaved => {}) + const savePromise = this.saveCrawledResults(singlePageResult) + .then(({ exist, savedRecords }) => { + if (exist) { + indexGenerators.splice(index, 1); + if (indexGenerators.length === 0) { + done = true; + } + } + + for (const savedRecord of savedRecords) { + const { createdAt, updatedAt } = savedRecord; + + console.log("Comparing ", createdAt, " <> ", updatedAt); + + const createdAtMoment = moment.utc(createdAt); + const updatedAtMoment = moment.utc(updatedAt); + + if (createdAtMoment.isSame(updatedAtMoment, "second")) { + console.log("\tEqual !"); + savedRealEstates.push(savedRecord); + } + } + }) .catch(error => console.log("[POSTGRES Saver] Error saving results : ", error) ); + asyncSaveActions.push(savePromise); } else { //Generator returned undefined, no more pages indexGenerators.splice(index, 1); @@ -92,11 +116,13 @@ class OlxCrawler { } }); - await this.sleep(500); + await this.sleep(5000); } } - + console.log("[OLX] Waiting for async save actions ..."); + await Promise.all(asyncSaveActions); console.log("[OLX] Crawler finished"); + return savedRealEstates; } async *categoryIndexer(adCategory) { @@ -111,7 +137,6 @@ class OlxCrawler { urlPageToCrawl, this.maxResultsPerPage ); - console.log("indexing ", adCategory, " page : ", pageToIndex); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; @@ -135,7 +160,6 @@ class OlxCrawler { const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; - const singlePageResults = []; $("#rezultatipretrage") .find(".listitem") @@ -158,7 +182,8 @@ class OlxCrawler { } const scrapedData = await Promise.all(asyncScraping); - return scrapedData; + const filteredScrapedData = scrapedData.filter(adData => !!adData); + return filteredScrapedData; } catch (e) { console.error("Exception caught:" + e); return []; @@ -166,29 +191,34 @@ class OlxCrawler { } async scrapeAd(url) { + console.log("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); let status = AD_STATUS.STATUS_NORMAL; - const username = $( - "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" - ) + const propertySelectors = { + username: + "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", + title: "#naslovartikla", + descriptions: ".artikal_detaljniopis_tekst", + category: + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" + }; + + const username = $(propertySelectors.username) .text() .trim(); - - if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { + if (this.ignoredUsernames.includes((username || "").toLowerCase())) { return null; } - const title = $("#naslovartikla") + const title = $(propertySelectors.title) .text() .trim(); - const descriptions = $(".artikal_detaljniopis_tekst"); - const category = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" - ) + const descriptions = $(propertySelectors.descriptions); + const category = $(propertySelectors.category) .text() .trim(); @@ -252,7 +282,7 @@ class OlxCrawler { const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; - const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`; + const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) .text() @@ -268,11 +298,15 @@ class OlxCrawler { throw { message: "Invalid published date ! Check parsing format" }; } - const renewedDate = $(renewedDateValueSelector) - .text() + const renewedDate = $(renewedDateFullValueSelector) + .data("content") .trim(); - const renewedDateMoment = this.parseRenewedDate(renewedDate); + const renewedDateMoment = moment.tz( + renewedDate, + OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); if (!renewedDateMoment) { throw { @@ -416,6 +450,8 @@ class OlxCrawler { return AD_CATEGORY.CATEGORY_HOUSE; case "Poslovni prostori": return AD_CATEGORY.CATEGORY_OFFICE; + case "Apartmani": + return AD_CATEGORY.CATEGORY_APARTMENT; default: return undefined; } @@ -459,34 +495,36 @@ class OlxCrawler { return currentMoment.add(-1, "month"); } - const dayVariations = ["dan", "dana"]; - for (const dayVariation of dayVariations) { - if (renewedDateText.includes(dayVariation)) { - // format for this case should be "Prije N dana" or "Prije N dan" - const dateParts = renewedDateText.split(" "); - if (dateParts[0] === "Prije") { - const numberOfDays = parseInt(dateParts[1]); - return currentMoment.add(-1 * numberOfDays, "days"); - } else { - return undefined; - } - } - } - if (renewedDateText.includes("Jučer")) { return currentMoment.add(-1, "day"); } - const todayVariations = [ - "sat", - "sati", - "sata", - "min", - "sekunde", - "sekundi", - "sekundu", - "maloprije" - ]; + if (renewedDateText.includes("Prije sat")) { + return currentMoment.add(-1, "hour"); + } + + if (renewedDateText.includes("dan")) { + // format for this case should be "Prije N dana" or "Prije N dan" + const dateParts = renewedDateText.split(" "); + if (dateParts[0] === "Prije") { + const numberOfDays = parseInt(dateParts[1]); + return currentMoment.add(-1 * numberOfDays, "days"); + } else { + return undefined; + } + } + + if (renewedDateText.includes("sat")) { + const dateParts = renewedDateText.split(" "); + const parsedHours = + dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined; + if (!parsedHours) { + return undefined; + } + return currentMoment.add(-1 * parsedHours, "hours"); + } + + const todayVariations = ["min", "sekund", "maloprije"]; for (const todayVariation of todayVariations) { if (renewedDateText.includes(todayVariation)) { return currentMoment; @@ -506,12 +544,16 @@ class OlxCrawler { return new Promise(resolve => setTimeout(resolve, ms)); } - async saveCrawledResults(results, maxAge) { + async saveCrawledResults(results) { const savers = this.savers; - for (const saver of savers) { - await saver.save(results, maxAge); - } + // for (const saver of savers) { + // await saver.save(results); + // } + + //For now, we use only Postgres saver, so ... + return await savers[0].save(results); + //so that we can use some sequelize options and information when data is inserted } } diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index aea9d29..967478a 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -1,7 +1,8 @@ "use strict"; const db = require("../../models/index"); +const sequelize = require("sequelize"); -const bulkUpsertRealEstates = async (realEstateData, maxAge) => { +const bulkUpsertRealEstates = async realEstateData => { try { const fieldsToUpdateIfDuplicate = [ "realEstateType", @@ -23,16 +24,42 @@ const bulkUpsertRealEstates = async (realEstateData, maxAge) => { "longDescription", "gardenSize", "adStatus", - "updatedAt" + "updatedAt", + "renewedDate" ]; + return await db.RealEstate.bulkCreate(realEstateData, { - updateOnDuplicate: fieldsToUpdateIfDuplicate + updateOnDuplicate: fieldsToUpdateIfDuplicate, + returning: true }); } catch (e) { console.log("Error bulk upserting realEstates : ", e); } }; -module.exports = { - bulkUpsertRealEstates +const checkIfAlreadyExist = async realEstateData => { + const orQueryPart = []; + + for (const realEstate of realEstateData) { + const { agencyObjectId, originAgencyName } = realEstate; + + const singleRealEstateQueryPart = { + agencyObjectId, + originAgencyName + }; + + orQueryPart.push(singleRealEstateQueryPart); + } + + const query = { + [sequelize.Op.or]: orQueryPart + }; + + const result = await db.RealEstate.count({ where: query }); + return result > 0; +}; + +module.exports = { + bulkUpsertRealEstates, + checkIfAlreadyExist };