From c4f6c6e1c3d13868ea3fdb0d404ed0c30ae6fb51 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Sat, 21 Sep 2019 15:45:48 +0200 Subject: [PATCH 01/12] construct crawling url before indexing single page --- app/crawler/specific/olx.js | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 0bf0a35..9723c74 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -78,9 +78,9 @@ class OlxCrawler { const maxResultsPerPage = this.maxResults; for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { + const pageUrl = `${url}&stranica=${pageNumber}`; const singlePageResults = await this.indexSinglePage( - url, - pageNumber, + pageUrl, maxResultsPerPage ); await this.saveCrawledResults(singlePageResults); @@ -88,10 +88,8 @@ class OlxCrawler { } } - async indexSinglePage(urlWithoutPageNumber, pageNumber, maxResultsPerPage) { + async indexSinglePage(url, maxResultsPerPage) { try { - const url = `${urlWithoutPageNumber}&stranica=${pageNumber}`; - const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); @@ -114,8 +112,6 @@ class OlxCrawler { hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; for (let i = 0; i < actualNoOfResults; i++) { - console.log(`Scraping : ${hrefs[i]}`); - const adData = await this.scrapeAd(hrefs[i]); if (adData) { From 3140fdf0c04beab3e4073636d99fa972315b09c3 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 23 Sep 2019 10:46:31 +0200 Subject: [PATCH 02/12] use function generator to index pages; crawl in parallel --- app/crawler/crawl.js | 8 +- app/crawler/crawlerConfig.js | 14 +-- app/crawler/savers/postgres.js | 4 +- app/crawler/specific/olx.js | 172 +++++++++++++++++++++------------ app/helpers/db/realEstate.js | 2 +- development.env | 4 +- 6 files changed, 127 insertions(+), 77 deletions(-) diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 0f9dcc9..2ff5118 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -13,12 +13,12 @@ const PostgresSaver = require("./savers/postgres"); const crawlers = [ new OlxCrawler( - OLX_CONFIG.OLX_START_PAGE, - OLX_CONFIG.OLX_END_PAGE, - OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, [new PostgresSaver()], OLX_CONFIG.OLX_CRAWLER_AD_TYPE, - OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES + OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, + OLX_CONFIG.OLX_MAX_PAGES, + OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, + OLX_CONFIG.OLX_MAX_AGE ) ]; diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index d524fce..a602ad8 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -2,29 +2,29 @@ require("dotenv").config({ path: "../../.env" }); const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); -const crawlerAdType = +const olxCrawlerAdType = process.env.OLX_CRAWLER_AD_TYPE !== undefined ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] : null; -const parsedCrawlerAdCategories = +const olxParsedCrawlerAdCategories = process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => category.trim() ) : ["CATEGORY_FLAT", "CATEGORY_HOUSE"]; -const transformedCrawlerAdCategories = parsedCrawlerAdCategories +const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories .map(categoryName => AD_CATEGORY[categoryName]) .filter(category => !!category); const OLX_CONFIG = { - OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1, - OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10, + OLX_MAX_PAGES: parseInt(process.env.MAX_PAGES) || 500, OLX_MAX_RESULTS_PER_PAGE: parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, - OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE, - OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories + OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, + OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories, + OLX_MAX_AGE: parseInt(process.env.OLX_MAX_AGE) || 30 }; module.exports = { diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js index 5aa0c3a..4664be7 100644 --- a/app/crawler/savers/postgres.js +++ b/app/crawler/savers/postgres.js @@ -7,9 +7,9 @@ class PostgresSaver { return true; } - async save(results) { + async save(results, maxAge) { console.log("[POSTGRES] Saving..."); - await bulkUpsertRealEstates(results); + await bulkUpsertRealEstates(results, maxAge); } close() { diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 9723c74..37af9e6 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -1,7 +1,8 @@ "use strict"; -let fetch = require("node-fetch"); -let cheerio = require("cheerio"); +const fetch = require("node-fetch"); +const cheerio = require("cheerio"); +const Promise = require("bluebird"); const { AD_TYPE, @@ -13,78 +14,113 @@ const { } = require("../../common/enums"); const OLX_ENUMS = { - OLX_AD_TYPE: {}, - OLX_AD_CATEGORY: {}, + OLX_AD_TYPE: { + [CRAWLER_AD_TYPE.ALL]: "", + [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", + [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje" + }, + OLX_AD_CATEGORY: { + [AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23", + [AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24", + [AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29", + [AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25", + [AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27", + [AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30" + }, MAX_DETAIL_FIELDS: 30 }; -OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = ""; -OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja"; -OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje"; - -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30"; - class OlxCrawler { constructor( - fromPage = 1, - toPage = 10, - maxResults = 1000, savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [ AD_CATEGORY.CATEGORY_FLAT, AD_CATEGORY.CATEGORY_HOUSE - ] + ], + maxPages = 1000, + maxResultsPerPage = 100, + maxAge = 30 ) { - this.fromPage = fromPage; - this.toPage = toPage; - this.maxResults = maxResults; this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; + this.maxPages = maxPages; + this.maxResultsPerPage = maxResultsPerPage; + this.maxAge = maxAge; } async crawl() { console.log("[OLX] Crawler started"); - const crawlAdTypes = this.crawlerAdTypes; const crawlAdCategories = this.crawlerAdCategories; - const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`; - - if (crawlAdCategories && crawlAdTypes) { - const asyncPagesIndexingByCategory = []; + if (crawlAdCategories) { + const indexGenerators = []; for (const adCategory of crawlAdCategories) { - asyncPagesIndexingByCategory.push( - this.indexPages( - `${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}` - ) - ); + indexGenerators.push(this.categoryIndexer(adCategory)); } - await Promise.all(asyncPagesIndexingByCategory); + let done = false; + while (!done) { + const categoryIndexerPromises = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + } + + Promise.all(categoryIndexerPromises).then(singlePageResults => { + const entries = singlePageResults.entries(); + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + this.saveCrawledResults(singlePageResult, this.maxAge) + .then(numberOfSaved => {}) + .catch(error => + console.log("[POSTGRES Saver] Error saving results : ", error) + ); + } else { + //Generator returned undefined, no more pages + indexGenerators.splice(index, 1); + if (indexGenerators.length === 0) { + done = true; + } + } + } + }); + + await this.sleep(500); + } } + console.log("[OLX] Crawler finished"); } - async indexPages(url) { - const startPage = this.fromPage; - const endPage = this.toPage; - const maxResultsPerPage = this.maxResults; + async *categoryIndexer(adCategory) { + let pageToIndex = 1; - for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { - const pageUrl = `${url}&stranica=${pageNumber}`; - const singlePageResults = await this.indexSinglePage( - pageUrl, - maxResultsPerPage - ); - await this.saveCrawledResults(singlePageResults); - await this.sleep(5000); + const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; + if (urlAdTypePart && urlCategoryPart) { + while (true) { + const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; + const singlePageResults = await this.indexSinglePage( + urlPageToCrawl, + this.maxResultsPerPage + ); + console.log("indexing ", adCategory, " page : ", pageToIndex); + + if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { + yield singlePageResults; + } else { + return undefined; + } + + ++pageToIndex; + if (pageToIndex === this.maxPages) { + return undefined; + } + } + } else { + return undefined; } } @@ -111,18 +147,16 @@ class OlxCrawler { let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; + const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { - const adData = await this.scrapeAd(hrefs[i]); - - if (adData) { - singlePageResults.push(adData); - } - await this.sleep(500); + asyncScraping.push(this.scrapeAd(hrefs[i])); } - return singlePageResults; + const scrapedData = await Promise.all(asyncScraping); + return scrapedData; } catch (e) { console.error("Exception caught:" + e); + return []; } } @@ -135,24 +169,32 @@ class OlxCrawler { const username = $( "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" - ).text(); + ) + .text() + .trim(); if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { return null; } - const title = $("#naslovartikla").text(); + const title = $("#naslovartikla") + .text() + .trim(); const descriptions = $(".artikal_detaljniopis_tekst"); const category = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" - ).text(); + ) + .text() + .trim(); //====== PRICE DETECTION AND EXTRACTION ===== let price = null; const normalPriceValue = $("#pc > p:nth-child(2)").text(); const urgentPriceValue = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" - ).text(); + ) + .text() + .trim(); if (normalPriceValue && normalPriceValue.length > 0) { price = normalPriceValue; @@ -258,7 +300,9 @@ class OlxCrawler { const time = $("time").attr("datetime"); const numberOfViews = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" - ).text(); + ) + .text() + .trim(); //=========================================== //========================================= @@ -296,8 +340,14 @@ class OlxCrawler { price: parsedPrice, area: parsedArea, gardenSize: parsedGardenSize, - shortDescription: descriptions.first().text(), - longDescription: descriptions.last().text(), + shortDescription: descriptions + .first() + .text() + .trim(), + longDescription: descriptions + .last() + .text() + .trim(), streetNumber: 0, streetName: "", locality: "", @@ -370,11 +420,11 @@ class OlxCrawler { return new Promise(resolve => setTimeout(resolve, ms)); } - async saveCrawledResults(results) { + async saveCrawledResults(results, maxAge) { const savers = this.savers; for (const saver of savers) { - await saver.save(results); + await saver.save(results, maxAge); } } } diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index 3f32b7c..aea9d29 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -1,7 +1,7 @@ "use strict"; const db = require("../../models/index"); -const bulkUpsertRealEstates = async realEstateData => { +const bulkUpsertRealEstates = async (realEstateData, maxAge) => { try { const fieldsToUpdateIfDuplicate = [ "realEstateType", diff --git a/development.env b/development.env index f998e9d..e96b363 100644 --- a/development.env +++ b/development.env @@ -16,8 +16,8 @@ SOURCE_EMAIL=info@saburly.com #=============== CRAWLER SETTINGS===============# #==OLX== -OLX_START_PAGE=Crawler starts from this page -OLX_END_PAGE=Crawler ends with this page (including this page) +OLX_MAX_PAGES=Restrict crawler to this number of pages OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values +OLX_MAX_AGE=[in days] if ad is crawled before this number of days, it will be re-crawled From 18db554ea8d1a28193733f735cef280233180a5e Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 23 Sep 2019 19:02:00 +0200 Subject: [PATCH 03/12] add published and renewed date columns to the RealEstates table --- ...-published-renewed-dates-to-realEstates.js | 21 +++++++++++++++++++ app/models/realEstate.js | 4 +++- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 app/migrations/20190923185802-add-published-renewed-dates-to-realEstates.js diff --git a/app/migrations/20190923185802-add-published-renewed-dates-to-realEstates.js b/app/migrations/20190923185802-add-published-renewed-dates-to-realEstates.js new file mode 100644 index 0000000..88ae358 --- /dev/null +++ b/app/migrations/20190923185802-add-published-renewed-dates-to-realEstates.js @@ -0,0 +1,21 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.addColumn("RealEstates", "publishedDate", { + type: Sequelize.DATE + }), + queryInterface.addColumn("RealEstates", "renewedDate", { + type: Sequelize.DATE + }) + ]); + }, + + down: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.removeColumn("RealEstates", "renewedDate"), + queryInterface.removeColumn("RealEstates", "publishedDate") + ]); + } +}; diff --git a/app/models/realEstate.js b/app/models/realEstate.js index 72b76ec..8a93892 100644 --- a/app/models/realEstate.js +++ b/app/models/realEstate.js @@ -50,7 +50,9 @@ module.exports = (sequelize, DataTypes) => { title: DataTypes.TEXT, shortDescription: DataTypes.TEXT, longDescription: DataTypes.TEXT, - adStatus: DataTypes.INTEGER + adStatus: DataTypes.INTEGER, + publishedDate: DataTypes.DATE, + renewedDate: DataTypes.DATE }); RealEstate.associate = models => { From c7184be5fc80010a1b3ae91fc88bee0142559c9d Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 23 Sep 2019 21:18:48 +0200 Subject: [PATCH 04/12] install moment and moment-timezone packages --- package-lock.json | 6 +++--- package.json | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index cac150b..ad66538 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2697,9 +2697,9 @@ "integrity": "sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg==" }, "moment-timezone": { - "version": "0.5.25", - "resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.25.tgz", - "integrity": "sha512-DgEaTyN/z0HFaVcVbSyVCUU6HeFdnNC3vE4c9cgu2dgMTvjBUBdBzWfasTBmAW45u5OIMeCJtU8yNjM22DHucw==", + "version": "0.5.26", + "resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.26.tgz", + "integrity": "sha512-sFP4cgEKTCymBBKgoxZjYzlSovC20Y6J7y3nanDc5RoBIXKlZhoYwBoZGe3flwU6A372AcRwScH8KiwV6zjy1g==", "requires": { "moment": ">= 2.9.0" } diff --git a/package.json b/package.json index a1447c3..f55a068 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,8 @@ "express": "^4.16.4", "express-ejs-layouts": "^2.5.0", "express-layout": "^0.1.0", + "moment": "^2.24.0", + "moment-timezone": "^0.5.26", "node-fetch": "^2.3.0", "node-schedule": "^1.3.2", "pg": "^7.10.0", From 63eb64b0f6a9c16ea65541ddcebdffc2c058d16e Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 23 Sep 2019 21:19:28 +0200 Subject: [PATCH 05/12] parse and save published and renewed dates --- app/config/appConfig.js | 5 ++- app/crawler/specific/olx.js | 90 ++++++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 3 deletions(-) diff --git a/app/config/appConfig.js b/app/config/appConfig.js index b4144cc..5b06652 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -6,7 +6,10 @@ const APP_URL = ? process.env.APP_URL || "http://market-alarm" : process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`; +const DEFAULT_TIMEZONE = "Europe/Sarajevo"; + module.exports = { APP_PORT, - APP_URL + APP_URL, + DEFAULT_TIMEZONE }; diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 37af9e6..34a949a 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -3,6 +3,7 @@ const fetch = require("node-fetch"); const cheerio = require("cheerio"); const Promise = require("bluebird"); +const moment = require("moment-timezone"); const { AD_TYPE, @@ -13,6 +14,8 @@ const { CRAWLER_AD_TYPE } = require("../../common/enums"); +const { DEFAULT_TIMEZONE } = require("../../config/appConfig"); + const OLX_ENUMS = { OLX_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "", @@ -27,7 +30,9 @@ const OLX_ENUMS = { [AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27", [AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30" }, - MAX_DETAIL_FIELDS: 30 + MAX_DETAIL_FIELDS: 30, + OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm", + OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" }; class OlxCrawler { @@ -246,6 +251,35 @@ class OlxCrawler { } const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; + const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; + const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`; + + const publishedDate = $(publishedDateValueSelector) + .text() + .trim(); + + const publishedDateMoment = moment.tz( + publishedDate, + OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + + if (!publishedDateMoment.isValid()) { + throw { message: "Invalid published date ! Check parsing format" }; + } + + const renewedDate = $(renewedDateValueSelector) + .text() + .trim(); + + const renewedDateMoment = this.parseRenewedDate(renewedDate); + + if (!renewedDateMoment) { + throw { + message: + "Invalid renewed date ! Check how parser parsed renewed date text" + }; + } adType = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` @@ -358,7 +392,9 @@ class OlxCrawler { country: "", locationLat, locationLong, - adStatus: status + adStatus: status, + publishedDate: publishedDateMoment.toISOString(), + renewedDate: renewedDateMoment.toISOString() }; return data; @@ -416,6 +452,56 @@ class OlxCrawler { return parseFloat(formattedPriceText); } + parseRenewedDate(renewedDateText) { + const currentMoment = moment.tz(DEFAULT_TIMEZONE); + + if (renewedDateText.includes("Prije mjesec dana")) { + return currentMoment.add(-1, "month"); + } + + const dayVariations = ["dan", "dana"]; + for (const dayVariation of dayVariations) { + if (renewedDateText.includes(dayVariation)) { + // format for this case should be "Prije N dana" or "Prije N dan" + const dateParts = renewedDateText.split(" "); + if (dateParts[0] === "Prije") { + const numberOfDays = parseInt(dateParts[1]); + return currentMoment.add(-1 * numberOfDays, "days"); + } else { + return undefined; + } + } + } + + if (renewedDateText.includes("Jučer")) { + return currentMoment.add(-1, "day"); + } + + const todayVariations = [ + "sat", + "sati", + "sata", + "min", + "sekunde", + "sekundi", + "sekundu", + "maloprije" + ]; + for (const todayVariation of todayVariations) { + if (renewedDateText.includes(todayVariation)) { + return currentMoment; + } + } + + const renewedDateMoment = moment.tz( + renewedDateText, + OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + + return renewedDateMoment.isValid() ? renewedDateMoment : undefined; + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } From 06d35fcb4b959c618b4c570046477fdcb817840c Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Tue, 24 Sep 2019 23:21:06 +0200 Subject: [PATCH 06/12] move ignored usernames config to crawler specific config --- app/common/enums.js | 2 -- app/crawler/crawlerConfig.js | 11 +++++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/app/common/enums.js b/app/common/enums.js index d047240..a23a6a0 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -12,8 +12,6 @@ const AD_CATEGORY = { CATEGORY_GARAGE: "GARAGE" }; -const IGNORED_USERNAMES = []; - const AD_STATUS = { STATUS_NORMAL: 1, STATUS_RESERVED: 2, diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index a602ad8..2b457bc 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -14,17 +14,24 @@ const olxParsedCrawlerAdCategories = ) : ["CATEGORY_FLAT", "CATEGORY_HOUSE"]; +const olxIgnoredUsernames = + process.env.OLX_IGNORED_USERNAMES !== undefined + ? process.env.OLX_IGNORED_USERNAMES.split(",").map(username => + username.trim() + ) + : []; + const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories .map(categoryName => AD_CATEGORY[categoryName]) .filter(category => !!category); const OLX_CONFIG = { - OLX_MAX_PAGES: parseInt(process.env.MAX_PAGES) || 500, + OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500, OLX_MAX_RESULTS_PER_PAGE: parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories, - OLX_MAX_AGE: parseInt(process.env.OLX_MAX_AGE) || 30 + OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [] }; module.exports = { From 746732f30b3d185fca87612bc44ce96fd131b46c Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Tue, 24 Sep 2019 23:21:22 +0200 Subject: [PATCH 07/12] remove deleted column from RealEstate model --- app/models/realEstate.js | 4 ---- 1 file changed, 4 deletions(-) diff --git a/app/models/realEstate.js b/app/models/realEstate.js index 8a93892..48c85d2 100644 --- a/app/models/realEstate.js +++ b/app/models/realEstate.js @@ -43,10 +43,6 @@ module.exports = (sequelize, DataTypes) => { country: DataTypes.TEXT, locationLat: DataTypes.REAL, locationLong: DataTypes.REAL, - lastTimeCrawled: { - type: DataTypes.DATE, - allowNull: false - }, title: DataTypes.TEXT, shortDescription: DataTypes.TEXT, longDescription: DataTypes.TEXT, From 90bc57edb6ac84ff7c36d8ffd2b89b81eb6eeaa6 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Tue, 24 Sep 2019 23:23:09 +0200 Subject: [PATCH 08/12] stop crawling when existing, non-renewed ad is found --- app/crawler/crawl.js | 2 +- app/crawler/savers/postgres.js | 31 ++++++- app/crawler/specific/olx.js | 146 +++++++++++++++++++++------------ app/helpers/db/realEstate.js | 37 +++++++-- 4 files changed, 155 insertions(+), 61 deletions(-) diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 2ff5118..d37cb9c 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -18,7 +18,7 @@ const crawlers = [ OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, OLX_CONFIG.OLX_MAX_PAGES, OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, - OLX_CONFIG.OLX_MAX_AGE + OLX_CONFIG.OLX_IGNORED_USERNAMES ) ]; diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js index 4664be7..28fc635 100644 --- a/app/crawler/savers/postgres.js +++ b/app/crawler/savers/postgres.js @@ -1,4 +1,9 @@ -const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate"); +const moment = require("moment"); + +const { + bulkUpsertRealEstates, + checkIfAlreadyExist +} = require("../../helpers/db/realEstate"); class PostgresSaver { connect() { @@ -7,9 +12,29 @@ class PostgresSaver { return true; } - async save(results, maxAge) { + async save(results) { console.log("[POSTGRES] Saving..."); - await bulkUpsertRealEstates(results, maxAge); + const resultsWithPublishedAndRenewedDateSame = results.filter( + realEstate => { + const { publishedDate, renewedDate } = realEstate; + + const publishedMomentDate = moment.utc(publishedDate); + const renewedMomentDate = moment.utc(renewedDate); + + return publishedMomentDate.isSame(renewedMomentDate, "minute"); + } + ); + + const exist = + resultsWithPublishedAndRenewedDateSame.length > 0 + ? await checkIfAlreadyExist(resultsWithPublishedAndRenewedDateSame) + : false; + const savedRecords = await bulkUpsertRealEstates(results); + + return { + exist, + savedRecords + }; } close() { diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 34a949a..1f6ea1c 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -8,7 +8,6 @@ const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, - IGNORED_USERNAMES, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE @@ -45,7 +44,7 @@ class OlxCrawler { ], maxPages = 1000, maxResultsPerPage = 100, - maxAge = 30 + ignoredUsernames = [] ) { this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; @@ -53,13 +52,16 @@ class OlxCrawler { this.crawlerAdCategories = crawlerAdCategories; this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; - this.maxAge = maxAge; + this.ignoredUsernames = ignoredUsernames; } async crawl() { console.log("[OLX] Crawler started"); const crawlAdCategories = this.crawlerAdCategories; + const savedRealEstates = []; + const asyncSaveActions = []; + if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { @@ -77,11 +79,33 @@ class OlxCrawler { const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { - this.saveCrawledResults(singlePageResult, this.maxAge) - .then(numberOfSaved => {}) + const savePromise = this.saveCrawledResults(singlePageResult) + .then(({ exist, savedRecords }) => { + if (exist) { + indexGenerators.splice(index, 1); + if (indexGenerators.length === 0) { + done = true; + } + } + + for (const savedRecord of savedRecords) { + const { createdAt, updatedAt } = savedRecord; + + console.log("Comparing ", createdAt, " <> ", updatedAt); + + const createdAtMoment = moment.utc(createdAt); + const updatedAtMoment = moment.utc(updatedAt); + + if (createdAtMoment.isSame(updatedAtMoment, "second")) { + console.log("\tEqual !"); + savedRealEstates.push(savedRecord); + } + } + }) .catch(error => console.log("[POSTGRES Saver] Error saving results : ", error) ); + asyncSaveActions.push(savePromise); } else { //Generator returned undefined, no more pages indexGenerators.splice(index, 1); @@ -92,11 +116,13 @@ class OlxCrawler { } }); - await this.sleep(500); + await this.sleep(5000); } } - + console.log("[OLX] Waiting for async save actions ..."); + await Promise.all(asyncSaveActions); console.log("[OLX] Crawler finished"); + return savedRealEstates; } async *categoryIndexer(adCategory) { @@ -111,7 +137,6 @@ class OlxCrawler { urlPageToCrawl, this.maxResultsPerPage ); - console.log("indexing ", adCategory, " page : ", pageToIndex); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; @@ -135,7 +160,6 @@ class OlxCrawler { const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; - const singlePageResults = []; $("#rezultatipretrage") .find(".listitem") @@ -158,7 +182,8 @@ class OlxCrawler { } const scrapedData = await Promise.all(asyncScraping); - return scrapedData; + const filteredScrapedData = scrapedData.filter(adData => !!adData); + return filteredScrapedData; } catch (e) { console.error("Exception caught:" + e); return []; @@ -166,29 +191,34 @@ class OlxCrawler { } async scrapeAd(url) { + console.log("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); let status = AD_STATUS.STATUS_NORMAL; - const username = $( - "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" - ) + const propertySelectors = { + username: + "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", + title: "#naslovartikla", + descriptions: ".artikal_detaljniopis_tekst", + category: + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" + }; + + const username = $(propertySelectors.username) .text() .trim(); - - if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { + if (this.ignoredUsernames.includes((username || "").toLowerCase())) { return null; } - const title = $("#naslovartikla") + const title = $(propertySelectors.title) .text() .trim(); - const descriptions = $(".artikal_detaljniopis_tekst"); - const category = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" - ) + const descriptions = $(propertySelectors.descriptions); + const category = $(propertySelectors.category) .text() .trim(); @@ -252,7 +282,7 @@ class OlxCrawler { const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; - const renewedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(5) > div.df2`; + const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) .text() @@ -268,11 +298,15 @@ class OlxCrawler { throw { message: "Invalid published date ! Check parsing format" }; } - const renewedDate = $(renewedDateValueSelector) - .text() + const renewedDate = $(renewedDateFullValueSelector) + .data("content") .trim(); - const renewedDateMoment = this.parseRenewedDate(renewedDate); + const renewedDateMoment = moment.tz( + renewedDate, + OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); if (!renewedDateMoment) { throw { @@ -416,6 +450,8 @@ class OlxCrawler { return AD_CATEGORY.CATEGORY_HOUSE; case "Poslovni prostori": return AD_CATEGORY.CATEGORY_OFFICE; + case "Apartmani": + return AD_CATEGORY.CATEGORY_APARTMENT; default: return undefined; } @@ -459,34 +495,36 @@ class OlxCrawler { return currentMoment.add(-1, "month"); } - const dayVariations = ["dan", "dana"]; - for (const dayVariation of dayVariations) { - if (renewedDateText.includes(dayVariation)) { - // format for this case should be "Prije N dana" or "Prije N dan" - const dateParts = renewedDateText.split(" "); - if (dateParts[0] === "Prije") { - const numberOfDays = parseInt(dateParts[1]); - return currentMoment.add(-1 * numberOfDays, "days"); - } else { - return undefined; - } - } - } - if (renewedDateText.includes("Jučer")) { return currentMoment.add(-1, "day"); } - const todayVariations = [ - "sat", - "sati", - "sata", - "min", - "sekunde", - "sekundi", - "sekundu", - "maloprije" - ]; + if (renewedDateText.includes("Prije sat")) { + return currentMoment.add(-1, "hour"); + } + + if (renewedDateText.includes("dan")) { + // format for this case should be "Prije N dana" or "Prije N dan" + const dateParts = renewedDateText.split(" "); + if (dateParts[0] === "Prije") { + const numberOfDays = parseInt(dateParts[1]); + return currentMoment.add(-1 * numberOfDays, "days"); + } else { + return undefined; + } + } + + if (renewedDateText.includes("sat")) { + const dateParts = renewedDateText.split(" "); + const parsedHours = + dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined; + if (!parsedHours) { + return undefined; + } + return currentMoment.add(-1 * parsedHours, "hours"); + } + + const todayVariations = ["min", "sekund", "maloprije"]; for (const todayVariation of todayVariations) { if (renewedDateText.includes(todayVariation)) { return currentMoment; @@ -506,12 +544,16 @@ class OlxCrawler { return new Promise(resolve => setTimeout(resolve, ms)); } - async saveCrawledResults(results, maxAge) { + async saveCrawledResults(results) { const savers = this.savers; - for (const saver of savers) { - await saver.save(results, maxAge); - } + // for (const saver of savers) { + // await saver.save(results); + // } + + //For now, we use only Postgres saver, so ... + return await savers[0].save(results); + //so that we can use some sequelize options and information when data is inserted } } diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index aea9d29..967478a 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -1,7 +1,8 @@ "use strict"; const db = require("../../models/index"); +const sequelize = require("sequelize"); -const bulkUpsertRealEstates = async (realEstateData, maxAge) => { +const bulkUpsertRealEstates = async realEstateData => { try { const fieldsToUpdateIfDuplicate = [ "realEstateType", @@ -23,16 +24,42 @@ const bulkUpsertRealEstates = async (realEstateData, maxAge) => { "longDescription", "gardenSize", "adStatus", - "updatedAt" + "updatedAt", + "renewedDate" ]; + return await db.RealEstate.bulkCreate(realEstateData, { - updateOnDuplicate: fieldsToUpdateIfDuplicate + updateOnDuplicate: fieldsToUpdateIfDuplicate, + returning: true }); } catch (e) { console.log("Error bulk upserting realEstates : ", e); } }; -module.exports = { - bulkUpsertRealEstates +const checkIfAlreadyExist = async realEstateData => { + const orQueryPart = []; + + for (const realEstate of realEstateData) { + const { agencyObjectId, originAgencyName } = realEstate; + + const singleRealEstateQueryPart = { + agencyObjectId, + originAgencyName + }; + + orQueryPart.push(singleRealEstateQueryPart); + } + + const query = { + [sequelize.Op.or]: orQueryPart + }; + + const result = await db.RealEstate.count({ where: query }); + return result > 0; +}; + +module.exports = { + bulkUpsertRealEstates, + checkIfAlreadyExist }; From f93d0e738f8d83772cb5fa8defa9fc453ae39695 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 25 Sep 2019 08:31:37 +0200 Subject: [PATCH 09/12] add delay between pages config variable --- app/common/enums.js | 1 - app/crawler/crawlerConfig.js | 3 ++- development.env | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/app/common/enums.js b/app/common/enums.js index a23a6a0..b7a650c 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -34,7 +34,6 @@ const CRAWLER_AD_TYPE = { module.exports = { AD_TYPE, - IGNORED_USERNAMES, AD_CATEGORY, AD_STATUS, AD_AGENCY, diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index 2b457bc..eb9133a 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -31,7 +31,8 @@ const OLX_CONFIG = { parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories, - OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [] + OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], + OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 }; module.exports = { diff --git a/development.env b/development.env index e96b363..2f2fdc0 100644 --- a/development.env +++ b/development.env @@ -20,4 +20,5 @@ OLX_MAX_PAGES=Restrict crawler to this number of pages OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values -OLX_MAX_AGE=[in days] if ad is crawled before this number of days, it will be re-crawled +OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore +OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page From b3fcc6ba9a61d494b40afa21363448ed36a9565e Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 25 Sep 2019 08:32:37 +0200 Subject: [PATCH 10/12] return new and existing real estates when saving results --- app/crawler/savers/postgres.js | 47 ++++++++++++++++++---------------- app/helpers/db/realEstate.js | 26 +------------------ 2 files changed, 26 insertions(+), 47 deletions(-) diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js index 28fc635..9ba3391 100644 --- a/app/crawler/savers/postgres.js +++ b/app/crawler/savers/postgres.js @@ -1,9 +1,6 @@ const moment = require("moment"); -const { - bulkUpsertRealEstates, - checkIfAlreadyExist -} = require("../../helpers/db/realEstate"); +const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate"); class PostgresSaver { connect() { @@ -14,27 +11,33 @@ class PostgresSaver { async save(results) { console.log("[POSTGRES] Saving..."); - const resultsWithPublishedAndRenewedDateSame = results.filter( - realEstate => { - const { publishedDate, renewedDate } = realEstate; - const publishedMomentDate = moment.utc(publishedDate); - const renewedMomentDate = moment.utc(renewedDate); - - return publishedMomentDate.isSame(renewedMomentDate, "minute"); - } - ); - - const exist = - resultsWithPublishedAndRenewedDateSame.length > 0 - ? await checkIfAlreadyExist(resultsWithPublishedAndRenewedDateSame) - : false; const savedRecords = await bulkUpsertRealEstates(results); - return { - exist, - savedRecords - }; + if (Array.isArray(savedRecords)) { + const newRealEstates = []; + const existingRealEstates = []; + + for (const savedRecord of savedRecords) { + const { createdAt, updatedAt } = savedRecord; + + const createdAtMoment = moment.utc(createdAt); + const updatedAtMoment = moment.utc(updatedAt); + + if (createdAtMoment.isSame(updatedAtMoment, "second")) { + newRealEstates.push(savedRecord); + } else { + existingRealEstates.push(savedRecord); + } + } + + return { + newRecords: newRealEstates, + existingRecords: existingRealEstates + }; + } else { + throw { message: "[POSTGRES] Failed to save records" }; + } } close() { diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index 967478a..2443fbb 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -1,6 +1,5 @@ "use strict"; const db = require("../../models/index"); -const sequelize = require("sequelize"); const bulkUpsertRealEstates = async realEstateData => { try { @@ -37,29 +36,6 @@ const bulkUpsertRealEstates = async realEstateData => { } }; -const checkIfAlreadyExist = async realEstateData => { - const orQueryPart = []; - - for (const realEstate of realEstateData) { - const { agencyObjectId, originAgencyName } = realEstate; - - const singleRealEstateQueryPart = { - agencyObjectId, - originAgencyName - }; - - orQueryPart.push(singleRealEstateQueryPart); - } - - const query = { - [sequelize.Op.or]: orQueryPart - }; - - const result = await db.RealEstate.count({ where: query }); - return result > 0; -}; - module.exports = { - bulkUpsertRealEstates, - checkIfAlreadyExist + bulkUpsertRealEstates }; From c9a959f8be0b3136659cbc9d4b05e1fa574dd49d Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 25 Sep 2019 08:54:33 +0200 Subject: [PATCH 11/12] stop crawling when existing, not renewed ad is found --- app/crawler/crawl.js | 11 +++-- app/crawler/specific/olx.js | 87 ++++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 44 deletions(-) diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index d37cb9c..77d4fc9 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -18,18 +18,23 @@ const crawlers = [ OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, OLX_CONFIG.OLX_MAX_PAGES, OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, - OLX_CONFIG.OLX_IGNORED_USERNAMES + OLX_CONFIG.OLX_IGNORED_USERNAMES, + OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES ) ]; async function crawlAll() { for (let crawler of crawlers) { try { - await crawler.crawl(); + const newRealEstates = await crawler.crawl(); + + console.log("Number of new real estates : ", newRealEstates.length); } catch (e) { console.log("Error crawling. Trying next crawler! ", e); } } } -crawlAll(); +(async () => { + await crawlAll(); +})(); diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 1f6ea1c..75f9578 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -44,7 +44,8 @@ class OlxCrawler { ], maxPages = 1000, maxResultsPerPage = 100, - ignoredUsernames = [] + ignoredUsernames = [], + delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; @@ -53,14 +54,14 @@ class OlxCrawler { this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; this.ignoredUsernames = ignoredUsernames; + this.delayBetweenPages = delayBetweenPages; } async crawl() { console.log("[OLX] Crawler started"); const crawlAdCategories = this.crawlerAdCategories; - const savedRealEstates = []; - const asyncSaveActions = []; + const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; @@ -71,58 +72,62 @@ class OlxCrawler { let done = false; while (!done) { const categoryIndexerPromises = []; + const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); } - Promise.all(categoryIndexerPromises).then(singlePageResults => { - const entries = singlePageResults.entries(); - for (const [index, { value: singlePageResult }] of entries) { - if (singlePageResult) { - const savePromise = this.saveCrawledResults(singlePageResult) - .then(({ exist, savedRecords }) => { - if (exist) { - indexGenerators.splice(index, 1); - if (indexGenerators.length === 0) { - done = true; - } - } + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); - for (const savedRecord of savedRecords) { - const { createdAt, updatedAt } = savedRecord; + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords, existingRecords } = saveResults; - console.log("Comparing ", createdAt, " <> ", updatedAt); + newRealEstates.push(...newRecords); - const createdAtMoment = moment.utc(createdAt); - const updatedAtMoment = moment.utc(updatedAt); + for (const existingRecord of existingRecords) { + const { publishedDate, renewedDate } = existingRecord; - if (createdAtMoment.isSame(updatedAtMoment, "second")) { - console.log("\tEqual !"); - savedRealEstates.push(savedRecord); - } - } - }) - .catch(error => - console.log("[POSTGRES Saver] Error saving results : ", error) - ); - asyncSaveActions.push(savePromise); - } else { - //Generator returned undefined, no more pages - indexGenerators.splice(index, 1); - if (indexGenerators.length === 0) { - done = true; + const publishedDateMoment = moment.utc(publishedDate); + const renewedDateMoment = moment.utc(renewedDate); + + const stopCrawlingThisCategory = publishedDateMoment.isSame( + renewedDateMoment, + "minute" + ); + + if (stopCrawlingThisCategory) { + generatorsToRemove[index] = true; + // console.log("\tGenerator ", index + 1, "has no more new ads"); + break; } } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); } - }); + } - await this.sleep(5000); + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + // await this.sleep(this.delayBetweenPages); } } - console.log("[OLX] Waiting for async save actions ..."); - await Promise.all(asyncSaveActions); console.log("[OLX] Crawler finished"); - return savedRealEstates; + return newRealEstates; } async *categoryIndexer(adCategory) { @@ -191,7 +196,7 @@ class OlxCrawler { } async scrapeAd(url) { - console.log("Scraping : ", url); + //console.log("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); From 3d203df988401fc6a84f36ac7ad64985c48013b6 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 25 Sep 2019 10:00:42 +0000 Subject: [PATCH 12/12] remove comment from delay between indexing pages --- app/crawler/specific/olx.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 75f9578..b10f55a 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -123,7 +123,7 @@ class OlxCrawler { done = true; } - // await this.sleep(this.delayBetweenPages); + await this.sleep(this.delayBetweenPages); } } console.log("[OLX] Crawler finished");