From 5829de64e07c667798d845125c51dcbfbbfb48e7 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Wed, 10 Jul 2019 12:27:30 +0200 Subject: [PATCH 1/2] Added hrefs to global varialbe --- app/services/crawlerService.js | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 3ac23f5..243c38d 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -20,7 +20,22 @@ async function crawlAll() { try { const marketAlertsFromDb = await allMarketAlerts(true); - console.log("CRAWLER SERVICE: number of existing MarketAlerts from db: " + marketAlertsFromDb.length); + const hrefs = []; + const subscribedMakretAlerts = marketAlertsFromDb.filter(marketAlert => { + return marketAlert.subscribed; + }); + marketAlertsFromDb.map(marketAlert => { + if (hrefs[marketAlert.request] === undefined) { + hrefs[marketAlert.request] = [] + } + + hrefs[marketAlert.request].push(marketAlert.url); + }) + + global.hrefs = hrefs; + console.log(global.hrefs); + + console.log("CRAWLER SERVICE: number of existing MarketAlerts from db: " + subscribedMakretAlerts.length); const marketAlerts = []; const mergedResults = [].concat.apply([], results); @@ -47,7 +62,7 @@ async function crawlAll() { try { - const filteredMarketAlerts = marketAlerts.filter((elem) => !marketAlertsFromDb.find(({ url }) => elem.url === url)); + const filteredMarketAlerts = marketAlerts.filter((elem) => !subscribedMakretAlerts.find(({ url }) => elem.url === url)); console.log("CRAWLER SERVICE: Number of new crawler results: " + filteredMarketAlerts.length); await db.MarketAlert.bulkCreate(filteredMarketAlerts); -- 2.47.3 From 33f9e37d934d047a6f7a4f67bd6a2d5817b7cd69 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Wed, 10 Jul 2019 15:21:46 +0200 Subject: [PATCH 2/2] Filter data by geolocation now sets hasLocation boolean instead of excluding results --- app/helpers/crawlers/olxClawler.js | 15 +++++++ app/helpers/db/dbHelper.js | 5 ++- ...141356-add-has-location-to-marketalerts.js | 20 +++++++++ app/models/marketalert.js | 1 + app/services/crawlerService.js | 43 +++++++++++-------- 5 files changed, 64 insertions(+), 20 deletions(-) create mode 100644 app/migrations/20190710141356-add-has-location-to-marketalerts.js diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index c68db55..e94daa7 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -45,7 +45,12 @@ module.exports = class OlxCrawler { if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); + if (pointInsideBoundingBox[0].length !== 0) { + finalResult.hasLocation = true + filteredResults.push(finalResult); + } else { + finalResult.hasLocation = false filteredResults.push(finalResult); } } @@ -270,6 +275,16 @@ class Indexer { if (this.olxUrl.url === undefined) { return {} } + + if (global.hrefs) { + + if (global.hrefs[this.olxUrl.uuid] && global.hrefs[this.olxUrl.uuid].includes(this.olxUrl.url)) { + + console.log("We found duplicate URL"); + return null + } + } + const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); diff --git a/app/helpers/db/dbHelper.js b/app/helpers/db/dbHelper.js index 234aff5..d964d54 100644 --- a/app/helpers/db/dbHelper.js +++ b/app/helpers/db/dbHelper.js @@ -26,7 +26,7 @@ const allRERequestByUiid = async (requestArray) => { } /** - * Find all , or all depending on notified bolean marketalerts, and order them by email + * Find all , or all depending on notified bolean marketalerts, that the hasLocation is true, and order them by email * * @param fechAll bolean * @param notified bolean @@ -43,7 +43,8 @@ const allMarketAlerts = async (fetchAll, notified) => { if (!fetchAll){ queryObject.where = { - notified: notified + notified: notified, + hasLocation: true } } diff --git a/app/migrations/20190710141356-add-has-location-to-marketalerts.js b/app/migrations/20190710141356-add-has-location-to-marketalerts.js new file mode 100644 index 0000000..049efe3 --- /dev/null +++ b/app/migrations/20190710141356-add-has-location-to-marketalerts.js @@ -0,0 +1,20 @@ +'use strict'; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.addColumn( + 'MarketAlerts', + 'hasLocation', + { + type: Sequelize.BOOLEAN + } + ); + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.removeColumn( + 'MarketAlerts', + 'hasLocation' + ); + } +}; diff --git a/app/models/marketalert.js b/app/models/marketalert.js index 72c797c..14742ff 100644 --- a/app/models/marketalert.js +++ b/app/models/marketalert.js @@ -14,6 +14,7 @@ module.exports = (sequelize, DataTypes) => { notified : DataTypes.BOOLEAN, title : DataTypes.STRING, request: DataTypes.STRING, + hasLocation: DataTypes.BOOLEAN, email: { type: DataTypes.STRING, diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 243c38d..e410c4c 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -13,29 +13,35 @@ const crawlers = [ async function crawlAll() { console.log("CRAWLER SERVICE: crawlAll"); + try { + const marketAlertsFromDb = await allMarketAlerts(true); + const hrefs = []; + + marketAlertsFromDb.map(marketAlert => { + if (hrefs[marketAlert.request] === undefined) { + hrefs[marketAlert.request] = [] + } + + hrefs[marketAlert.request].push(marketAlert.url); + }) + + global.hrefs = hrefs; + console.log("CRAWLER SERVICE: GLOBAL HREFS"); + console.log(global.hrefs); + + } catch (e) { + console.error("CRAWLER SERVICE:could not fetch marketalerts ", e); + } + return Promise.map(crawlers, function (crawler) { return crawler.crawl(); }).then(async (results) => { try { - const marketAlertsFromDb = await allMarketAlerts(true); - const hrefs = []; - const subscribedMakretAlerts = marketAlertsFromDb.filter(marketAlert => { - return marketAlert.subscribed; - }); - marketAlertsFromDb.map(marketAlert => { - if (hrefs[marketAlert.request] === undefined) { - hrefs[marketAlert.request] = [] - } + const marketAlertsFromDb = await allMarketAlerts(false, true); - hrefs[marketAlert.request].push(marketAlert.url); - }) - - global.hrefs = hrefs; - console.log(global.hrefs); - - console.log("CRAWLER SERVICE: number of existing MarketAlerts from db: " + subscribedMakretAlerts.length); + console.log("CRAWLER SERVICE: number of existing MarketAlerts from db: " + marketAlertsFromDb.length); const marketAlerts = []; const mergedResults = [].concat.apply([], results); @@ -55,14 +61,15 @@ async function crawlAll() { gardenSize: isNaN(result.gardenSize) ? 0 : result.gardenSize, realEstateType: result.realEstateType, title: result.title, - notified: false + notified: false, + hasLocation: result.hasLocation }) } console.log("CRAWLER SERVICE: Number of crawler results: " + marketAlerts.length); try { - const filteredMarketAlerts = marketAlerts.filter((elem) => !subscribedMakretAlerts.find(({ url }) => elem.url === url)); + const filteredMarketAlerts = marketAlerts.filter((elem) => !marketAlertsFromDb.find(({ url }) => elem.url === url)); console.log("CRAWLER SERVICE: Number of new crawler results: " + filteredMarketAlerts.length); await db.MarketAlert.bulkCreate(filteredMarketAlerts); -- 2.47.3