From fdd0124924674718f545a1576405dec0fb1c9575 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Thu, 13 Jun 2019 13:31:35 +0200 Subject: [PATCH 01/13] Added crawler service --- app/helpers/crawlers/olixClawler.js | 0 app/services/crawlerService.js | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 app/helpers/crawlers/olixClawler.js create mode 100644 app/services/crawlerService.js diff --git a/app/helpers/crawlers/olixClawler.js b/app/helpers/crawlers/olixClawler.js new file mode 100644 index 0000000..e69de29 diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js new file mode 100644 index 0000000..e69de29 From 6aaaea161285795e13d465010659c7c87dbe5650 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Thu, 13 Jun 2019 15:49:31 +0200 Subject: [PATCH 02/13] working on crawler --- app/helpers/crawlers/olixClawler.js | 30 +++++++++++++++++++++++++++++ app/services/crawlerService.js | 30 +++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/app/helpers/crawlers/olixClawler.js b/app/helpers/crawlers/olixClawler.js index e69de29..1c37df9 100644 --- a/app/helpers/crawlers/olixClawler.js +++ b/app/helpers/crawlers/olixClawler.js @@ -0,0 +1,30 @@ +const fetch = require('node-fetch'); +const cheerio = require('cheerio'); + +export default class OlxCrawler { + + constructor(fromPage = 0, toPage = 10, maxResults = 1000) { + this.fromPage = fromPage; + this.toPage = toPage; + this.maxResults = maxResults; + } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async indexPages(start, end, maxResults = 1000) { + let results = {}; + for (let i = start; i <= end; i++) { + let result = await this.indexPage(i, maxResults); + Object.assign(results, result) + await this.sleep(5000); + } + return results; + } + + async crawl() { + let results = await this.indexPages(this.fromPage, this.toPage, this.maxResults); + return results; + } +} \ No newline at end of file diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index e69de29..073195c 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -0,0 +1,30 @@ + + +const crawlers = [ + //new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), + ]; + + async function crawlAll() { + + for (let crawler of crawlers) { + try { + let results = await crawler.crawl() + for (let saver of savers) { + try { + await saver.connect(); + await saver.save(results); + } catch (e) { + console.log("Error saving. Trying next saver! ", e); + } + } + } catch (e) { + console.log("Error crawling. Trying next crawler! ", e); + } + } + + for (let saver of savers) { + saver.close(); + } + } + + crawlAll(); \ No newline at end of file From b17b6862ba363af0c64193fb0667bf3e5a0d7a0e Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Tue, 18 Jun 2019 13:13:16 +0200 Subject: [PATCH 03/13] Added migrations, expanded maketalert table --- .../20190618103020-expand-maketalert.js | 37 +++++++++++++++++++ ...0618124522-marketalerts-additional-info.js | 33 +++++++++++++++++ app/models/marketalert.js | 4 +- 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 app/migrations/20190618103020-expand-maketalert.js create mode 100644 app/migrations/20190618124522-marketalerts-additional-info.js diff --git a/app/migrations/20190618103020-expand-maketalert.js b/app/migrations/20190618103020-expand-maketalert.js new file mode 100644 index 0000000..54eed96 --- /dev/null +++ b/app/migrations/20190618103020-expand-maketalert.js @@ -0,0 +1,37 @@ +'use strict'; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.sequelize.transaction((t) => { + return Promise.all([ + queryInterface.addColumn('MarketAlerts', 'size', { + type: Sequelize.INTEGER, + }, { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'gardenSize', { + type: Sequelize.INTEGER, + }, { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'price', { + type: Sequelize.INTEGER, + }, { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'municipality', { + type: Sequelize.STRING, + }, { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'region', { + type: Sequelize.STRING, + }, { transaction: t }) + ]) + }) + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.sequelize.transaction((t) => { + return Promise.all([ + queryInterface.removeColumn('MarketAlerts', 'size', { transaction: t }), + queryInterface.removeColumn('MarketAlerts', 'gardenSize', { transaction: t }), + queryInterface.removeColumn('MarketAlerts', 'price', { transaction: t }), + queryInterface.removeColumn('MarketAlerts', 'municipality', { transaction: t }), + queryInterface.removeColumn('MarketAlerts', 'region', { transaction: t }) + ]) + }) + } +}; diff --git a/app/migrations/20190618124522-marketalerts-additional-info.js b/app/migrations/20190618124522-marketalerts-additional-info.js new file mode 100644 index 0000000..48847d1 --- /dev/null +++ b/app/migrations/20190618124522-marketalerts-additional-info.js @@ -0,0 +1,33 @@ +'use strict'; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.sequelize.transaction((t) => { + return Promise.all([ + queryInterface.removeColumn('MarketAlerts', 'olxUrl', { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'url', { + type: Sequelize.STRING, + }, { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'realestateOrigin', { + type: Sequelize.STRING, + }, { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'originId', { + type: Sequelize.STRING, + }, { transaction: t }) + ]) + }) + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.sequelize.transaction((t) => { + return Promise.all([ + queryInterface.removeColumn('MarketAlerts', 'url', { transaction: t }), + queryInterface.removeColumn('MarketAlerts', 'realestateOrigin', { transaction: t }), + queryInterface.removeColumn('MarketAlerts', 'originId', { transaction: t }), + queryInterface.addColumn('MarketAlerts', 'olxUrl', { + type: Sequelize.STRING + }, { transaction: t }) + ]) + }) + } +}; diff --git a/app/models/marketalert.js b/app/models/marketalert.js index ac08ac4..c0665a3 100644 --- a/app/models/marketalert.js +++ b/app/models/marketalert.js @@ -1,7 +1,9 @@ 'use strict'; module.exports = (sequelize, DataTypes) => { const MarketAlert = sequelize.define('MarketAlert', { - olxUrl: DataTypes.STRING, + url: DataTypes.STRING, + realestateOrigin: DataTypes.STRING, + originId: DataTypes.STRING, lastDate: DataTypes.STRING, email: { type: DataTypes.STRING, From 9a8a27d1d99224fea49cbe06a4fc15a7afe67cc2 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Tue, 18 Jun 2019 15:05:40 +0200 Subject: [PATCH 04/13] Scheduler --- app/helpers/crawlers/olixClawler.js | 169 +++++++++++++++++++++++++--- app/helpers/enums.js | 6 +- app/services/crawlerService.js | 1 + package.json | 1 + 4 files changed, 156 insertions(+), 21 deletions(-) diff --git a/app/helpers/crawlers/olixClawler.js b/app/helpers/crawlers/olixClawler.js index 1c37df9..0b9b01c 100644 --- a/app/helpers/crawlers/olixClawler.js +++ b/app/helpers/crawlers/olixClawler.js @@ -3,28 +3,161 @@ const cheerio = require('cheerio'); export default class OlxCrawler { - constructor(fromPage = 0, toPage = 10, maxResults = 1000) { - this.fromPage = fromPage; - this.toPage = toPage; - this.maxResults = maxResults; + constructor(fromPage = 0, toPage = 10, maxResults = 1000) { + this.fromPage = fromPage; + this.toPage = toPage; + this.maxResults = maxResults; + } + + async indexSingle(url) { + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + + const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); + + if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { + return null; } - async sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); + const title = $('#naslovartikla').text(); + const category = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); + + const price = $('#pc > p:nth-child(2)').text(); + const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); + const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); + const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); + const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); + + const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); + const time = $('time').attr('datetime'); + const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); + + const descriptions = $('.artikal_detaljniopis_tekst'); + const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); + const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + const imgRe = /href":("[^"]*")/g; + const matches = latLngRe.exec(body); + let lng = '', + lat = ''; + + const images = []; + const imgMatches = body.match(imgRe); + + const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) + const parsePrice = (price) => parseFloat(price.replace(".", "")) + + + for (let i = 0; imgMatches && i < imgMatches.length; i++) { + let img = imgMatches[i].replace("href\":", "") + img = img.replace("\"", ""); + img = img.replace("\"", ""); + images.push(img); } - async indexPages(start, end, maxResults = 1000) { - let results = {}; - for (let i = start; i <= end; i++) { - let result = await this.indexPage(i, maxResults); - Object.assign(results, result) - await this.sleep(5000); + const uploadPromises = images.map(img => { + const imgFixed = eval(`'${img}'`); + return cloudinary.uploader.upload(eval(`'${img}'`)); + }); + + const uploadResults = await Promise.all(uploadPromises); + const cloudinaryImages = uploadResults.map(ur => ur.url); + + if (matches && matches.length >= 3) { + lat = matches[1]; + lng = matches[2]; + } + + const parsedPrice = parsePrice(price); + let parsedRooms; + + if (rooms === 'Garsonjera') { + parsedRooms = 0; + } else { + parsedRooms = parseRooms(rooms); + } + + const data = { + category: this.getCategoryId(category), + url, + title, + price: isNaN(parsedPrice) ? price : parsedPrice, + size: parseFloat(size), + rooms: parsedRooms, + floor: parseInt(floor), + address, + location, + adType: AD_TYPE_SALE, + time, + shortDescription: descriptions.first().text(), + longDescription: descriptions.last().text(), + lat, + lng, + loc: [parseFloat(lat), parseFloat(lng)], + images: cloudinaryImages + }; + + return data; + } catch (e) { + console.error('Exception caught: ' + e.message); + } + + return null; + } + + async indexPage(url, pageNr, maxResults = 1000) { + try { + console.log('Starting to index page: ' + pageNr); + const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; + + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + const hrefs = []; + const results = {}; + + $('#rezultatipretrage').find('.listitem').each((i, elem) => { + const href = $(elem).find('a').first().attr('href'); + hrefs.push(href); + }); + + let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; + + for (let i = 0; i < hrefs.length; i++) { + console.log(`indexing: ${hrefs[i]}`); + + const singleData = await this.indexSingle(hrefs[i]); + + if (singleData) { + results[hrefs[i]] = singleData; } - return results; - } - - async crawl() { - let results = await this.indexPages(this.fromPage, this.toPage, this.maxResults); - return results; + await this.sleep(500); } + + return results; + } catch (e) { + console.error('Exception caught:' + e); + } + } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async indexPages(url, start, end, maxResults = 1000) { + let results = {}; + for (let i = start; i <= end; i++) { + let result = await this.indexPage(i, maxResults); + Object.assign(results, result) + await this.sleep(5000); + } + return results; + } + + async crawl() { + // TODO create URLS from db + let results = await this.indexPages(this.fromPage, this.toPage, this.maxResults); + return results; + } } \ No newline at end of file diff --git a/app/helpers/enums.js b/app/helpers/enums.js index 2c3b177..9ab2d58 100644 --- a/app/helpers/enums.js +++ b/app/helpers/enums.js @@ -1,7 +1,7 @@ const realEstateTypes = [ - { title: "Kuća", id: "kuca", hasGardenSize: true }, - { title: "Stan", id: "stan", hasGardenSize: false }, - { title: "Vikendica", id: "vikendica", hasGardenSize: true } + { title: "Kuća", id: "kuca", hasGardenSize: true, olixCategory: 23 }, + { title: "Stan", id: "stan", hasGardenSize: false, olixCategory: 24}, + { title: "Vikendica", id: "vikendica", hasGardenSize: true, olixCategory: 26 } ]; const sizes = [ diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 073195c..c6f0c82 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -1,5 +1,6 @@ +var http = require('http'); const crawlers = [ //new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), ]; diff --git a/package.json b/package.json index 8196722..a8b5aa4 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "test": "echo \"Error: no test specified\" && exit 1", "start": "node ./index.js", "start-mon": "nodemon ./index.js", + "scheduler": "node ./app/services/crawlerService.js", "migrate": "cd app && npx sequelize db:migrate", "setup": "docker build -t marketalerts . && docker run -e POSTGRES_USER=docker -e POSTGRES_PASSWORD=docker -e POSTGRES_DB=marketalerts --name pg_marketalerts -d -p 5432:5432 marketalerts && sleep 4 && npm run migrate", "docker-start": "docker start pg_marketalerts", From 0f630e9ea4ebe92a983f2b05183f22c4cb1675e7 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Wed, 19 Jun 2019 17:12:22 +0200 Subject: [PATCH 05/13] Olix crawling, filter crawling result by lat, lng --- app/helpers/codes.js | 1042 ++++++++++++++------------- app/helpers/crawlers/olixClawler.js | 163 ----- app/helpers/crawlers/olxClawler.js | 230 ++++++ app/helpers/enums.js | 6 +- app/helpers/url.js | 12 +- app/services/crawlerService.js | 6 +- 6 files changed, 777 insertions(+), 682 deletions(-) delete mode 100644 app/helpers/crawlers/olixClawler.js create mode 100644 app/helpers/crawlers/olxClawler.js diff --git a/app/helpers/codes.js b/app/helpers/codes.js index 5ec86cd..bcec456 100644 --- a/app/helpers/codes.js +++ b/app/helpers/codes.js @@ -1,871 +1,871 @@ const regions = [ { - "name":" Sarajevo", - "id":"sarajevo", + "name": " Sarajevo", + "id": "sarajevo", "olxid": "9", - "municipalities":[ + "municipalities": [ { - "name":"Hadžići", - "id":"hadii", - "olxid":"3817" + "name": "Hadžići", + "id": "hadii", + "olxid": "3817" }, { - "name":"Ilidža", - "id":"ilida", - "olxid":"3879" + "name": "Ilidža", + "id": "ilida", + "olxid": "3879" }, { - "name":"Ilijaš", - "id":"ilija", - "olxid":"3892" + "name": "Ilijaš", + "id": "ilija", + "olxid": "3892" }, { - "name":"Sarajevo - Centar", - "id":"sarajevocentar", - "olxid":"3812" + "name": "Sarajevo - Centar", + "id": "sarajevocentar", + "olxid": "3812" }, { - "name":"Sarajevo-Novi Grad", - "id":"sarajevonovigrad", - "olxid":"3969" + "name": "Sarajevo-Novi Grad", + "id": "sarajevonovigrad", + "olxid": "3969" }, { - "name":"Sarajevo-Novo Sarajevo", - "id":"sarajevonovosarajevo", - "olxid":"5896" + "name": "Sarajevo-Novo Sarajevo", + "id": "sarajevonovosarajevo", + "olxid": "5896" }, { - "name":"Sarajevo-Stari Grad", - "id":"sarajevostarigrad", - "olxid":"4048" + "name": "Sarajevo-Stari Grad", + "id": "sarajevostarigrad", + "olxid": "4048" }, { - "name":"Trnovo", - "id":"trnovo", - "olxid":"4063" + "name": "Trnovo", + "id": "trnovo", + "olxid": "4063" }, { - "name":"Vogošća", - "id":"vogoa", - "olxid":"4126" + "name": "Vogošća", + "id": "vogoa", + "olxid": "4126" } ] }, { - "name":" Unsko-sanski", - "id":"unskosanski", + "name": " Unsko-sanski", + "id": "unskosanski", "olxid": "9", - "municipalities":[ + "municipalities": [ { - "name":"Bihać", - "id":"biha", - "olxid":"75" + "name": "Bihać", + "id": "biha", + "olxid": "75" }, { - "name":"Bosanska Krupa", - "id":"bosanskakrupa", - "olxid":"373" + "name": "Bosanska Krupa", + "id": "bosanskakrupa", + "olxid": "373" }, { - "name":"Bosanski Petrovac", - "id":"bosanskipetrovac", - "olxid":"504" + "name": "Bosanski Petrovac", + "id": "bosanskipetrovac", + "olxid": "504" }, { - "name":"Bužim", - "id":"buim", - "olxid":"374" + "name": "Bužim", + "id": "buim", + "olxid": "374" }, { - "name":"Cazin", - "id":"cazin", - "olxid":"857" + "name": "Cazin", + "id": "cazin", + "olxid": "857" }, { - "name":"Ključ", - "id":"klju", - "olxid":"2362" + "name": "Ključ", + "id": "klju", + "olxid": "2362" }, { - "name":"Sanski Most", - "id":"sanskimost", - "olxid":"3738" + "name": "Sanski Most", + "id": "sanskimost", + "olxid": "3738" }, { - "name":"Velika Kladuša", - "id":"velikakladua", - "olxid":"5122" + "name": "Velika Kladuša", + "id": "velikakladua", + "olxid": "5122" } ] }, { - "name":" Posavski", - "id":"posavski", + "name": " Posavski", + "id": "posavski", "olxid": "15", - "municipalities":[ + "municipalities": [ { - "name":"Domaljevac", - "id":"domaljevac", - "olxid":"6144" + "name": "Domaljevac", + "id": "domaljevac", + "olxid": "6144" }, { - "name":"Odžak", - "id":"odak", - "olxid":"424" + "name": "Odžak", + "id": "odak", + "olxid": "424" }, { - "name":"Orašje", - "id":"oraje", - "olxid":"3252" + "name": "Orašje", + "id": "oraje", + "olxid": "3252" }, { - "name":"Šamac", - "id":"amac", - "olxid":"540" + "name": "Šamac", + "id": "amac", + "olxid": "540" } ] }, { - "name":" Tuzlanski", - "id":"tuzlanski", + "name": " Tuzlanski", + "id": "tuzlanski", "olxid": "15", - "municipalities":[ + "municipalities": [ { - "name":"Banovići", - "id":"banovii", - "olxid":"2" + "name": "Banovići", + "id": "banovii", + "olxid": "2" }, { - "name":"Doboj-Istok", - "id":"dobojistok", - "olxid":"1090" + "name": "Doboj-Istok", + "id": "dobojistok", + "olxid": "1090" }, { - "name":"Gradačac", - "id":"gradaac", - "olxid":"1854" + "name": "Gradačac", + "id": "gradaac", + "olxid": "1854" }, { - "name":"Gračanica", - "id":"graanica", - "olxid":"1826" + "name": "Gračanica", + "id": "graanica", + "olxid": "1826" }, { - "name":"Kalesija", - "id":"kalesija", - "olxid":"2129" + "name": "Kalesija", + "id": "kalesija", + "olxid": "2129" }, { - "name":"Kladanj", - "id":"kladanj", - "olxid":"2319" + "name": "Kladanj", + "id": "kladanj", + "olxid": "2319" }, { - "name":"Lukavac", - "id":"lukavac", - "olxid":"2840" + "name": "Lukavac", + "id": "lukavac", + "olxid": "2840" }, { - "name":"Sapna", - "id":"sapna", - "olxid":"5699" + "name": "Sapna", + "id": "sapna", + "olxid": "5699" }, { - "name":"Srebrenik", - "id":"srebrenik", - "olxid":"4391" + "name": "Srebrenik", + "id": "srebrenik", + "olxid": "4391" }, { - "name":"Teočak", - "id":"teoak", - "olxid":"5010" + "name": "Teočak", + "id": "teoak", + "olxid": "5010" }, { - "name":"Tuzla", - "id":"tuzla", - "olxid":"4944" + "name": "Tuzla", + "id": "tuzla", + "olxid": "4944" }, { - "name":"Čelić", - "id":"eli", - "olxid":"2801" + "name": "Čelić", + "id": "eli", + "olxid": "2801" }, { - "name":"Živinice", - "id":"ivinice", - "olxid":"5774" + "name": "Živinice", + "id": "ivinice", + "olxid": "5774" } ] }, { - "name":" Zeničko-dobojski", - "id":"zenickodobojski", + "name": " Zeničko-dobojski", + "id": "zenickodobojski", "olxid": "15", - "municipalities":[ + "municipalities": [ { - "name":"Breza", - "id":"breza", - "olxid":"704" + "name": "Breza", + "id": "breza", + "olxid": "704" }, { - "name":"Doboj-Jug", - "id":"dobojjug", - "olxid":"1122" + "name": "Doboj-Jug", + "id": "dobojjug", + "olxid": "1122" }, { - "name":"Kakanj", - "id":"kakanj", - "olxid":"2022" + "name": "Kakanj", + "id": "kakanj", + "olxid": "2022" }, { - "name":"Maglaj", - "id":"maglaj", - "olxid":"2941" + "name": "Maglaj", + "id": "maglaj", + "olxid": "2941" }, { - "name":"Olovo", - "id":"olovo", - "olxid":"1925" + "name": "Olovo", + "id": "olovo", + "olxid": "1925" }, { - "name":"Tešanj", - "id":"teanj", - "olxid":"4594" + "name": "Tešanj", + "id": "teanj", + "olxid": "4594" }, { - "name":"Usora", - "id":"usora", - "olxid":"1087" + "name": "Usora", + "id": "usora", + "olxid": "1087" }, { - "name":"Vareš", - "id":"vare", - "olxid":"5037" + "name": "Vareš", + "id": "vare", + "olxid": "5037" }, { - "name":"Visoko", - "id":"visoko", - "olxid":"5171" + "name": "Visoko", + "id": "visoko", + "olxid": "5171" }, { - "name":"Zavidovići", - "id":"zavidovii", - "olxid":"5548" + "name": "Zavidovići", + "id": "zavidovii", + "olxid": "5548" }, { - "name":"Zenica", - "id":"zenica", - "olxid":"4571" + "name": "Zenica", + "id": "zenica", + "olxid": "4571" }, { - "name":"Žepče", - "id":"epe", - "olxid":"2940" + "name": "Žepče", + "id": "epe", + "olxid": "2940" } ] }, { - "name":" Bosansko-podrinjski", - "id":"bosanskopodrinjski", + "name": " Bosansko-podrinjski", + "id": "bosanskopodrinjski", "olxid": "15", - "municipalities":[ + "municipalities": [ { - "name":"Foča", - "id":"foa", - "olxid":"1289" + "name": "Foča", + "id": "foa", + "olxid": "1289" }, { - "name":"Goražde", - "id":"gorade", - "olxid":"1588" + "name": "Goražde", + "id": "gorade", + "olxid": "1588" }, { - "name":"Pale", - "id":"pale", - "olxid":"3546" + "name": "Pale", + "id": "pale", + "olxid": "3546" } ] }, { - "name":" Srednjobosanski", - "id":"srednjobosanski", + "name": " Srednjobosanski", + "id": "srednjobosanski", "olxid": "6", - "municipalities":[ + "municipalities": [ { - "name":"Bugojno", - "id":"bugojno", - "olxid":"732" + "name": "Bugojno", + "id": "bugojno", + "olxid": "732" }, { - "name":"Busovača", - "id":"busovaa", - "olxid":"810" + "name": "Busovača", + "id": "busovaa", + "olxid": "810" }, { - "name":"Dobretići", - "id":"dobretii", - "olxid":"4151" + "name": "Dobretići", + "id": "dobretii", + "olxid": "4151" }, { - "name":"Donji Vakuf", - "id":"donjivakuf", - "olxid":"1160" + "name": "Donji Vakuf", + "id": "donjivakuf", + "olxid": "1160" }, { - "name":"Fojnica", - "id":"fojnica", - "olxid":"1407" + "name": "Fojnica", + "id": "fojnica", + "olxid": "1407" }, { - "name":"Gornji Vakuf - Uskoplje", - "id":"gornjivakufuskoplje", - "olxid":"1775" + "name": "Gornji Vakuf - Uskoplje", + "id": "gornjivakufuskoplje", + "olxid": "1775" }, { - "name":"Jajce", - "id":"jajce", - "olxid":"1960" + "name": "Jajce", + "id": "jajce", + "olxid": "1960" }, { - "name":"Kiseljak", - "id":"kiseljak", - "olxid":"2237" + "name": "Kiseljak", + "id": "kiseljak", + "olxid": "2237" }, { - "name":"Kreševo", - "id":"kreevo", - "olxid":"2608" + "name": "Kreševo", + "id": "kreevo", + "olxid": "2608" }, { - "name":"Novi Travnik", - "id":"novitravnik", - "olxid":"3477" + "name": "Novi Travnik", + "id": "novitravnik", + "olxid": "3477" }, { - "name":"Travnik", - "id":"travnik", - "olxid":"4678" + "name": "Travnik", + "id": "travnik", + "olxid": "4678" }, { - "name":"Vitez", - "id":"vitez", - "olxid":"5422" + "name": "Vitez", + "id": "vitez", + "olxid": "5422" } ] }, { - "name":" Hercegovačko-neretvanski", - "id":"hercegovackoneretvanski", + "name": " Hercegovačko-neretvanski", + "id": "hercegovackoneretvanski", "olxid": "7", - "municipalities":[ + "municipalities": [ { - "name":"Grad Mostar", - "id":"gradmostar", - "olxid":"3017" + "name": "Grad Mostar", + "id": "gradmostar", + "olxid": "3017" }, { - "name":"Jablanica", - "id":"jablanica", - "olxid":"1930" + "name": "Jablanica", + "id": "jablanica", + "olxid": "1930" }, { - "name":"Konjic", - "id":"konjic", - "olxid":"2169" + "name": "Konjic", + "id": "konjic", + "olxid": "2169" }, { - "name":"Neum", - "id":"neum", - "olxid":"3111" + "name": "Neum", + "id": "neum", + "olxid": "3111" }, { - "name":"Prozor", - "id":"prozor", - "olxid":"3421" + "name": "Prozor", + "id": "prozor", + "olxid": "3421" }, { - "name":"Ravno", - "id":"ravno", - "olxid":"4769" + "name": "Ravno", + "id": "ravno", + "olxid": "4769" }, { - "name":"Stolac", - "id":"stolac", - "olxid":"4439" + "name": "Stolac", + "id": "stolac", + "olxid": "4439" }, { - "name":"Čapljina", - "id":"apljina", - "olxid":"947" + "name": "Čapljina", + "id": "apljina", + "olxid": "947" }, { - "name":"Čitluk", - "id":"itluk", - "olxid":"1009" + "name": "Čitluk", + "id": "itluk", + "olxid": "1009" } ] }, { - "name":" Zapadno-hercegovački", - "id":"zapadnohercegovacki", + "name": " Zapadno-hercegovački", + "id": "zapadnohercegovacki", "olxid": "8", - "municipalities":[ + "municipalities": [ { - "name":"Grude", - "id":"grude", - "olxid":"1892" + "name": "Grude", + "id": "grude", + "olxid": "1892" }, { - "name":"Ljubuški", - "id":"ljubuki", - "olxid":"2905" + "name": "Ljubuški", + "id": "ljubuki", + "olxid": "2905" }, { - "name":"Posušje", - "id":"posuje", - "olxid":"3268" + "name": "Posušje", + "id": "posuje", + "olxid": "3268" }, { - "name":"Široki Brijeg", - "id":"irokibrijeg", - "olxid":"2708" + "name": "Široki Brijeg", + "id": "irokibrijeg", + "olxid": "2708" } ] }, { - "name":" Livanjski", - "id":"livanjski", + "name": " Livanjski", + "id": "livanjski", "olxid": "10", - "municipalities":[ + "municipalities": [ { - "name":"Bosansko Grahovo", - "id":"bosanskograhovo", - "olxid":"560" + "name": "Bosansko Grahovo", + "id": "bosanskograhovo", + "olxid": "560" }, { - "name":"Drvar", - "id":"drvar", - "olxid":"4640" + "name": "Drvar", + "id": "drvar", + "olxid": "4640" }, { - "name":"Glamoč", - "id":"glamo", - "olxid":"1533" + "name": "Glamoč", + "id": "glamo", + "olxid": "1533" }, { - "name":"Kupres", - "id":"kupres", - "olxid":"2635" + "name": "Kupres", + "id": "kupres", + "olxid": "2635" }, { - "name":"Livno", - "id":"livno", - "olxid":"2741" + "name": "Livno", + "id": "livno", + "olxid": "2741" }, { - "name":"Tomislavgrad", - "id":"tomislavgrad", - "olxid":"1228" + "name": "Tomislavgrad", + "id": "tomislavgrad", + "olxid": "1228" } ] }, { - "name":" Banjalučka", - "id":"banjalučka", + "name": " Banjalučka", + "id": "banjalučka", "olxid": "14", - "municipalities":[ + "municipalities": [ { - "name":"Banja Luka", - "id":"banjaluka", - "olxid":"21" + "name": "Banja Luka", + "id": "banjaluka", + "olxid": "21" }, { - "name":"Gradiška", - "id":"gradika", - "olxid":"305" + "name": "Gradiška", + "id": "gradika", + "olxid": "305" }, { - "name":"Istočni Drvar", - "id":"istonidrvar", - "olxid":"4662" + "name": "Istočni Drvar", + "id": "istonidrvar", + "olxid": "4662" }, { - "name":"Jezero", - "id":"jezero", - "olxid":"1965" + "name": "Jezero", + "id": "jezero", + "olxid": "1965" }, { - "name":"Kneževo", - "id":"kneevo", - "olxid":"4147" + "name": "Kneževo", + "id": "kneevo", + "olxid": "4147" }, { - "name":"Kostajnica", - "id":"kostajnica", - "olxid":"6142" + "name": "Kostajnica", + "id": "kostajnica", + "olxid": "6142" }, { - "name":"Kotor Varoš", - "id":"kotorvaro", - "olxid":"2574" + "name": "Kotor Varoš", + "id": "kotorvaro", + "olxid": "2574" }, { - "name":"Kozarska Dubica", - "id":"kozarskadubica", - "olxid":"244" + "name": "Kozarska Dubica", + "id": "kozarskadubica", + "olxid": "244" }, { - "name":"Krupa na uni", - "id":"krupanauni", - "olxid":"382" + "name": "Krupa na uni", + "id": "krupanauni", + "olxid": "382" }, { - "name":"Kupres ", - "id":"kupres", - "olxid":"2654" + "name": "Kupres ", + "id": "kupres", + "olxid": "2654" }, { - "name":"Laktaši", - "id":"laktai", - "olxid":"2671" + "name": "Laktaši", + "id": "laktai", + "olxid": "2671" }, { - "name":"Mrkonjić Grad", - "id":"mrkonjigrad", - "olxid":"3073" + "name": "Mrkonjić Grad", + "id": "mrkonjigrad", + "olxid": "3073" }, { - "name":"Novi Grad", - "id":"novigrad", - "olxid":"444" + "name": "Novi Grad", + "id": "novigrad", + "olxid": "444" }, { - "name":"Oštra Luka", - "id":"otraluka", - "olxid":"3737" + "name": "Oštra Luka", + "id": "otraluka", + "olxid": "3737" }, { - "name":"Petrovac", - "id":"petrovac", - "olxid":"515" + "name": "Petrovac", + "id": "petrovac", + "olxid": "515" }, { - "name":"Prijedor", - "id":"prijedor", - "olxid":"3287" + "name": "Prijedor", + "id": "prijedor", + "olxid": "3287" }, { - "name":"Prnjavor", - "id":"prnjavor", - "olxid":"3358" + "name": "Prnjavor", + "id": "prnjavor", + "olxid": "3358" }, { - "name":"Ribnik", - "id":"ribnik", - "olxid":"2365" + "name": "Ribnik", + "id": "ribnik", + "olxid": "2365" }, { - "name":"Srbac", - "id":"srbac", - "olxid":"4271" + "name": "Srbac", + "id": "srbac", + "olxid": "4271" }, { - "name":"Čelinac", - "id":"elinac", - "olxid":"979" + "name": "Čelinac", + "id": "elinac", + "olxid": "979" }, { - "name":"Šipovo", - "id":"ipovo", - "olxid":"4509" + "name": "Šipovo", + "id": "ipovo", + "olxid": "4509" } ] }, { - "name":" Dobojsko-Bijeljinska", - "id":"dobojskobijeljinska", + "name": " Dobojsko-Bijeljinska", + "id": "dobojskobijeljinska", "olxid": "15", - "municipalities":[ + "municipalities": [ { - "name":"Bijeljina", - "id":"bijeljina", - "olxid":"123" + "name": "Bijeljina", + "id": "bijeljina", + "olxid": "123" }, { - "name":"Bosanski Brod", - "id":"bosanskibrod", - "olxid":"421" + "name": "Bosanski Brod", + "id": "bosanskibrod", + "olxid": "421" }, { - "name":"Derventa", - "id":"derventa", - "olxid":"1030" + "name": "Derventa", + "id": "derventa", + "olxid": "1030" }, { - "name":"Doboj", - "id":"doboj", - "olxid":"1088" + "name": "Doboj", + "id": "doboj", + "olxid": "1088" }, { - "name":"Donji Žabar", - "id":"donjiabar", - "olxid":"3254" + "name": "Donji Žabar", + "id": "donjiabar", + "olxid": "3254" }, { - "name":"Lopare", - "id":"lopare", - "olxid":"2800" + "name": "Lopare", + "id": "lopare", + "olxid": "2800" }, { - "name":"Lukavac", - "id":"lukavac", - "olxid":"6029" + "name": "Lukavac", + "id": "lukavac", + "olxid": "6029" }, { - "name":"Modriča", - "id":"modria", - "olxid":"2996" + "name": "Modriča", + "id": "modria", + "olxid": "2996" }, { - "name":"Pelagićevo", - "id":"pelagievo", - "olxid":"1856" + "name": "Pelagićevo", + "id": "pelagievo", + "olxid": "1856" }, { - "name":"Petrovo", - "id":"petrovo", - "olxid":"1827" + "name": "Petrovo", + "id": "petrovo", + "olxid": "1827" }, { - "name":"Stanari", - "id":"stanari", - "olxid":"1148" + "name": "Stanari", + "id": "stanari", + "olxid": "1148" }, { - "name":"Teslić", - "id":"tesli", - "olxid":"4549" + "name": "Teslić", + "id": "tesli", + "olxid": "4549" }, { - "name":"Tešanj", - "id":"teanj", - "olxid":"4636" + "name": "Tešanj", + "id": "teanj", + "olxid": "4636" }, { - "name":"Travnik", - "id":"travnik", - "olxid":"4692" + "name": "Travnik", + "id": "travnik", + "olxid": "4692" }, { - "name":"Tuzla", - "id":"tuzla", - "olxid":"4966" + "name": "Tuzla", + "id": "tuzla", + "olxid": "4966" }, { - "name":"Ugljevik", - "id":"ugljevik", - "olxid":"5009" + "name": "Ugljevik", + "id": "ugljevik", + "olxid": "5009" }, { - "name":"Vukosavlje", - "id":"vukosavlje", - "olxid":"3197" + "name": "Vukosavlje", + "id": "vukosavlje", + "olxid": "3197" }, { - "name":"Šamac", - "id":"amac", - "olxid":"539" + "name": "Šamac", + "id": "amac", + "olxid": "539" } ] }, { - "name":" Sarajevsko-Zvornička", - "id":"sarajevskozvornicka", + "name": " Sarajevsko-Zvornička", + "id": "sarajevskozvornicka", "olxid": "16", - "municipalities":[ + "municipalities": [ { - "name":"Bratunac", - "id":"bratunac", - "olxid":"595" + "name": "Bratunac", + "id": "bratunac", + "olxid": "595" }, { - "name":"Han Pijesak", - "id":"hanpijesak", - "olxid":"1904" + "name": "Han Pijesak", + "id": "hanpijesak", + "olxid": "1904" }, { - "name":"Ilijaš", - "id":"ilija", - "olxid":"3947" + "name": "Ilijaš", + "id": "ilija", + "olxid": "3947" }, { - "name":"Istočni Stari Grad", - "id":"istonistarigrad", - "olxid":"4049" + "name": "Istočni Stari Grad", + "id": "istonistarigrad", + "olxid": "4049" }, { - "name":"Kasindo", - "id":"kasindo", - "olxid":"3880" + "name": "Kasindo", + "id": "kasindo", + "olxid": "3880" }, { - "name":"Kladanj", - "id":"kladanj", - "olxid":"2325" + "name": "Kladanj", + "id": "kladanj", + "olxid": "2325" }, { - "name":"Lukavica", - "id":"lukavica", - "olxid":"3971" + "name": "Lukavica", + "id": "lukavica", + "olxid": "3971" }, { - "name":"Milići", - "id":"milii", - "olxid":"6143" + "name": "Milići", + "id": "milii", + "olxid": "6143" }, { - "name":"Olovo", - "id":"olovo", - "olxid":"3221" + "name": "Olovo", + "id": "olovo", + "olxid": "3221" }, { - "name":"Osmaci", - "id":"osmaci", - "olxid":"2128" + "name": "Osmaci", + "id": "osmaci", + "olxid": "2128" }, { - "name":"Pale", - "id":"pale", - "olxid":"3978" + "name": "Pale", + "id": "pale", + "olxid": "3978" }, { - "name":"Rogatica", - "id":"rogatica", - "olxid":"3529" + "name": "Rogatica", + "id": "rogatica", + "olxid": "3529" }, { - "name":"Rudo", - "id":"rudo", - "olxid":"3648" + "name": "Rudo", + "id": "rudo", + "olxid": "3648" }, { - "name":"Sarajevo-Novi Grad", - "id":"sarajevonovigrad", - "olxid":"6069" + "name": "Sarajevo-Novi Grad", + "id": "sarajevonovigrad", + "olxid": "6069" }, { - "name":"Sokolac", - "id":"sokolac", - "olxid":"4183" + "name": "Sokolac", + "id": "sokolac", + "olxid": "4183" }, { - "name":"Srebrenica", - "id":"srebrenica", - "olxid":"4310" + "name": "Srebrenica", + "id": "srebrenica", + "olxid": "4310" }, { - "name":"Trnovo", - "id":"trnovo", - "olxid":"4067" + "name": "Trnovo", + "id": "trnovo", + "olxid": "4067" }, { - "name":"Ustiprača", - "id":"ustipraa", - "olxid":"1593" + "name": "Ustiprača", + "id": "ustipraa", + "olxid": "1593" }, { - "name":"Višegrad", - "id":"viegrad", - "olxid":"5259" + "name": "Višegrad", + "id": "viegrad", + "olxid": "5259" }, { - "name":"Vlasenica", - "id":"vlasenica", - "olxid":"5456" + "name": "Vlasenica", + "id": "vlasenica", + "olxid": "5456" }, { - "name":"Zvornik", - "id":"zvornik", - "olxid":"5684" + "name": "Zvornik", + "id": "zvornik", + "olxid": "5684" }, { - "name":"Šekovići", - "id":"ekovii", - "olxid":"4475" + "name": "Šekovići", + "id": "ekovii", + "olxid": "4475" }, { - "name":"Žepa", - "id":"epa", - "olxid":"1906" + "name": "Žepa", + "id": "epa", + "olxid": "1906" } ] }, { - "name":" Trebinjsko-Fočanska", - "id":"trebinjskofocanska", + "name": " Trebinjsko-Fočanska", + "id": "trebinjskofocanska", "olxid": "17", - "municipalities":[ + "municipalities": [ { - "name":"Berkovići", - "id":"berkovii", - "olxid":"4441" + "name": "Berkovići", + "id": "berkovii", + "olxid": "4441" }, { - "name":"Bileća", - "id":"bilea", - "olxid":"183" + "name": "Bileća", + "id": "bilea", + "olxid": "183" }, { - "name":"Foča", - "id":"foa", - "olxid":"1287" + "name": "Foča", + "id": "foa", + "olxid": "1287" }, { - "name":"Gacko", - "id":"gacko", - "olxid":"1462" + "name": "Gacko", + "id": "gacko", + "olxid": "1462" }, { - "name":"Istočni Mostar", - "id":"istonimostar", - "olxid":"3038" + "name": "Istočni Mostar", + "id": "istonimostar", + "olxid": "3038" }, { - "name":"Kalinovik", - "id":"kalinovik", - "olxid":"2164" + "name": "Kalinovik", + "id": "kalinovik", + "olxid": "2164" }, { - "name":"Ljubinje", - "id":"ljubinje", - "olxid":"2884" + "name": "Ljubinje", + "id": "ljubinje", + "olxid": "2884" }, { - "name":"Nevesinje", - "id":"nevesinje", - "olxid":"3138" + "name": "Nevesinje", + "id": "nevesinje", + "olxid": "3138" }, { - "name":"Trebinje", - "id":"trebinje", - "olxid":"4766" + "name": "Trebinje", + "id": "trebinje", + "olxid": "4766" }, { - "name":"Čajniče", - "id":"ajnie", - "olxid":"911" + "name": "Čajniče", + "id": "ajnie", + "olxid": "911" } ] }, { - "name":"Distrikt Brčko", - "id":"distriktbrcko", + "name": "Distrikt Brčko", + "id": "distriktbrcko", "olxid": "12", - "municipalities":[ - { - "name":"Brčko", - "id":"brko", - "olxid":"12" + "municipalities": [ + { + "name": "Brčko", + "id": "brko", + "olxid": "12" } ] @@ -873,7 +873,7 @@ const regions = [ ]; const getRegions = () => { - return regions.map( (g) => ({ name: g.name, id: g.id, olxid: g.olxid }) ); + return regions.map((g) => ({ name: g.name, id: g.id, olxid: g.olxid })); }; const getRegion = (regionId) => { @@ -886,13 +886,27 @@ const getRegionName = (regionId) => { }; const getMunicipalitiesForRegion = (regionId) => { - const region = getRegion(regionId); - return (region && region.municipalities) ? region.municipalities : null; + const region = getRegion(regionId); + return (region && region.municipalities) ? region.municipalities : null; +}; + +const getMunicipality = (regionId, municipalityId) => { + const region = getRegion(regionId); + if (!region) { + return null; + } + + const municipality = region.municipalities.find(municipality => municipality.id === municipalityId); + if (!municipality) { + return null; + } + + return municipality; }; const getMunicipalityName = (regionId, municipalityId) => { const region = getRegion(regionId); - if (!region){ + if (!region) { return null; } @@ -905,8 +919,10 @@ const getMunicipalityName = (regionId, municipalityId) => { }; module.exports = { - getRegions, - getRegionName, - getMunicipalitiesForRegion, - getMunicipalityName, + getRegion, + getRegions, + getRegionName, + getMunicipalitiesForRegion, + getMunicipalityName, + getMunicipality }; diff --git a/app/helpers/crawlers/olixClawler.js b/app/helpers/crawlers/olixClawler.js deleted file mode 100644 index 0b9b01c..0000000 --- a/app/helpers/crawlers/olixClawler.js +++ /dev/null @@ -1,163 +0,0 @@ -const fetch = require('node-fetch'); -const cheerio = require('cheerio'); - -export default class OlxCrawler { - - constructor(fromPage = 0, toPage = 10, maxResults = 1000) { - this.fromPage = fromPage; - this.toPage = toPage; - this.maxResults = maxResults; - } - - async indexSingle(url) { - try { - const res = await fetch(url); - const body = await res.text(); - const $ = cheerio.load(body); - - const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); - - if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { - return null; - } - - const title = $('#naslovartikla').text(); - const category = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); - - const price = $('#pc > p:nth-child(2)').text(); - const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); - const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); - const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); - const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); - - const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); - const time = $('time').attr('datetime'); - const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); - - const descriptions = $('.artikal_detaljniopis_tekst'); - const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); - const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; - const imgRe = /href":("[^"]*")/g; - const matches = latLngRe.exec(body); - let lng = '', - lat = ''; - - const images = []; - const imgMatches = body.match(imgRe); - - const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) - const parsePrice = (price) => parseFloat(price.replace(".", "")) - - - for (let i = 0; imgMatches && i < imgMatches.length; i++) { - let img = imgMatches[i].replace("href\":", "") - img = img.replace("\"", ""); - img = img.replace("\"", ""); - images.push(img); - } - - const uploadPromises = images.map(img => { - const imgFixed = eval(`'${img}'`); - return cloudinary.uploader.upload(eval(`'${img}'`)); - }); - - const uploadResults = await Promise.all(uploadPromises); - const cloudinaryImages = uploadResults.map(ur => ur.url); - - if (matches && matches.length >= 3) { - lat = matches[1]; - lng = matches[2]; - } - - const parsedPrice = parsePrice(price); - let parsedRooms; - - if (rooms === 'Garsonjera') { - parsedRooms = 0; - } else { - parsedRooms = parseRooms(rooms); - } - - const data = { - category: this.getCategoryId(category), - url, - title, - price: isNaN(parsedPrice) ? price : parsedPrice, - size: parseFloat(size), - rooms: parsedRooms, - floor: parseInt(floor), - address, - location, - adType: AD_TYPE_SALE, - time, - shortDescription: descriptions.first().text(), - longDescription: descriptions.last().text(), - lat, - lng, - loc: [parseFloat(lat), parseFloat(lng)], - images: cloudinaryImages - }; - - return data; - } catch (e) { - console.error('Exception caught: ' + e.message); - } - - return null; - } - - async indexPage(url, pageNr, maxResults = 1000) { - try { - console.log('Starting to index page: ' + pageNr); - const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; - - const res = await fetch(url); - const body = await res.text(); - const $ = cheerio.load(body); - const hrefs = []; - const results = {}; - - $('#rezultatipretrage').find('.listitem').each((i, elem) => { - const href = $(elem).find('a').first().attr('href'); - hrefs.push(href); - }); - - let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; - - for (let i = 0; i < hrefs.length; i++) { - console.log(`indexing: ${hrefs[i]}`); - - const singleData = await this.indexSingle(hrefs[i]); - - if (singleData) { - results[hrefs[i]] = singleData; - } - await this.sleep(500); - } - - return results; - } catch (e) { - console.error('Exception caught:' + e); - } - } - - async sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); - } - - async indexPages(url, start, end, maxResults = 1000) { - let results = {}; - for (let i = start; i <= end; i++) { - let result = await this.indexPage(i, maxResults); - Object.assign(results, result) - await this.sleep(5000); - } - return results; - } - - async crawl() { - // TODO create URLS from db - let results = await this.indexPages(this.fromPage, this.toPage, this.maxResults); - return results; - } -} \ No newline at end of file diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js new file mode 100644 index 0000000..eb535ed --- /dev/null +++ b/app/helpers/crawlers/olxClawler.js @@ -0,0 +1,230 @@ +const fetch = require('node-fetch'); +const cheerio = require('cheerio'); +const { allRERequest, findPointInsideBoundingBox } = require('../url'); +const { getRealEstateTypeEnum } = require('../enums'); +const { getRegion, getMunicipality } = require('../codes') + +module.exports = class OlxCrawler { + + constructor(fromPage = 0, toPage = 10, maxResults = 1000) { + this.fromPage = fromPage; + this.toPage = toPage; + this.maxResults = maxResults; + } + + async indexSingle(url) { + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + + const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); + + // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { + // return null; + // } + + const title = $('#naslovartikla').text(); + const category = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); + + const price = $('#pc > p:nth-child(2)').text(); + const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); + const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); + const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); + const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); + + const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); + const time = $('time').attr('datetime'); + const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); + + const descriptions = $('.artikal_detaljniopis_tekst'); + const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); + const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + const imgRe = /href":("[^"]*")/g; + const matches = latLngRe.exec(body); + let lng = '', + lat = ''; + + const images = []; + const imgMatches = body.match(imgRe); + + const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) + const parsePrice = (price) => parseFloat(price.replace(".", "")) + + + for (let i = 0; imgMatches && i < imgMatches.length; i++) { + let img = imgMatches[i].replace("href\":", "") + img = img.replace("\"", ""); + img = img.replace("\"", ""); + images.push(img); + } + + // const uploadPromises = images.map(img => { + // const imgFixed = eval(`'${img}'`); + // return cloudinary.uploader.upload(eval(`'${img}'`)); + // }); + + // const uploadResults = await Promise.all(uploadPromises); + // const cloudinaryImages = uploadResults.map(ur => ur.url); + + if (matches && matches.length >= 3) { + lat = matches[1]; + lng = matches[2]; + } + + const parsedPrice = parsePrice(price); + let parsedRooms; + + if (rooms === 'Garsonjera') { + parsedRooms = 0; + } else { + parsedRooms = parseRooms(rooms); + } + + const data = { + // category: this.getCategoryId(category), + category: category, + url, + title, + price: isNaN(parsedPrice) ? price : parsedPrice, + size: parseFloat(size), + rooms: parsedRooms, + floor: parseInt(floor), + address, + location, + // adType: AD_TYPE_SALE, + time, + shortDescription: descriptions.first().text(), + longDescription: descriptions.last().text(), + lat, + lng, + loc: [parseFloat(lat), parseFloat(lng)], + // images: cloudinaryImages + }; + + return data; + } catch (e) { + console.error('Exception caught: ' + e.message); + } + + return null; + } + + + + async indexPage(url, maxResults = 1000) { + try { + // console.log('Starting to index page: ' + pageNr); + // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; + + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + const hrefs = []; + const results = []; + + $('#rezultatipretrage').find('.listitem').each((i, elem) => { + const href = $(elem).find('a').first().attr('href'); + hrefs.push(href); + }); + + let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; + + for (let i = 0; i < hrefs.length; i++) { + console.log(`indexing: ${hrefs[i]}`); + + const singleData = await this.indexSingle(hrefs[i]); + + if (singleData) { + results.push(singleData); + } + await this.sleep(500); + } + + return results; + } catch (e) { + console.error('Exception caught:' + e); + } + } + + // getCategoryId (category) { + // if (category === 'Stanovi') { + // return CATEGORY_FLAT; + // } else if (category === 'Zemljišta') { + // return CATEGORY_LAND; + // } else if (category === 'Kuće') { + // return CATEGORY_HOUSE; + // } else if (category === 'Poslovni prostori') { + // return CATEGORY_OFFICE; + // } + // } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async indexPages(urls, start, end, maxResults = 1000) { + // let results = {}; + // for (let i = start; i <= end; i++) { + // let result = await this.indexPage(i, maxResults); + // Object.assign(results, result) + // await this.sleep(5000); + // } + // return results; + + let results = []; + for (let url of urls) { + let result = await this.indexPage(url, maxResults); + // Object.assign(results, result) + results.push(result); + await this.sleep(5000); + } + return results; + } + + async crawl() { + // TODO create URLS from db + const filteredResults = []; + const realestateRequests = await allRERequest() + const urls = this.createRequestUrls(realestateRequests); + let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); + + for (const result of results) { + console.log(result); + for (const re1 of result) { + if (re1.lat !== undefined) { + console.log(re1.lat); + const pointInsideBoundingBox = await findPointInsideBoundingBox([re1.lng, re1.lat]); + console.log(pointInsideBoundingBox); + } + } + } + // console.log(results); + + + + return results; + } + + createRequestUrls(realestateRequests) { + const urls = [] + + for (const request of realestateRequests) { + const realsestateType = "kategorija=" + getRealEstateTypeEnum(request.realEstateType).olxCategory; + const region = "kanton=" + getRegion(request.region).olxid; + const municipality = "grad%5B%5D=" + getMunicipality(request.region, request.municipality).olxid; + const sizeMin = "kvadrata_min=" + request.sizeMin; + const sizeMax = "kvadrata_max=" + request.sizeMax; + const priceMin = "od=" + request.priceMin; + const priceMax = "do=" + request.priceMax; + + + + const olxUrl = "https://www.olx.ba/pretraga?" + realsestateType + "&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&" + region + "&" + municipality + "&" + priceMin + "&" + priceMax + "&vrsta=samoprodaja&" + sizeMin + "&" + sizeMax + console.log(olxUrl); + urls.push(olxUrl); + } + + return urls; + } +}; \ No newline at end of file diff --git a/app/helpers/enums.js b/app/helpers/enums.js index 9ab2d58..ff78e09 100644 --- a/app/helpers/enums.js +++ b/app/helpers/enums.js @@ -1,7 +1,7 @@ const realEstateTypes = [ - { title: "Kuća", id: "kuca", hasGardenSize: true, olixCategory: 23 }, - { title: "Stan", id: "stan", hasGardenSize: false, olixCategory: 24}, - { title: "Vikendica", id: "vikendica", hasGardenSize: true, olixCategory: 26 } + { title: "Kuća", id: "kuca", hasGardenSize: true, olxCategory: 24 }, + { title: "Stan", id: "stan", hasGardenSize: false, olxCategory: 23}, + { title: "Vikendica", id: "vikendica", hasGardenSize: true, olxCategory: 26 } ]; const sizes = [ diff --git a/app/helpers/url.js b/app/helpers/url.js index 81a73b4..d36e04a 100644 --- a/app/helpers/url.js +++ b/app/helpers/url.js @@ -8,6 +8,16 @@ const currentRERequest = async (req) => { return request; }; +const allRERequest = async () => { + return await db.RealEstateRequest.findAll(); +} + +const findPointInsideBoundingBox = async (latLng) => { + return await db.sequelize.query("SELECT * FROM \"RealEstateRequests\" WHERE ST_Contains(\"RealEstateRequests\".bounding_box, ST_GEOMFROMTEXT(\'POINT (" + latLng[0] + " " + latLng[1]+ ")\'))"); +} + module.exports = { - currentRERequest + currentRERequest, + allRERequest, + findPointInsideBoundingBox }; diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index c6f0c82..d211f1f 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -1,8 +1,10 @@ +// import OlxCrawler from '../helpers/crawlers/olixClawler' +const OlxCrawler = require("../helpers/crawlers/olxClawler"); -var http = require('http'); const crawlers = [ - //new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), + new OlxCrawler(1, 2, 3), + // new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), ]; async function crawlAll() { From c8ee848f0ed82a7ad383a202006839f1359cebb8 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Thu, 20 Jun 2019 10:57:37 +0200 Subject: [PATCH 06/13] Improved results filtering by lat lng --- app/helpers/crawlers/olxClawler.js | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index eb535ed..3584bf6 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -183,27 +183,24 @@ module.exports = class OlxCrawler { } async crawl() { - // TODO create URLS from db + const filteredResults = []; const realestateRequests = await allRERequest() const urls = this.createRequestUrls(realestateRequests); let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); for (const result of results) { - console.log(result); for (const re1 of result) { - if (re1.lat !== undefined) { - console.log(re1.lat); + if (re1.lat !== undefined && re1.lat !== null && re1.lat !== "") { const pointInsideBoundingBox = await findPointInsideBoundingBox([re1.lng, re1.lat]); - console.log(pointInsideBoundingBox); + + if (pointInsideBoundingBox[0].length !== 0) { + filteredResults.push(result); + } } } } - // console.log(results); - - - - return results; + return filteredResults; } createRequestUrls(realestateRequests) { @@ -218,8 +215,6 @@ module.exports = class OlxCrawler { const priceMin = "od=" + request.priceMin; const priceMax = "do=" + request.priceMax; - - const olxUrl = "https://www.olx.ba/pretraga?" + realsestateType + "&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&" + region + "&" + municipality + "&" + priceMin + "&" + priceMax + "&vrsta=samoprodaja&" + sizeMin + "&" + sizeMax console.log(olxUrl); urls.push(olxUrl); From 1bcc5e8e5d31541e703998d725317f16912de029 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Thu, 20 Jun 2019 14:51:14 +0200 Subject: [PATCH 07/13] Preparing to save results to db --- app/helpers/crawlers/olxClawler.js | 6 ++---- app/helpers/db/dbHelper.js | 10 ++++++++++ app/helpers/url.js | 2 +- app/models/marketalert.js | 6 ++++++ app/services/crawlerService.js | 29 +++++++++++++++++++++-------- 5 files changed, 40 insertions(+), 13 deletions(-) create mode 100644 app/helpers/db/dbHelper.js diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index 3584bf6..075cffe 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -110,8 +110,6 @@ module.exports = class OlxCrawler { return null; } - - async indexPage(url, maxResults = 1000) { try { // console.log('Starting to index page: ' + pageNr); @@ -138,7 +136,7 @@ module.exports = class OlxCrawler { if (singleData) { results.push(singleData); } - await this.sleep(500); + // await this.sleep(500); } return results; @@ -177,7 +175,7 @@ module.exports = class OlxCrawler { let result = await this.indexPage(url, maxResults); // Object.assign(results, result) results.push(result); - await this.sleep(5000); + // await this.sleep(5000); } return results; } diff --git a/app/helpers/db/dbHelper.js b/app/helpers/db/dbHelper.js new file mode 100644 index 0000000..6c81004 --- /dev/null +++ b/app/helpers/db/dbHelper.js @@ -0,0 +1,10 @@ + +// const db = require('../../models/index'); + + +// const bulkInsert = async (reuslts) => { +// db.MarketAlert.bulkCreate({ + +// }) + +// } \ No newline at end of file diff --git a/app/helpers/url.js b/app/helpers/url.js index d36e04a..854d2a7 100644 --- a/app/helpers/url.js +++ b/app/helpers/url.js @@ -7,7 +7,7 @@ const currentRERequest = async (req) => { const request = await db.RealEstateRequest.findOne({ where: {uniqueId} }); return request; }; - +// TODO Fetch only subscribed realestate requests const allRERequest = async () => { return await db.RealEstateRequest.findAll(); } diff --git a/app/models/marketalert.js b/app/models/marketalert.js index c0665a3..3aa2d42 100644 --- a/app/models/marketalert.js +++ b/app/models/marketalert.js @@ -5,6 +5,12 @@ module.exports = (sequelize, DataTypes) => { realestateOrigin: DataTypes.STRING, originId: DataTypes.STRING, lastDate: DataTypes.STRING, + size : DataTypes.INTEGER, + gardenSize : DataTypes.INTEGER, + price : DataTypes.INTEGER, + municipailty : DataTypes.STRING, + region : DataTypes.STRING, + email: { type: DataTypes.STRING, allowNul: false diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index d211f1f..13a7f3b 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -1,6 +1,8 @@ // import OlxCrawler from '../helpers/crawlers/olixClawler' const OlxCrawler = require("../helpers/crawlers/olxClawler"); +const db = require("../models/index"); +const MarketAlert = require("../models/marketalert"); const crawlers = [ new OlxCrawler(1, 2, 3), @@ -8,22 +10,33 @@ const crawlers = [ ]; async function crawlAll() { + console.log(db.MarketAlert); for (let crawler of crawlers) { try { - let results = await crawler.crawl() - for (let saver of savers) { - try { - await saver.connect(); - await saver.save(results); - } catch (e) { - console.log("Error saving. Trying next saver! ", e); - } + let results = await crawler.crawl(); + + for (const result of results) { + + const newMAlert = Object.assign({}, MarketAlert) + console.log(newMAlert); + db.MarketAlert.bulkCreate( [{ + + }]) } + // for (let saver of savers) { + // try { + // await saver.connect(); + // await saver.save(results); + // } catch (e) { + // console.log("Error saving. Trying next saver! ", e); + // } + // } } catch (e) { console.log("Error crawling. Trying next crawler! ", e); } } + for (let saver of savers) { saver.close(); From 3c59292f2366794e0e603b664e0a5a9a9ec2d476 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Thu, 20 Jun 2019 21:27:51 +0200 Subject: [PATCH 08/13] refactoring --- app/services/crawlerService.js | 85 +++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 31 deletions(-) diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 13a7f3b..6771012 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -5,42 +5,65 @@ const db = require("../models/index"); const MarketAlert = require("../models/marketalert"); const crawlers = [ - new OlxCrawler(1, 2, 3), - // new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), - ]; + new OlxCrawler(1, 2, 3), + // new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), +]; - async function crawlAll() { - console.log(db.MarketAlert); +async function crawlAll() { + const properties = db.MarketAlert.rawAttributes; + console.log(properties); - for (let crawler of crawlers) { - try { - let results = await crawler.crawl(); + for (let crawler of crawlers) { + try { + let results = await crawler.crawl(); + const marketAlerts = []; - for (const result of results) { + for (const result of results) { + console.log("This is result", result); + console.log("This is result", result.size); - const newMAlert = Object.assign({}, MarketAlert) - console.log(newMAlert); - db.MarketAlert.bulkCreate( [{ + // category: category, + // url, + // title, + // price: isNaN(parsedPrice) ? price : parsedPrice, + // size: parseFloat(size), + // rooms: parsedRooms, + // floor: parseInt(floor), + // address, + // location, + // // adType: AD_TYPE_SALE, + // time, + // shortDescription: descriptions.first().text(), + // longDescription: descriptions.last().text(), + // lat, + // lng, + // loc: [parseFloat(lat), parseFloat(lng)], - }]) - } - // for (let saver of savers) { - // try { - // await saver.connect(); - // await saver.save(results); - // } catch (e) { - // console.log("Error saving. Trying next saver! ", e); - // } - // } - } catch (e) { - console.log("Error crawling. Trying next crawler! ", e); + marketAlerts.push({ + url: result.url, + realestateOrigin: "OLX", + originId: "1", + size: "" + result.size, + price: result.price, + email: "em" + // lastDate: DataTypes.STRING, + // municipailty: DataTypes.STRING, + // region: DataTypes.STRING, + // gardenSize: DataTypes.INTEGER, + + + }) } - } - - - for (let saver of savers) { - saver.close(); + + try { + await db.MarketAlert.bulkCreate(marketAlerts); + } catch (e) { + console.log("Could not bulkCreate marketalers reason: ", e); + } + } catch (e) { + console.log("Error crawling. Trying next crawler! ", e); } } - - crawlAll(); \ No newline at end of file +} + +crawlAll(); \ No newline at end of file From 80ff9bcb6ba5f3a9112301f137099467691d3cd7 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Fri, 21 Jun 2019 15:14:43 +0200 Subject: [PATCH 09/13] saving additional fields, improved async functions with promises --- app/helpers/crawlers/olxClawler.js | 42 ++++---- app/models/marketalert.js | 2 +- app/services/crawlerService.js | 149 ++++++++++++++++++++++++++--- package-lock.json | 6 +- package.json | 1 + 5 files changed, 165 insertions(+), 35 deletions(-) diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index 075cffe..e28f2a8 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -12,7 +12,7 @@ module.exports = class OlxCrawler { this.maxResults = maxResults; } - async indexSingle(url) { + async indexSingle(url, email) { try { const res = await fetch(url); const body = await res.text(); @@ -38,7 +38,7 @@ module.exports = class OlxCrawler { const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); const descriptions = $('.artikal_detaljniopis_tekst'); - const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); + // const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const imgRe = /href":("[^"]*")/g; const matches = latLngRe.exec(body); @@ -73,25 +73,25 @@ module.exports = class OlxCrawler { } const parsedPrice = parsePrice(price); - let parsedRooms; - if (rooms === 'Garsonjera') { - parsedRooms = 0; - } else { - parsedRooms = parseRooms(rooms); - } + const locationArray = location.split(","); + const region = locationArray[0]; + const municipality = locationArray[1]; + console.log(location); + console.log(locationArray); const data = { // category: this.getCategoryId(category), + email : email, + olxId: olxId, category: category, url, title, price: isNaN(parsedPrice) ? price : parsedPrice, size: parseFloat(size), - rooms: parsedRooms, - floor: parseInt(floor), address, - location, + region, + municipality, // adType: AD_TYPE_SALE, time, shortDescription: descriptions.first().text(), @@ -110,12 +110,12 @@ module.exports = class OlxCrawler { return null; } - async indexPage(url, maxResults = 1000) { + async indexPage(olxUrl, maxResults = 1000) { try { // console.log('Starting to index page: ' + pageNr); // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; - const res = await fetch(url); + const res = await fetch(olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const hrefs = []; @@ -131,7 +131,7 @@ module.exports = class OlxCrawler { for (let i = 0; i < hrefs.length; i++) { console.log(`indexing: ${hrefs[i]}`); - const singleData = await this.indexSingle(hrefs[i]); + const singleData = await this.indexSingle(hrefs[i], olxUrl.email); if (singleData) { results.push(singleData); @@ -193,11 +193,13 @@ module.exports = class OlxCrawler { const pointInsideBoundingBox = await findPointInsideBoundingBox([re1.lng, re1.lat]); if (pointInsideBoundingBox[0].length !== 0) { - filteredResults.push(result); + filteredResults.push(re1); } } } } + + console.log(filteredResults); return filteredResults; } @@ -213,11 +215,15 @@ module.exports = class OlxCrawler { const priceMin = "od=" + request.priceMin; const priceMax = "do=" + request.priceMax; - const olxUrl = "https://www.olx.ba/pretraga?" + realsestateType + "&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&" + region + "&" + municipality + "&" + priceMin + "&" + priceMax + "&vrsta=samoprodaja&" + sizeMin + "&" + sizeMax - console.log(olxUrl); + const olxUrl = { + url: "https://www.olx.ba/pretraga?" + realsestateType + "&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&" + region + "&" + municipality + "&" + priceMin + "&" + priceMax + "&vrsta=samoprodaja&" + sizeMin + "&" + sizeMax, + email: request.email + } + console.log(olxUrl.url); urls.push(olxUrl); } return urls; } -}; \ No newline at end of file +}; + diff --git a/app/models/marketalert.js b/app/models/marketalert.js index 3aa2d42..130f065 100644 --- a/app/models/marketalert.js +++ b/app/models/marketalert.js @@ -8,7 +8,7 @@ module.exports = (sequelize, DataTypes) => { size : DataTypes.INTEGER, gardenSize : DataTypes.INTEGER, price : DataTypes.INTEGER, - municipailty : DataTypes.STRING, + municipality : DataTypes.STRING, region : DataTypes.STRING, email: { diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 6771012..aae4ecc 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -1,26 +1,31 @@ // import OlxCrawler from '../helpers/crawlers/olixClawler' +var Promise = require("bluebird"); const OlxCrawler = require("../helpers/crawlers/olxClawler"); const db = require("../models/index"); const MarketAlert = require("../models/marketalert"); +const olxCrawler = new OlxCrawler(1, 2, 3); + const crawlers = [ - new OlxCrawler(1, 2, 3), + olxCrawler, // new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), ]; async function crawlAll() { - const properties = db.MarketAlert.rawAttributes; - console.log(properties); - for (let crawler of crawlers) { + Promise.map(crawlers, function (crawler) { + + return crawler.crawl(); + + }).then(async (results) => { + // let results = await crawler.crawl(); try { - let results = await crawler.crawl(); const marketAlerts = []; + const mergedResults = [].concat.apply([], results); + + for (const result of mergedResults) { - for (const result of results) { - console.log("This is result", result); - console.log("This is result", result.size); // category: category, // url, @@ -39,16 +44,17 @@ async function crawlAll() { // lng, // loc: [parseFloat(lat), parseFloat(lng)], + marketAlerts.push({ url: result.url, realestateOrigin: "OLX", - originId: "1", - size: "" + result.size, + originId: result.olxId, + size: result.size, price: result.price, - email: "em" + email: result.email, // lastDate: DataTypes.STRING, - // municipailty: DataTypes.STRING, - // region: DataTypes.STRING, + municipality: result.municipality, + region:result.region, // gardenSize: DataTypes.INTEGER, @@ -64,6 +70,123 @@ async function crawlAll() { console.log("Error crawling. Trying next crawler! ", e); } } + + + ) +}; + + + + // Promise.all( + + + // ).then((results) => { + // console.log(results); + // console.log(results.length); + // console.log("Executing save results"); + // Promise.all([extractAndSaveResults(results)]).then(() => { + // console.log("Executed save results"); + // }); + // }); + + +// for (let crawler of crawlers) { +// try { +// let results = await crawler.crawl(); +// const marketAlerts = []; + +// for (const result of results) { +// console.log("This is result", result); +// console.log("This is result", result.size); + +// // category: category, +// // url, +// // title, +// // price: isNaN(parsedPrice) ? price : parsedPrice, +// // size: parseFloat(size), +// // rooms: parsedRooms, +// // floor: parseInt(floor), +// // address, +// // location, +// // // adType: AD_TYPE_SALE, +// // time, +// // shortDescription: descriptions.first().text(), +// // longDescription: descriptions.last().text(), +// // lat, +// // lng, +// // loc: [parseFloat(lat), parseFloat(lng)], + + +// marketAlerts.push({ +// url: result.url, +// realestateOrigin: "OLX", +// originId: result.olxId, +// size: result.size, +// price: result.price, +// email: "em" +// // lastDate: DataTypes.STRING, +// // municipailty: DataTypes.STRING, +// // region: DataTypes.STRING, +// // gardenSize: DataTypes.INTEGER, + + +// }) +// } + +// try { +// await db.MarketAlert.bulkCreate(marketAlerts); +// } catch (e) { +// console.log("Could not bulkCreate marketalers reason: ", e); +// } +// } catch (e) { +// console.log("Error crawling. Trying next crawler! ", e); +// } +// } +// } + +async function extractAndSaveResults(results) { + const marketAlerts = [] + + for (const result of results) { + // console.log("This is result", result); + // console.log("This is result", result.size); + + // category: category, + // url, + // title, + // price: isNaN(parsedPrice) ? price : parsedPrice, + // size: parseFloat(size), + // rooms: parsedRooms, + // floor: parseInt(floor), + // address, + // location, + // // adType: AD_TYPE_SALE, + // time, + // shortDescription: descriptions.first().text(), + // longDescription: descriptions.last().text(), + // lat, + // lng, + // loc: [parseFloat(lat), parseFloat(lng)], + + + marketAlerts.push({ + url: result.url, + realestateOrigin: "OLX", + originId: result.olxId, + size: result.size, + price: result.price, + email: "em" + // lastDate: DataTypes.STRING, + // municipailty: DataTypes.STRING, + // region: DataTypes.STRING, + // gardenSize: DataTypes.INTEGER, + + + }) + } + + return marketAlerts; + } crawlAll(); \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index a69443e..65de1ce 100644 --- a/package-lock.json +++ b/package-lock.json @@ -327,9 +327,9 @@ "dev": true }, "bluebird": { - "version": "3.5.3", - "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz", - "integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw==" + "version": "3.5.5", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.5.tgz", + "integrity": "sha512-5am6HnnfN+urzt4yfg7IgTbotDjIT/u8AJpEt0sIU9FtXfVeezXAPKswrG+xKUCOYAINpSdgZVDU6QFh+cuH3w==" }, "body-parser": { "version": "1.18.3", diff --git a/package.json b/package.json index a8b5aa4..360b7bd 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "2checkout-node": "0.0.1", "@sendgrid/mail": "^6.3.1", "aws-sdk": "^2.422.0", + "bluebird": "^3.5.5", "cheerio": "^1.0.0-rc.2", "compression": "^1.7.4", "dotenv": "^7.0.0", From 2f474619caef8773ccf9ab3bb9958946f46f156b Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Fri, 21 Jun 2019 16:48:19 +0200 Subject: [PATCH 10/13] Compare crawler results with db, and only save new if necessary --- app/helpers/crawlers/olxClawler.js | 58 ++--- ...90621162321-add-category-to-marketalert.js | 20 ++ app/models/marketalert.js | 1 + app/services/crawlerService.js | 210 +++--------------- 4 files changed, 86 insertions(+), 203 deletions(-) create mode 100644 app/migrations/20190621162321-add-category-to-marketalert.js diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index e28f2a8..365813f 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -3,9 +3,10 @@ const cheerio = require('cheerio'); const { allRERequest, findPointInsideBoundingBox } = require('../url'); const { getRealEstateTypeEnum } = require('../enums'); const { getRegion, getMunicipality } = require('../codes') +const Promise = require("bluebird"); module.exports = class OlxCrawler { - + //TODO figure best way to handle paging constructor(fromPage = 0, toPage = 10, maxResults = 1000) { this.fromPage = fromPage; this.toPage = toPage; @@ -18,19 +19,22 @@ module.exports = class OlxCrawler { const body = await res.text(); const $ = cheerio.load(body); + //TODO figure out what to do with username const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { // return null; // } + //TODO remove properties that are not needed, and add some if they are missing const title = $('#naslovartikla').text(); - const category = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); + const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); const price = $('#pc > p:nth-child(2)').text(); const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); + const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); @@ -45,19 +49,22 @@ module.exports = class OlxCrawler { let lng = '', lat = ''; - const images = []; - const imgMatches = body.match(imgRe); const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) const parsePrice = (price) => parseFloat(price.replace(".", "")) - for (let i = 0; imgMatches && i < imgMatches.length; i++) { - let img = imgMatches[i].replace("href\":", "") - img = img.replace("\"", ""); - img = img.replace("\"", ""); - images.push(img); - } + // TODO we dont save images ?? + + // const images = []; + // const imgMatches = body.match(imgRe); + + // for (let i = 0; imgMatches && i < imgMatches.length; i++) { + // let img = imgMatches[i].replace("href\":", "") + // img = img.replace("\"", ""); + // img = img.replace("\"", ""); + // images.push(img); + // } // const uploadPromises = images.map(img => { // const imgFixed = eval(`'${img}'`); @@ -77,18 +84,17 @@ module.exports = class OlxCrawler { const locationArray = location.split(","); const region = locationArray[0]; const municipality = locationArray[1]; - console.log(location); - console.log(locationArray); const data = { - // category: this.getCategoryId(category), + realEstateType: this.getCategoryId(realEstateType), email : email, olxId: olxId, - category: category, + // category: category, url, title, price: isNaN(parsedPrice) ? price : parsedPrice, size: parseFloat(size), + gardenSize: parseFloat(gardenSize), address, region, municipality, @@ -145,21 +151,15 @@ module.exports = class OlxCrawler { } } - // getCategoryId (category) { - // if (category === 'Stanovi') { - // return CATEGORY_FLAT; - // } else if (category === 'Zemljišta') { - // return CATEGORY_LAND; - // } else if (category === 'Kuće') { - // return CATEGORY_HOUSE; - // } else if (category === 'Poslovni prostori') { - // return CATEGORY_OFFICE; - // } - // } - - async sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); - } + getCategoryId (category) { + if (category === 'Stanovi') { + return 'stan'; + } else if (category === 'Vikendice') { + return 'vikendica'; + } else if (category === 'Kuće') { + return 'kuca'; + } + } async indexPages(urls, start, end, maxResults = 1000) { // let results = {}; diff --git a/app/migrations/20190621162321-add-category-to-marketalert.js b/app/migrations/20190621162321-add-category-to-marketalert.js new file mode 100644 index 0000000..d8a49f5 --- /dev/null +++ b/app/migrations/20190621162321-add-category-to-marketalert.js @@ -0,0 +1,20 @@ +'use strict'; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.addColumn( + 'MarketAlerts', + 'realEstateType', + { + type: Sequelize.STRING + } + ); + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.removeColumn( + 'MarketAlerts', + 'realEstateType' + ); + } +}; diff --git a/app/models/marketalert.js b/app/models/marketalert.js index 130f065..9f1e092 100644 --- a/app/models/marketalert.js +++ b/app/models/marketalert.js @@ -10,6 +10,7 @@ module.exports = (sequelize, DataTypes) => { price : DataTypes.INTEGER, municipality : DataTypes.STRING, region : DataTypes.STRING, + realEstateType : DataTypes.STRING, email: { type: DataTypes.STRING, diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index aae4ecc..7d210c6 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -1,192 +1,54 @@ -// import OlxCrawler from '../helpers/crawlers/olixClawler' -var Promise = require("bluebird"); +const Promise = require("bluebird"); const OlxCrawler = require("../helpers/crawlers/olxClawler"); const db = require("../models/index"); -const MarketAlert = require("../models/marketalert"); const olxCrawler = new OlxCrawler(1, 2, 3); const crawlers = [ - olxCrawler, - // new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), + olxCrawler, ]; async function crawlAll() { - Promise.map(crawlers, function (crawler) { + Promise.map(crawlers, function (crawler) { + return crawler.crawl(); + }).then(async (results) => { - return crawler.crawl(); + try { - }).then(async (results) => { - // let results = await crawler.crawl(); - try { - const marketAlerts = []; - const mergedResults = [].concat.apply([], results); + const marketAlertsFromDb = await db.MarketAlert.findAll(); - for (const result of mergedResults) { + const marketAlerts = []; + const mergedResults = [].concat.apply([], results); - - // category: category, - // url, - // title, - // price: isNaN(parsedPrice) ? price : parsedPrice, - // size: parseFloat(size), - // rooms: parsedRooms, - // floor: parseInt(floor), - // address, - // location, - // // adType: AD_TYPE_SALE, - // time, - // shortDescription: descriptions.first().text(), - // longDescription: descriptions.last().text(), - // lat, - // lng, - // loc: [parseFloat(lat), parseFloat(lng)], - - - marketAlerts.push({ - url: result.url, - realestateOrigin: "OLX", - originId: result.olxId, - size: result.size, - price: result.price, - email: result.email, - // lastDate: DataTypes.STRING, - municipality: result.municipality, - region:result.region, - // gardenSize: DataTypes.INTEGER, - - - }) - } - - try { - await db.MarketAlert.bulkCreate(marketAlerts); - } catch (e) { - console.log("Could not bulkCreate marketalers reason: ", e); - } - } catch (e) { - console.log("Error crawling. Trying next crawler! ", e); - } - } - - - ) + for (const result of mergedResults) { + marketAlerts.push({ + url: result.url, + realestateOrigin: "OLX", + originId: result.olxId, + size: result.size, + price: result.price, + email: result.email, + // lastDate: DataTypes.STRING, + municipality: result.municipality, + region: result.region, + gardenSize: result.gardenSize, + realEstateType: result.realEstateType + }) + } + try { + const filteredMarketAlerts = marketAlerts.filter((elem) => !marketAlertsFromDb.find(({ url }) => elem.url === url)); + await db.MarketAlert.bulkCreate(filteredMarketAlerts); + process.exit() + } catch (e) { + console.log("Could not bulkCreate marketalers reason: ", e); + } + } catch (e) { + console.log("Error crawling. Trying next crawler! ", e); + } + }) }; +crawlAll(); - - // Promise.all( - - - // ).then((results) => { - // console.log(results); - // console.log(results.length); - // console.log("Executing save results"); - // Promise.all([extractAndSaveResults(results)]).then(() => { - // console.log("Executed save results"); - // }); - // }); - - -// for (let crawler of crawlers) { -// try { -// let results = await crawler.crawl(); -// const marketAlerts = []; - -// for (const result of results) { -// console.log("This is result", result); -// console.log("This is result", result.size); - -// // category: category, -// // url, -// // title, -// // price: isNaN(parsedPrice) ? price : parsedPrice, -// // size: parseFloat(size), -// // rooms: parsedRooms, -// // floor: parseInt(floor), -// // address, -// // location, -// // // adType: AD_TYPE_SALE, -// // time, -// // shortDescription: descriptions.first().text(), -// // longDescription: descriptions.last().text(), -// // lat, -// // lng, -// // loc: [parseFloat(lat), parseFloat(lng)], - - -// marketAlerts.push({ -// url: result.url, -// realestateOrigin: "OLX", -// originId: result.olxId, -// size: result.size, -// price: result.price, -// email: "em" -// // lastDate: DataTypes.STRING, -// // municipailty: DataTypes.STRING, -// // region: DataTypes.STRING, -// // gardenSize: DataTypes.INTEGER, - - -// }) -// } - -// try { -// await db.MarketAlert.bulkCreate(marketAlerts); -// } catch (e) { -// console.log("Could not bulkCreate marketalers reason: ", e); -// } -// } catch (e) { -// console.log("Error crawling. Trying next crawler! ", e); -// } -// } -// } - -async function extractAndSaveResults(results) { - const marketAlerts = [] - - for (const result of results) { - // console.log("This is result", result); - // console.log("This is result", result.size); - - // category: category, - // url, - // title, - // price: isNaN(parsedPrice) ? price : parsedPrice, - // size: parseFloat(size), - // rooms: parsedRooms, - // floor: parseInt(floor), - // address, - // location, - // // adType: AD_TYPE_SALE, - // time, - // shortDescription: descriptions.first().text(), - // longDescription: descriptions.last().text(), - // lat, - // lng, - // loc: [parseFloat(lat), parseFloat(lng)], - - - marketAlerts.push({ - url: result.url, - realestateOrigin: "OLX", - originId: result.olxId, - size: result.size, - price: result.price, - email: "em" - // lastDate: DataTypes.STRING, - // municipailty: DataTypes.STRING, - // region: DataTypes.STRING, - // gardenSize: DataTypes.INTEGER, - - - }) - } - - return marketAlerts; - -} - -crawlAll(); \ No newline at end of file From 6eba5c2a97f26a2350e8a6fd46d96b6d856df921 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Mon, 24 Jun 2019 11:49:13 +0200 Subject: [PATCH 11/13] gardenSize nan --- app/services/crawlerService.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 7d210c6..49ad166 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -26,14 +26,14 @@ async function crawlAll() { marketAlerts.push({ url: result.url, realestateOrigin: "OLX", - originId: result.olxId, + originId: 1, size: result.size, price: result.price, email: result.email, // lastDate: DataTypes.STRING, municipality: result.municipality, region: result.region, - gardenSize: result.gardenSize, + gardenSize: isNaN(result.gardenSize) ? 0 : result.gardenSize, realEstateType: result.realEstateType }) } From 2cf6f6f1ff6b092a5cc58d8144b135945bd59d4a Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Mon, 24 Jun 2019 14:20:31 +0200 Subject: [PATCH 12/13] Code refactoring, fixed bug with price parsing: --- app/helpers/crawlers/olxClawler.js | 43 +++++++++++++++++------------- app/helpers/db/dbHelper.js | 19 ++++++++----- app/services/crawlerService.js | 1 + 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index 365813f..d6b7292 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -1,6 +1,6 @@ const fetch = require('node-fetch'); const cheerio = require('cheerio'); -const { allRERequest, findPointInsideBoundingBox } = require('../url'); +const { allRERequest, findPointInsideBoundingBox } = require('../db/dbHelper'); const { getRealEstateTypeEnum } = require('../enums'); const { getRegion, getMunicipality } = require('../codes') const Promise = require("bluebird"); @@ -92,9 +92,9 @@ module.exports = class OlxCrawler { // category: category, url, title, - price: isNaN(parsedPrice) ? price : parsedPrice, + price: isNaN(parsedPrice) ? 0 : parsedPrice, size: parseFloat(size), - gardenSize: parseFloat(gardenSize), + gardenSize: isNaN(parseFloat(gardenSize)) ? parseFloat(gardenSize) : 0, address, region, municipality, @@ -118,6 +118,7 @@ module.exports = class OlxCrawler { async indexPage(olxUrl, maxResults = 1000) { try { + //TODO fix paging // console.log('Starting to index page: ' + pageNr); // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; @@ -142,7 +143,6 @@ module.exports = class OlxCrawler { if (singleData) { results.push(singleData); } - // await this.sleep(500); } return results; @@ -152,30 +152,35 @@ module.exports = class OlxCrawler { } getCategoryId (category) { - if (category === 'Stanovi') { - return 'stan'; - } else if (category === 'Vikendice') { - return 'vikendica'; - } else if (category === 'Kuće') { - return 'kuca'; - } + + switch(category) { + case 'Stanovi': + return 'stan'; + + case 'Vikendice': + return 'vikendica' + + case 'Kuće': + return 'kuca'; + + default: + return ''; + } } async indexPages(urls, start, end, maxResults = 1000) { + //TODO fix paging // let results = {}; // for (let i = start; i <= end; i++) { // let result = await this.indexPage(i, maxResults); // Object.assign(results, result) - // await this.sleep(5000); // } // return results; let results = []; for (let url of urls) { let result = await this.indexPage(url, maxResults); - // Object.assign(results, result) results.push(result); - // await this.sleep(5000); } return results; } @@ -188,12 +193,12 @@ module.exports = class OlxCrawler { let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); for (const result of results) { - for (const re1 of result) { - if (re1.lat !== undefined && re1.lat !== null && re1.lat !== "") { - const pointInsideBoundingBox = await findPointInsideBoundingBox([re1.lng, re1.lat]); + for (const finalResult of result) { + if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { + const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat]); if (pointInsideBoundingBox[0].length !== 0) { - filteredResults.push(re1); + filteredResults.push(finalResult); } } } @@ -216,7 +221,7 @@ module.exports = class OlxCrawler { const priceMax = "do=" + request.priceMax; const olxUrl = { - url: "https://www.olx.ba/pretraga?" + realsestateType + "&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&" + region + "&" + municipality + "&" + priceMin + "&" + priceMax + "&vrsta=samoprodaja&" + sizeMin + "&" + sizeMax, + url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}`, email: request.email } console.log(olxUrl.url); diff --git a/app/helpers/db/dbHelper.js b/app/helpers/db/dbHelper.js index 6c81004..f51638b 100644 --- a/app/helpers/db/dbHelper.js +++ b/app/helpers/db/dbHelper.js @@ -1,10 +1,15 @@ +const db = require('../../models/index'); -// const db = require('../../models/index'); +// TODO Fetch only subscribed realestate requests +const allRERequest = async () => { + return await db.RealEstateRequest.findAll(); +} +const findPointInsideBoundingBox = async (latLng) => { + return await db.sequelize.query("SELECT * FROM \"RealEstateRequests\" WHERE ST_Contains(\"RealEstateRequests\".bounding_box, ST_GEOMFROMTEXT(\'POINT (" + latLng[0] + " " + latLng[1]+ ")\'))"); +} -// const bulkInsert = async (reuslts) => { -// db.MarketAlert.bulkCreate({ - -// }) - -// } \ No newline at end of file +module.exports = { + allRERequest, + findPointInsideBoundingBox +}; diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 49ad166..5eb160b 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -38,6 +38,7 @@ async function crawlAll() { }) } try { + console.log(marketAlerts); const filteredMarketAlerts = marketAlerts.filter((elem) => !marketAlertsFromDb.find(({ url }) => elem.url === url)); await db.MarketAlert.bulkCreate(filteredMarketAlerts); process.exit() From 1aa91fb4e2498b91c996bf3b49e9e97e7a4b6bca Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Mon, 24 Jun 2019 15:34:59 +0200 Subject: [PATCH 13/13] Fixed gardenSize --- app/helpers/crawlers/olxClawler.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index d6b7292..1f7ea1b 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -94,7 +94,7 @@ module.exports = class OlxCrawler { title, price: isNaN(parsedPrice) ? 0 : parsedPrice, size: parseFloat(size), - gardenSize: isNaN(parseFloat(gardenSize)) ? parseFloat(gardenSize) : 0, + gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), address, region, municipality,