From 80ff9bcb6ba5f3a9112301f137099467691d3cd7 Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Fri, 21 Jun 2019 15:14:43 +0200 Subject: [PATCH] saving additional fields, improved async functions with promises --- app/helpers/crawlers/olxClawler.js | 42 ++++---- app/models/marketalert.js | 2 +- app/services/crawlerService.js | 149 ++++++++++++++++++++++++++--- package-lock.json | 6 +- package.json | 1 + 5 files changed, 165 insertions(+), 35 deletions(-) diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index 075cffe..e28f2a8 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -12,7 +12,7 @@ module.exports = class OlxCrawler { this.maxResults = maxResults; } - async indexSingle(url) { + async indexSingle(url, email) { try { const res = await fetch(url); const body = await res.text(); @@ -38,7 +38,7 @@ module.exports = class OlxCrawler { const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); const descriptions = $('.artikal_detaljniopis_tekst'); - const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); + // const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const imgRe = /href":("[^"]*")/g; const matches = latLngRe.exec(body); @@ -73,25 +73,25 @@ module.exports = class OlxCrawler { } const parsedPrice = parsePrice(price); - let parsedRooms; - if (rooms === 'Garsonjera') { - parsedRooms = 0; - } else { - parsedRooms = parseRooms(rooms); - } + const locationArray = location.split(","); + const region = locationArray[0]; + const municipality = locationArray[1]; + console.log(location); + console.log(locationArray); const data = { // category: this.getCategoryId(category), + email : email, + olxId: olxId, category: category, url, title, price: isNaN(parsedPrice) ? price : parsedPrice, size: parseFloat(size), - rooms: parsedRooms, - floor: parseInt(floor), address, - location, + region, + municipality, // adType: AD_TYPE_SALE, time, shortDescription: descriptions.first().text(), @@ -110,12 +110,12 @@ module.exports = class OlxCrawler { return null; } - async indexPage(url, maxResults = 1000) { + async indexPage(olxUrl, maxResults = 1000) { try { // console.log('Starting to index page: ' + pageNr); // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; - const res = await fetch(url); + const res = await fetch(olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const hrefs = []; @@ -131,7 +131,7 @@ module.exports = class OlxCrawler { for (let i = 0; i < hrefs.length; i++) { console.log(`indexing: ${hrefs[i]}`); - const singleData = await this.indexSingle(hrefs[i]); + const singleData = await this.indexSingle(hrefs[i], olxUrl.email); if (singleData) { results.push(singleData); @@ -193,11 +193,13 @@ module.exports = class OlxCrawler { const pointInsideBoundingBox = await findPointInsideBoundingBox([re1.lng, re1.lat]); if (pointInsideBoundingBox[0].length !== 0) { - filteredResults.push(result); + filteredResults.push(re1); } } } } + + console.log(filteredResults); return filteredResults; } @@ -213,11 +215,15 @@ module.exports = class OlxCrawler { const priceMin = "od=" + request.priceMin; const priceMax = "do=" + request.priceMax; - const olxUrl = "https://www.olx.ba/pretraga?" + realsestateType + "&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&" + region + "&" + municipality + "&" + priceMin + "&" + priceMax + "&vrsta=samoprodaja&" + sizeMin + "&" + sizeMax - console.log(olxUrl); + const olxUrl = { + url: "https://www.olx.ba/pretraga?" + realsestateType + "&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&" + region + "&" + municipality + "&" + priceMin + "&" + priceMax + "&vrsta=samoprodaja&" + sizeMin + "&" + sizeMax, + email: request.email + } + console.log(olxUrl.url); urls.push(olxUrl); } return urls; } -}; \ No newline at end of file +}; + diff --git a/app/models/marketalert.js b/app/models/marketalert.js index 3aa2d42..130f065 100644 --- a/app/models/marketalert.js +++ b/app/models/marketalert.js @@ -8,7 +8,7 @@ module.exports = (sequelize, DataTypes) => { size : DataTypes.INTEGER, gardenSize : DataTypes.INTEGER, price : DataTypes.INTEGER, - municipailty : DataTypes.STRING, + municipality : DataTypes.STRING, region : DataTypes.STRING, email: { diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 6771012..aae4ecc 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -1,26 +1,31 @@ // import OlxCrawler from '../helpers/crawlers/olixClawler' +var Promise = require("bluebird"); const OlxCrawler = require("../helpers/crawlers/olxClawler"); const db = require("../models/index"); const MarketAlert = require("../models/marketalert"); +const olxCrawler = new OlxCrawler(1, 2, 3); + const crawlers = [ - new OlxCrawler(1, 2, 3), + olxCrawler, // new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), ]; async function crawlAll() { - const properties = db.MarketAlert.rawAttributes; - console.log(properties); - for (let crawler of crawlers) { + Promise.map(crawlers, function (crawler) { + + return crawler.crawl(); + + }).then(async (results) => { + // let results = await crawler.crawl(); try { - let results = await crawler.crawl(); const marketAlerts = []; + const mergedResults = [].concat.apply([], results); + + for (const result of mergedResults) { - for (const result of results) { - console.log("This is result", result); - console.log("This is result", result.size); // category: category, // url, @@ -39,16 +44,17 @@ async function crawlAll() { // lng, // loc: [parseFloat(lat), parseFloat(lng)], + marketAlerts.push({ url: result.url, realestateOrigin: "OLX", - originId: "1", - size: "" + result.size, + originId: result.olxId, + size: result.size, price: result.price, - email: "em" + email: result.email, // lastDate: DataTypes.STRING, - // municipailty: DataTypes.STRING, - // region: DataTypes.STRING, + municipality: result.municipality, + region:result.region, // gardenSize: DataTypes.INTEGER, @@ -64,6 +70,123 @@ async function crawlAll() { console.log("Error crawling. Trying next crawler! ", e); } } + + + ) +}; + + + + // Promise.all( + + + // ).then((results) => { + // console.log(results); + // console.log(results.length); + // console.log("Executing save results"); + // Promise.all([extractAndSaveResults(results)]).then(() => { + // console.log("Executed save results"); + // }); + // }); + + +// for (let crawler of crawlers) { +// try { +// let results = await crawler.crawl(); +// const marketAlerts = []; + +// for (const result of results) { +// console.log("This is result", result); +// console.log("This is result", result.size); + +// // category: category, +// // url, +// // title, +// // price: isNaN(parsedPrice) ? price : parsedPrice, +// // size: parseFloat(size), +// // rooms: parsedRooms, +// // floor: parseInt(floor), +// // address, +// // location, +// // // adType: AD_TYPE_SALE, +// // time, +// // shortDescription: descriptions.first().text(), +// // longDescription: descriptions.last().text(), +// // lat, +// // lng, +// // loc: [parseFloat(lat), parseFloat(lng)], + + +// marketAlerts.push({ +// url: result.url, +// realestateOrigin: "OLX", +// originId: result.olxId, +// size: result.size, +// price: result.price, +// email: "em" +// // lastDate: DataTypes.STRING, +// // municipailty: DataTypes.STRING, +// // region: DataTypes.STRING, +// // gardenSize: DataTypes.INTEGER, + + +// }) +// } + +// try { +// await db.MarketAlert.bulkCreate(marketAlerts); +// } catch (e) { +// console.log("Could not bulkCreate marketalers reason: ", e); +// } +// } catch (e) { +// console.log("Error crawling. Trying next crawler! ", e); +// } +// } +// } + +async function extractAndSaveResults(results) { + const marketAlerts = [] + + for (const result of results) { + // console.log("This is result", result); + // console.log("This is result", result.size); + + // category: category, + // url, + // title, + // price: isNaN(parsedPrice) ? price : parsedPrice, + // size: parseFloat(size), + // rooms: parsedRooms, + // floor: parseInt(floor), + // address, + // location, + // // adType: AD_TYPE_SALE, + // time, + // shortDescription: descriptions.first().text(), + // longDescription: descriptions.last().text(), + // lat, + // lng, + // loc: [parseFloat(lat), parseFloat(lng)], + + + marketAlerts.push({ + url: result.url, + realestateOrigin: "OLX", + originId: result.olxId, + size: result.size, + price: result.price, + email: "em" + // lastDate: DataTypes.STRING, + // municipailty: DataTypes.STRING, + // region: DataTypes.STRING, + // gardenSize: DataTypes.INTEGER, + + + }) + } + + return marketAlerts; + } crawlAll(); \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index a69443e..65de1ce 100644 --- a/package-lock.json +++ b/package-lock.json @@ -327,9 +327,9 @@ "dev": true }, "bluebird": { - "version": "3.5.3", - "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.3.tgz", - "integrity": "sha512-/qKPUQlaW1OyR51WeCPBvRnAlnZFUJkCSG5HzGnuIqhgyJtF+T94lFnn33eiazjRm2LAHVy2guNnaq48X9SJuw==" + "version": "3.5.5", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.5.tgz", + "integrity": "sha512-5am6HnnfN+urzt4yfg7IgTbotDjIT/u8AJpEt0sIU9FtXfVeezXAPKswrG+xKUCOYAINpSdgZVDU6QFh+cuH3w==" }, "body-parser": { "version": "1.18.3", diff --git a/package.json b/package.json index a8b5aa4..360b7bd 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "2checkout-node": "0.0.1", "@sendgrid/mail": "^6.3.1", "aws-sdk": "^2.422.0", + "bluebird": "^3.5.5", "cheerio": "^1.0.0-rc.2", "compression": "^1.7.4", "dotenv": "^7.0.0",