diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index e28f2a8..365813f 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -3,9 +3,10 @@ const cheerio = require('cheerio'); const { allRERequest, findPointInsideBoundingBox } = require('../url'); const { getRealEstateTypeEnum } = require('../enums'); const { getRegion, getMunicipality } = require('../codes') +const Promise = require("bluebird"); module.exports = class OlxCrawler { - + //TODO figure best way to handle paging constructor(fromPage = 0, toPage = 10, maxResults = 1000) { this.fromPage = fromPage; this.toPage = toPage; @@ -18,19 +19,22 @@ module.exports = class OlxCrawler { const body = await res.text(); const $ = cheerio.load(body); + //TODO figure out what to do with username const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { // return null; // } + //TODO remove properties that are not needed, and add some if they are missing const title = $('#naslovartikla').text(); - const category = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); + const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); const price = $('#pc > p:nth-child(2)').text(); const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); + const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); @@ -45,19 +49,22 @@ module.exports = class OlxCrawler { let lng = '', lat = ''; - const images = []; - const imgMatches = body.match(imgRe); const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) const parsePrice = (price) => parseFloat(price.replace(".", "")) - for (let i = 0; imgMatches && i < imgMatches.length; i++) { - let img = imgMatches[i].replace("href\":", "") - img = img.replace("\"", ""); - img = img.replace("\"", ""); - images.push(img); - } + // TODO we dont save images ?? + + // const images = []; + // const imgMatches = body.match(imgRe); + + // for (let i = 0; imgMatches && i < imgMatches.length; i++) { + // let img = imgMatches[i].replace("href\":", "") + // img = img.replace("\"", ""); + // img = img.replace("\"", ""); + // images.push(img); + // } // const uploadPromises = images.map(img => { // const imgFixed = eval(`'${img}'`); @@ -77,18 +84,17 @@ module.exports = class OlxCrawler { const locationArray = location.split(","); const region = locationArray[0]; const municipality = locationArray[1]; - console.log(location); - console.log(locationArray); const data = { - // category: this.getCategoryId(category), + realEstateType: this.getCategoryId(realEstateType), email : email, olxId: olxId, - category: category, + // category: category, url, title, price: isNaN(parsedPrice) ? price : parsedPrice, size: parseFloat(size), + gardenSize: parseFloat(gardenSize), address, region, municipality, @@ -145,21 +151,15 @@ module.exports = class OlxCrawler { } } - // getCategoryId (category) { - // if (category === 'Stanovi') { - // return CATEGORY_FLAT; - // } else if (category === 'Zemljišta') { - // return CATEGORY_LAND; - // } else if (category === 'Kuće') { - // return CATEGORY_HOUSE; - // } else if (category === 'Poslovni prostori') { - // return CATEGORY_OFFICE; - // } - // } - - async sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); - } + getCategoryId (category) { + if (category === 'Stanovi') { + return 'stan'; + } else if (category === 'Vikendice') { + return 'vikendica'; + } else if (category === 'Kuće') { + return 'kuca'; + } + } async indexPages(urls, start, end, maxResults = 1000) { // let results = {}; diff --git a/app/migrations/20190621162321-add-category-to-marketalert.js b/app/migrations/20190621162321-add-category-to-marketalert.js new file mode 100644 index 0000000..d8a49f5 --- /dev/null +++ b/app/migrations/20190621162321-add-category-to-marketalert.js @@ -0,0 +1,20 @@ +'use strict'; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.addColumn( + 'MarketAlerts', + 'realEstateType', + { + type: Sequelize.STRING + } + ); + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.removeColumn( + 'MarketAlerts', + 'realEstateType' + ); + } +}; diff --git a/app/models/marketalert.js b/app/models/marketalert.js index 130f065..9f1e092 100644 --- a/app/models/marketalert.js +++ b/app/models/marketalert.js @@ -10,6 +10,7 @@ module.exports = (sequelize, DataTypes) => { price : DataTypes.INTEGER, municipality : DataTypes.STRING, region : DataTypes.STRING, + realEstateType : DataTypes.STRING, email: { type: DataTypes.STRING, diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index aae4ecc..7d210c6 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -1,192 +1,54 @@ -// import OlxCrawler from '../helpers/crawlers/olixClawler' -var Promise = require("bluebird"); +const Promise = require("bluebird"); const OlxCrawler = require("../helpers/crawlers/olxClawler"); const db = require("../models/index"); -const MarketAlert = require("../models/marketalert"); const olxCrawler = new OlxCrawler(1, 2, 3); const crawlers = [ - olxCrawler, - // new OlxCrawler(process.env.OLX_FROM_PAGE, process.env.OLX_TO_PAGE, process.env.OLX_MAX_RESULTS), + olxCrawler, ]; async function crawlAll() { - Promise.map(crawlers, function (crawler) { + Promise.map(crawlers, function (crawler) { + return crawler.crawl(); + }).then(async (results) => { - return crawler.crawl(); + try { - }).then(async (results) => { - // let results = await crawler.crawl(); - try { - const marketAlerts = []; - const mergedResults = [].concat.apply([], results); + const marketAlertsFromDb = await db.MarketAlert.findAll(); - for (const result of mergedResults) { + const marketAlerts = []; + const mergedResults = [].concat.apply([], results); - - // category: category, - // url, - // title, - // price: isNaN(parsedPrice) ? price : parsedPrice, - // size: parseFloat(size), - // rooms: parsedRooms, - // floor: parseInt(floor), - // address, - // location, - // // adType: AD_TYPE_SALE, - // time, - // shortDescription: descriptions.first().text(), - // longDescription: descriptions.last().text(), - // lat, - // lng, - // loc: [parseFloat(lat), parseFloat(lng)], - - - marketAlerts.push({ - url: result.url, - realestateOrigin: "OLX", - originId: result.olxId, - size: result.size, - price: result.price, - email: result.email, - // lastDate: DataTypes.STRING, - municipality: result.municipality, - region:result.region, - // gardenSize: DataTypes.INTEGER, - - - }) - } - - try { - await db.MarketAlert.bulkCreate(marketAlerts); - } catch (e) { - console.log("Could not bulkCreate marketalers reason: ", e); - } - } catch (e) { - console.log("Error crawling. Trying next crawler! ", e); - } - } - - - ) + for (const result of mergedResults) { + marketAlerts.push({ + url: result.url, + realestateOrigin: "OLX", + originId: result.olxId, + size: result.size, + price: result.price, + email: result.email, + // lastDate: DataTypes.STRING, + municipality: result.municipality, + region: result.region, + gardenSize: result.gardenSize, + realEstateType: result.realEstateType + }) + } + try { + const filteredMarketAlerts = marketAlerts.filter((elem) => !marketAlertsFromDb.find(({ url }) => elem.url === url)); + await db.MarketAlert.bulkCreate(filteredMarketAlerts); + process.exit() + } catch (e) { + console.log("Could not bulkCreate marketalers reason: ", e); + } + } catch (e) { + console.log("Error crawling. Trying next crawler! ", e); + } + }) }; +crawlAll(); - - // Promise.all( - - - // ).then((results) => { - // console.log(results); - // console.log(results.length); - // console.log("Executing save results"); - // Promise.all([extractAndSaveResults(results)]).then(() => { - // console.log("Executed save results"); - // }); - // }); - - -// for (let crawler of crawlers) { -// try { -// let results = await crawler.crawl(); -// const marketAlerts = []; - -// for (const result of results) { -// console.log("This is result", result); -// console.log("This is result", result.size); - -// // category: category, -// // url, -// // title, -// // price: isNaN(parsedPrice) ? price : parsedPrice, -// // size: parseFloat(size), -// // rooms: parsedRooms, -// // floor: parseInt(floor), -// // address, -// // location, -// // // adType: AD_TYPE_SALE, -// // time, -// // shortDescription: descriptions.first().text(), -// // longDescription: descriptions.last().text(), -// // lat, -// // lng, -// // loc: [parseFloat(lat), parseFloat(lng)], - - -// marketAlerts.push({ -// url: result.url, -// realestateOrigin: "OLX", -// originId: result.olxId, -// size: result.size, -// price: result.price, -// email: "em" -// // lastDate: DataTypes.STRING, -// // municipailty: DataTypes.STRING, -// // region: DataTypes.STRING, -// // gardenSize: DataTypes.INTEGER, - - -// }) -// } - -// try { -// await db.MarketAlert.bulkCreate(marketAlerts); -// } catch (e) { -// console.log("Could not bulkCreate marketalers reason: ", e); -// } -// } catch (e) { -// console.log("Error crawling. Trying next crawler! ", e); -// } -// } -// } - -async function extractAndSaveResults(results) { - const marketAlerts = [] - - for (const result of results) { - // console.log("This is result", result); - // console.log("This is result", result.size); - - // category: category, - // url, - // title, - // price: isNaN(parsedPrice) ? price : parsedPrice, - // size: parseFloat(size), - // rooms: parsedRooms, - // floor: parseInt(floor), - // address, - // location, - // // adType: AD_TYPE_SALE, - // time, - // shortDescription: descriptions.first().text(), - // longDescription: descriptions.last().text(), - // lat, - // lng, - // loc: [parseFloat(lat), parseFloat(lng)], - - - marketAlerts.push({ - url: result.url, - realestateOrigin: "OLX", - originId: result.olxId, - size: result.size, - price: result.price, - email: "em" - // lastDate: DataTypes.STRING, - // municipailty: DataTypes.STRING, - // region: DataTypes.STRING, - // gardenSize: DataTypes.INTEGER, - - - }) - } - - return marketAlerts; - -} - -crawlAll(); \ No newline at end of file