const fetch = require("node-fetch"); const cheerio = require("cheerio"); const { allRERequest, findPointInsideBoundingBox } = require("../db/dbHelper"); const { getRealEstateTypeEnum } = require("../enums"); const { getRegion, getMunicipality } = require("../codes"); const Promise = require("bluebird"); module.exports = class OlxCrawler { //TODO figure best way to handle paging constructor(hrefs = []) { this.hrefs = hrefs; } async indexPages(urls) { const indexers = []; urls.forEach(url => { indexers.push(new Indexer(url)); }); return Promise.map(indexers, function(indexer) { return indexer.indexWithPagination(); }).then(async results => { return results; }); } async crawl() { const filteredResults = []; const realestateRequests = await allRERequest(); const urls = this.createRequestUrls(realestateRequests); let results = await this.indexPages( urls, this.fromPage, this.toPage, this.maxResults ); const flatResults = results.flat(); if (flatResults) { for (const finalResult of flatResults) { if (null !== finalResult) { if ( finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "" ) { const pointInsideBoundingBox = await findPointInsideBoundingBox( [finalResult.lng, finalResult.lat], finalResult.email, finalResult.uuid ); if (pointInsideBoundingBox[0].length !== 0) { finalResult.hasLocation = true; filteredResults.push(finalResult); } else { finalResult.hasLocation = false; filteredResults.push(finalResult); } } } } return filteredResults; } return []; } createRequestUrls(realestateRequests) { const urls = []; for (const request of realestateRequests) { const realsestateType = "kategorija=" + getRealEstateTypeEnum(request.realEstateType).olxCategory; const region = "kanton=" + getRegion(request.region).olxid; const municipality = "grad%5B%5D=" + getMunicipality(request.region, request.municipality).olxid; const sizeMin = "kvadrata_min=" + request.sizeMin; const sizeMax = "kvadrata_max=" + request.sizeMax; const priceMin = "od=" + request.priceMin; const priceMax = "do=" + request.priceMax; const olxUrl = { url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}&stranica=`, email: request.email, uuid: request.uniqueId, hrefs: this.hrefs }; urls.push(olxUrl); } return urls; } }; class Indexer { /** * * @param {String|Array} olxUrl single or array of objects containing url email and uuid * @param {Array} hrefResutls array contaning urls from crawler results */ constructor(olxUrl, hrefResutls) { this.olxUrl = olxUrl; this.hrefResutls = hrefResutls; } async indexWithPagination(pageNumber = 1) { const pageNr = this.olxUrl.url.match(/\d+$/); const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr); try { return Promise.map(indexers.indexers, function(indexer) { return indexer.indexPage(pageNumber); }).then(async results => { let hasResults = false; results.forEach(result => { if (!hasResults) { hasResults = result.hasResults; } }); if (!hasResults) { const singlePageIndexers = this.prepareHrefIndexers(results); if (singlePageIndexers.length === 0) { return []; } return Promise.map(singlePageIndexers, function(indexer) { return indexer.indexSingle(); }).then(async results => { return results; }); } else { const newResults = await this.indexWithPagination( results[0].pageNumber + 5 ); const singlePageIndexers = this.prepareHrefIndexers(results); const newerResults = await Promise.map(singlePageIndexers, function( indexer ) { return indexer.indexSingle(); }).then(async results => { return results; }); Array.prototype.push.apply(newResults, newerResults); return newResults; } }); } catch (e) { console.error("Error has accured", e); } } prepareIndexers(pageNr) { const indexers = []; let lastPageNumber; if (pageNr) { for ( let index = Number(pageNr[0]); index <= Number(pageNr[0]) + 5; index++ ) { lastPageNumber = index; const newOlxUrl = { url: this.olxUrl.url.replace(/\d+$/, "") + index, email: this.olxUrl.email, uuid: this.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); } } else { for (let index = 1; index <= 5; index++) { lastPageNumber = index; const newOlxUrl = { url: this.olxUrl.url + index, email: this.olxUrl.email, uuid: this.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); } } return { indexers: indexers, lastPageNumber: lastPageNumber }; } prepareHrefIndexers(results) { const indexers = []; if (!Array.isArray(results)) { results.hrefs.forEach(href => { const newOlxUrl = { url: href, email: results.olxUrl.email, uuid: results.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); }); } else { results.forEach(result => { if (result !== null && result.hasOwnProperty("hrefs")) { result.hrefs.forEach(href => { const newOlxUrl = { url: href, email: result.olxUrl.email, uuid: result.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); }); } }); } return indexers; } async indexPage(pageNumber) { try { const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const hrefs = []; let hasResults = false; $("#rezultatipretrage") .find(".listitem") .each((i, elem) => { hasResults = true; const href = $(elem) .find("a") .first() .attr("href"); hrefs.push(href); }); return { hrefs: hrefs, hasResults: hasResults, pageNumber: pageNumber, olxUrl: this.olxUrl }; } catch (e) { console.error("Exception caught:" + e); } } async indexSingle() { try { if (this.olxUrl.url === undefined) { return {}; } // if (global.hrefs) { if ( this.olxUrl.hrefs[this.olxUrl.uuid] && this.olxUrl.hrefs[this.olxUrl.uuid].includes(this.olxUrl.url) ) { return null; } // } const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const title = $("#naslovartikla") .text() .trim(); const realEstateType = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" ).text(); const price = $("#pc > p:nth-child(2)").text(); const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text(); const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text(); const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text(); const gardenSize = $( "#dodatnapolja1 > div:nth-child(6) > div.df2" ).text(); const location = $( "#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija" ).attr("data-content"); const time = $("time").attr("datetime"); const olxId = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2" ).text(); const descriptions = $(".artikal_detaljniopis_tekst"); const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const imgRe = /href":("[^"]*")/g; const matches = latLngRe.exec(body); let lng = "", lat = ""; const parsePrice = price => parseFloat(price.replace(".", "")); if (matches && matches.length >= 3) { lat = matches[1]; lng = matches[2]; } const parsedPrice = parsePrice(price); const locationArray = location.split(","); const region = locationArray[0]; const municipality = locationArray[1]; const data = { realEstateType: this.getCategoryId(realEstateType), email: this.olxUrl.email, uuid: this.olxUrl.uuid, olxId: olxId, url: this.olxUrl.url, title, price: isNaN(parsedPrice) ? 0 : parsedPrice, size: parseFloat(size), gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), address, region, municipality, time, shortDescription: descriptions.first().text(), longDescription: descriptions.last().text(), lat, lng, loc: [parseFloat(lat), parseFloat(lng)] }; return data; } catch (e) { console.error("Exception caught: " + e.message); } return null; } getCategoryId(category) { switch (category) { case "Stanovi": return "stan"; case "Vikendice": return "vikendica"; case "Kuće": return "kuca"; default: return ""; } } }