const fetch = require("node-fetch"); const cheerio = require("cheerio"); const { allRERequest, findPointInsideBoundingBox } = require("../db/dbHelper"); const { getRealEstateTypeEnum } = require("../enums"); const { getRegion, getMunicipality } = require("../codes"); const Promise = require("bluebird"); module.exports = class OlxCrawler { //TODO figure best way to handle paging constructor(hrefs = []) { this.hrefs = hrefs; } async indexPages(urls) { const indexers = []; urls.forEach(url => { indexers.push(new Indexer(url)); }); return Promise.map(indexers, function(indexer) { return indexer.indexWithPagination(); }).then(async results => { return results; }); } async crawl() { const filteredResults = []; const realEstateRequests = await allRERequest(); const urls = this.createRequestUrls(realEstateRequests); let results = await this.indexPages( urls, this.fromPage, this.toPage, this.maxResults ); const flatResults = results.flat(); if (flatResults) { for (const finalResult of flatResults) { if (null !== finalResult) { if ( finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "" ) { const pointInsideBoundingBox = await findPointInsideBoundingBox( [finalResult.lng, finalResult.lat], finalResult.email, finalResult.uuid ); if (pointInsideBoundingBox[0].length !== 0) { finalResult.hasLocation = true; filteredResults.push(finalResult); } else { finalResult.hasLocation = false; filteredResults.push(finalResult); } } } } return filteredResults; } return []; } createRequestUrls(realEstateRequests) { const urls = []; for (const request of realEstateRequests) { const { realEstateType, region, sizeMin, sizeMax, priceMin, priceMax } = request; const urlRealEstateParams = [ { paramName: "kanton", paramValue: region, useParam: false }, { paramName: "kategorija", paramValue: getRealEstateTypeEnum(realEstateType).olxid, useParam: true }, { paramName: "kvadrata_min", paramValue: sizeMin, useParam: true }, { paramName: "kvadrata_max", paramValue: sizeMax, useParam: true }, { paramName: "od", paramValue: priceMin, useParam: true }, { paramName: "do", paramValue: priceMax, useParam: true } ]; const urlResultsParams = [ { paramName: "vrstapregleda", paramValue: "tabela", useParam: true }, { paramName: "sort_order", paramValue: "desc", useParam: true }, { paramName: "vrsta", paramValue: "samoprodaja", useParam: true }, { paramName: "stranica", paramValue: "0", useParam: true } ]; const paramsReduceFunction = (accumulatedValue, currentParam) => { const { paramName, paramValue, useParam } = currentParam; if (useParam) { return `${accumulatedValue}&${paramName}=${paramValue}`; } else { return accumulatedValue; } }; const reducedRealEstateParams = urlRealEstateParams.reduce( paramsReduceFunction, "" ); const reducedResultsParams = urlResultsParams.reduce( paramsReduceFunction, "" ); const olxUrl = { url: `https://www.olx.ba/pretraga?${reducedRealEstateParams}${reducedResultsParams}`, email: request.email, uuid: request.uniqueId, hrefs: this.hrefs }; urls.push(olxUrl); } return urls; } }; class Indexer { /** * * @param {String|Array} olxUrl single or array of objects containing url email and uuid * @param {Array} hrefResutls array contaning urls from crawler results */ constructor(olxUrl, hrefResutls) { this.olxUrl = olxUrl; this.hrefResutls = hrefResutls; } async indexWithPagination(pageNumber = 1) { const pageNr = this.olxUrl.url.match(/\d+$/); const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr); try { return Promise.map(indexers.indexers, function(indexer) { return indexer.indexPage(pageNumber); }).then(async results => { let hasResults = false; results.forEach(result => { if (!hasResults) { hasResults = result.hasResults; } }); if (!hasResults) { const singlePageIndexers = this.prepareHrefIndexers(results); if (singlePageIndexers.length === 0) { return []; } return Promise.map(singlePageIndexers, function(indexer) { return indexer.indexSingle(); }).then(async results => { return results; }); } else { const newResults = await this.indexWithPagination( results[0].pageNumber + 5 ); const singlePageIndexers = this.prepareHrefIndexers(results); const newerResults = await Promise.map(singlePageIndexers, function( indexer ) { return indexer.indexSingle(); }).then(async results => { return results; }); Array.prototype.push.apply(newResults, newerResults); return newResults; } }); } catch (e) { console.error("Error has accured", e); } } prepareIndexers(pageNr) { const indexers = []; let lastPageNumber; if (pageNr) { for ( let index = Number(pageNr[0]); index <= Number(pageNr[0]) + 5; index++ ) { lastPageNumber = index; const newOlxUrl = { url: this.olxUrl.url.replace(/\d+$/, "") + index, email: this.olxUrl.email, uuid: this.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); } } else { for (let index = 1; index <= 5; index++) { lastPageNumber = index; const newOlxUrl = { url: this.olxUrl.url + index, email: this.olxUrl.email, uuid: this.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); } } return { indexers: indexers, lastPageNumber: lastPageNumber }; } prepareHrefIndexers(results) { const indexers = []; if (!Array.isArray(results)) { results.hrefs.forEach(href => { const newOlxUrl = { url: href, email: results.olxUrl.email, uuid: results.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); }); } else { results.forEach(result => { if (result !== null && result.hasOwnProperty("hrefs")) { result.hrefs.forEach(href => { const newOlxUrl = { url: href, email: result.olxUrl.email, uuid: result.olxUrl.uuid, hrefs: this.olxUrl.hrefs }; indexers.push(new Indexer(newOlxUrl)); }); } }); } return indexers; } async indexPage(pageNumber) { try { const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const hrefs = []; let hasResults = false; $("#rezultatipretrage") .find(".listitem") .each((i, elem) => { hasResults = true; const href = $(elem) .find("a") .first() .attr("href"); hrefs.push(href); }); return { hrefs: hrefs, hasResults: hasResults, pageNumber: pageNumber, olxUrl: this.olxUrl }; } catch (e) { console.error("Exception caught:" + e); } } async indexSingle() { try { if (this.olxUrl.url === undefined) { return {}; } // if (global.hrefs) { if ( this.olxUrl.hrefs[this.olxUrl.uuid] && this.olxUrl.hrefs[this.olxUrl.uuid].includes(this.olxUrl.url) ) { return null; } // } const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const title = $("#naslovartikla") .text() .trim(); const realEstateType = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" ).text(); const price = $("#pc > p:nth-child(2)").text(); const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text(); const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text(); const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text(); const gardenSize = $( "#dodatnapolja1 > div:nth-child(6) > div.df2" ).text(); const location = $( "#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija" ).attr("data-content"); const time = $("time").attr("datetime"); const olxId = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2" ).text(); const descriptions = $(".artikal_detaljniopis_tekst"); const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const imgRe = /href":("[^"]*")/g; const matches = latLngRe.exec(body); let lng = "", lat = ""; const parsePrice = price => parseFloat(price.replace(".", "")); if (matches && matches.length >= 3) { lat = matches[1]; lng = matches[2]; } const parsedPrice = parsePrice(price); const locationArray = location && location.length > 0 ? location.split(",") : []; const region = locationArray.length > 0 ? locationArray[0] : ""; const municipality = locationArray.length > 1 ? locationArray[1] : ""; const data = { realEstateType: this.getCategoryId(realEstateType), email: this.olxUrl.email, uuid: this.olxUrl.uuid, olxId: olxId, url: this.olxUrl.url, title, price: isNaN(parsedPrice) ? 0 : parsedPrice, size: parseFloat(size), gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), address, region, municipality, time, shortDescription: descriptions.first().text(), longDescription: descriptions.last().text(), lat, lng, loc: [parseFloat(lat), parseFloat(lng)] }; return data; } catch (e) { console.error("Exception caught: " + e.message); } return null; } getCategoryId(category) { switch (category) { case "Stanovi": return "stan"; case "Vikendice": return "vikendica"; case "Kuće": return "kuca"; default: return ""; } } }