const fetch = require('node-fetch'); const cheerio = require('cheerio'); const { allRERequest, findPointInsideBoundingBox } = require('../db/dbHelper'); const { getRealEstateTypeEnum } = require('../enums'); const { getRegion, getMunicipality } = require('../codes') const Promise = require("bluebird"); module.exports = class OlxCrawler { //TODO figure best way to handle paging constructor(fromPage = 0, toPage = 10, maxResults = 1000) { this.fromPage = fromPage; this.toPage = toPage; this.maxResults = maxResults; } async indexPages(urls) { const indexers = []; urls.forEach(url => { indexers.push(new Indexer(url)); }); return Promise.map(indexers, function (indexer) { return indexer.indexWithPagination(); }).then(async (results) => { return results }) } async crawl() { console.log("OLX CRAWLER: start crawl"); const filteredResults = []; const realestateRequests = await allRERequest(); console.log("OLX CRAWLER: found " + realestateRequests.length + "subscribed RealEstateRequests"); const urls = this.createRequestUrls(realestateRequests); let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); console.log("Final crawler results"); console.log(results[0].length); for (const finalResult of results[0]) { if (null !== finalResult) { if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); if (pointInsideBoundingBox[0].length !== 0) { filteredResults.push(finalResult); } } } } console.log("OLX CRAWLER: number of olx crawler results, after geo location filtering: " + filteredResults.length); return filteredResults; } createRequestUrls(realestateRequests) { const urls = [] for (const request of realestateRequests) { const realsestateType = "kategorija=" + getRealEstateTypeEnum(request.realEstateType).olxCategory; const region = "kanton=" + getRegion(request.region).olxid; const municipality = "grad%5B%5D=" + getMunicipality(request.region, request.municipality).olxid; const sizeMin = "kvadrata_min=" + request.sizeMin; const sizeMax = "kvadrata_max=" + request.sizeMax; const priceMin = "od=" + request.priceMin; const priceMax = "do=" + request.priceMax; const olxUrl = { url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}&stranica=`, email: request.email, uuid: request.uniqueId } console.log(olxUrl.url); urls.push(olxUrl); } return urls; } }; class Indexer { /** * * @param {String|Array} olxUrl single or array of objects containing url email and uuid * @param {Array} hrefResutls array contaning urls from crawler results */ constructor(olxUrl, hrefResutls) { this.olxUrl = olxUrl; this.hrefResutls = hrefResutls; } async indexWithPagination(pageNumber = 1) { console.log("This is olxUrl:" + this.olxUrl.url); const pageNr = this.olxUrl.url.match(/\d+$/); const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr); try { return Promise.map(indexers.indexers, function (indexer) { return indexer.indexPage(pageNumber); }).then(async (results) => { let hasResults = false; results.forEach(result => { if (!hasResults) { console.log("No results detected") hasResults = result.hasResults } }); if (!hasResults) { console.log("HAS NO MORE RESULTS, stop the paging, there are some results and they should contain only HREFS"); console.log(results.length); const singlePageIndexers = this.prepareHrefIndexers(results); if (singlePageIndexers.length === 0) { console.log("THERE IS NOT EVEN SINGLE RESULT"); return [] } return Promise.map(singlePageIndexers, function (indexer) { return indexer.indexSingle(); }).then(async (results) => { console.log("SinglePageMethod in HAS NO RESULTS, MarketAralms"); console.log(results.length); return results; }); } else { console.log("HAS MORE RESULTS, should only contain HREFS"); console.log(results.length); const newResults = await this.indexWithPagination(results[0].pageNumber + 5); const singlePageIndexers = this.prepareHrefIndexers(results); const newerResults = await Promise.map(singlePageIndexers, function (indexer) { return indexer.indexSingle(); }).then(async (results) => { console.log("SinglePageMethod HAS RESULTS, should contain MarketAlerts only"); console.log(results.length); return results; }); Array.prototype.push.apply(newResults, newerResults); return newResults; } }); } catch (e) { console.error("Error has accured", e); } } prepareIndexers(pageNr) { console.log("Entering prepareIndexers : page nr - " + pageNr); const indexers = []; let lastPageNumber; if (pageNr) { for (let index = Number(pageNr[0]); index <= Number(pageNr[0]) + 5; index++) { lastPageNumber = index; const newOlxUrl = { url: this.olxUrl.url.replace(/\d+$/, "") + index, email: this.olxUrl.email, uuid: this.olxUrl.uuid } indexers.push(new Indexer(newOlxUrl)); } } else { for (let index = 1; index <= 5; index++) { lastPageNumber = index; const newOlxUrl = { url: this.olxUrl.url + index, email: this.olxUrl.email, uuid: this.olxUrl.uuid } indexers.push(new Indexer(newOlxUrl)); } } return { indexers: indexers, lastPageNumber: lastPageNumber }; } prepareHrefIndexers(results) { const indexers = [] if (!Array.isArray(results)) { results.hrefs.forEach(href => { const newOlxUrl = { url: href, email: results.olxUrl.email, uuid: results.olxUrl.uuid } indexers.push(new Indexer(newOlxUrl)); }); } else { results.forEach(result => { if (result !== null && result.hasOwnProperty('hrefs')) { result.hrefs.forEach(href => { // console.log(href); const newOlxUrl = { url: href, email: result.olxUrl.email, uuid: result.olxUrl.uuid } indexers.push(new Indexer(newOlxUrl)); }) } }); } return indexers; } async indexPage(pageNumber) { console.log("Page number in index page, max page number :") console.log(pageNumber); try { console.log("Indexing page: " + this.olxUrl.url); const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const hrefs = []; let hasResults = false $('#rezultatipretrage').find('.listitem').each((i, elem) => { hasResults = true const href = $(elem).find('a').first().attr('href'); hrefs.push(href); }); console.log("this is hrefs for olxUrl" + this.olxUrl.url); console.log("NUMBER OF HREFS " + hrefs.length); return { hrefs: hrefs, hasResults: hasResults, pageNumber: pageNumber, olxUrl: this.olxUrl } } catch (e) { console.error('Exception caught:' + e); } } async indexSingle() { try { console.log("Index single"); console.log(this.olxUrl.url); if (this.olxUrl.url === undefined) { return {} } const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const title = $('#naslovartikla').text().trim(); const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); const price = $('#pc > p:nth-child(2)').text(); const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); const time = $('time').attr('datetime'); const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); const descriptions = $('.artikal_detaljniopis_tekst'); const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const imgRe = /href":("[^"]*")/g; const matches = latLngRe.exec(body); let lng = '', lat = ''; const parsePrice = (price) => parseFloat(price.replace(".", "")) if (matches && matches.length >= 3) { lat = matches[1]; lng = matches[2]; } const parsedPrice = parsePrice(price); const locationArray = location.split(","); const region = locationArray[0]; const municipality = locationArray[1]; const data = { realEstateType: this.getCategoryId(realEstateType), email: this.olxUrl.email, uuid: this.olxUrl.uuid, olxId: olxId, url: this.olxUrl.url, title, price: isNaN(parsedPrice) ? 0 : parsedPrice, size: parseFloat(size), gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), address, region, municipality, time, shortDescription: descriptions.first().text(), longDescription: descriptions.last().text(), lat, lng, loc: [parseFloat(lat), parseFloat(lng)], }; return data; } catch (e) { console.error('Exception caught: ' + e.message); } return null; } getCategoryId(category) { switch (category) { case 'Stanovi': return 'stan'; case 'Vikendice': return 'vikendica' case 'Kuće': return 'kuca'; default: return ''; } } }