"use strict"; let fetch = require("node-fetch"); let cheerio = require("cheerio"); const { AD_TYPE, AD_CATEGORY, IGNORED_USERNAMES } = require("../../common/enums"); class OlxCrawler { constructor(fromPage = 0, toPage = 10, maxResults = 1000) { this.fromPage = fromPage; this.toPage = toPage; this.maxResults = maxResults; } async indexSingle(url) { try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); const username = $( "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" ).text(); if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { return null; } const title = $("#naslovartikla").text(); const category = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" ).text(); const price = $("#pc > p:nth-child(2)").text(); const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text(); const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text(); const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text(); const location = $( "#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija" ).attr("data-content"); const adType = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2" ).text(); const time = $("time").attr("datetime"); const olxId = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2" ).text(); const descriptions = $(".artikal_detaljniopis_tekst"); const floor = $("#dodatnapolja1") .find(":contains(Sprat)") .last() .nextAll() .text(); const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const matches = latLngRe.exec(body); let lng = "", lat = ""; const parseRooms = rooms => parseInt( [...rooms] .filter(c => !isNaN(c)) .filter(c => c.trim()) .join() ); const parsePrice = price => parseFloat(price.replace(".", "")); if (matches && matches.length >= 3) { lat = matches[1]; lng = matches[2]; } const parsedPrice = parsePrice(price); let parsedRooms; if (rooms === "Garsonjera") { parsedRooms = 0; } else { parsedRooms = parseRooms(rooms); } const data = { category: this.getCategoryId(category), url, title, price: isNaN(parsedPrice) ? price : parsedPrice, size: parseFloat(size), rooms: parsedRooms, floor: parseInt(floor), address, location, adType: AD_TYPE.AD_TYPE_SALE, time, shortDescription: descriptions.first().text(), longDescription: descriptions.last().text(), lat, lng, loc: [parseFloat(lat), parseFloat(lng)] }; return data; } catch (e) { console.error("Exception caught: " + e.message); } return null; } async indexPage(pageNr, maxResults = 1000) { try { console.log("Starting to index page: " + pageNr); const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); const hrefs = []; const results = {}; $("#rezultatipretrage") .find(".listitem") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); hrefs.push(href); }); let actualNoOfResults = hrefs.length <= maxResults ? hrefs.length : maxResults; for (let i = 0; i < hrefs.length; i++) { console.log(`indexing: ${hrefs[i]}`); const singleData = await this.indexSingle(hrefs[i]); if (singleData) { results[hrefs[i]] = singleData; } await this.sleep(500); } return results; } catch (e) { console.error("Exception caught:" + e); } } getCategoryId(category) { if (category === "Stanovi") { return AD_CATEGORY.CATEGORY_FLAT; } else if (category === "Zemljišta") { return AD_CATEGORY.CATEGORY_LAND; } else if (category === "Kuće") { return AD_CATEGORY.CATEGORY_HOUSE; } else if (category === "Poslovni prostori") { return AD_CATEGORY.CATEGORY_OFFICE; } } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async indexPages(start, end, maxResults = 1000) { let results = {}; for (let i = start; i <= end; i++) { let result = await this.indexPage(i, maxResults); Object.assign(results, result); await this.sleep(5000); } return results; } async crawl() { let results = await this.indexPages( this.fromPage, this.toPage, this.maxResults ); return results; } } module.exports = OlxCrawler;