'use strict'; let fetch = require ('node-fetch'); let cheerio = require ('cheerio'); let fs = require ('fs'); let cloudinary = require ('cloudinary'); let FormData = require ('form-data'); import { AD_TYPE_SALE, IGNORED_USERNAMES, CATEGORY_FLAT, CATEGORY_HOUSE, CATEGORY_OFFICE, CATEGORY_LAND, CATEGORY_APARTMENT, CATEGORY_GARAGE, STATUS_NORMAL, STATUS_RESERVED, STATUS_SOLD, } from '../../common/enums'; export default class RentalCrawler { constructor (fromPage = 0, toPage = 10, maxResults = 1000) { console.log ('Rental Crawler'); this.fromPage = fromPage; this.toPage = toPage; this.maxResults = maxResults; } async indexSingle (url) { try { const res = await fetch (url); const body = await res.text (); const $ = cheerio.load (body); var title; var category; var price; var size; var rooms; var address; var descriptions; var floor; var floor; var time; var lat; var lng; var hasMap; var status; //No JSON string -> No map try { let completeData; let dataJsonString; let dataJson; const startN = 5; const lastN = 15; for (let i = startN; i <= lastN; i++) { try { completeData = $ ( 'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(' + i + ') > script' ).text (); dataJsonString = completeData.slice (21, -1); dataJson = JSON.parse (dataJsonString); break; } catch (e) { console.log ('No JSON string'); if (i === lastN) throw e; } } title = dataJson['re_realEstates_portalName']; category = this.getCategoryIdfromNumber ( parseInt (dataJson['re_types_id']) ); //categories from JSON string doesn't match categories in ENUMS price = parseFloat (dataJson['re_realEstates_price']); size = parseFloat (dataJson['re_realEstates_area']); rooms = parseInt (dataJson['re_realEstates_roomsNO']); address = dataJson['re_realEstates_address']; //descriptions = dataJson["re_realEstates_description"]; floor = parseInt (dataJson['re_realEstates_floorNO']); let timeArray = dataJson['re_realEstates_inserted'] .slice (0, dataJson['re_realEstates_inserted'].indexOf (' ')) .split ('-'); time = timeArray[2] + '.' + timeArray[1] + '.' + timeArray[0]; lat = dataJson['re_realEstates_latitude']; lng = dataJson['re_realEstates_longitude']; hasMap = true; } catch (e) { console.log ('error : ' + e); //This ad has no JSON string, informations should be retrieved using HTML selectors time = undefined; lat = 0; lng = 0; hasMap = false; price = parseFloat ( $ ( 'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.prices > span.pull-left' ) .text () .replace (',', '') .replace ('.', '') ) / 100; const propsList = {}; $ ( 'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body' ) .contents () .map ((i, elem) => { const entry = $ (elem).text ().trim ().split (':'); if (entry[0]) propsList[entry[0]] = entry[1]; }); address = propsList['Ulica']; size = parseFloat ( propsList['Površina'].replace (',', '').replace ('.', '') ) / 100; rooms = parseInt (propsList['Broj soba']); floor = parseInt (propsList['Spratnost']); title = $ ( 'div.container-fluid > div.container > div.row.content-top > div.col-xs-12.col-sm-6.col-md-9 > div.description.pull-left > h1' ).text (); descriptions = $ ('#b1 > div > div > div').text (); const fullCategory = $ ( 'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.title > p' ) .text () .split (',', 3); category = fullCategory.size > 2 ? this.getCategoryIdfromText (fullCategory[0] + fullCategory[1]) : this.getCategoryIdfromText (fullCategory[0]); } descriptions = $ ('#b1 > div > div > div').text (); status = this.getStatusIdFromText ( $ ('#a1 > div.box-badges > div').text () ); const images = []; $ ('.img-gallery').contents ().map ((i, elem) => { const tmp = $ (elem).attr ('data-preview'); if (tmp) images.push (tmp); }); const data = { category, url, title, price, size, rooms, floor, address, adType: AD_TYPE_SALE, time, shortDescription: title, longDescription: descriptions, lat, lng, loc: [parseFloat (lat), parseFloat (lng)], hasMap, status, //images: cloudinaryImages images, }; return data; } catch (e) { console.error ('Exception caught: ' + e.message); } return null; } async indexPage (pageNr, maxResults = 1000) { try { console.log ('Starting to index page: ' + pageNr); const url = 'http://www.rental.ba/pretraga/prodaja-1/stranica-' + pageNr; /* const data = new FormData(); data.append('sales', 1); // Mislim da ovo definiše oglase tipa prodaje data.append('re_types_id', ''); //odnosi se na tip nekretnine (kuća, stan, apartman,...) data.append('full_text', ''); data.append('re_realEstates_code', ''); data.append('re_realEstates_price_max', ''); data.append('re_realEstates_price_min', ''); data.append('re_realEstates_area_min', ''); data.append('re_realEstates_area_max', ''); data.append('re_realEstates_roomsNO_min', ''); data.append('re_realEstates_roomsNO_max', ''); data.append('re_realEstates_floorNO_min', ''); data.append('re_realEstates_floorNO_max', ''); data.append('re_subTypes_id', 1); */ const res = await fetch (url, { method: 'POST', //body: data }); const body = await res.text (); const $ = cheerio.load (body); const hrefs = []; $ ('.middle').each ((i, elem) => { const href = $ (elem).find ('a').first ().attr ('href'); hrefs.push (href); }); const results = {}; for (const href of hrefs) { console.log (`indexing: ${href}`); const singleData = await this.indexSingle (href); if (singleData) { results[href] = singleData; } await this.sleep (500); } return results; } catch (e) { console.error ('Exception caught:' + e); } } getCategoryIdfromNumber (category) { switch (category) { case 1: return CATEGORY_HOUSE; case 2: return CATEGORY_FLAT; case 3: return CATEGORY_APARTMENT; case 4: return CATEGORY_OFFICE; case 5: return CATEGORY_LAND; case 6: return CATEGORY_GARAGE; } } getCategoryIdfromText (category) { switch (category) { case 'samostojeća': return CATEGORY_HOUSE; case 'dvojna': return CATEGORY_HOUSE; case 'kuća u nizu': return CATEGORY_HOUSE; case 'stambeno-poslovni objekt': return CATEGORY_HOUSE; case 'prizemnica': return CATEGORY_HOUSE; case 'kuća na moru': return CATEGORY_HOUSE; case 'kuća u izgradnji': return CATEGORY_HOUSE; case 'dvorac': return CATEGORY_HOUSE; case 'apartmanska kuća': return CATEGORY_HOUSE; case 'porodična kuća': return CATEGORY_HOUSE; case 'vikend kuća': return CATEGORY_HOUSE; case 'luksuzna kuća': return CATEGORY_HOUSE; case 'kamena': return CATEGORY_HOUSE; case 'vila': return CATEGORY_HOUSE; case 'splav': return CATEGORY_HOUSE; case 'stan u zgradi': return CATEGORY_FLAT; case 'stan u kući': return CATEGORY_FLAT; case 'stan višeetažni': return CATEGORY_FLAT; case 'stan višeetažni u kući': return CATEGORY_FLAT; case 'stan u starijoj zgradi': return CATEGORY_FLAT; case 'stan u novogradnji': return CATEGORY_FLAT; case 'stan u neboderu': return CATEGORY_FLAT; case 'Korišten stan u novogradnji': return CATEGORY_FLAT; case 'apartman na moru': return CATEGORY_APARTMENT; case 'apartman u planini': return CATEGORY_APARTMENT; case 'unutrašnje garažno mjesto': return CATEGORY_GARAGE; case 'unutrašnje parkirno mjesto': return CATEGORY_GARAGE; case 'građevinsko': return CATEGORY_LAND; case 'građevinsko stambeno': return CATEGORY_LAND; case 'zemljište, ostalo': return CATEGORY_LAND; case 'odmaralište': return CATEGORY_LAND; case 'oranica': return CATEGORY_LAND; case 'šuma': return CATEGORY_LAND; case 'livada': return CATEGORY_LAND; case 'građevinsko M2': return CATEGORY_LAND; case 'građevinsko M1': return CATEGORY_LAND; case 'građevinsko - turističko': return CATEGORY_LAND; case 'građevinsko - poslovno': return CATEGORY_LAND; case 'otok': return CATEGORY_LAND; case 'poljoprivredno': return CATEGORY_LAND; case 'lokal': return CATEGORY_OFFICE; case 'ured': return CATEGORY_OFFICE; case 'skladište ili garaža': return CATEGORY_OFFICE; case 'radionica': return CATEGORY_OFFICE; case 'tvornica': return CATEGORY_OFFICE; case 'restoran': return CATEGORY_OFFICE; case 'sportski centar': return CATEGORY_OFFICE; case 'ordinacija': return CATEGORY_OFFICE; case 'kiosk': return CATEGORY_OFFICE; case 'auto-praonica': return CATEGORY_OFFICE; case 'poslovna zgrada': return CATEGORY_OFFICE; case 'skladište': return CATEGORY_OFFICE; case 'garaža': return CATEGORY_OFFICE; case 'hotel': return CATEGORY_OFFICE; case 'pansion': return CATEGORY_OFFICE; case 'apartmanska zgrada': return CATEGORY_OFFICE; case 'trgovina': return CATEGORY_OFFICE; case 'prodajno skladišni': return CATEGORY_OFFICE; case 'proizvodno skladišni': return CATEGORY_OFFICE; case 'Kancelarije': return CATEGORY_OFFICE; case 'Poslovni prostor': return CATEGORY_OFFICE; } } getStatusIdFromText (status) { if (status === 'Prodato') return STATUS_SOLD; return STATUS_NORMAL; } async sleep (ms) { return new Promise (resolve => setTimeout (resolve, ms)); } async indexPages (start, end, maxResults = 1000) { let results = {}; for (let i = start; i <= end; i++) { let result = await this.indexPage (i, maxResults); Object.assign (results, result); await this.sleep (5000); } return results; } async crawl () { let results = await this.indexPages ( this.fromPage, this.toPage, this.maxResults ); return results; } }