425 lines
12 KiB
JavaScript
425 lines
12 KiB
JavaScript
'use strict';
|
|
|
|
let fetch = require ('node-fetch');
|
|
let cheerio = require ('cheerio');
|
|
let fs = require ('fs');
|
|
let cloudinary = require ('cloudinary');
|
|
let FormData = require ('form-data');
|
|
|
|
import {
|
|
AD_TYPE_SALE,
|
|
IGNORED_USERNAMES,
|
|
CATEGORY_FLAT,
|
|
CATEGORY_HOUSE,
|
|
CATEGORY_OFFICE,
|
|
CATEGORY_LAND,
|
|
CATEGORY_APARTMENT,
|
|
CATEGORY_GARAGE,
|
|
STATUS_NORMAL,
|
|
STATUS_RESERVED,
|
|
STATUS_SOLD,
|
|
} from '../../common/enums';
|
|
|
|
export default class RentalCrawler {
|
|
constructor (fromPage = 0, toPage = 10, maxResults = 1000) {
|
|
console.log ('Rental Crawler');
|
|
|
|
this.fromPage = fromPage;
|
|
this.toPage = toPage;
|
|
this.maxResults = maxResults;
|
|
}
|
|
|
|
async indexSingle (url) {
|
|
try {
|
|
const res = await fetch (url);
|
|
const body = await res.text ();
|
|
const $ = cheerio.load (body);
|
|
|
|
var title;
|
|
var category;
|
|
var price;
|
|
var size;
|
|
var rooms;
|
|
var address;
|
|
var descriptions;
|
|
var floor;
|
|
var floor;
|
|
var time;
|
|
var lat;
|
|
var lng;
|
|
var hasMap;
|
|
var status;
|
|
|
|
//No JSON string -> No map
|
|
try {
|
|
let completeData;
|
|
let dataJsonString;
|
|
let dataJson;
|
|
|
|
const startN = 5;
|
|
const lastN = 15;
|
|
|
|
for (let i = startN; i <= lastN; i++) {
|
|
try {
|
|
completeData = $ (
|
|
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(' +
|
|
i +
|
|
') > script'
|
|
).text ();
|
|
dataJsonString = completeData.slice (21, -1);
|
|
dataJson = JSON.parse (dataJsonString);
|
|
break;
|
|
} catch (e) {
|
|
console.log ('No JSON string');
|
|
if (i === lastN) throw e;
|
|
}
|
|
}
|
|
|
|
title = dataJson['re_realEstates_portalName'];
|
|
category = this.getCategoryIdfromNumber (
|
|
parseInt (dataJson['re_types_id'])
|
|
); //categories from JSON string doesn't match categories in ENUMS
|
|
price = parseFloat (dataJson['re_realEstates_price']);
|
|
size = parseFloat (dataJson['re_realEstates_area']);
|
|
rooms = parseInt (dataJson['re_realEstates_roomsNO']);
|
|
address = dataJson['re_realEstates_address'];
|
|
//descriptions = dataJson["re_realEstates_description"];
|
|
floor = parseInt (dataJson['re_realEstates_floorNO']);
|
|
|
|
let timeArray = dataJson['re_realEstates_inserted']
|
|
.slice (0, dataJson['re_realEstates_inserted'].indexOf (' '))
|
|
.split ('-');
|
|
time = timeArray[2] + '.' + timeArray[1] + '.' + timeArray[0];
|
|
|
|
lat = dataJson['re_realEstates_latitude'];
|
|
lng = dataJson['re_realEstates_longitude'];
|
|
hasMap = true;
|
|
} catch (e) {
|
|
console.log ('error : ' + e);
|
|
//This ad has no JSON string, informations should be retrieved using HTML selectors
|
|
time = undefined;
|
|
lat = 0;
|
|
lng = 0;
|
|
hasMap = false;
|
|
|
|
price =
|
|
parseFloat (
|
|
$ (
|
|
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.prices > span.pull-left'
|
|
)
|
|
.text ()
|
|
.replace (',', '')
|
|
.replace ('.', '')
|
|
) / 100;
|
|
|
|
const propsList = {};
|
|
|
|
$ (
|
|
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body'
|
|
)
|
|
.contents ()
|
|
.map ((i, elem) => {
|
|
const entry = $ (elem).text ().trim ().split (':');
|
|
if (entry[0]) propsList[entry[0]] = entry[1];
|
|
});
|
|
|
|
address = propsList['Ulica'];
|
|
size =
|
|
parseFloat (
|
|
propsList['Površina'].replace (',', '').replace ('.', '')
|
|
) / 100;
|
|
rooms = parseInt (propsList['Broj soba']);
|
|
floor = parseInt (propsList['Spratnost']);
|
|
|
|
title = $ (
|
|
'div.container-fluid > div.container > div.row.content-top > div.col-xs-12.col-sm-6.col-md-9 > div.description.pull-left > h1'
|
|
).text ();
|
|
descriptions = $ ('#b1 > div > div > div').text ();
|
|
|
|
const fullCategory = $ (
|
|
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.title > p'
|
|
)
|
|
.text ()
|
|
.split (',', 3);
|
|
|
|
category = fullCategory.size > 2
|
|
? this.getCategoryIdfromText (fullCategory[0] + fullCategory[1])
|
|
: this.getCategoryIdfromText (fullCategory[0]);
|
|
}
|
|
|
|
descriptions = $ ('#b1 > div > div > div').text ();
|
|
status = this.getStatusIdFromText (
|
|
$ ('#a1 > div.box-badges > div').text ()
|
|
);
|
|
|
|
const images = [];
|
|
|
|
$ ('.img-gallery').contents ().map ((i, elem) => {
|
|
const tmp = $ (elem).attr ('data-preview');
|
|
if (tmp) images.push (tmp);
|
|
});
|
|
|
|
const data = {
|
|
category,
|
|
url,
|
|
title,
|
|
price,
|
|
size,
|
|
rooms,
|
|
floor,
|
|
address,
|
|
adType: AD_TYPE_SALE,
|
|
time,
|
|
shortDescription: title,
|
|
longDescription: descriptions,
|
|
lat,
|
|
lng,
|
|
loc: [parseFloat (lat), parseFloat (lng)],
|
|
hasMap,
|
|
status,
|
|
//images: cloudinaryImages
|
|
images,
|
|
};
|
|
|
|
return data;
|
|
} catch (e) {
|
|
console.error ('Exception caught: ' + e.message);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async indexPage (pageNr, maxResults = 1000) {
|
|
try {
|
|
console.log ('Starting to index page: ' + pageNr);
|
|
|
|
const url = 'http://www.rental.ba/pretraga/prodaja-1/stranica-' + pageNr;
|
|
|
|
/*
|
|
const data = new FormData();
|
|
data.append('sales', 1); // Mislim da ovo definiše oglase tipa prodaje
|
|
data.append('re_types_id', ''); //odnosi se na tip nekretnine (kuća, stan, apartman,...)
|
|
data.append('full_text', '');
|
|
data.append('re_realEstates_code', '');
|
|
data.append('re_realEstates_price_max', '');
|
|
data.append('re_realEstates_price_min', '');
|
|
data.append('re_realEstates_area_min', '');
|
|
data.append('re_realEstates_area_max', '');
|
|
data.append('re_realEstates_roomsNO_min', '');
|
|
data.append('re_realEstates_roomsNO_max', '');
|
|
data.append('re_realEstates_floorNO_min', '');
|
|
data.append('re_realEstates_floorNO_max', '');
|
|
data.append('re_subTypes_id', 1);
|
|
*/
|
|
|
|
const res = await fetch (url, {
|
|
method: 'POST',
|
|
//body: data
|
|
});
|
|
const body = await res.text ();
|
|
const $ = cheerio.load (body);
|
|
|
|
const hrefs = [];
|
|
$ ('.middle').each ((i, elem) => {
|
|
const href = $ (elem).find ('a').first ().attr ('href');
|
|
hrefs.push (href);
|
|
});
|
|
|
|
const results = {};
|
|
for (const href of hrefs) {
|
|
console.log (`indexing: ${href}`);
|
|
|
|
const singleData = await this.indexSingle (href);
|
|
|
|
if (singleData) {
|
|
results[href] = singleData;
|
|
}
|
|
|
|
await this.sleep (500);
|
|
}
|
|
|
|
return results;
|
|
} catch (e) {
|
|
console.error ('Exception caught:' + e);
|
|
}
|
|
}
|
|
|
|
getCategoryIdfromNumber (category) {
|
|
switch (category) {
|
|
case 1:
|
|
return CATEGORY_HOUSE;
|
|
case 2:
|
|
return CATEGORY_FLAT;
|
|
case 3:
|
|
return CATEGORY_APARTMENT;
|
|
case 4:
|
|
return CATEGORY_OFFICE;
|
|
case 5:
|
|
return CATEGORY_LAND;
|
|
case 6:
|
|
return CATEGORY_GARAGE;
|
|
}
|
|
}
|
|
|
|
getCategoryIdfromText (category) {
|
|
switch (category) {
|
|
case 'samostojeća':
|
|
return CATEGORY_HOUSE;
|
|
case 'dvojna':
|
|
return CATEGORY_HOUSE;
|
|
case 'kuća u nizu':
|
|
return CATEGORY_HOUSE;
|
|
case 'stambeno-poslovni objekt':
|
|
return CATEGORY_HOUSE;
|
|
case 'prizemnica':
|
|
return CATEGORY_HOUSE;
|
|
case 'kuća na moru':
|
|
return CATEGORY_HOUSE;
|
|
case 'kuća u izgradnji':
|
|
return CATEGORY_HOUSE;
|
|
case 'dvorac':
|
|
return CATEGORY_HOUSE;
|
|
case 'apartmanska kuća':
|
|
return CATEGORY_HOUSE;
|
|
case 'porodična kuća':
|
|
return CATEGORY_HOUSE;
|
|
case 'vikend kuća':
|
|
return CATEGORY_HOUSE;
|
|
case 'luksuzna kuća':
|
|
return CATEGORY_HOUSE;
|
|
case 'kamena':
|
|
return CATEGORY_HOUSE;
|
|
case 'vila':
|
|
return CATEGORY_HOUSE;
|
|
case 'splav':
|
|
return CATEGORY_HOUSE;
|
|
|
|
case 'stan u zgradi':
|
|
return CATEGORY_FLAT;
|
|
case 'stan u kući':
|
|
return CATEGORY_FLAT;
|
|
case 'stan višeetažni':
|
|
return CATEGORY_FLAT;
|
|
case 'stan višeetažni u kući':
|
|
return CATEGORY_FLAT;
|
|
case 'stan u starijoj zgradi':
|
|
return CATEGORY_FLAT;
|
|
case 'stan u novogradnji':
|
|
return CATEGORY_FLAT;
|
|
case 'stan u neboderu':
|
|
return CATEGORY_FLAT;
|
|
case 'Korišten stan u novogradnji':
|
|
return CATEGORY_FLAT;
|
|
|
|
case 'apartman na moru':
|
|
return CATEGORY_APARTMENT;
|
|
case 'apartman u planini':
|
|
return CATEGORY_APARTMENT;
|
|
|
|
case 'unutrašnje garažno mjesto':
|
|
return CATEGORY_GARAGE;
|
|
case 'unutrašnje parkirno mjesto':
|
|
return CATEGORY_GARAGE;
|
|
|
|
case 'građevinsko':
|
|
return CATEGORY_LAND;
|
|
case 'građevinsko stambeno':
|
|
return CATEGORY_LAND;
|
|
case 'zemljište, ostalo':
|
|
return CATEGORY_LAND;
|
|
case 'odmaralište':
|
|
return CATEGORY_LAND;
|
|
case 'oranica':
|
|
return CATEGORY_LAND;
|
|
case 'šuma':
|
|
return CATEGORY_LAND;
|
|
case 'livada':
|
|
return CATEGORY_LAND;
|
|
case 'građevinsko M2':
|
|
return CATEGORY_LAND;
|
|
case 'građevinsko M1':
|
|
return CATEGORY_LAND;
|
|
case 'građevinsko - turističko':
|
|
return CATEGORY_LAND;
|
|
case 'građevinsko - poslovno':
|
|
return CATEGORY_LAND;
|
|
case 'otok':
|
|
return CATEGORY_LAND;
|
|
case 'poljoprivredno':
|
|
return CATEGORY_LAND;
|
|
|
|
case 'lokal':
|
|
return CATEGORY_OFFICE;
|
|
case 'ured':
|
|
return CATEGORY_OFFICE;
|
|
case 'skladište ili garaža':
|
|
return CATEGORY_OFFICE;
|
|
case 'radionica':
|
|
return CATEGORY_OFFICE;
|
|
case 'tvornica':
|
|
return CATEGORY_OFFICE;
|
|
case 'restoran':
|
|
return CATEGORY_OFFICE;
|
|
case 'sportski centar':
|
|
return CATEGORY_OFFICE;
|
|
case 'ordinacija':
|
|
return CATEGORY_OFFICE;
|
|
case 'kiosk':
|
|
return CATEGORY_OFFICE;
|
|
case 'auto-praonica':
|
|
return CATEGORY_OFFICE;
|
|
case 'poslovna zgrada':
|
|
return CATEGORY_OFFICE;
|
|
case 'skladište':
|
|
return CATEGORY_OFFICE;
|
|
case 'garaža':
|
|
return CATEGORY_OFFICE;
|
|
case 'hotel':
|
|
return CATEGORY_OFFICE;
|
|
case 'pansion':
|
|
return CATEGORY_OFFICE;
|
|
case 'apartmanska zgrada':
|
|
return CATEGORY_OFFICE;
|
|
case 'trgovina':
|
|
return CATEGORY_OFFICE;
|
|
case 'prodajno skladišni':
|
|
return CATEGORY_OFFICE;
|
|
case 'proizvodno skladišni':
|
|
return CATEGORY_OFFICE;
|
|
case 'Kancelarije':
|
|
return CATEGORY_OFFICE;
|
|
case 'Poslovni prostor':
|
|
return CATEGORY_OFFICE;
|
|
}
|
|
}
|
|
|
|
getStatusIdFromText (status) {
|
|
if (status === 'Prodato') return STATUS_SOLD;
|
|
|
|
return STATUS_NORMAL;
|
|
}
|
|
|
|
async sleep (ms) {
|
|
return new Promise (resolve => setTimeout (resolve, ms));
|
|
}
|
|
|
|
async indexPages (start, end, maxResults = 1000) {
|
|
let results = {};
|
|
for (let i = start; i <= end; i++) {
|
|
let result = await this.indexPage (i, maxResults);
|
|
Object.assign (results, result);
|
|
await this.sleep (5000);
|
|
}
|
|
return results;
|
|
}
|
|
|
|
async crawl () {
|
|
let results = await this.indexPages (
|
|
this.fromPage,
|
|
this.toPage,
|
|
this.maxResults
|
|
);
|
|
return results;
|
|
}
|
|
}
|