336 lines
11 KiB
JavaScript
336 lines
11 KiB
JavaScript
'use strict'
|
|
|
|
let fetch = require('node-fetch');
|
|
let cheerio = require('cheerio');
|
|
let fs = require('fs');
|
|
let cloudinary = require('cloudinary');
|
|
let FormData = require('form-data');
|
|
|
|
import {
|
|
AD_TYPE_SALE,
|
|
|
|
IGNORED_USERNAMES,
|
|
|
|
CATEGORY_FLAT,
|
|
CATEGORY_HOUSE,
|
|
CATEGORY_OFFICE,
|
|
CATEGORY_LAND,
|
|
CATEGORY_APARTMENT,
|
|
CATEGORY_GARAGE,
|
|
|
|
STATUS_NORMAL,
|
|
STATUS_RESERVED,
|
|
STATUS_SOLD
|
|
} from '../enums';
|
|
|
|
export default class RentalCrawler {
|
|
|
|
constructor(fromPage = 0, toPage = 10, maxResults = 1000) {
|
|
|
|
console.log("Rental Crawler");
|
|
|
|
this.fromPage = fromPage;
|
|
this.toPage = toPage;
|
|
this.maxResults = maxResults;
|
|
}
|
|
|
|
async indexSingle(url) {
|
|
try {
|
|
|
|
const res = await fetch(url);
|
|
const body = await res.text();
|
|
const $ = cheerio.load(body);
|
|
|
|
var title;
|
|
var category;
|
|
var price;
|
|
var size;
|
|
var rooms;
|
|
var address;
|
|
var descriptions;
|
|
var floor;
|
|
var floor;
|
|
var time;
|
|
var lat;
|
|
var lng;
|
|
var has_map;
|
|
var status;
|
|
|
|
//No JSON string -> No map
|
|
try{
|
|
let complete_data;
|
|
let data_json_string;
|
|
let data_json;
|
|
|
|
const start_n = 5;
|
|
const last_n = 15;
|
|
|
|
for (let i=start_n;i<=last_n;i++){
|
|
try{
|
|
complete_data = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child('+i+') > script').text();
|
|
data_json_string = complete_data.slice(21,-1);
|
|
data_json = JSON.parse(data_json_string);
|
|
break;
|
|
}catch(e){
|
|
console.log("No JSON string");
|
|
if (i===last_n) throw(e);
|
|
}
|
|
}
|
|
|
|
title = data_json["re_realEstates_portalName"];
|
|
category = this.getCategoryIdfromNumber(parseInt(data_json["re_types_id"])); //categories from JSON string doesn't match categories in ENUMS
|
|
price = parseFloat(data_json["re_realEstates_price"]);
|
|
size = parseFloat(data_json["re_realEstates_area"]);
|
|
rooms = parseInt(data_json["re_realEstates_roomsNO"]);
|
|
address = data_json["re_realEstates_address"];
|
|
//descriptions = data_json["re_realEstates_description"];
|
|
floor = parseInt(data_json["re_realEstates_floorNO"]);
|
|
|
|
let time_array = data_json["re_realEstates_inserted"].slice(0,data_json["re_realEstates_inserted"].indexOf(' ')).split('-');
|
|
time = time_array[2]+'.'+time_array[1]+'.'+time_array[0];
|
|
|
|
lat = data_json["re_realEstates_latitude"];
|
|
lng = data_json["re_realEstates_longitude"];
|
|
has_map = true;
|
|
}catch(e){
|
|
console.log("error : " + e);
|
|
//This ad has no JSON string, informations should be retrieved using HTML selectors
|
|
time=undefined;
|
|
lat=0;
|
|
lng=0;
|
|
has_map = false;
|
|
|
|
price = (parseFloat($('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.prices > span.pull-left').text().replace(',','').replace('.','')))/100;
|
|
|
|
const props_list = {};
|
|
|
|
$('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body').contents().map((i,elem)=>{
|
|
const entry = $(elem).text().trim().split(':');
|
|
if (entry[0]) props_list[entry[0]]=entry[1];
|
|
});
|
|
|
|
address = props_list['Ulica'];
|
|
size = parseFloat((props_list['Površina']).replace(',','').replace('.',''))/100;
|
|
rooms = parseInt(props_list['Broj soba']);
|
|
floor = parseInt(props_list['Spratnost']);
|
|
|
|
title = $('div.container-fluid > div.container > div.row.content-top > div.col-xs-12.col-sm-6.col-md-9 > div.description.pull-left > h1').text();
|
|
descriptions = $('#b1 > div > div > div').text();
|
|
|
|
const full_category = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.title > p').text().split(',',3);
|
|
|
|
category = (full_category.size > 2) ?
|
|
this.getCategoryIdfromText(full_category[0]+full_category[1]) :
|
|
this.getCategoryIdfromText(full_category[0]);
|
|
|
|
}
|
|
|
|
descriptions = $('#b1 > div > div > div').text();
|
|
status = this.getStatusIdFromText($('#a1 > div.box-badges > div').text());
|
|
|
|
|
|
const images = [];
|
|
|
|
$(".img-gallery").contents().map((i,elem)=>{
|
|
const tmp =$(elem).attr('data-preview');
|
|
if(tmp) images.push(tmp);
|
|
});
|
|
|
|
const data = {
|
|
category,
|
|
url,
|
|
title,
|
|
price,
|
|
size,
|
|
rooms,
|
|
floor,
|
|
address,
|
|
adType: AD_TYPE_SALE,
|
|
time,
|
|
shortDescription: title,
|
|
longDescription: descriptions,
|
|
lat,
|
|
lng,
|
|
loc: [parseFloat(lat), parseFloat(lng)],
|
|
has_map,
|
|
status,
|
|
//images: cloudinaryImages
|
|
images
|
|
};
|
|
|
|
return data;
|
|
|
|
} catch (e) {
|
|
console.error('Exception caught: ' + e.message);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async indexPage(pageNr, maxResults = 1000) {
|
|
try {
|
|
|
|
console.log('Starting to index page: ' + pageNr);
|
|
|
|
const url = "http://www.rental.ba/pretraga/prodaja-1/stranica-" + pageNr;
|
|
|
|
/*
|
|
const data = new FormData();
|
|
data.append('sales', 1); // Mislim da ovo definiše oglase tipa prodaje
|
|
data.append('re_types_id', ''); //odnosi se na tip nekretnine (kuća, stan, apartman,...)
|
|
data.append('full_text', '');
|
|
data.append('re_realEstates_code', '');
|
|
data.append('re_realEstates_price_max', '');
|
|
data.append('re_realEstates_price_min', '');
|
|
data.append('re_realEstates_area_min', '');
|
|
data.append('re_realEstates_area_max', '');
|
|
data.append('re_realEstates_roomsNO_min', '');
|
|
data.append('re_realEstates_roomsNO_max', '');
|
|
data.append('re_realEstates_floorNO_min', '');
|
|
data.append('re_realEstates_floorNO_max', '');
|
|
data.append('re_subTypes_id', 1);
|
|
*/
|
|
|
|
const res = await fetch(url, {
|
|
method: 'POST'
|
|
//body: data
|
|
});
|
|
const body = await res.text();
|
|
const $ = cheerio.load(body);
|
|
|
|
const hrefs = [];
|
|
$('.middle').each((i, elem) => {
|
|
const href = $(elem).find("a").first().attr('href');
|
|
hrefs.push(href);
|
|
});
|
|
|
|
const results = {};
|
|
for (const href of hrefs) {
|
|
console.log(`indexing: ${href}`);
|
|
|
|
const singleData = await this.indexSingle(href);
|
|
|
|
if (singleData) {
|
|
results[href] = singleData;
|
|
}
|
|
|
|
await this.sleep(500);
|
|
}
|
|
|
|
return results;
|
|
|
|
} catch (e) {
|
|
console.error('Exception caught:' + e);
|
|
}
|
|
}
|
|
|
|
getCategoryIdfromNumber(category){
|
|
switch(category){
|
|
case (1): return CATEGORY_HOUSE; break;
|
|
case (2): return CATEGORY_FLAT; break;
|
|
case (3): return CATEGORY_APARTMENT; break;
|
|
case (4): return CATEGORY_OFFICE; break;
|
|
case (5): return CATEGORY_LAND; break;
|
|
case (6): return CATEGORY_GARAGE; break;
|
|
}
|
|
}
|
|
|
|
getCategoryIdfromText (category) {
|
|
switch(category){
|
|
case ('samostojeća'): return CATEGORY_HOUSE
|
|
case ('dvojna'): return CATEGORY_HOUSE
|
|
case ('kuća u nizu'): return CATEGORY_HOUSE
|
|
case ('stambeno-poslovni objekt'): return CATEGORY_HOUSE
|
|
case ('prizemnica'): return CATEGORY_HOUSE
|
|
case ('kuća na moru'): return CATEGORY_HOUSE
|
|
case ('kuća u izgradnji'): return CATEGORY_HOUSE
|
|
case ('dvorac'): return CATEGORY_HOUSE
|
|
case ('apartmanska kuća'): return CATEGORY_HOUSE
|
|
case ('porodična kuća'): return CATEGORY_HOUSE
|
|
case ('vikend kuća'): return CATEGORY_HOUSE
|
|
case ('luksuzna kuća'): return CATEGORY_HOUSE
|
|
case ('kamena'): return CATEGORY_HOUSE
|
|
case ('vila'): return CATEGORY_HOUSE
|
|
case ('splav'): return CATEGORY_HOUSE
|
|
|
|
case ('stan u zgradi'): return CATEGORY_FLAT
|
|
case ('stan u kući'): return CATEGORY_FLAT
|
|
case ('stan višeetažni'): return CATEGORY_FLAT
|
|
case ('stan višeetažni u kući'): return CATEGORY_FLAT
|
|
case ('stan u starijoj zgradi'): return CATEGORY_FLAT
|
|
case ('stan u novogradnji'): return CATEGORY_FLAT
|
|
case ('stan u neboderu'): return CATEGORY_FLAT
|
|
case ('Korišten stan u novogradnji'): return CATEGORY_FLAT
|
|
|
|
case ('apartman na moru'): return CATEGORY_APARTMENT
|
|
case ('apartman u planini'): return CATEGORY_APARTMENT
|
|
|
|
case ('unutrašnje garažno mjesto'): return CATEGORY_GARAGE
|
|
case ('unutrašnje parkirno mjesto'): return CATEGORY_GARAGE
|
|
|
|
case ('građevinsko'): return CATEGORY_LAND
|
|
case ('građevinsko stambeno'): return CATEGORY_LAND
|
|
case ('zemljište, ostalo'): return CATEGORY_LAND
|
|
case ('odmaralište'): return CATEGORY_LAND
|
|
case ('oranica'): return CATEGORY_LAND
|
|
case ('šuma'): return CATEGORY_LAND
|
|
case ('livada'): return CATEGORY_LAND
|
|
case ('građevinsko M2'): return CATEGORY_LAND
|
|
case ('građevinsko M1'): return CATEGORY_LAND
|
|
case ('građevinsko - turističko'): return CATEGORY_LAND
|
|
case ('građevinsko - poslovno'): return CATEGORY_LAND
|
|
case ('otok'): return CATEGORY_LAND
|
|
case ('poljoprivredno'): return CATEGORY_LAND
|
|
|
|
|
|
case ('lokal'): return CATEGORY_OFFICE
|
|
case ('ured'): return CATEGORY_OFFICE
|
|
case ('skladište ili garaža'): return CATEGORY_OFFICE
|
|
case ('radionica'): return CATEGORY_OFFICE
|
|
case ('tvornica'): return CATEGORY_OFFICE
|
|
case ('restoran'): return CATEGORY_OFFICE
|
|
case ('sportski centar'): return CATEGORY_OFFICE
|
|
case ('ordinacija'): return CATEGORY_OFFICE
|
|
case ('kiosk'): return CATEGORY_OFFICE
|
|
case ('auto-praonica'): return CATEGORY_OFFICE
|
|
case ('poslovna zgrada'): return CATEGORY_OFFICE
|
|
case ('skladište'): return CATEGORY_OFFICE
|
|
case ('garaža'): return CATEGORY_OFFICE
|
|
case ('hotel'): return CATEGORY_OFFICE
|
|
case ('pansion'): return CATEGORY_OFFICE
|
|
case ('apartmanska zgrada'): return CATEGORY_OFFICE
|
|
case ('trgovina'): return CATEGORY_OFFICE
|
|
case ('prodajno skladišni'): return CATEGORY_OFFICE
|
|
case ('proizvodno skladišni'): return CATEGORY_OFFICE
|
|
case ('Kancelarije'): return CATEGORY_OFFICE
|
|
case ('Poslovni prostor'): return CATEGORY_OFFICE
|
|
|
|
}
|
|
}
|
|
|
|
getStatusIdFromText(status){
|
|
if (status === 'Prodato') return STATUS_SOLD;
|
|
|
|
return STATUS_NORMAL;
|
|
}
|
|
|
|
async sleep(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async indexPages(start, end, maxResults = 1000) {
|
|
let results = {};
|
|
for (let i = start; i <= end; i++) {
|
|
let result = await this.indexPage(i, maxResults);
|
|
Object.assign(results, result)
|
|
await this.sleep(5000);
|
|
}
|
|
return results;
|
|
}
|
|
|
|
async crawl() {
|
|
let results = await this.indexPages(this.fromPage, this.toPage, this.maxResults);
|
|
return results;
|
|
}
|
|
}
|