new structure; code polish

This commit is contained in:
GotPPay
2017-10-31 20:20:09 +01:00
parent a63c108259
commit 7a5f7242ac
25 changed files with 1670 additions and 6108 deletions

View File

@@ -1,45 +1,39 @@
'use strict'
'use strict';
let fetch = require('node-fetch');
let cheerio = require('cheerio');
let fs = require('fs');
let cloudinary = require('cloudinary');
let FormData = require('form-data');
let fetch = require ('node-fetch');
let cheerio = require ('cheerio');
let fs = require ('fs');
let cloudinary = require ('cloudinary');
let FormData = require ('form-data');
import {
AD_TYPE_SALE,
IGNORED_USERNAMES,
CATEGORY_FLAT,
CATEGORY_HOUSE,
CATEGORY_OFFICE,
CATEGORY_LAND,
CATEGORY_APARTMENT,
CATEGORY_GARAGE,
STATUS_NORMAL,
STATUS_RESERVED,
STATUS_SOLD
} from '../enums';
STATUS_SOLD,
} from '../../common/enums';
export default class RentalCrawler {
constructor(fromPage = 0, toPage = 10, maxResults = 1000) {
console.log("Rental Crawler");
constructor (fromPage = 0, toPage = 10, maxResults = 1000) {
console.log ('Rental Crawler');
this.fromPage = fromPage;
this.toPage = toPage;
this.maxResults = maxResults;
}
async indexSingle(url) {
async indexSingle (url) {
try {
const res = await fetch(url);
const body = await res.text();
const $ = cheerio.load(body);
const res = await fetch (url);
const body = await res.text ();
const $ = cheerio.load (body);
var title;
var category;
@@ -50,90 +44,119 @@ export default class RentalCrawler {
var descriptions;
var floor;
var floor;
var time;
var time;
var lat;
var lng;
var has_map;
var hasMap;
var status;
//No JSON string -> No map
try{
let complete_data;
let data_json_string;
let data_json;
try {
let completeData;
let dataJsonString;
let dataJson;
const start_n = 5;
const last_n = 15;
const startN = 5;
const lastN = 15;
for (let i=start_n;i<=last_n;i++){
try{
complete_data = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child('+i+') > script').text();
data_json_string = complete_data.slice(21,-1);
data_json = JSON.parse(data_json_string);
for (let i = startN; i <= lastN; i++) {
try {
completeData = $ (
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(' +
i +
') > script'
).text ();
dataJsonString = completeData.slice (21, -1);
dataJson = JSON.parse (dataJsonString);
break;
}catch(e){
console.log("No JSON string");
if (i===last_n) throw(e);
} catch (e) {
console.log ('No JSON string');
if (i === lastN) throw e;
}
}
}
title = data_json["re_realEstates_portalName"];
category = this.getCategoryIdfromNumber(parseInt(data_json["re_types_id"])); //categories from JSON string doesn't match categories in ENUMS
price = parseFloat(data_json["re_realEstates_price"]);
size = parseFloat(data_json["re_realEstates_area"]);
rooms = parseInt(data_json["re_realEstates_roomsNO"]);
address = data_json["re_realEstates_address"];
//descriptions = data_json["re_realEstates_description"];
floor = parseInt(data_json["re_realEstates_floorNO"]);
let time_array = data_json["re_realEstates_inserted"].slice(0,data_json["re_realEstates_inserted"].indexOf(' ')).split('-');
time = time_array[2]+'.'+time_array[1]+'.'+time_array[0];
lat = data_json["re_realEstates_latitude"];
lng = data_json["re_realEstates_longitude"];
has_map = true;
}catch(e){
console.log("error : " + e);
title = dataJson['re_realEstates_portalName'];
category = this.getCategoryIdfromNumber (
parseInt (dataJson['re_types_id'])
); //categories from JSON string doesn't match categories in ENUMS
price = parseFloat (dataJson['re_realEstates_price']);
size = parseFloat (dataJson['re_realEstates_area']);
rooms = parseInt (dataJson['re_realEstates_roomsNO']);
address = dataJson['re_realEstates_address'];
//descriptions = dataJson["re_realEstates_description"];
floor = parseInt (dataJson['re_realEstates_floorNO']);
let timeArray = dataJson['re_realEstates_inserted']
.slice (0, dataJson['re_realEstates_inserted'].indexOf (' '))
.split ('-');
time = timeArray[2] + '.' + timeArray[1] + '.' + timeArray[0];
lat = dataJson['re_realEstates_latitude'];
lng = dataJson['re_realEstates_longitude'];
hasMap = true;
} catch (e) {
console.log ('error : ' + e);
//This ad has no JSON string, informations should be retrieved using HTML selectors
time=undefined;
lat=0;
lng=0;
has_map = false;
time = undefined;
lat = 0;
lng = 0;
hasMap = false;
price = (parseFloat($('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.prices > span.pull-left').text().replace(',','').replace('.','')))/100;
price =
parseFloat (
$ (
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.prices > span.pull-left'
)
.text ()
.replace (',', '')
.replace ('.', '')
) / 100;
const props_list = {};
const propsList = {};
$('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body').contents().map((i,elem)=>{
const entry = $(elem).text().trim().split(':');
if (entry[0]) props_list[entry[0]]=entry[1];
});
$ (
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body'
)
.contents ()
.map ((i, elem) => {
const entry = $ (elem).text ().trim ().split (':');
if (entry[0]) propsList[entry[0]] = entry[1];
});
address = props_list['Ulica'];
size = parseFloat((props_list['Površina']).replace(',','').replace('.',''))/100;
rooms = parseInt(props_list['Broj soba']);
floor = parseInt(props_list['Spratnost']);
address = propsList['Ulica'];
size =
parseFloat (
propsList['Površina'].replace (',', '').replace ('.', '')
) / 100;
rooms = parseInt (propsList['Broj soba']);
floor = parseInt (propsList['Spratnost']);
title = $('div.container-fluid > div.container > div.row.content-top > div.col-xs-12.col-sm-6.col-md-9 > div.description.pull-left > h1').text();
descriptions = $('#b1 > div > div > div').text();
title = $ (
'div.container-fluid > div.container > div.row.content-top > div.col-xs-12.col-sm-6.col-md-9 > div.description.pull-left > h1'
).text ();
descriptions = $ ('#b1 > div > div > div').text ();
const full_category = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.title > p').text().split(',',3);
category = (full_category.size > 2) ?
this.getCategoryIdfromText(full_category[0]+full_category[1]) :
this.getCategoryIdfromText(full_category[0]);
const fullCategory = $ (
'body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.title > p'
)
.text ()
.split (',', 3);
category = fullCategory.size > 2
? this.getCategoryIdfromText (fullCategory[0] + fullCategory[1])
: this.getCategoryIdfromText (fullCategory[0]);
}
descriptions = $('#b1 > div > div > div').text();
status = this.getStatusIdFromText($('#a1 > div.box-badges > div').text());
descriptions = $ ('#b1 > div > div > div').text ();
status = this.getStatusIdFromText (
$ ('#a1 > div.box-badges > div').text ()
);
const images = [];
$(".img-gallery").contents().map((i,elem)=>{
const tmp =$(elem).attr('data-preview');
if(tmp) images.push(tmp);
$ ('.img-gallery').contents ().map ((i, elem) => {
const tmp = $ (elem).attr ('data-preview');
if (tmp) images.push (tmp);
});
const data = {
@@ -151,28 +174,26 @@ export default class RentalCrawler {
longDescription: descriptions,
lat,
lng,
loc: [parseFloat(lat), parseFloat(lng)],
has_map,
loc: [parseFloat (lat), parseFloat (lng)],
hasMap,
status,
//images: cloudinaryImages
images
images,
};
return data;
} catch (e) {
console.error('Exception caught: ' + e.message);
console.error ('Exception caught: ' + e.message);
}
return null;
}
async indexPage(pageNr, maxResults = 1000) {
async indexPage (pageNr, maxResults = 1000) {
try {
console.log ('Starting to index page: ' + pageNr);
console.log('Starting to index page: ' + pageNr);
const url = "http://www.rental.ba/pretraga/prodaja-1/stranica-" + pageNr;
const url = 'http://www.rental.ba/pretraga/prodaja-1/stranica-' + pageNr;
/*
const data = new FormData();
@@ -191,145 +212,213 @@ export default class RentalCrawler {
data.append('re_subTypes_id', 1);
*/
const res = await fetch(url, {
method: 'POST'
const res = await fetch (url, {
method: 'POST',
//body: data
});
const body = await res.text();
const $ = cheerio.load(body);
const body = await res.text ();
const $ = cheerio.load (body);
const hrefs = [];
$('.middle').each((i, elem) => {
const href = $(elem).find("a").first().attr('href');
hrefs.push(href);
$ ('.middle').each ((i, elem) => {
const href = $ (elem).find ('a').first ().attr ('href');
hrefs.push (href);
});
const results = {};
for (const href of hrefs) {
console.log(`indexing: ${href}`);
console.log (`indexing: ${href}`);
const singleData = await this.indexSingle(href);
const singleData = await this.indexSingle (href);
if (singleData) {
results[href] = singleData;
}
await this.sleep(500);
await this.sleep (500);
}
return results;
} catch (e) {
console.error('Exception caught:' + e);
console.error ('Exception caught:' + e);
}
}
getCategoryIdfromNumber(category){
switch(category){
case (1): return CATEGORY_HOUSE; break;
case (2): return CATEGORY_FLAT; break;
case (3): return CATEGORY_APARTMENT; break;
case (4): return CATEGORY_OFFICE; break;
case (5): return CATEGORY_LAND; break;
case (6): return CATEGORY_GARAGE; break;
getCategoryIdfromNumber (category) {
switch (category) {
case 1:
return CATEGORY_HOUSE;
case 2:
return CATEGORY_FLAT;
case 3:
return CATEGORY_APARTMENT;
case 4:
return CATEGORY_OFFICE;
case 5:
return CATEGORY_LAND;
case 6:
return CATEGORY_GARAGE;
}
}
getCategoryIdfromText (category) {
switch(category){
case ('samostojeća'): return CATEGORY_HOUSE
case ('dvojna'): return CATEGORY_HOUSE
case ('kuća u nizu'): return CATEGORY_HOUSE
case ('stambeno-poslovni objekt'): return CATEGORY_HOUSE
case ('prizemnica'): return CATEGORY_HOUSE
case ('kuća na moru'): return CATEGORY_HOUSE
case ('kuća u izgradnji'): return CATEGORY_HOUSE
case ('dvorac'): return CATEGORY_HOUSE
case ('apartmanska kuća'): return CATEGORY_HOUSE
case ('porodična kuća'): return CATEGORY_HOUSE
case ('vikend kuća'): return CATEGORY_HOUSE
case ('luksuzna kuća'): return CATEGORY_HOUSE
case ('kamena'): return CATEGORY_HOUSE
case ('vila'): return CATEGORY_HOUSE
case ('splav'): return CATEGORY_HOUSE
switch (category) {
case 'samostojeća':
return CATEGORY_HOUSE;
case 'dvojna':
return CATEGORY_HOUSE;
case 'kuća u nizu':
return CATEGORY_HOUSE;
case 'stambeno-poslovni objekt':
return CATEGORY_HOUSE;
case 'prizemnica':
return CATEGORY_HOUSE;
case 'kuća na moru':
return CATEGORY_HOUSE;
case 'kuća u izgradnji':
return CATEGORY_HOUSE;
case 'dvorac':
return CATEGORY_HOUSE;
case 'apartmanska kuća':
return CATEGORY_HOUSE;
case 'porodična kuća':
return CATEGORY_HOUSE;
case 'vikend kuća':
return CATEGORY_HOUSE;
case 'luksuzna kuća':
return CATEGORY_HOUSE;
case 'kamena':
return CATEGORY_HOUSE;
case 'vila':
return CATEGORY_HOUSE;
case 'splav':
return CATEGORY_HOUSE;
case ('stan u zgradi'): return CATEGORY_FLAT
case ('stan u kući'): return CATEGORY_FLAT
case ('stan višeetažni'): return CATEGORY_FLAT
case ('stan višeetažni u kući'): return CATEGORY_FLAT
case ('stan u starijoj zgradi'): return CATEGORY_FLAT
case ('stan u novogradnji'): return CATEGORY_FLAT
case ('stan u neboderu'): return CATEGORY_FLAT
case ('Korišten stan u novogradnji'): return CATEGORY_FLAT
case 'stan u zgradi':
return CATEGORY_FLAT;
case 'stan u kući':
return CATEGORY_FLAT;
case 'stan višeetažni':
return CATEGORY_FLAT;
case 'stan višeetažni u kući':
return CATEGORY_FLAT;
case 'stan u starijoj zgradi':
return CATEGORY_FLAT;
case 'stan u novogradnji':
return CATEGORY_FLAT;
case 'stan u neboderu':
return CATEGORY_FLAT;
case 'Korišten stan u novogradnji':
return CATEGORY_FLAT;
case ('apartman na moru'): return CATEGORY_APARTMENT
case ('apartman u planini'): return CATEGORY_APARTMENT
case ('unutrašnje garažno mjesto'): return CATEGORY_GARAGE
case ('unutrašnje parkirno mjesto'): return CATEGORY_GARAGE
case 'apartman na moru':
return CATEGORY_APARTMENT;
case 'apartman u planini':
return CATEGORY_APARTMENT;
case ('građevinsko'): return CATEGORY_LAND
case ('građevinsko stambeno'): return CATEGORY_LAND
case ('zemljište, ostalo'): return CATEGORY_LAND
case ('odmaralište'): return CATEGORY_LAND
case ('oranica'): return CATEGORY_LAND
case ('šuma'): return CATEGORY_LAND
case ('livada'): return CATEGORY_LAND
case ('građevinsko M2'): return CATEGORY_LAND
case ('građevinsko M1'): return CATEGORY_LAND
case ('građevinsko - turističko'): return CATEGORY_LAND
case ('građevinsko - poslovno'): return CATEGORY_LAND
case ('otok'): return CATEGORY_LAND
case ('poljoprivredno'): return CATEGORY_LAND
case ('lokal'): return CATEGORY_OFFICE
case ('ured'): return CATEGORY_OFFICE
case ('skladište ili garaža'): return CATEGORY_OFFICE
case ('radionica'): return CATEGORY_OFFICE
case ('tvornica'): return CATEGORY_OFFICE
case ('restoran'): return CATEGORY_OFFICE
case ('sportski centar'): return CATEGORY_OFFICE
case ('ordinacija'): return CATEGORY_OFFICE
case ('kiosk'): return CATEGORY_OFFICE
case ('auto-praonica'): return CATEGORY_OFFICE
case ('poslovna zgrada'): return CATEGORY_OFFICE
case ('skladište'): return CATEGORY_OFFICE
case ('garaža'): return CATEGORY_OFFICE
case ('hotel'): return CATEGORY_OFFICE
case ('pansion'): return CATEGORY_OFFICE
case ('apartmanska zgrada'): return CATEGORY_OFFICE
case ('trgovina'): return CATEGORY_OFFICE
case ('prodajno skladišni'): return CATEGORY_OFFICE
case ('proizvodno skladišni'): return CATEGORY_OFFICE
case ('Kancelarije'): return CATEGORY_OFFICE
case ('Poslovni prostor'): return CATEGORY_OFFICE
case 'unutrašnje garažno mjesto':
return CATEGORY_GARAGE;
case 'unutrašnje parkirno mjesto':
return CATEGORY_GARAGE;
case 'građevinsko':
return CATEGORY_LAND;
case 'građevinsko stambeno':
return CATEGORY_LAND;
case 'zemljište, ostalo':
return CATEGORY_LAND;
case 'odmaralište':
return CATEGORY_LAND;
case 'oranica':
return CATEGORY_LAND;
case 'šuma':
return CATEGORY_LAND;
case 'livada':
return CATEGORY_LAND;
case 'građevinsko M2':
return CATEGORY_LAND;
case 'građevinsko M1':
return CATEGORY_LAND;
case 'građevinsko - turističko':
return CATEGORY_LAND;
case 'građevinsko - poslovno':
return CATEGORY_LAND;
case 'otok':
return CATEGORY_LAND;
case 'poljoprivredno':
return CATEGORY_LAND;
case 'lokal':
return CATEGORY_OFFICE;
case 'ured':
return CATEGORY_OFFICE;
case 'skladište ili garaža':
return CATEGORY_OFFICE;
case 'radionica':
return CATEGORY_OFFICE;
case 'tvornica':
return CATEGORY_OFFICE;
case 'restoran':
return CATEGORY_OFFICE;
case 'sportski centar':
return CATEGORY_OFFICE;
case 'ordinacija':
return CATEGORY_OFFICE;
case 'kiosk':
return CATEGORY_OFFICE;
case 'auto-praonica':
return CATEGORY_OFFICE;
case 'poslovna zgrada':
return CATEGORY_OFFICE;
case 'skladište':
return CATEGORY_OFFICE;
case 'garaža':
return CATEGORY_OFFICE;
case 'hotel':
return CATEGORY_OFFICE;
case 'pansion':
return CATEGORY_OFFICE;
case 'apartmanska zgrada':
return CATEGORY_OFFICE;
case 'trgovina':
return CATEGORY_OFFICE;
case 'prodajno skladišni':
return CATEGORY_OFFICE;
case 'proizvodno skladišni':
return CATEGORY_OFFICE;
case 'Kancelarije':
return CATEGORY_OFFICE;
case 'Poslovni prostor':
return CATEGORY_OFFICE;
}
}
getStatusIdFromText(status){
getStatusIdFromText (status) {
if (status === 'Prodato') return STATUS_SOLD;
return STATUS_NORMAL;
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
async sleep (ms) {
return new Promise (resolve => setTimeout (resolve, ms));
}
async indexPages(start, end, maxResults = 1000) {
async indexPages (start, end, maxResults = 1000) {
let results = {};
for (let i = start; i <= end; i++) {
let result = await this.indexPage(i, maxResults);
Object.assign(results, result)
await this.sleep(5000);
let result = await this.indexPage (i, maxResults);
Object.assign (results, result);
await this.sleep (5000);
}
return results;
}
async crawl() {
let results = await this.indexPages(this.fromPage, this.toPage, this.maxResults);
async crawl () {
let results = await this.indexPages (
this.fromPage,
this.toPage,
this.maxResults
);
return results;
}
}