crawler upgrade, server upgrade

This commit is contained in:
GotPPay
2017-10-30 22:54:56 +01:00
parent 039e34237d
commit a63c108259
14 changed files with 8757 additions and 2517 deletions

View File

@@ -8,11 +8,17 @@ let FormData = require('form-data');
import {
AD_TYPE_SALE,
IGNORED_USERNAMES,
CATEGORY_FLAT,
CATEGORY_HOUSE,
CATEGORY_OFFICE,
CATEGORY_LAND
CATEGORY_LAND,
STATUS_NORMAL,
STATUS_RESERVED,
STATUS_SOLD
} from '../enums';
export default class ProstorCrawler {
@@ -58,14 +64,25 @@ export default class ProstorCrawler {
const latLngRe = /marker=([0-9]+\.[0-9]+)\,\s*([0-9]+\.[0-9]+)/g;
var has_map = false;
var tmpTitle = title.toUpperCase();
var status = STATUS_NORMAL;
if (tmpTitle.indexOf("PRODANO") !== -1) status = STATUS_SOLD;
if (tmpTitle.indexOf("REZERVISANO") !== -1) status = STATUS_RESERVED;
//const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g;
const matches = latLngRe.exec(body);
let lng = '',
lat = '';
has_map = false;
if (matches && matches.length >= 3) {
lat = matches[1];
lng = matches[2];
has_map = true;
}
//console.log({
@@ -136,6 +153,8 @@ export default class ProstorCrawler {
lat,
lng,
loc: [parseFloat(lat), parseFloat(lng)],
has_map,
status,
//images: cloudinaryImages
images
};

View File

@@ -8,13 +8,19 @@ let FormData = require('form-data');
import {
AD_TYPE_SALE,
IGNORED_USERNAMES,
CATEGORY_FLAT,
CATEGORY_HOUSE,
CATEGORY_OFFICE,
CATEGORY_LAND,
CATEGORY_APARTMENT,
CATEGORY_GARAGE
CATEGORY_GARAGE,
STATUS_NORMAL,
STATUS_RESERVED,
STATUS_SOLD
} from '../enums';
export default class RentalCrawler {
@@ -44,36 +50,55 @@ export default class RentalCrawler {
var descriptions;
var floor;
var floor;
var time;
var time;
var lat;
var lng;
var has_map;
var status;
//Oglas koji nema JSON objekat, nema ni mape
//No JSON string -> No map
try{
const complete_data = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(7) > script').text();
const data_json_string = complete_data.slice(21,-1);
const data_json = JSON.parse(data_json_string);
let complete_data;
let data_json_string;
let data_json;
const start_n = 5;
const last_n = 15;
for (let i=start_n;i<=last_n;i++){
try{
complete_data = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child('+i+') > script').text();
data_json_string = complete_data.slice(21,-1);
data_json = JSON.parse(data_json_string);
break;
}catch(e){
console.log("No JSON string");
if (i===last_n) throw(e);
}
}
title = data_json["re_realEstates_portalName"];
category = this.getCategoryIdfromNumber(data_json["re_types_id"]); //kategorije ne odgovaraju brojevima u Enums !!!
price = data_json["re_realEstates_price"];
size = data_json["re_realEstates_area"];
rooms = data_json["re_realEstates_roomsNO"];
category = this.getCategoryIdfromNumber(parseInt(data_json["re_types_id"])); //categories from JSON string doesn't match categories in ENUMS
price = parseFloat(data_json["re_realEstates_price"]);
size = parseFloat(data_json["re_realEstates_area"]);
rooms = parseInt(data_json["re_realEstates_roomsNO"]);
address = data_json["re_realEstates_address"];
descriptions = data_json["re_realEstates_description"];
floor = data_json["re_realEstates_floorNO"];
time = data_json["re_realEstates_inserted"];
lat = data_json["re_realEstates_latitude"];
lng = data_json["re_realEstates_longitude"];
//descriptions = data_json["re_realEstates_description"];
floor = parseInt(data_json["re_realEstates_floorNO"]);
let time_array = data_json["re_realEstates_inserted"].slice(0,data_json["re_realEstates_inserted"].indexOf(' ')).split('-');
time = time_array[2]+'.'+time_array[1]+'.'+time_array[0];
time = data_json["re_realEstates_inserted"];
lat = data_json["re_realEstates_latitude"];
lng = data_json["re_realEstates_longitude"];
has_map = true;
}catch(e){
//oglas nema JSON objekat, informacije izvući preko selektora
console.log("error : " + e);
//This ad has no JSON string, informations should be retrieved using HTML selectors
time=undefined;
lat=undefined;
lng=undefined;
lat=0;
lng=0;
has_map = false;
price = (parseFloat($('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.prices > span.pull-left').text().replace(',','').replace('.','')))/100;
@@ -86,23 +111,27 @@ export default class RentalCrawler {
address = props_list['Ulica'];
size = parseFloat((props_list['Površina']).replace(',','').replace('.',''))/100;
rooms = props_list['Broj soba'];
rooms = parseInt(props_list['Broj soba']);
floor = parseInt(props_list['Spratnost']);
title = $('div.container-fluid > div.container > div.row.content-top > div.col-xs-12.col-sm-6.col-md-9 > div.description.pull-left > h1').text();
descriptions = $('#b1 > div > div > div').text();
const full_category = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.title > p').text().split(',',3);
category = (full_category.size > 2) ?
this.getCategoryIdfromText(full_category[0]+full_category[1]) :
this.getCategoryIdfromText(full_category[0]);
}
descriptions = $('#b1 > div > div > div').text();
status = this.getStatusIdFromText($('#a1 > div.box-badges > div').text());
const images = [];
$(".img-gallery").contents().map((i,elem)=>{
//u linku sadržanom u tmp stoje i parametri za max visinu i širinu
const tmp =$(elem).attr('data-preview');
if(tmp) images.push(tmp);
});
@@ -123,12 +152,12 @@ export default class RentalCrawler {
lat,
lng,
loc: [parseFloat(lat), parseFloat(lng)],
has_map,
status,
//images: cloudinaryImages
images
};
console.log(data);
return data;
} catch (e) {
@@ -197,12 +226,12 @@ export default class RentalCrawler {
getCategoryIdfromNumber(category){
switch(category){
case (1): return CATEGORY_HOUSE;
case (2): return CATEGORY_FLAT;
case (3): return CATEGORY_APARTMENT;
case (4): return CATEGORY_OFFICE;
case (5): return CATEGORY_LAND;
case (6): return CATEGORY_GARAGE;
case (1): return CATEGORY_HOUSE; break;
case (2): return CATEGORY_FLAT; break;
case (3): return CATEGORY_APARTMENT; break;
case (4): return CATEGORY_OFFICE; break;
case (5): return CATEGORY_LAND; break;
case (6): return CATEGORY_GARAGE; break;
}
}
@@ -279,6 +308,12 @@ export default class RentalCrawler {
}
}
getStatusIdFromText(status){
if (status === 'Prodato') return STATUS_SOLD;
return STATUS_NORMAL;
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}