Rental crawler #2

Merged
senaduka merged 6 commits from rental-crawler into master 2017-11-02 21:37:02 +01:00
5 changed files with 488 additions and 0 deletions
Showing only changes of commit b2787ebda5 - Show all commits

10
crawler/detalji Normal file
View File

@@ -0,0 +1,10 @@
Početna stranica : http://www.rental.ba/pretraga/prodaja-1/stranica-1
kategorije :
kuća = 1
stan = 2
apartman = 3
poslovni prostor = 4
zemljište = 5
garaža = 6

1
crawler/hello Normal file

File diff suppressed because one or more lines are too long

1
crawler/helloworld.txt Normal file

File diff suppressed because one or more lines are too long

176
crawler/privremeni Normal file
View File

@@ -0,0 +1,176 @@
http://www.rental.ba/pretraga
sales=1
re_types_id=
full_text=
re_realEstates_code=
re_realEstates_price_max=
re_realEstates_price_min=
re_realEstates_area_max=
re_realEstates_area_min=
re_realEstates_roomsNO_min=
re_realEstates_roomsNO_max=
re_realEstate_floorNO_min=
re_realEstate_floorNO_max=
re_subTypes_id=1
search_count=1
<script>var json_map_data = [{
"re_realEstates_id":"1084",
"re_realEstates_code":"1084-1",
"re_realEstates_parent":"0",
"ag_agents_id":"1",
"re_types_id":"2",
"re_subTypes_id":"1",
"re_action_id":"1",
"re_status_id":"1",
"loc_countries_id":"4",
"loc_counties_id":"400008",
"loc_islands_id":"0",
"loc_cities_id":"400076",
"loc_cityAreas_id":"400071",
"loc_quarters_id":"400128",
"re_realEstates_address":"Adema Bu\u0107e ",
"re_realEstates_addressNO":null,
"op_realEstates_addressSync":"1",
"pr_projects_id":"0",
"pr_buildings_id":"0",
"re_realEstates_name":"",
"re_realEstates_longitude":"18.364784424420122",
"re_realEstates_latitude":"43.85508550138114",
"op_realEstates_gmapSync":"1",
"re_realEstates_price":"127000",
"re_realEstates_priceM2":"2116.67",
"re_realEstates_priceOld":"",
"re_realEstates_priceCustomer":null,
"op_realEstates_priceOnRequest":"0",
"op_realEstates_priceWithoutTaxes":"0",
"op_realEstates_utilitiesIncluded":"0",
"re_realEstates_reservePrice":"0",
"re_realEstates_repurchase":null,
"re_realEstates_area":"60",
"re_realEstates_infield":"0",
"re_realEstates_warehouseArea":"0",
"re_realEstates_constructionQuotient":"0",
"re_realEstates_constructionFloors":"0",
"re_realEstates_grossDevelopedArea":"0",
"re_realEstates_parkingNO":"0",
"re_realEstates_bedNO":"0",
"re_realEstates_bathroomNO":"1",
"re_realEstates_roomsNO":"2",
"op_realEstates_openSpace":"0",
"op_realEstates_roomApartment":"0",
"re_realEstates_flatsNO":"0",
"re_realEstates_floorNO":"3",
"re_realEstates_floorsNO":"8",
"re_realEstates_ceilingHeight":"",
"re_realEstates_actualAge":"2010",
"re_realEstates_movingIn":"",
"re_realEstates_LRcartridge":null,
"re_realEstates_LRsubCartridge":null,
"re_realEstates_LRlotNo":null,
"re_realEstates_LRcounties":null,
"op_realEstates_newBuilding":"0",
"op_realEstates_buildingPermit":"0",
"op_realEstates_locationPermit":"0",
"op_realEstates_inspectionCertificat":"0",
"re_realEstates_landRegisterNotification":null,
"op_realEstates_seafront":"0",
"di_realEstates_transportation":"5",
"di_realEstates_seaDistance":"",
"di_realEstates_center":"",
"i18n_id":"57",
"re_access_id":"9",
"re_agencyCommission_id":"0",
"re_categories_id":"0",
"re_descriptions_id":"2,14,1,28,26,3,33,19",
"re_descriptions_area":null,
"re_energyEfficiency_id":"0",
"re_floorNO_id":"",
"re_heating_id":"4",
"re_infrastructure_id":"9,12,16,11,5,4,1,8,2",
"re_joinery_id":"6",
"re_orientation_id":"1",
"re_ownerships_id":"0",
"re_proofs_id":"0",
"re_propertyCondition_id":"8",
"re_registryStatus_id":"0",
"re_spaces_id":"1,4,2",
"re_spaces_values":"a:3:{i:1;s:0:\"\";i:4;s:2:\"22\";i:2;s:0:\"\";}",
"re_transportation_id":"",
"re_realEstates_priority":"0",
"cp_realEstates_name":null,
"cp_realEstates_sex":null,
"cp_realEstates_phone":null,
"cp_realEstates_phone2":null,
"cp_realEstates_email":null,
"cl_clients_buyer_id":null,
"cl_transactionRiskLevel_id":null,
"re_realEstates_contractualPrice":null,
"re_realEstates_commission":null,
"re_realEstates_conclusionPlace":null,
"re_realEstates_conclusionDate":null,
"re_realEstates_downPayment":null,
"re_realEstates_downPaymentDate":null,
"re_realEstates_payoffDate":null,
"adm_realEstates_discount":"0",
"adm_realEstates_discountDate":"0000-00-00",
"op_realEstates_web":"0",
"op_realEstates_specialOffer":"1",
"op_realEstates_bestBuy":"1",
"re_portals_id":null,
"re_realEstates_rentedTo":null,
"re_realEstates_recived":null,
"re_realEstates_inserted":"2017-10-11 14:13:41",
"re_realEstates_edited":"2017-10-11 14:26:34",
"re_realEstates_reservedDate":null,
"re_realEstates_reservedDeadline":null,
"re_realEstates_resource":null,
"re_realEstates_contractNO":null,
"re_position_id":"0",
"re_realEstates_old_id":null,
"re_realEstates_yearAdapted":null,
"op_realEstates_shortLease":"0",
"re_advertisement_text":null,
"re_advertisement_published_date":null,
"re_advertisement_status_id":null,
"re_advertisement_spotted_date":null,
"re_realEstates_commission2":null,
"re_realEstates_commission_suffix_id":null,
"re_realEstates_commission2_suffix_id":null,
"re_realEstates_fieldArea":"0",
"re_roomType_id":"0",
"op_realEstates_ownerPermit":"0",
"re_realEstates_eop_note":null,
"re_realEstates_print_ads_description":null,
"cl_profile_id":null,
"cl_clients_id":null,
"re_photos_name":"1084\/1084_1_1507724744.jpg",
"op_realEstates_lux":"0",
"loc_cities_name":"Sarajevo",
"loc_countries_name":"Bosna i Hercegovina",
"loc_counties_name":"Sarajevo",
"loc_cityAreas_name":"Sarajevo \u2013 Novo Sarajevo",
"loc_quarters_name":"Bu\u0107a Potok",
"loc_islands_name":null,
"re_action_priority":"1",
"i18n_id1":"22133",
"re_types_priority":"1",
"i18n_id2":"2",
"i18n_id3":"8",
"re_subTypes_oldID":"101",
"re_subTypes_priority":"4",
"i18n_id4":"57",
"i18n_id5":"125",
"re_subTypes_name":"stan u zgradi",
"re_subTypes_shortName":"s. u zgradi",
"re_realEstates_description":"&lt;p style=&quot;text-align: justify;&quot;&gt;Vrlo, vrlo dobar stan! U Tibri u Bu\u0107a Potoku, pored Binga, s pogledom prema dje\u010dijem igrali&amp;scaron;tu, ovaj fantasti\u010dni dovosban stan ispuni\u0107e sva o\u010dekivnaja manje porodice ili para koji \u017eeli miran \u017eivot u kvalitetnoj novogradnji u porodi\u010dnom naselju.&lt;\/p&gt;\r\n&lt;p style=&quot;text-align: justify;&quot;&gt;Knji\u017ene povr&amp;scaron;ine 59,42 m&amp;sup2;, stan se sastoji od dnevne sobe s trpezarijom, kuhinje, spava\u0107e sobe, kupatila, dva balkona, hodnika, te ostave. Sve je vrlo kvalitetno ura\u0111eno i odr\u017eavano pa\u017enjom dobrog doma\u0107ina. Stan je opremljen vrhunskom kuhinjom s aparatima, a prodajemo ga sa svim stvarima koje vidite na slikama, izuzev TV aparata.&amp;nbsp;&lt;\/p&gt;\r\n&lt;p style=&quot;text-align: justify;&quot;&gt;Dva balkona daju poseban &amp;scaron;arm, a optimalna spratnost omogu\u0107it \u0107e vam u\u017eivanje u punoj intimi. Zgrada je opremljena liftom.&lt;\/p&gt;\r\n&lt;p style=&quot;text-align: justify;&quot;&gt;Prostrana gara\u017ea povr&amp;scaron;ine 22,36 m&amp;sup2; je povezana liftom sa stanom. Mogu\u0107e je kupiti i stan bez gara\u017ee.&lt;\/p&gt;\r\n&lt;p style=&quot;text-align: justify;&quot;&gt;Za pogledati!&lt;\/p&gt;\r\n&lt;p style=&quot;text-align: justify;&quot;&gt;Mirza - 061 188 252&amp;nbsp;&lt;\/p&gt;",
"re_realEstates_portalName":"East 17",
"re_types_name":"stan",
"re_types_plural":"stanovi",
"re_types_shortName":"S",
"re_action_name":"prodaja",
"re_action_shortName":"P",
"re_propertyCondition_name":"Ure\u0111eno i odr\u017eavano"
}]

300
crawler/specific/rental.js Normal file
View File

@@ -0,0 +1,300 @@
'use strict'
let fetch = require('node-fetch');
let cheerio = require('cheerio');
let fs = require('fs');
let cloudinary = require('cloudinary');
let FormData = require('form-data');
import {
AD_TYPE_SALE,
IGNORED_USERNAMES,
CATEGORY_FLAT,
CATEGORY_HOUSE,
CATEGORY_OFFICE,
CATEGORY_LAND,
CATEGORY_APARTMENT,
CATEGORY_GARAGE
} from '../enums';
export default class RentalCrawler {
constructor(fromPage = 0, toPage = 10, maxResults = 1000) {
console.log("Rental Crawler");
this.fromPage = fromPage;
this.toPage = toPage;
this.maxResults = maxResults;
}
async indexSingle(url) {
try {
const res = await fetch(url);
const body = await res.text();
const $ = cheerio.load(body);
var title;
var category;
var price;
var size;
var rooms;
var address;
var descriptions;
var floor;
var floor;
var time;
var lat;
var lng;
//Oglas koji nema JSON objekat, nema ni mape
try{
const complete_data = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(7) > script').text();
const data_json_string = complete_data.slice(21,-1);
const data_json = JSON.parse(data_json_string);
title = data_json["re_realEstates_portalName"];
category = getCategoryIdfromNumber(data_json["re_types_id"]); //kategorije ne odgovaraju brojevima u Enums !!!
price = data_json["re_realEstates_price"];
size = data_json["re_realEstates_area"];
rooms = data_json["re_realEstates_roomsNO"];
address = data_json["re_realEstates_address"];
descriptions = data_json["re_realEstates_description"];
floor = data_json["re_realEstates_floorNO"];
time = data_json["re_realEstates_inserted"];
lat = data_json["re_realEstates_latitude"];
lng = data_json["re_realEstates_longitude"];
time = data_json["re_realEstates_inserted"];
lat = data_json["re_realEstates_latitude"];
lng = data_json["re_realEstates_longitude"];
}catch(e){
//oglas nema JSON objekat, informacije izvući preko selektora
time=undefined;
lat=undefined;
lng=undefined;
price = (parseFloat($('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.prices > span.pull-left').text().replace(',','').replace('.','')))/100;
const props_list = {};
$('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body').contents().map((i,elem)=>{
const entry = $(elem).text().trim().split(':');
if (entry[0]) props_list[entry[0]]=entry[1];
});
address = props_list['Ulica'];
size = parseFloat((props_list['Površina']).replace(',','').replace('.',''))/100;
rooms = props_list['Broj soba'];
floor = parseInt(props_list['Spratnost']);
title = $('div.container-fluid > div.container > div.row.content-top > div.col-xs-12.col-sm-6.col-md-9 > div.description.pull-left > h1').text();
descriptions = $('#b1 > div > div > div').text();
const full_category = $('body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.title > p').text().split(',',3);
category = (full_category.size > 2) ?
this.getCategoryIdfromText(full_category[0]+full_category[1]) :
this.getCategoryIdfromText(full_category[0]);
}
const images = [];
$(".img-gallery").contents().map((i,elem)=>{
//u linku sadržanom u tmp stoje i parametri za max visinu i širinu
const tmp =$(elem).attr('data-preview');
if(tmp) images.push(tmp);
});
const data = {
category,
url,
title,
price,
size,
rooms,
floor,
address,
adType: AD_TYPE_SALE,
time,
shortDescription: title,
longDescription: descriptions,
lat,
lng,
loc: [parseFloat(lat), parseFloat(lng)],
//images: cloudinaryImages
images
};
console.log(data);
return data;
} catch (e) {
console.error('Exception caught: ' + e.message);
}
return null;
}
async indexPage(pageNr, maxResults = 1000) {
try {
console.log('Starting to index page: ' + pageNr);
const url = "http://www.rental.ba/pretraga/prodaja-1/stranica-" + pageNr;
/*
const data = new FormData();
data.append('sales', 1); // Mislim da ovo definiše oglase tipa prodaje
data.append('re_types_id', ''); //odnosi se na tip nekretnine (kuća, stan, apartman,...)
data.append('full_text', '');
data.append('re_realEstates_code', '');
data.append('re_realEstates_price_max', '');
data.append('re_realEstates_price_min', '');
data.append('re_realEstates_area_min', '');
data.append('re_realEstates_area_max', '');
data.append('re_realEstates_roomsNO_min', '');
data.append('re_realEstates_roomsNO_max', '');
data.append('re_realEstates_floorNO_min', '');
data.append('re_realEstates_floorNO_max', '');
data.append('re_subTypes_id', 1);
*/
const res = await fetch(url, {
method: 'POST'
//body: data
});
const body = await res.text();
const $ = cheerio.load(body);
const hrefs = [];
$('.middle').each((i, elem) => {
const href = $(elem).find("a").first().attr('href');
hrefs.push(href);
});
const results = {};
for (const href of hrefs) {
console.log(`indexing: ${href}`);
const singleData = await this.indexSingle(href);
if (singleData) {
results[href] = singleData;
}
await this.sleep(500);
}
return results;
} catch (e) {
console.error('Exception caught:' + e);
}
}
getCategoryIdfromNumber(category){
switch(category){
case (1): return CATEGORY_HOUSE;
case (2): return CATEGORY_FLAT;
case (3): return CATEGORY_APARTMENT;
case (4): return CATEGORY_OFFICE;
case (5): return CATEGORY_LAND;
case (6): return CATEGORY_GARAGE;
}
}
getCategoryIdfromText (category) {
switch(category){
case ('samostojeća'): return CATEGORY_HOUSE
case ('dvojna'): return CATEGORY_HOUSE
case ('kuća u nizu'): return CATEGORY_HOUSE
case ('stambeno-poslovni objekt'): return CATEGORY_HOUSE
case ('prizemnica'): return CATEGORY_HOUSE
case ('kuća na moru'): return CATEGORY_HOUSE
case ('kuća u izgradnji'): return CATEGORY_HOUSE
case ('dvorac'): return CATEGORY_HOUSE
case ('apartmanska kuća'): return CATEGORY_HOUSE
case ('porodična kuća'): return CATEGORY_HOUSE
case ('vikend kuća'): return CATEGORY_HOUSE
case ('luksuzna kuća'): return CATEGORY_HOUSE
case ('kamena'): return CATEGORY_HOUSE
case ('vila'): return CATEGORY_HOUSE
case ('splav'): return CATEGORY_HOUSE
case ('stan u zgradi'): return CATEGORY_FLAT
case ('stan u kući'): return CATEGORY_FLAT
case ('stan višeetažni'): return CATEGORY_FLAT
case ('stan višeetažni u kući'): return CATEGORY_FLAT
case ('stan u starijoj zgradi'): return CATEGORY_FLAT
case ('stan u novogradnji'): return CATEGORY_FLAT
case ('stan u neboderu'): return CATEGORY_FLAT
case ('Korišten stan u novogradnji'): return CATEGORY_FLAT
case ('apartman na moru'): return CATEGORY_APARTMENT
case ('apartman u planini'): return CATEGORY_APARTMENT
case ('unutrašnje garažno mjesto'): return CATEGORY_GARAGE
case ('unutrašnje parkirno mjesto'): return CATEGORY_GARAGE
case ('građevinsko'): return CATEGORY_LAND
case ('građevinsko stambeno'): return CATEGORY_LAND
case ('zemljište, ostalo'): return CATEGORY_LAND
case ('odmaralište'): return CATEGORY_LAND
case ('oranica'): return CATEGORY_LAND
case ('šuma'): return CATEGORY_LAND
case ('livada'): return CATEGORY_LAND
case ('građevinsko M2'): return CATEGORY_LAND
case ('građevinsko M1'): return CATEGORY_LAND
case ('građevinsko - turističko'): return CATEGORY_LAND
case ('građevinsko - poslovno'): return CATEGORY_LAND
case ('otok'): return CATEGORY_LAND
case ('poljoprivredno'): return CATEGORY_LAND
case ('lokal'): return CATEGORY_OFFICE
case ('ured'): return CATEGORY_OFFICE
case ('skladište ili garaža'): return CATEGORY_OFFICE
case ('radionica'): return CATEGORY_OFFICE
case ('tvornica'): return CATEGORY_OFFICE
case ('restoran'): return CATEGORY_OFFICE
case ('sportski centar'): return CATEGORY_OFFICE
case ('ordinacija'): return CATEGORY_OFFICE
case ('kiosk'): return CATEGORY_OFFICE
case ('auto-praonica'): return CATEGORY_OFFICE
case ('poslovna zgrada'): return CATEGORY_OFFICE
case ('skladište'): return CATEGORY_OFFICE
case ('garaža'): return CATEGORY_OFFICE
case ('hotel'): return CATEGORY_OFFICE
case ('pansion'): return CATEGORY_OFFICE
case ('apartmanska zgrada'): return CATEGORY_OFFICE
case ('trgovina'): return CATEGORY_OFFICE
case ('prodajno skladišni'): return CATEGORY_OFFICE
case ('proizvodno skladišni'): return CATEGORY_OFFICE
case ('Kancelarije'): return CATEGORY_OFFICE
case ('Poslovni prostor'): return CATEGORY_OFFICE
}
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async indexPages(start, end, maxResults = 1000) {
let results = {};
for (let i = start; i <= end; i++) {
let result = await this.indexPage(i, maxResults);
Object.assign(results, result)
await this.sleep(5000);
}
return results;
}
async crawl() {
let results = await this.indexPages(this.fromPage, this.toPage, this.maxResults);
return results;
}
}