Add more real estate filters to crawler #72

Merged
bilal.catic merged 20 commits from add-more-real-estate-filters-to-crawler into master 2019-11-14 14:58:59 +01:00
12 changed files with 2084 additions and 208 deletions

View File

@@ -156,7 +156,8 @@ const AD_STATUS = {
STATUS_SOLD: 3,
STATUS_DELETED: 4,
STATUS_URGENT: 5,
STATUS_DISCOUNTED: 6
STATUS_DISCOUNTED: 6,
STATUS_RENTED: 7
};
const AD_AGENCY = {
@@ -187,11 +188,87 @@ const EMAIL_FREQUENCY = {
}
};
const HEATING_TYPE = {
NO_HEATING: {
id: "NO_HEATING",
title: "Nije uvedeno"
},
ELECTRICITY: {
id: "ELECTRICITY",
title: "Struja"
},
GAS: {
id: "GAS",
title: "Plin"
},
WOOD: {
id: "WOOD",
title: "Drva"
},
CENTRAL_CITY: {
id: "CENTRAL_CITY",
title: "Centralno (gradsko)"
},
CENTRAL_BOILER: {
id: "CENTRAL_BOILER",
title: "Centralno (kotlovnica)"
},
CENTRAL_GAS: {
id: "CENTRAL_GAS",
title: "Centralno (plin)"
},
HEAT_PUMP: {
id: "HEAT_PUMP",
title: "Toplotna pumpa"
},
OTHER: {
id: "OTHER",
title: "Drugo"
}
};
const ACCESS_ROAD_TYPE = {
ASPHALT: {
id: "ASPHALT",
title: "Asfalt"
},
CONCRETE: {
id: "CONCRETE",
title: "Beton"
},
MACADAM: {
id: "MACADAM",
title: "Makadam"
},
OTHER: {
id: "OTHER",
title: "Drugo"
}
};
const FURNISHING_TYPE = {
NOT_FURNISHED: {
id: "NOT_FURNISHED",
title: "Nenamješten"
},
HALF_FURNISHED: {
id: "HALF_FURNISHED",
title: "Polunamješten"
},
FURNISHED: {
id: "FURNISHED",
title: "Namješten"
}
};
module.exports = {
AD_TYPE,
AD_CATEGORY,
AD_STATUS,
AD_AGENCY,
CRAWLER_AD_TYPE,
EMAIL_FREQUENCY
EMAIL_FREQUENCY,
HEATING_TYPE,
ACCESS_ROAD_TYPE,
FURNISHING_TYPE
};

View File

@@ -29,5 +29,6 @@ module.exports = {
PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories,
PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [],
PROSTOR_DELAY_BETWEEN_PAGES:
parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000
parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000,
PROSTOR_FORCE_CRAWL: !!parseInt(process.env.PROSTOR_FORCE_CRAWL)
};

View File

@@ -11,7 +11,10 @@ const {
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE
CRAWLER_AD_TYPE,
HEATING_TYPE,
ACCESS_ROAD_TYPE,
FURNISHING_TYPE
} = require("../../common/enums");
const {
@@ -219,6 +222,7 @@ class AktidoCrawler {
throw { message: "Can't find ad data JSON" };
}
let adStatus = AD_STATUS.STATUS_NORMAL;
const aktidoId = extractedData["re_realEstates_id"];
const adCategory = this.getKiviCategoryIdFromAktidoId(
parseInt(extractedData["re_types_id"])
@@ -237,6 +241,181 @@ class AktidoCrawler {
};
}
const descriptionIds = extractedData["re_descriptions_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(descriptionIds)) {
throw {
message:
'Expected array od descriptions but "re_descriptions_id" not found !'
};
}
const spaceIds = extractedData["re_spaces_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(spaceIds)) {
throw {
message: 'Expected array od spaces but "re_spaces_id" not found !'
};
}
const infrastructureIds = extractedData["re_infrastructure_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(infrastructureIds)) {
throw {
message:
'Expected array od infrastructures but "re_infrastructure_id" not found !'
};
}
const floorNoIds = extractedData["re_floorNO_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(floorNoIds)) {
throw {
message:
'Expected array od infrastructures but "re_floorNO_id" not found !'
};
}
// counting floor enums
// for (let i = 1; i < 10; i++) {
// const floorEnumsTitle = $(
// `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(1)`
// )
// .text()
// .trim();
// if (floorEnumsTitle === "Spratnost:") {
// const floorEnumsValue = $(
// `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(2)`
// )
// .text()
// .trim()
// .split(",");
//
// console.log("==========");
// floorNoIds.forEach((id, index) => {
// console.log("\t", id, " = ", floorEnumsValue[index]);
// });
// break;
// }
// }
// enumerating infrastructure - relation between id and infrastructure title
// let found = false;
// let infrastructureDescriptions = {};
// for (let i = 1; i < 5; i++) {
// found = false;
// for (let j = 1; j < 10; j++) {
// const infrastructureTitle = $(
// `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > strong`
// )
// .text()
// .trim();
// if (infrastructureTitle === "Osnovna infrastruktura:") {
// found = true;
//
// const infrastructureValues = $(
// `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > div`
// )
// .text()
// .trim()
// .split(",");
//
// infrastructureIds.forEach((id, index) => {
// infrastructureDescriptions[id] = infrastructureValues[index];
// });
// }
// }
// if (found) {
// break;
// }
// }
const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions(
descriptionIds
);
const realEstatePropertiesFromSpaces = this.getPropertiesFromSpaces(
spaceIds
);
const realEstatePropertiesFromInfrastructure = this.getPropertiesFromInfrastructure(
infrastructureIds
);
if (extractedData["adm_realEstates_discount"] === "1") {
adStatus = AD_STATUS.STATUS_DISCOUNTED;
}
let numberOfRooms =
parseInt(extractedData["re_realEstates_roomsNO"]) +
parseInt(extractedData["re_realEstates_bedroomNO"]) || null,
numberOfFloors =
parseInt(extractedData["re_realEstates_floorsNO"]) ||
this.getNumberOfFloorsFromFloorId(extractedData["re_floorNO_id"]),
floor =
parseInt(extractedData["re_realEstates_floorNO"]) ||
this.getFloorNumberFromFloorId(extractedData["re_floorNO_id"]),
accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType,
heatingType =
this.getHeatingTypeId(extractedData["re_heating_id"]) || null,
furnishingType = realEstatePropertiesFromDescriptions.furnishingType,
balcony =
realEstatePropertiesFromDescriptions.balcony ||
realEstatePropertiesFromSpaces.balcony,
newBuilding = extractedData["op_realEstates_newBuilding"]
? extractedData["op_realEstates_newBuilding"] === "1"
: null,
elevator = realEstatePropertiesFromDescriptions.elevator,
water =
realEstatePropertiesFromDescriptions.water ||
realEstatePropertiesFromInfrastructure.water,
electricity =
realEstatePropertiesFromDescriptions.electricity ||
realEstatePropertiesFromInfrastructure.electricity,
drainageSystem =
realEstatePropertiesFromInfrastructure.drainageSystem,
registeredInZkBooks =
extractedData["op_realEstates_ownerPermit"] === 1 || null,
recentlyAdapted = null,
parking =
realEstatePropertiesFromDescriptions.parking ||
realEstatePropertiesFromSpaces.parking,
garage = realEstatePropertiesFromSpaces.garage,
gas = realEstatePropertiesFromInfrastructure.gas,
antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor,
airCondition = realEstatePropertiesFromDescriptions.airCondition,
phoneConnection =
realEstatePropertiesFromInfrastructure.phoneConnection,
cableTV = realEstatePropertiesFromInfrastructure.cableTV,
internet = realEstatePropertiesFromInfrastructure.internet,
basementAttic = realEstatePropertiesFromSpaces.basementAttic,
storeRoom = realEstatePropertiesFromSpaces.storeRoom,
videoSurveillance =
realEstatePropertiesFromDescriptions.videoSurveillance ||
realEstatePropertiesFromInfrastructure.videoSurveillance,
alarm = realEstatePropertiesFromDescriptions.alarm,
suitableForStudents = null,
includingBills =
extractedData["op_realEstates_utilitiesIncluded"] === "1" || null,
animalsAllowed = null,
pool = realEstatePropertiesFromDescriptions.pool,
urbanPlanPermit =
extractedData["op_realEstates_locationPermit"] === "1" ||
realEstatePropertiesFromDescriptions.urbanPlanPermit,
buildingPermit =
extractedData["op_realEstates_buildingPermit"] === "1" || null,
utilityConnection =
realEstatePropertiesFromDescriptions.utilityConnection,
distanceToRiver = null,
numberOfViewsAgency = null;
const title = extractedData["re_realEstates_portalName"];
const extractedPrice = parseFloat(
extractedData["re_realEstates_price"]
@@ -277,8 +456,6 @@ class AktidoCrawler {
};
}
const adStatus = AD_STATUS.STATUS_NORMAL;
const data = {
url,
agencyObjectId: aktidoId,
@@ -303,7 +480,42 @@ class AktidoCrawler {
locationLong,
adStatus,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString()
renewedDate: renewedDateMoment.toISOString(),
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency
};
return data;
@@ -350,6 +562,270 @@ class AktidoCrawler {
}
}
getPropertiesFromDescriptions(descriptionIds) {
const result = {
accessRoadType: null,
furnishingType: null,
balcony: null,
elevator: null,
parking: null,
antiTheftDoor: null,
airCondition: null,
videoSurveillance: null,
alarm: null,
pool: null,
urbanPlanPermit: null,
utilityConnection: null,
water: null,
electricity: null
};
for (const descriptionId of descriptionIds) {
switch (descriptionId) {
case 16:
result.furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
break;
case 17:
result.furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
break;
case 1:
case 28:
result.furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case 14:
result.elevator = true;
break;
case 39:
result.electricity = true;
break;
case 40:
result.water = true;
break;
case 41:
case 58:
result.accessRoadType = ACCESS_ROAD_TYPE.ASPHALT.id;
break;
case 26:
result.balcony = true;
break;
case 62:
result.parking = true;
break;
case 3:
result.antiTheftDoor = true;
break;
case 2:
case 21:
result.airCondition = true;
break;
case 4:
result.alarm = true;
break;
case 55:
result.videoSurveillance = true;
break;
case 9:
result.pool = true;
break;
case 60:
result.urbanPlanPermit = true;
break;
case 38:
result.utilityConnection = true;
break;
}
}
return result;
}
getPropertiesFromSpaces(spaceIds) {
const result = {
balcony: null,
parking: null,
garage: null,
basementAttic: null,
storeRoom: null
};
for (const spaceId of spaceIds) {
switch (spaceId) {
case 36:
case 12:
result.parking = true;
break;
case 1:
case 2:
case 3:
result.balcony = true;
break;
case 4:
case 30:
result.garage = true;
break;
case 9:
case 10:
result.storeRoom = true;
break;
case 18:
case 34:
case 37:
case 27:
result.basementAttic = true;
break;
}
}
return result;
}
getHeatingTypeId(heatingRentalId) {
// heatingRentalId can have multiple values, like: "1, 2, 3", parseInt will take first integer value
const heatingId = parseInt(heatingRentalId);
switch (heatingId) {
case 27:
case 16:
return HEATING_TYPE.GAS.id;
case 4:
return HEATING_TYPE.CENTRAL_GAS.id;
case 3:
case 23:
case 6:
case 7:
case 8:
case 9:
case 10:
return HEATING_TYPE.CENTRAL_BOILER.id;
case 2:
case 13:
case 30:
case 17:
case 29:
case 31:
return HEATING_TYPE.ELECTRICITY.id;
case 24:
case 25:
case 12:
return HEATING_TYPE.CENTRAL_CITY.id;
case 26:
case 21:
case 20:
return HEATING_TYPE.WOOD.id;
case 28:
case 19:
return HEATING_TYPE.HEAT_PUMP.id;
case 14:
case 32:
return HEATING_TYPE.OTHER.id;
default:
return null;
}
}
getPropertiesFromInfrastructure(infrastructureIds) {
const result = {
electricity: null,
water: null,
gas: null,
drainageSystem: null,
phoneConnection: null,
internet: null,
videoSurveillance: null,
cableTV: null
};
for (const infrastructureId of infrastructureIds) {
switch (infrastructureId) {
case 1:
result.electricity = true;
break;
case 2:
result.water = true;
break;
case 4:
result.gas = true;
break;
case 5:
result.drainageSystem = true;
break;
case 7:
case 8:
result.phoneConnection = true;
break;
case 10:
result.internet = true;
break;
case 11:
result.cableTV = true;
break;
case 16:
case 17:
result.videoSurveillance = true;
break;
}
}
return result;
}
getFloorNumberFromFloorId(floorsIdText) {
// floorIdText can be array of numbers, separated by comma or number
// just extracting floor number from first element
const floorsId = floorsIdText.split(",");
if (floorsId.length === 0) {
return null;
}
const firstFloorId = parseInt(floorsId[0]);
// 1 pod
// 2 sut
// 3 raz
// 4 pri
// 5 vpri
// 6 prv
// 7 dru
// 8 tre
// 9 čet
// 10 man
// 11
// 12 pot
// 13 vpot
// 14 tav
// 15 pet
const floorNumber = [
-1,
-1,
0,
0,
1,
1,
2,
3,
4,
null,
null,
null,
null,
null,
5
];
return floorNumber[firstFloorId - 1] || null;
}
getNumberOfFloorsFromFloorId(floorsIdText) {
// floorIdText can be array of numbers, separated by comma or number
const floorIds = floorsIdText.split(",");
if (floorIds.length === 0) {
return null;
}
return floorIds.length;
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
@@ -362,7 +838,7 @@ class AktidoCrawler {
// }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}

View File

@@ -10,7 +10,10 @@ const {
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE
CRAWLER_AD_TYPE,
HEATING_TYPE,
FURNISHING_TYPE,
ACCESS_ROAD_TYPE
} = require("../../common/enums");
const {
@@ -271,6 +274,7 @@ class OlxCrawler {
//====== OTHER AD INFORMATION ===============
let adType = null;
let olxId = null;
let numberOfViewsAgency = null;
let otherInformationDivId;
//We need to locate DIV ID where other information are stored
@@ -293,6 +297,7 @@ class OlxCrawler {
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`;
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
const publishedDate = $(publishedDateValueSelector)
@@ -331,60 +336,7 @@ class OlxCrawler {
)
.text()
.trim();
const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`)
.text()
.trim();
olxId = $(`${olxIdFieldSelector} > div.df2`)
.text()
.trim();
if (olxIdFieldTitle !== "OLX ID") {
throw { message: "Cannot find correct OLX ID" };
}
//===========================================
//====== DETAIL INFORMATION FIELDS ==========
let area = null;
let gardenSize = null;
let fieldIndex = 1;
do {
const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`;
const fieldTitleSelector = `${fieldSelector} > div.df1`;
const fieldValueSelector = `${fieldSelector} > div.df2`;
const fieldTitle = $(fieldTitleSelector)
.text()
.trim();
const fieldValue = $(fieldValueSelector)
.text()
.trim();
switch (fieldTitle) {
case "Kvadrata":
area = fieldValue;
break;
case "Okućnica (kvadratura)":
gardenSize = fieldValue;
break;
}
if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") {
break;
}
} while (true);
//===========================================
//====== UNUSED FIELDS FOR NOW ==============
const time = $("time").attr("datetime");
const numberOfViews = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2"
)
.text()
.trim();
//===========================================
//=========================================
const parsedCategory = this.getAdCategoryId(category);
if (!parsedCategory) {
throw { message: `Unknown ad category [${category}]` };
@@ -395,6 +347,221 @@ class OlxCrawler {
throw { message: "Unknown ad type" };
}
const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`)
.text()
.trim();
olxId = $(`${olxIdFieldSelector} > div.df2`)
.text()
.trim();
numberOfViewsAgency = parseInt(
$(numberOfViewsAgencyValueSelector)
.text()
.trim()
);
if (olxIdFieldTitle !== "OLX ID") {
throw { message: "Cannot find correct OLX ID" };
}
//===========================================
//====== DETAIL INFORMATION FIELDS ==========
let area,
gardenSize,
numberOfRooms = null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = null,
furnishingType = null,
balcony = null,
newBuilding = null,
elevator = null,
water = null,
electricity = null,
drainageSystem = null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = null,
garage = null,
gas = null,
antiTheftDoor = null,
airCondition = null,
phoneConnection = null,
cableTV = null,
internet = null,
basementAttic = null,
storeRoom = null,
videoSurveillance = null,
alarm = null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null;
let fieldIndex = 1;
do {
const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`;
const fieldTitleSelector = `${fieldSelector} > div.df1`;
const fieldValueSelector = `${fieldSelector} > div.df2`;
const fieldTitle = $(fieldTitleSelector)
.text()
.trim()
.toLowerCase();
const fieldValue = $(fieldValueSelector)
.text()
.trim()
.toLowerCase();
switch (fieldTitle) {
case "kvadrata":
area = fieldValue;
break;
case "okućnica (kvadratura)":
gardenSize = fieldValue;
break;
case "broj soba":
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
break;
case "broj prostorija":
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
break;
case "broj spratova":
numberOfFloors = this.parseNumberOfFloors(
fieldValue,
parsedCategory
);
break;
case "sprat":
floor = this.parseFloorNumber(fieldValue, parsedCategory);
break;
case "vrsta grijanja":
heatingType = this.getHeatingTypeId(fieldValue);
break;
case "namješten?":
furnishingType = this.getFurnishingTypeId(fieldValue);
break;
case "namješten":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "namještena":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "voda":
water = true;
break;
case "struja":
electricity = true;
break;
case "kanalizacija":
drainageSystem = fieldValue !== "nema";
break;
case "godina izgradnje":
newBuilding = newBuilding || fieldValue === "novogradnja";
break;
case "kućni ljubimci":
animalsAllowed = fieldValue === "da";
break;
case "uknjiženo / zk":
registeredInZkBooks = true;
break;
case "uknjiženo (zk)":
registeredInZkBooks = true;
break;
case "novogradnja":
newBuilding = true;
break;
case "nedavno adaptiran":
recentlyAdapted = true;
break;
case "nedavno adaptirana":
recentlyAdapted = true;
break;
case "balkon":
balcony = true;
break;
case "lift":
elevator = true;
break;
case "parking":
parking = true;
break;
case "garaža":
garage = true;
break;
case "plin":
gas = true;
break;
case "blindirana vrata":
antiTheftDoor = true;
break;
case "klima":
airCondition = true;
break;
case "telefonski priključak":
phoneConnection = true;
break;
case "kablovska tv":
cableTV = true;
break;
case "internet":
internet = true;
break;
case "podrum/tavan":
basementAttic = true;
break;
case "ostava/špajz":
storeRoom = true;
break;
case "video nadzor":
videoSurveillance = true;
break;
case "alarm":
alarm = true;
break;
case "za studente":
suitableForStudents = true;
break;
case "uključen trošak režija":
includingBills = true;
break;
case "građevinska dozvola":
buildingPermit = true;
break;
case "komunalni priključak":
utilityConnection = true;
break;
case "urbanistička dozvola":
urbanPlanPermit = true;
break;
case "udaljenost od rijeke (m)":
distanceToRiver = parseInt(fieldValue) || null;
break;
case "prilaz":
accessRoadType = this.getAccessRoadTypeId(fieldValue);
break;
case "bazen":
pool = true;
break;
case "iznajmljeno":
status = AD_STATUS.STATUS_RENTED;
break;
default:
// console.log(fieldTitle, " = ", fieldValue);
break;
}
if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") {
break;
}
} while (true);
//===========================================
//=========================================
const parsedArea = this.parseArea(area) || null;
const parsedGardenSize = this.parseArea(gardenSize) || null;
const parsedPrice = this.parsePrice(price) || null;
@@ -409,6 +576,13 @@ class OlxCrawler {
locationLong = parseFloat(locationLatLngMatches[2]) || null;
}
if (
title.indexOf("[PRODANO]") !== -1 ||
title.indexOf("[ZAVRŠENO]") !== -1
) {
status = AD_STATUS.STATUS_SOLD;
}
const data = {
url,
agencyObjectId: olxId,
@@ -439,7 +613,42 @@ class OlxCrawler {
locationLong,
adStatus: status,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString()
renewedDate: renewedDateMoment.toISOString(),
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency
};
return data;
@@ -485,6 +694,64 @@ class OlxCrawler {
}
}
getHeatingTypeId(heatingTypeText) {
switch (heatingTypeText) {
case "struja":
return HEATING_TYPE.ELECTRICITY.id;
case "plin":
return HEATING_TYPE.GAS.id;
case "drva":
return HEATING_TYPE.WOOD.id;
case "centralno (gradsko)":
return HEATING_TYPE.CENTRAL_CITY.id;
case "centralno (kotlovnica)":
return HEATING_TYPE.CENTRAL_BOILER.id;
case "centralno (plin)":
return HEATING_TYPE.CENTRAL_GAS.id;
case "nije uvedeno":
return HEATING_TYPE.NO_HEATING.id;
case "ostalo":
return HEATING_TYPE.OTHER.id;
case "drugo":
return HEATING_TYPE.OTHER.id;
default:
console.log("grijanje = NEPOZNATO [", heatingTypeText, "]");
return null;
}
}
getFurnishingTypeId(furnishingTypeText) {
switch (furnishingTypeText) {
case "namješten":
return FURNISHING_TYPE.FURNISHED.id;
case "polunamješten":
return FURNISHING_TYPE.HALF_FURNISHED.id;
case "nenamješten":
return FURNISHING_TYPE.NOT_FURNISHED.id;
case "":
return FURNISHING_TYPE.FURNISHED.id;
default:
console.log("namješten = NEPOZNATO [", furnishingTypeText, "]");
return null;
}
}
getAccessRoadTypeId(accessRoadTypeText) {
switch (accessRoadTypeText) {
case "asfalt":
return ACCESS_ROAD_TYPE.ASPHALT.id;
case "beton":
return ACCESS_ROAD_TYPE.CONCRETE.id;
case "makadam":
return ACCESS_ROAD_TYPE.MACADAM.id;
case "ostalo":
return ACCESS_ROAD_TYPE.OTHER.id;
default:
console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]");
return null;
}
}
parseArea(areaText) {
if (!areaText) {
return NaN;
@@ -505,56 +772,100 @@ class OlxCrawler {
return parseFloat(formattedPriceText);
}
parseRenewedDate(renewedDateText) {
const currentMoment = moment.tz(DEFAULT_TIMEZONE);
if (renewedDateText.includes("Prije mjesec dana")) {
return currentMoment.add(-1, "month");
}
if (renewedDateText.includes("Jučer")) {
return currentMoment.add(-1, "day");
}
if (renewedDateText.includes("Prije sat")) {
return currentMoment.add(-1, "hour");
}
if (renewedDateText.includes("dan")) {
// format for this case should be "Prije N dana" or "Prije N dan"
const dateParts = renewedDateText.split(" ");
if (dateParts[0] === "Prije") {
const numberOfDays = parseInt(dateParts[1]);
return currentMoment.add(-1 * numberOfDays, "days");
} else {
return undefined;
parseNumberOfRooms(numberOfRoomsText, categoryId) {
if (categoryId === AD_CATEGORY.FLAT.id) {
switch (numberOfRoomsText) {
case "garsonjera":
return 0;
case "jednosoban (1)":
return 1;
case "jednoiposoban (1.5)":
return 1.5;
case "dvosoban (2)":
return 2;
case "trosoban (3)":
return 3;
case "četverosoban (4)":
return 4;
case "petosoban i više":
return 5;
default:
console.log(
"broj soba [stan] = NEPOZNATO [",
numberOfRoomsText,
", ",
categoryId,
"]"
);
return null;
}
}
if (renewedDateText.includes("sat")) {
const dateParts = renewedDateText.split(" ");
const parsedHours =
dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined;
if (!parsedHours) {
return undefined;
}
return currentMoment.add(-1 * parsedHours, "hours");
if (
categoryId === AD_CATEGORY.HOUSE.id ||
categoryId === AD_CATEGORY.COTTAGE.id ||
categoryId === AD_CATEGORY.APARTMENT.id ||
categoryId === AD_CATEGORY.OFFICE.id
) {
return parseInt(numberOfRoomsText) || null;
}
const todayVariations = ["min", "sekund", "maloprije"];
for (const todayVariation of todayVariations) {
if (renewedDateText.includes(todayVariation)) {
return currentMoment;
}
console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]");
return null;
}
parseNumberOfFloors(numberOfFloorsText, categoryId) {
if (
categoryId === AD_CATEGORY.HOUSE.id ||
categoryId === AD_CATEGORY.COTTAGE.id
) {
return parseInt(numberOfFloorsText) || null;
}
const renewedDateMoment = moment.tz(
renewedDateText,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (categoryId === AD_CATEGORY.OFFICE.id) {
if (
numberOfFloorsText === "suteren" ||
numberOfFloorsText === "prizemlje"
) {
return 0;
}
if (numberOfFloorsText === "6+") {
return 7;
}
return parseInt(numberOfFloorsText) || null;
}
return renewedDateMoment.isValid() ? renewedDateMoment : undefined;
console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]");
return null;
}
parseFloorNumber(floorText, categoryId) {
if (
categoryId === AD_CATEGORY.FLAT.id ||
categoryId === AD_CATEGORY.APARTMENT.id
) {
if (
floorText === "suteren" ||
floorText === "prizemlje" ||
floorText === "visoko prizemlje"
) {
return 0;
}
return parseInt(floorText) || null;
}
if (categoryId === AD_CATEGORY.OFFICE.id) {
if (floorText === "zaseban objekat") {
return null;
}
if (floorText === "prizemlje" || floorText === "visoko prizemlje") {
return 0;
}
return parseInt(floorText) || null;
}
console.log("sprat = NEPOZNATO [", floorText, "]");
return null;
}
async sleep(ms) {
@@ -569,7 +880,7 @@ class OlxCrawler {
// }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}

View File

@@ -2,16 +2,23 @@
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const {
AD_TYPE,
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE
CRAWLER_AD_TYPE,
FURNISHING_TYPE,
HEATING_TYPE
} = require("../../common/enums");
const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig");
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
} = require("../../config/appConfig");
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
const PROSTOR_ENUMS = {
PROSTOR_AD_TYPE: {
@@ -48,37 +55,359 @@ class ProstorCrawler {
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
const urlAdTypePart =
PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
const singleCategoryResults = await this.extractRealEstates(
urlPageToCrawl
);
indexGenerators.push(this.categoryIndexer(adCategory));
}
const resultsSubset = singleCategoryResults.slice(
0,
this.maxResultsPerPage
);
const saveResults = await this.saveCrawledResults(resultsSubset);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!PROSTOR_FORCE_CRAWL
) {
generatorsToRemove[index] = true;
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
}
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
await this.sleep(this.delayBetweenPages);
}
}
return newRealEstates;
}
async *categoryIndexer(adCategory) {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl
);
let elementToStartIndexFrom = 0;
while (true) {
const realEstatesForSinglePage = listOfAllRealEstates.slice(
elementToStartIndexFrom,
elementToStartIndexFrom + this.maxResultsPerPage
);
if (realEstatesForSinglePage.length > 0) {
elementToStartIndexFrom += realEstatesForSinglePage.length;
const singlePageResults = await this.indexSinglePage(
realEstatesForSinglePage
);
const filteredSinglePageResults = singlePageResults.filter(
singleResult => !!singleResult
);
if (
Array.isArray(filteredSinglePageResults) &&
filteredSinglePageResults.length > 0
) {
yield filteredSinglePageResults;
} else {
return undefined;
}
} else {
return undefined;
}
}
} else {
return undefined;
}
}
async indexSinglePage(realEstatesList) {
const asyncActions = [];
for (const realEstate of realEstatesList) {
asyncActions.push(this.scrapeAd(realEstate));
}
try {
return await Promise.all(asyncActions);
} catch (e) {
console.log(
"[PROSTOR] Error crawling ads : ",
e.message || "UNKNOWN ERROR"
);
return [];
}
}
async scrapeAd(realEstate) {
const { lat, lng, property_name, price, size, link, status } = realEstate;
const url = `https://prostor.ba${link}`;
// console.log("[PROSTOR] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
const linkParts = link.split("/");
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
if (!adType || !realEstateType || !prostorId) {
return null;
}
const allDataSelector =
"body > div > div.container-fluid > div > div.column-right > table > tbody";
const realEstateProperties = {};
$(allDataSelector)
.find("p")
.each((i, element) => {
const propertyElement = $(element)
.text()
.split(":")
.map(text => text.trim().toLowerCase());
const propertyTitle = propertyElement[0];
realEstateProperties[propertyTitle] = propertyElement[1];
});
$(allDataSelector)
.find("div.mb-2")
.each((i, element) => {
const propertyElement = $(element)
.text()
.trim()
.toLowerCase();
realEstateProperties[propertyElement] = true;
});
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
return null;
}
let numberOfRooms =
parseFloat(realEstateProperties["broj soba"]) +
parseFloat(realEstateProperties["broj spavaćih soba"]) || null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties),
furnishingType = null,
balcony =
realEstateProperties["balkon"] ||
realEstateProperties["terasa"] ||
realEstateProperties["lođa"] ||
null,
newBuilding = linkParts[1] === "novogradnja",
elevator = realEstateProperties["lift"] || null,
water = realEstateProperties["voda"] || null,
electricity = realEstateProperties["električna energija"] || null,
drainageSystem = realEstateProperties["kanalizacija"] || null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = realEstateProperties["parking"] || null,
garage = realEstateProperties["garaža"] || null,
gas = realEstateProperties["plin"] || null,
antiTheftDoor = realEstateProperties["blindo vrata"] || null,
airCondition = realEstateProperties["klima"] || null,
phoneConnection = realEstateProperties["telefon"] || null,
cableTV = realEstateProperties["kablovksa tv"] || null,
internet =
realEstateProperties["internet"] ||
realEstateProperties["adsl"] ||
null,
basementAttic = realEstateProperties["podrum"] || null,
storeRoom = realEstateProperties["ostava"] || null,
videoSurveillance = realEstateProperties["video nadzor"],
alarm = realEstateProperties["alarm"] || null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = realEstateProperties["bazen"] || null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null,
numberOfViewsAgency = null;
// Floor versions (there are possibly more versions) :
// Sprat: 3/3
// Sprat: 1 - 2/2
// Sprat: Pr - 7/7
// Sprat: -2/0
// If there are two parts, that represents more real estates are sold
// numberOfFloors is contained in second part, after / sign
const floorsArray = realEstateProperties["sprat"].split(" - ");
let floorText = "";
if (floorsArray.length === 1) {
const floorDescription = floorsArray[0].split("/");
numberOfFloors = parseInt(floorDescription[1]) || null;
floorText = floorDescription[0];
floor = Math.round(parseFloat(floorText));
} else if (floorsArray.length === 2) {
const floorDescription = floorsArray[1].split("/");
numberOfFloors = parseInt(floorDescription[1]) || null;
floorText = floorsArray[0];
floor = Math.round(parseFloat(floorText));
} else {
// This is something strange
}
if (isNaN(floor)) {
// It was textual representation of floor, like "Pr", "Su" or similar
switch (floorText) {
case "pr":
floor = 0;
break;
case "su":
floor = -1;
break;
default:
console.log(
"[PROSTOR] Unknown textual representation of floor : ",
floorText
);
floor = null;
}
}
if (realEstateProperties["namješteno"]) {
furnishingType = FURNISHING_TYPE.FURNISHED.id;
} else if (realEstateProperties["polunamješteno"]) {
furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
} else {
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
}
const adStatus = ProstorCrawler.getStatusId(status);
const title = property_name;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
const gardenSize = null;
const longDescription = null;
const data = {
url,
agencyObjectId: prostorId,
originAgencyName: AD_AGENCY.PROSTOR,
realEstateType,
adType,
title,
price: parsedPrice,
area: parsedArea,
gardenSize,
shortDescription: "",
longDescription: longDescription,
streetNumber: 0,
streetName: realEstateProperties["adresa"],
locality: "",
municipality: "",
city: "",
region: "",
entity: "",
country: "",
locationLat: lat,
locationLong: lng,
adStatus,
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency
};
return data;
} catch (e) {
console.error(
"[PROSTOR] Exception caught: " + e.message,
"\r\nURL:",
url
);
return null;
}
}
async extractRealEstates(url) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url);
@@ -115,18 +444,19 @@ class ProstorCrawler {
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
const realEstates = JSON.parse(jsonData);
const transformedRealEstates = [];
for (const realEstate of realEstates) {
const transformedRealEstate = ProstorCrawler.transformRealEstateData(
realEstate
);
if (transformedRealEstate) {
transformedRealEstates.push(transformedRealEstate);
}
}
return transformedRealEstates;
// const transformedRealEstates = [];
//
// for (const realEstate of realEstates) {
// const transformedRealEstate = ProstorCrawler.transformRealEstateData(
// realEstate
// );
// if (transformedRealEstate) {
// transformedRealEstates.push(transformedRealEstate);
// }
// }
//
// return transformedRealEstates;
return realEstates;
} else {
throw {
message: "Something is wrong with JSON data or data is moved"
@@ -134,73 +464,15 @@ class ProstorCrawler {
}
} catch (e) {
console.log(e);
throw { message: "Can't find ad data JSON" };
throw e;
}
}
} catch (e) {
console.error("[PROSTOR] Exception caught:", e.message);
return [];
}
}
static transformRealEstateData(realEstateData) {
try {
const { lat, lng, property_name, price, size, link } = realEstateData;
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
const linkParts = link.split("/");
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
const url = `https://prostor.ba${link}`;
if (!adType || !realEstateType || !prostorId) {
return null;
}
const adStatus = AD_STATUS.STATUS_NORMAL;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
const data = {
url,
agencyObjectId: prostorId,
originAgencyName: AD_AGENCY.PROSTOR,
realEstateType,
adType,
title: property_name,
price: parsedPrice,
area: parsedArea,
gardenSize: null,
shortDescription: "",
longDescription: "",
streetNumber: 0,
streetName: "",
locality: "",
municipality: "",
city: "",
region: "",
entity: "",
country: "",
locationLat: lat,
locationLong: lng,
adStatus,
publishedDate: null,
renewedDate: null
};
return data;
} catch (e) {
console.error(
"[PROSTOR] Exception caught: " + e.message,
"\r\nURL:",
url
"[PROSTOR] Exception caught:",
e.message || "UNKNOWN MESSAGE"
);
return null;
return [];
}
}
@@ -231,11 +503,61 @@ class ProstorCrawler {
return AD_TYPE.AD_TYPE_SALE.stringId;
case "najam":
return AD_TYPE.AD_TYPE_RENT.stringId;
case "novogradnja":
return AD_TYPE.AD_TYPE_SALE.stringId;
default:
return undefined;
}
}
static getHeatingTypeId(realEstateProperties) {
const realEstatePropertiesKeys = Object.keys(realEstateProperties);
for (const property of realEstatePropertiesKeys) {
switch (property) {
case "centralno toplane":
return HEATING_TYPE.CENTRAL_CITY.id;
case "etažno plinsko":
return HEATING_TYPE.CENTRAL_GAS.id;
case "termo blok":
case "podno grijanje":
return HEATING_TYPE.OTHER.id;
case "etažno električno":
case "konvektori":
return HEATING_TYPE.ELECTRICITY.id;
case "plinske peći":
return HEATING_TYPE.GAS.id;
case "vlastita kotlovnica":
return HEATING_TYPE.CENTRAL_BOILER.id;
case "toplotna pumpa":
return HEATING_TYPE.HEAT_PUMP.id;
case "kamin":
return HEATING_TYPE.WOOD.id;
default:
//console.log("[PROSTOR] Nepoznato >>> [", property, "]");
}
}
}
static getStatusId(statusText) {
switch (statusText) {
case "":
return AD_STATUS.STATUS_NORMAL;
case "Rezervisano":
return AD_STATUS.STATUS_RESERVED;
case "Prodano":
return AD_STATUS.STATUS_SOLD;
case "Iznajmljeno":
return AD_STATUS.STATUS_RENTED;
default:
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
return AD_STATUS.STATUS_NORMAL;
}
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
const savers = this.savers;
@@ -244,7 +566,7 @@ class ProstorCrawler {
// }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}

View File

@@ -11,7 +11,10 @@ const {
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE
CRAWLER_AD_TYPE,
HEATING_TYPE,
ACCESS_ROAD_TYPE,
FURNISHING_TYPE
} = require("../../common/enums");
const {
@@ -219,6 +222,7 @@ class RentalCrawler {
throw { message: "Can't find ad data JSON" };
}
let adStatus = AD_STATUS.STATUS_NORMAL;
const rentalId = extractedData["re_realEstates_id"];
const adCategory = this.getKiviCategoryIdFromRentalId(
parseInt(extractedData["re_types_id"])
@@ -237,6 +241,141 @@ class RentalCrawler {
};
}
const descriptionIds = extractedData["re_descriptions_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(descriptionIds)) {
throw {
message:
'Expected array od descriptions but "re_descriptions_id" not found !'
};
}
const spaceIds = extractedData["re_spaces_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(spaceIds)) {
throw {
message: 'Expected array od spaces but "re_spaces_id" not found !'
};
}
const infrastructureIds = extractedData["re_infrastructure_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(infrastructureIds)) {
throw {
message:
'Expected array od infrastructures but "re_infrastructure_id" not found !'
};
}
const floorNoIds = extractedData["re_floorNO_id"]
.split(",")
.map(stringNumber => parseInt(stringNumber));
if (!Array.isArray(floorNoIds)) {
throw {
message:
'Expected array od infrastructures but "re_floorNO_id" not found !'
};
}
const numberOfViewsAgencySelector = $(
"body > div > div.container > div.row.content-top > div.col-xs-12.col-sm-12.col-md-9 > div > div.box-viewcount"
);
// number of views is written as : "Broj pregledavanja: NNN"
const numberOfViewsAgencyFullText = numberOfViewsAgencySelector
.text()
.trim();
const numberOfViewsAgencyParts = numberOfViewsAgencyFullText.split(":");
const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions(
descriptionIds
);
const realEstatePropertiesFromSpaces = this.getPropertiesFromSpaces(
spaceIds
);
const realEstatePropertiesFromInfrastructure = this.getPropertiesFromInfrastructure(
infrastructureIds
);
if (extractedData["adm_realEstates_discount"] === "1") {
adStatus = AD_STATUS.STATUS_DISCOUNTED;
}
let numberOfRooms =
parseInt(extractedData["re_realEstates_roomsNO"]) +
parseInt(extractedData["re_realEstates_bedroomNO"]) || null,
numberOfFloors =
parseInt(extractedData["re_realEstates_floorsNO"]) ||
this.getNumberOfFloorsFromFloorId(extractedData["re_floorNO_id"]),
floor =
parseInt(extractedData["re_realEstates_floorNO"]) ||
this.getFloorNumberFromFloorId(extractedData["re_floorNO_id"]),
accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType,
heatingType =
this.getHeatingTypeId(extractedData["re_heating_id"]) || null,
furnishingType = realEstatePropertiesFromDescriptions.furnishingType,
balcony =
realEstatePropertiesFromDescriptions.balcony ||
realEstatePropertiesFromSpaces.balcony,
newBuilding = extractedData["op_realEstates_newBuilding"]
? extractedData["op_realEstates_newBuilding"] === "1"
: null,
elevator = realEstatePropertiesFromDescriptions.elevator,
water =
realEstatePropertiesFromDescriptions.water ||
realEstatePropertiesFromInfrastructure.water,
electricity =
realEstatePropertiesFromDescriptions.electricity ||
realEstatePropertiesFromInfrastructure.electricity,
drainageSystem =
realEstatePropertiesFromInfrastructure.drainageSystem,
registeredInZkBooks =
extractedData["op_realEstates_ownerPermit"] === 1 || null,
recentlyAdapted = null,
parking =
realEstatePropertiesFromDescriptions.parking ||
realEstatePropertiesFromSpaces.parking,
garage = realEstatePropertiesFromSpaces.garage,
gas = realEstatePropertiesFromInfrastructure.gas,
antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor,
airCondition = realEstatePropertiesFromDescriptions.airCondition,
phoneConnection =
realEstatePropertiesFromInfrastructure.phoneConnection,
cableTV = realEstatePropertiesFromInfrastructure.cableTV,
internet = realEstatePropertiesFromInfrastructure.internet,
basementAttic = realEstatePropertiesFromSpaces.basementAttic,
storeRoom = realEstatePropertiesFromSpaces.storeRoom,
videoSurveillance =
realEstatePropertiesFromDescriptions.videoSurveillance ||
realEstatePropertiesFromInfrastructure.videoSurveillance,
alarm = realEstatePropertiesFromDescriptions.alarm,
suitableForStudents = null,
includingBills =
extractedData["op_realEstates_utilitiesIncluded"] === "1" || null,
animalsAllowed = null,
pool = realEstatePropertiesFromDescriptions.pool,
urbanPlanPermit =
extractedData["op_realEstates_locationPermit"] === "1" ||
realEstatePropertiesFromDescriptions.urbanPlanPermit,
buildingPermit =
extractedData["op_realEstates_buildingPermit"] === "1" || null,
utilityConnection =
realEstatePropertiesFromDescriptions.utilityConnection,
distanceToRiver = null,
numberOfViewsAgency =
numberOfViewsAgencyParts.length > 1
? parseInt(numberOfViewsAgencyParts[1])
: null;
const title = extractedData["re_realEstates_portalName"];
const extractedPrice = parseFloat(
extractedData["re_realEstates_price"]
@@ -277,8 +416,6 @@ class RentalCrawler {
};
}
const adStatus = AD_STATUS.STATUS_NORMAL;
const data = {
url,
agencyObjectId: rentalId,
@@ -303,7 +440,42 @@ class RentalCrawler {
locationLong,
adStatus,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString()
renewedDate: renewedDateMoment.toISOString(),
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency
};
return data;
@@ -350,6 +522,270 @@ class RentalCrawler {
}
}
getPropertiesFromDescriptions(descriptionIds) {
const result = {
accessRoadType: null,
furnishingType: null,
balcony: null,
elevator: null,
parking: null,
antiTheftDoor: null,
airCondition: null,
videoSurveillance: null,
alarm: null,
pool: null,
urbanPlanPermit: null,
utilityConnection: null,
water: null,
electricity: null
};
for (const descriptionId of descriptionIds) {
switch (descriptionId) {
case 16:
result.furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
break;
case 17:
result.furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
break;
case 1:
case 28:
result.furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case 14:
result.elevator = true;
break;
case 39:
result.electricity = true;
break;
case 40:
result.water = true;
break;
case 41:
case 58:
result.accessRoadType = ACCESS_ROAD_TYPE.ASPHALT.id;
break;
case 26:
result.balcony = true;
break;
case 62:
result.parking = true;
break;
case 3:
result.antiTheftDoor = true;
break;
case 2:
case 21:
result.airCondition = true;
break;
case 4:
result.alarm = true;
break;
case 55:
result.videoSurveillance = true;
break;
case 9:
result.pool = true;
break;
case 60:
result.urbanPlanPermit = true;
break;
case 38:
result.utilityConnection = true;
break;
}
}
return result;
}
getPropertiesFromSpaces(spaceIds) {
const result = {
balcony: null,
parking: null,
garage: null,
basementAttic: null,
storeRoom: null
};
for (const spaceId of spaceIds) {
switch (spaceId) {
case 36:
case 12:
result.parking = true;
break;
case 1:
case 2:
case 3:
result.balcony = true;
break;
case 4:
case 30:
result.garage = true;
break;
case 9:
case 10:
result.storeRoom = true;
break;
case 18:
case 34:
case 37:
case 27:
result.basementAttic = true;
break;
}
}
return result;
}
getHeatingTypeId(heatingRentalId) {
// heatingRentalId can have multiple values, like: "1, 2, 3", parseInt will take first integer value
const heatingId = parseInt(heatingRentalId);
switch (heatingId) {
case 27:
case 16:
return HEATING_TYPE.GAS.id;
case 4:
return HEATING_TYPE.CENTRAL_GAS.id;
case 3:
case 23:
case 6:
case 7:
case 8:
case 9:
case 10:
return HEATING_TYPE.CENTRAL_BOILER.id;
case 2:
case 13:
case 30:
case 17:
case 29:
case 31:
return HEATING_TYPE.ELECTRICITY.id;
case 24:
case 25:
case 12:
return HEATING_TYPE.CENTRAL_CITY.id;
case 26:
case 21:
case 20:
return HEATING_TYPE.WOOD.id;
case 28:
case 19:
return HEATING_TYPE.HEAT_PUMP.id;
case 14:
case 32:
return HEATING_TYPE.OTHER.id;
default:
return null;
}
}
getPropertiesFromInfrastructure(infrastructureIds) {
const result = {
electricity: null,
water: null,
gas: null,
drainageSystem: null,
phoneConnection: null,
internet: null,
videoSurveillance: null,
cableTV: null
};
for (const infrastructureId of infrastructureIds) {
switch (infrastructureId) {
case 1:
result.electricity = true;
break;
case 2:
result.water = true;
break;
case 4:
result.gas = true;
break;
case 5:
result.drainageSystem = true;
break;
case 7:
case 8:
result.phoneConnection = true;
break;
case 10:
result.internet = true;
break;
case 11:
result.cableTV = true;
break;
case 16:
case 17:
result.videoSurveillance = true;
break;
}
}
return result;
}
getFloorNumberFromFloorId(floorsIdText) {
// floorIdText can be array of numbers, separated by comma or number
// just extracting floor number from first element
const floorsId = floorsIdText.split(",");
if (floorsId.length === 0) {
return null;
}
const firstFloorId = parseInt(floorsId[0]);
// 1 pod
// 2 sut
// 3 raz
// 4 pri
// 5 vpri
// 6 prv
// 7 dru
// 8 tre
// 9 čet
// 10 man
// 11
// 12 pot
// 13 vpot
// 14 tav
// 15 pet
const floorNumber = [
-1,
-1,
0,
0,
1,
1,
2,
3,
4,
null,
null,
null,
null,
null,
5
];
return floorNumber[firstFloorId - 1] || null;
}
getNumberOfFloorsFromFloorId(floorsIdText) {
// floorIdText can be array of numbers, separated by comma or number
const floorIds = floorsIdText.split(",");
if (floorIds.length === 0) {
return null;
}
return floorIds.length;
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
@@ -362,7 +798,7 @@ class RentalCrawler {
// }
//For now, we use only Postgres saver, so ...
return await savers[0].save(results);
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}

View File

@@ -26,7 +26,42 @@ const bulkUpsertRealEstates = async realEstateData => {
"gardenSize",
"adStatus",
"updatedAt",
"renewedDate"
"renewedDate",
"numberOfRooms",
"numberOfFloors",
"floor",
"accessRoadType",
"heatingType",
"furnishingType",
"balcony",
"newBuilding",
"elevator",
"water",
"electricity",
"drainageSystem",
"registeredInZkBooks",
"recentlyAdapted",
"parking",
"garage",
"gas",
"antiTheftDoor",
"airCondition",
"phoneConnection",
"cableTV",
"internet",
"basementAttic",
"storeRoom",
"videoSurveillance",
"alarm",
"suitableForStudents",
"includingBills",
"animalsAllowed",
"pool",
"urbanPlanPermit",
"buildingPermit",
"utilityConnection",
"distanceToRiver",
"numberOfViewsAgency"
];
const order = [["updatedAt", "desc"]];

View File

@@ -0,0 +1,163 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
return Promise.all([
queryInterface.addColumn("RealEstates", "numberOfRooms", {
type: Sequelize.REAL
}),
queryInterface.addColumn("RealEstates", "numberOfFloors", {
type: Sequelize.INTEGER
}),
queryInterface.addColumn("RealEstates", "floor", {
type: Sequelize.INTEGER
}),
queryInterface.addColumn("RealEstates", "accessRoadType", {
type: Sequelize.TEXT
}),
queryInterface.addColumn("RealEstates", "heatingType", {
type: Sequelize.TEXT
}),
queryInterface.addColumn("RealEstates", "furnishingType", {
type: Sequelize.TEXT
}),
queryInterface.addColumn("RealEstates", "balcony", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "newBuilding", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "elevator", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "water", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "electricity", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "drainageSystem", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "registeredInZkBooks", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "recentlyAdapted", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "parking", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "garage", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "gas", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "antiTheftDoor", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "airCondition", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "phoneConnection", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "cableTV", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "internet", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "basementAttic", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "storeRoom", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "videoSurveillance", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "alarm", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "suitableForStudents", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "includingBills", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "animalsAllowed", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "pool", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "exchange", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "urbanPlanPermit", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "buildingPermit", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "utilityConnection", {
type: Sequelize.BOOLEAN
}),
queryInterface.addColumn("RealEstates", "distanceToRiver", {
type: Sequelize.INTEGER
}),
queryInterface.addColumn("RealEstates", "numberOfViewsAgency", {
type: Sequelize.INTEGER,
defaultValue: 0
}),
queryInterface.addColumn("RealEstates", "numberOfViewsKivi", {
type: Sequelize.INTEGER,
defaultValue: 0
})
]);
},
down: (queryInterface, Sequelize) => {
return Promise.all([
queryInterface.removeColumn("RealEstates", "numberOfRooms"),
queryInterface.removeColumn("RealEstates", "numberOfFloors"),
queryInterface.removeColumn("RealEstates", "floor"),
queryInterface.removeColumn("RealEstates", "accessRoadType"),
queryInterface.removeColumn("RealEstates", "heatingType"),
queryInterface.removeColumn("RealEstates", "furnishingType"),
queryInterface.removeColumn("RealEstates", "balcony"),
queryInterface.removeColumn("RealEstates", "newBuilding"),
queryInterface.removeColumn("RealEstates", "elevator"),
queryInterface.removeColumn("RealEstates", "water"),
queryInterface.removeColumn("RealEstates", "electricity"),
queryInterface.removeColumn("RealEstates", "drainageSystem"),
queryInterface.removeColumn("RealEstates", "registeredInZkBooks"),
queryInterface.removeColumn("RealEstates", "recentlyAdapted"),
queryInterface.removeColumn("RealEstates", "parking"),
queryInterface.removeColumn("RealEstates", "garage"),
queryInterface.removeColumn("RealEstates", "gas"),
queryInterface.removeColumn("RealEstates", "antiTheftDoor"),
queryInterface.removeColumn("RealEstates", "airCondition"),
queryInterface.removeColumn("RealEstates", "phoneConnection"),
queryInterface.removeColumn("RealEstates", "cableTV"),
queryInterface.removeColumn("RealEstates", "internet"),
queryInterface.removeColumn("RealEstates", "basementAttic"),
queryInterface.removeColumn("RealEstates", "storeRoom"),
queryInterface.removeColumn("RealEstates", "videoSurveillance"),
queryInterface.removeColumn("RealEstates", "alarm"),
queryInterface.removeColumn("RealEstates", "suitableForStudents"),
queryInterface.removeColumn("RealEstates", "includingBills"),
queryInterface.removeColumn("RealEstates", "animalsAllowed"),
queryInterface.removeColumn("RealEstates", "pool"),
queryInterface.removeColumn("RealEstates", "exchange"),
queryInterface.removeColumn("RealEstates", "urbanPlanPermit"),
queryInterface.removeColumn("RealEstates", "buildingPermit"),
queryInterface.removeColumn("RealEstates", "utilityConnection"),
queryInterface.removeColumn("RealEstates", "distanceToRiver"),
queryInterface.removeColumn("RealEstates", "numberOfViewsAgency"),
queryInterface.removeColumn("RealEstates", "numberOfViewsKivi")
]);
}
};

View File

@@ -48,7 +48,44 @@ module.exports = (sequelize, DataTypes) => {
longDescription: DataTypes.TEXT,
adStatus: DataTypes.INTEGER,
publishedDate: DataTypes.DATE,
renewedDate: DataTypes.DATE
renewedDate: DataTypes.DATE,
numberOfRooms: DataTypes.INTEGER,
numberOfFloors: DataTypes.INTEGER,
floor: DataTypes.INTEGER,
accessRoadType: DataTypes.TEXT,
heatingType: DataTypes.TEXT,
furnishingType: DataTypes.TEXT,
balcony: DataTypes.BOOLEAN,
newBuilding: DataTypes.BOOLEAN,
elevator: DataTypes.BOOLEAN,
water: DataTypes.BOOLEAN,
electricity: DataTypes.BOOLEAN,
drainageSystem: DataTypes.BOOLEAN,
registeredInZkBooks: DataTypes.BOOLEAN,
recentlyAdapted: DataTypes.BOOLEAN,
parking: DataTypes.BOOLEAN,
garage: DataTypes.BOOLEAN,
gas: DataTypes.BOOLEAN,
antiTheftDoor: DataTypes.BOOLEAN,
airCondition: DataTypes.BOOLEAN,
phoneConnection: DataTypes.BOOLEAN,
cableTV: DataTypes.BOOLEAN,
internet: DataTypes.BOOLEAN,
basementAttic: DataTypes.BOOLEAN,
storeRoom: DataTypes.BOOLEAN,
videoSurveillance: DataTypes.BOOLEAN,
alarm: DataTypes.BOOLEAN,
suitableForStudents: DataTypes.BOOLEAN,
includingBills: DataTypes.BOOLEAN,
animalsAllowed: DataTypes.BOOLEAN,
pool: DataTypes.BOOLEAN,
exchange: DataTypes.BOOLEAN,
urbanPlanPermit: DataTypes.BOOLEAN,
buildingPermit: DataTypes.BOOLEAN,
utilityConnection: DataTypes.BOOLEAN,
distanceToRiver: DataTypes.INTEGER,
numberOfViewsAgency: DataTypes.INTEGER,
numberOfViewsKivi: DataTypes.INTEGER
});
return RealEstate;

View File

@@ -42,7 +42,7 @@ RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
RENTAL_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==PROSTOR==
PROSTOR_MAX_PAGES=!!! This is not used for prostor crawler !!!
PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents MAX RESULTS in total
PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents how many ads are crawled at once
PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!!

View File

@@ -14,7 +14,8 @@
"crawl": "cd app/crawler && node npmCrawl.js",
"daily-notify": "cd app/npmScripts && node npmDailyNotify.js",
"test-search": "cd test && node searchTest.js",
"test-olx-scraper": "cd test && node olxScrapeTest.js"
"test-olx-scraper": "cd test && node olxScrapeTest.js",
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
},
"repository": {
"type": "git",

17
test/rentalScrapeTest.js Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
const rentalCrawler = require("../app/crawler/specificCrawlers/rental");
const urlToScrape = process.argv[2] || undefined;
if (urlToScrape) {
const crawler = new rentalCrawler();
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log(data);
})();
} else {
console.log("No URL to scrape. Use like this : ");
console.log("npm run test-olx-scraper -- URL_TO_SCRAPE");
}