add more fields to the Prostor real estates crawler

This commit is contained in:
Bilal Catic
2019-11-14 07:23:23 +01:00
parent 1e68d640e2
commit 168b2186e7

View File

@@ -9,7 +9,9 @@ const {
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE
CRAWLER_AD_TYPE,
FURNISHING_TYPE,
HEATING_TYPE
} = require("../../common/enums");
const {
@@ -121,7 +123,7 @@ class ProstorCrawler {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl
);
@@ -179,7 +181,7 @@ class ProstorCrawler {
}
async scrapeAd(realEstate) {
const { lat, lng, property_name, price, size, link } = realEstate;
const { lat, lng, property_name, price, size, link, status } = realEstate;
const url = `https://prostor.ba${link}`;
// console.log("[PROSTOR] Scraping : ", url);
try {
@@ -198,16 +200,6 @@ class ProstorCrawler {
const prostorId = linkParts[4];
if (!adType || !realEstateType || !prostorId) {
console.log(
"adType: ",
adType,
" reType: ",
realEstateType,
" prostorId: ",
prostorId,
"url: ",
url
);
return null;
}
@@ -218,52 +210,70 @@ class ProstorCrawler {
$(allDataSelector)
.find("p")
.each((i, elem) => {
const propertyElement = $(elem)
.each((i, element) => {
const propertyElement = $(element)
.text()
.split(":")
.map(text => text.trim());
.map(text => text.trim().toLowerCase());
const propertyTitle = propertyElement[0];
realEstateProperties[propertyTitle] = propertyElement[1];
});
$(allDataSelector)
.find("div.mb-2")
.each((i, element) => {
const propertyElement = $(element)
.text()
.trim()
.toLowerCase();
realEstateProperties[propertyElement] = true;
});
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
return null;
}
let numberOfRooms =
parseFloat(realEstateProperties["Broj soba"]) +
parseFloat(realEstateProperties["Broj spavaćih soba"]) || null,
parseFloat(realEstateProperties["broj soba"]) +
parseFloat(realEstateProperties["broj spavaćih soba"]) || null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = null,
heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties),
furnishingType = null,
balcony = null,
balcony =
realEstateProperties["balkon"] ||
realEstateProperties["terasa"] ||
realEstateProperties["lođa"] ||
null,
newBuilding = linkParts[1] === "novogradnja",
elevator = null,
water = null,
electricity = null,
drainageSystem = null,
elevator = realEstateProperties["lift"] || null,
water = realEstateProperties["voda"] || null,
electricity = realEstateProperties["električna energija"] || null,
drainageSystem = realEstateProperties["kanalizacija"] || null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = null,
garage = null,
gas = null,
antiTheftDoor = null,
airCondition = null,
phoneConnection = null,
cableTV = null,
internet = null,
basementAttic = null,
storeRoom = null,
videoSurveillance = null,
alarm = null,
parking = realEstateProperties["parking"] || null,
garage = realEstateProperties["garaža"] || null,
gas = realEstateProperties["plin"] || null,
antiTheftDoor = realEstateProperties["blindo vrata"] || null,
airCondition = realEstateProperties["klima"] || null,
phoneConnection = realEstateProperties["telefon"] || null,
cableTV = realEstateProperties["kablovksa tv"] || null,
internet =
realEstateProperties["internet"] ||
realEstateProperties["adsl"] ||
null,
basementAttic = realEstateProperties["podrum"] || null,
storeRoom = realEstateProperties["ostava"] || null,
videoSurveillance = realEstateProperties["video nadzor"],
alarm = realEstateProperties["alarm"] || null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = null,
pool = realEstateProperties["bazen"] || null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
@@ -278,7 +288,7 @@ class ProstorCrawler {
// If there are two parts, that represents more real estates are sold
// numberOfFloors is contained in second part, after / sign
const floorsArray = realEstateProperties["Sprat"].split(" - ");
const floorsArray = realEstateProperties["sprat"].split(" - ");
let floorText = "";
if (floorsArray.length === 1) {
const floorDescription = floorsArray[0].split("/");
@@ -296,7 +306,7 @@ class ProstorCrawler {
if (isNaN(floor)) {
// It was textual representation of floor, like "Pr", "Su" or similar
switch (floorText.toLowerCase()) {
switch (floorText) {
case "pr":
floor = 0;
break;
@@ -312,7 +322,15 @@ class ProstorCrawler {
}
}
const adStatus = AD_STATUS.STATUS_NORMAL;
if (realEstateProperties["namješteno"]) {
furnishingType = FURNISHING_TYPE.FURNISHED.id;
} else if (realEstateProperties["polunamješteno"]) {
furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
} else {
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
}
const adStatus = ProstorCrawler.getStatusId(status);
const title = property_name;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
@@ -332,7 +350,7 @@ class ProstorCrawler {
shortDescription: "",
longDescription: longDescription,
streetNumber: 0,
streetName: realEstateProperties["Adresa"],
streetName: realEstateProperties["adresa"],
locality: "",
municipality: "",
city: "",
@@ -492,6 +510,50 @@ class ProstorCrawler {
}
}
static getHeatingTypeId(realEstateProperties) {
const realEstatePropertiesKeys = Object.keys(realEstateProperties);
for (const property of realEstatePropertiesKeys) {
switch (property) {
case "centralno toplane":
return HEATING_TYPE.CENTRAL_CITY.id;
case "etažno plinsko":
return HEATING_TYPE.CENTRAL_GAS.id;
case "termo blok":
case "podno grijanje":
return HEATING_TYPE.OTHER.id;
case "etažno električno":
case "konvektori":
return HEATING_TYPE.ELECTRICITY.id;
case "plinske peći":
return HEATING_TYPE.GAS.id;
case "vlastita kotlovnica":
return HEATING_TYPE.CENTRAL_BOILER.id;
case "toplotna pumpa":
return HEATING_TYPE.HEAT_PUMP.id;
case "kamin":
return HEATING_TYPE.WOOD.id;
default:
//console.log("[PROSTOR] Nepoznato >>> [", property, "]");
}
}
}
static getStatusId(statusText) {
switch (statusText) {
case "":
return AD_STATUS.STATUS_NORMAL;
case "Rezervisano":
return AD_STATUS.STATUS_RESERVED;
case "Prodano":
return AD_STATUS.STATUS_SOLD;
case "Iznajmljeno":
return AD_STATUS.STATUS_RENTED;
default:
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
return AD_STATUS.STATUS_NORMAL;
}
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}