add more fields to the Prostor real estates crawler
This commit is contained in:
@@ -9,7 +9,9 @@ const {
|
|||||||
AD_CATEGORY,
|
AD_CATEGORY,
|
||||||
AD_AGENCY,
|
AD_AGENCY,
|
||||||
AD_STATUS,
|
AD_STATUS,
|
||||||
CRAWLER_AD_TYPE
|
CRAWLER_AD_TYPE,
|
||||||
|
FURNISHING_TYPE,
|
||||||
|
HEATING_TYPE
|
||||||
} = require("../../common/enums");
|
} = require("../../common/enums");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
@@ -121,7 +123,7 @@ class ProstorCrawler {
|
|||||||
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
||||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
||||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
|
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
|
||||||
const listOfAllRealEstates = await this.extractRealEstates(
|
const listOfAllRealEstates = await this.extractRealEstates(
|
||||||
urlPageToCrawl
|
urlPageToCrawl
|
||||||
);
|
);
|
||||||
@@ -179,7 +181,7 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(realEstate) {
|
async scrapeAd(realEstate) {
|
||||||
const { lat, lng, property_name, price, size, link } = realEstate;
|
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
||||||
const url = `https://prostor.ba${link}`;
|
const url = `https://prostor.ba${link}`;
|
||||||
// console.log("[PROSTOR] Scraping : ", url);
|
// console.log("[PROSTOR] Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
@@ -198,16 +200,6 @@ class ProstorCrawler {
|
|||||||
const prostorId = linkParts[4];
|
const prostorId = linkParts[4];
|
||||||
|
|
||||||
if (!adType || !realEstateType || !prostorId) {
|
if (!adType || !realEstateType || !prostorId) {
|
||||||
console.log(
|
|
||||||
"adType: ",
|
|
||||||
adType,
|
|
||||||
" reType: ",
|
|
||||||
realEstateType,
|
|
||||||
" prostorId: ",
|
|
||||||
prostorId,
|
|
||||||
"url: ",
|
|
||||||
url
|
|
||||||
);
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -218,52 +210,70 @@ class ProstorCrawler {
|
|||||||
|
|
||||||
$(allDataSelector)
|
$(allDataSelector)
|
||||||
.find("p")
|
.find("p")
|
||||||
.each((i, elem) => {
|
.each((i, element) => {
|
||||||
const propertyElement = $(elem)
|
const propertyElement = $(element)
|
||||||
.text()
|
.text()
|
||||||
.split(":")
|
.split(":")
|
||||||
.map(text => text.trim());
|
.map(text => text.trim().toLowerCase());
|
||||||
|
|
||||||
const propertyTitle = propertyElement[0];
|
const propertyTitle = propertyElement[0];
|
||||||
realEstateProperties[propertyTitle] = propertyElement[1];
|
realEstateProperties[propertyTitle] = propertyElement[1];
|
||||||
});
|
});
|
||||||
|
|
||||||
|
$(allDataSelector)
|
||||||
|
.find("div.mb-2")
|
||||||
|
.each((i, element) => {
|
||||||
|
const propertyElement = $(element)
|
||||||
|
.text()
|
||||||
|
.trim()
|
||||||
|
.toLowerCase();
|
||||||
|
|
||||||
|
realEstateProperties[propertyElement] = true;
|
||||||
|
});
|
||||||
|
|
||||||
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
|
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
let numberOfRooms =
|
let numberOfRooms =
|
||||||
parseFloat(realEstateProperties["Broj soba"]) +
|
parseFloat(realEstateProperties["broj soba"]) +
|
||||||
parseFloat(realEstateProperties["Broj spavaćih soba"]) || null,
|
parseFloat(realEstateProperties["broj spavaćih soba"]) || null,
|
||||||
numberOfFloors = null,
|
numberOfFloors = null,
|
||||||
floor = null,
|
floor = null,
|
||||||
accessRoadType = null,
|
accessRoadType = null,
|
||||||
heatingType = null,
|
heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties),
|
||||||
furnishingType = null,
|
furnishingType = null,
|
||||||
balcony = null,
|
balcony =
|
||||||
|
realEstateProperties["balkon"] ||
|
||||||
|
realEstateProperties["terasa"] ||
|
||||||
|
realEstateProperties["lođa"] ||
|
||||||
|
null,
|
||||||
newBuilding = linkParts[1] === "novogradnja",
|
newBuilding = linkParts[1] === "novogradnja",
|
||||||
elevator = null,
|
elevator = realEstateProperties["lift"] || null,
|
||||||
water = null,
|
water = realEstateProperties["voda"] || null,
|
||||||
electricity = null,
|
electricity = realEstateProperties["električna energija"] || null,
|
||||||
drainageSystem = null,
|
drainageSystem = realEstateProperties["kanalizacija"] || null,
|
||||||
registeredInZkBooks = null,
|
registeredInZkBooks = null,
|
||||||
recentlyAdapted = null,
|
recentlyAdapted = null,
|
||||||
parking = null,
|
parking = realEstateProperties["parking"] || null,
|
||||||
garage = null,
|
garage = realEstateProperties["garaža"] || null,
|
||||||
gas = null,
|
gas = realEstateProperties["plin"] || null,
|
||||||
antiTheftDoor = null,
|
antiTheftDoor = realEstateProperties["blindo vrata"] || null,
|
||||||
airCondition = null,
|
airCondition = realEstateProperties["klima"] || null,
|
||||||
phoneConnection = null,
|
phoneConnection = realEstateProperties["telefon"] || null,
|
||||||
cableTV = null,
|
cableTV = realEstateProperties["kablovksa tv"] || null,
|
||||||
internet = null,
|
internet =
|
||||||
basementAttic = null,
|
realEstateProperties["internet"] ||
|
||||||
storeRoom = null,
|
realEstateProperties["adsl"] ||
|
||||||
videoSurveillance = null,
|
null,
|
||||||
alarm = null,
|
basementAttic = realEstateProperties["podrum"] || null,
|
||||||
|
storeRoom = realEstateProperties["ostava"] || null,
|
||||||
|
videoSurveillance = realEstateProperties["video nadzor"],
|
||||||
|
alarm = realEstateProperties["alarm"] || null,
|
||||||
suitableForStudents = null,
|
suitableForStudents = null,
|
||||||
includingBills = null,
|
includingBills = null,
|
||||||
animalsAllowed = null,
|
animalsAllowed = null,
|
||||||
pool = null,
|
pool = realEstateProperties["bazen"] || null,
|
||||||
urbanPlanPermit = null,
|
urbanPlanPermit = null,
|
||||||
buildingPermit = null,
|
buildingPermit = null,
|
||||||
utilityConnection = null,
|
utilityConnection = null,
|
||||||
@@ -278,7 +288,7 @@ class ProstorCrawler {
|
|||||||
// If there are two parts, that represents more real estates are sold
|
// If there are two parts, that represents more real estates are sold
|
||||||
// numberOfFloors is contained in second part, after / sign
|
// numberOfFloors is contained in second part, after / sign
|
||||||
|
|
||||||
const floorsArray = realEstateProperties["Sprat"].split(" - ");
|
const floorsArray = realEstateProperties["sprat"].split(" - ");
|
||||||
let floorText = "";
|
let floorText = "";
|
||||||
if (floorsArray.length === 1) {
|
if (floorsArray.length === 1) {
|
||||||
const floorDescription = floorsArray[0].split("/");
|
const floorDescription = floorsArray[0].split("/");
|
||||||
@@ -296,7 +306,7 @@ class ProstorCrawler {
|
|||||||
|
|
||||||
if (isNaN(floor)) {
|
if (isNaN(floor)) {
|
||||||
// It was textual representation of floor, like "Pr", "Su" or similar
|
// It was textual representation of floor, like "Pr", "Su" or similar
|
||||||
switch (floorText.toLowerCase()) {
|
switch (floorText) {
|
||||||
case "pr":
|
case "pr":
|
||||||
floor = 0;
|
floor = 0;
|
||||||
break;
|
break;
|
||||||
@@ -312,7 +322,15 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const adStatus = AD_STATUS.STATUS_NORMAL;
|
if (realEstateProperties["namješteno"]) {
|
||||||
|
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
||||||
|
} else if (realEstateProperties["polunamješteno"]) {
|
||||||
|
furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
|
||||||
|
} else {
|
||||||
|
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
|
||||||
|
}
|
||||||
|
|
||||||
|
const adStatus = ProstorCrawler.getStatusId(status);
|
||||||
const title = property_name;
|
const title = property_name;
|
||||||
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
||||||
const parsedArea = parseFloat(size);
|
const parsedArea = parseFloat(size);
|
||||||
@@ -332,7 +350,7 @@ class ProstorCrawler {
|
|||||||
shortDescription: "",
|
shortDescription: "",
|
||||||
longDescription: longDescription,
|
longDescription: longDescription,
|
||||||
streetNumber: 0,
|
streetNumber: 0,
|
||||||
streetName: realEstateProperties["Adresa"],
|
streetName: realEstateProperties["adresa"],
|
||||||
locality: "",
|
locality: "",
|
||||||
municipality: "",
|
municipality: "",
|
||||||
city: "",
|
city: "",
|
||||||
@@ -492,6 +510,50 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static getHeatingTypeId(realEstateProperties) {
|
||||||
|
const realEstatePropertiesKeys = Object.keys(realEstateProperties);
|
||||||
|
for (const property of realEstatePropertiesKeys) {
|
||||||
|
switch (property) {
|
||||||
|
case "centralno toplane":
|
||||||
|
return HEATING_TYPE.CENTRAL_CITY.id;
|
||||||
|
case "etažno plinsko":
|
||||||
|
return HEATING_TYPE.CENTRAL_GAS.id;
|
||||||
|
case "termo blok":
|
||||||
|
case "podno grijanje":
|
||||||
|
return HEATING_TYPE.OTHER.id;
|
||||||
|
case "etažno električno":
|
||||||
|
case "konvektori":
|
||||||
|
return HEATING_TYPE.ELECTRICITY.id;
|
||||||
|
case "plinske peći":
|
||||||
|
return HEATING_TYPE.GAS.id;
|
||||||
|
case "vlastita kotlovnica":
|
||||||
|
return HEATING_TYPE.CENTRAL_BOILER.id;
|
||||||
|
case "toplotna pumpa":
|
||||||
|
return HEATING_TYPE.HEAT_PUMP.id;
|
||||||
|
case "kamin":
|
||||||
|
return HEATING_TYPE.WOOD.id;
|
||||||
|
default:
|
||||||
|
//console.log("[PROSTOR] Nepoznato >>> [", property, "]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static getStatusId(statusText) {
|
||||||
|
switch (statusText) {
|
||||||
|
case "":
|
||||||
|
return AD_STATUS.STATUS_NORMAL;
|
||||||
|
case "Rezervisano":
|
||||||
|
return AD_STATUS.STATUS_RESERVED;
|
||||||
|
case "Prodano":
|
||||||
|
return AD_STATUS.STATUS_SOLD;
|
||||||
|
case "Iznajmljeno":
|
||||||
|
return AD_STATUS.STATUS_RENTED;
|
||||||
|
default:
|
||||||
|
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
|
||||||
|
return AD_STATUS.STATUS_NORMAL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async sleep(ms) {
|
async sleep(ms) {
|
||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user