Added crawler for Saljic nekretnine.
This commit is contained in:
@@ -174,6 +174,25 @@ class SaljicCrawler {
|
||||
}
|
||||
});
|
||||
|
||||
let adTypesTmp = [];
|
||||
|
||||
$("#shop")
|
||||
.find(".product")
|
||||
.each((i, elem) => {
|
||||
const adType = $(elem)
|
||||
.find(".trakica-search-page")
|
||||
.text()
|
||||
.trim();
|
||||
if (adType) {
|
||||
adTypesTmp.push(adType);
|
||||
}
|
||||
});
|
||||
|
||||
//Converting to AD_TYPE
|
||||
const adTypes = adTypesTmp.map(adTypeText => {
|
||||
return this.getAdTypeId(adTypeText);
|
||||
});
|
||||
|
||||
//Converting to absolute URLs
|
||||
const hrefsAbs = hrefs.map(link => {
|
||||
return "https://www.saljicnekretnine.ba" + link;
|
||||
@@ -186,7 +205,7 @@ class SaljicCrawler {
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
asyncScraping.push(this.scrapeAd(hrefsAbs[i]));
|
||||
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
|
||||
}
|
||||
|
||||
const scrapedData = await Promise.all(asyncScraping);
|
||||
@@ -198,16 +217,19 @@ class SaljicCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
async scrapeAd(url, adType) {
|
||||
console.log("[SALJIC] Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
// ??? treba li nesto za status
|
||||
let status = AD_STATUS.STATUS_NORMAL;
|
||||
// No information for status ex. PRODAN
|
||||
const status = AD_STATUS.STATUS_NORMAL;
|
||||
//Extracting agency ID from url
|
||||
const agencyObjectId = parseInt(url.substring(46, url.length));
|
||||
|
||||
//Extracting main properties
|
||||
const propertySelectors = {
|
||||
title:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
|
||||
@@ -227,7 +249,6 @@ class SaljicCrawler {
|
||||
.replace(/ {1,}/g, " ")
|
||||
.trim();
|
||||
|
||||
console.log("Title:", title);
|
||||
const priceText = $(propertySelectors.price)
|
||||
.text()
|
||||
.replace(/(\r\n|\n|\r)/gm, "")
|
||||
@@ -240,18 +261,14 @@ class SaljicCrawler {
|
||||
priceText.substring(8, priceText.length - 3).replace(",", "")
|
||||
);
|
||||
|
||||
console.log("Price:", price);
|
||||
|
||||
const streetName = $(propertySelectors.streetName)
|
||||
.text()
|
||||
.replace(/(\r\n|\n|\r)/gm, "")
|
||||
.trim();
|
||||
console.log("Street:", streetName);
|
||||
|
||||
const descriptions = $(propertySelectors.descriptions)
|
||||
.text()
|
||||
.trim();
|
||||
console.log("Description:", descriptions);
|
||||
|
||||
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
|
||||
const latText = latAndLongSrc.substring(
|
||||
@@ -264,8 +281,6 @@ class SaljicCrawler {
|
||||
);
|
||||
const locationLat = parseFloat(latText) || null;
|
||||
const locationLong = parseFloat(longText) || null;
|
||||
console.log("Lat:", locationLat);
|
||||
console.log("Long:", locationLong);
|
||||
|
||||
//====== DETAIL INFORMATION FIELDS ==========
|
||||
let area,
|
||||
@@ -306,6 +321,8 @@ class SaljicCrawler {
|
||||
distanceToRiver = null;
|
||||
let publishedDate = null;
|
||||
let renewedDate = null;
|
||||
let realEstateType;
|
||||
let numberOfViewsAgency = null;
|
||||
|
||||
//Extracting data - Glavne karakteristike
|
||||
let mainFieldIndex = 1;
|
||||
@@ -346,18 +363,15 @@ class SaljicCrawler {
|
||||
recentlyAdapted = true;
|
||||
break;
|
||||
case "Broj parking mjesta":
|
||||
`${month}/${day}/${year}`;
|
||||
parking = true;
|
||||
break;
|
||||
case "Dostupno od":
|
||||
const day = mainFieldValue.substring(0, 2);
|
||||
const month = mainFieldValue.substring(3, 5);
|
||||
const year = mainFieldValue.substring(6, mainFieldValue.length);
|
||||
console.log(`${month}/${day}/${year}`);
|
||||
publishedDate = new Date(`${month}/${day}/${year}`);
|
||||
break;
|
||||
default:
|
||||
// console.log(fieldTitle, " = ", fieldValue);
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -367,39 +381,121 @@ class SaljicCrawler {
|
||||
mainFieldIndex++;
|
||||
} while (true);
|
||||
|
||||
console.log("Area:", area);
|
||||
console.log("Garden size:", gardenSize);
|
||||
console.log("Number of rooms:", numberOfRooms);
|
||||
console.log("Number of floors", numberOfFloors);
|
||||
console.log("Floor:", floor);
|
||||
console.log("Adapted:", recentlyAdapted);
|
||||
console.log("Parking:", parking);
|
||||
console.log("Published date:", publishedDate);
|
||||
//Extracting data - Sadrzaji
|
||||
let additionalFieldIndex = 1;
|
||||
do {
|
||||
const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`;
|
||||
|
||||
//const category = $(propertySelectors.category)
|
||||
//.text()
|
||||
//.trim();
|
||||
const additionalField = $(additionalFieldSelector)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
if (additionalFieldIndex === 1) {
|
||||
//Extracting data of real estate type
|
||||
const categoryTmp = additionalField
|
||||
.replace(/[\n\r\t]/gm, "")
|
||||
.substring(
|
||||
additionalField.indexOf("Kategorija") + 10,
|
||||
additionalField.length
|
||||
)
|
||||
.trim();
|
||||
realEstateType = this.getAdCategoryId(categoryTmp);
|
||||
} else {
|
||||
switch (additionalField) {
|
||||
case "Internet":
|
||||
internet = true;
|
||||
break;
|
||||
case "Garaža":
|
||||
garage = true;
|
||||
break;
|
||||
case "Klima":
|
||||
airCondition = true;
|
||||
break;
|
||||
case "Balkon":
|
||||
balcony = true;
|
||||
break;
|
||||
case "Ostava":
|
||||
storeRoom = true;
|
||||
break;
|
||||
case "Podrum":
|
||||
basementAttic = true;
|
||||
break;
|
||||
case "Blindirana vrata":
|
||||
antiTheftDoor = true;
|
||||
break;
|
||||
case "Voda":
|
||||
water = true;
|
||||
break;
|
||||
case "Kablovska":
|
||||
cableTV = true;
|
||||
break;
|
||||
case "Uknjiženo":
|
||||
registeredInZkBooks = true;
|
||||
break;
|
||||
case "Grijanje - centralno":
|
||||
heatingType = HEATING_TYPE.CENTRAL_CITY.id;
|
||||
break;
|
||||
case "Grijanje - plin":
|
||||
heatingType = HEATING_TYPE.GAS.id;
|
||||
break;
|
||||
case "Grijanje - struja":
|
||||
heatingType = HEATING_TYPE.ELECTRICITY.id;
|
||||
break;
|
||||
case "Grijanje":
|
||||
heatingType = HEATING_TYPE.OTHER.id;
|
||||
break;
|
||||
case "Plin":
|
||||
gas = true;
|
||||
break;
|
||||
case "Namješten":
|
||||
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
||||
break;
|
||||
case "Alarm":
|
||||
alarm = true;
|
||||
break;
|
||||
case "Video nadzor":
|
||||
videoSurveillance = true;
|
||||
break;
|
||||
case "Lift":
|
||||
elevator = true;
|
||||
break;
|
||||
case "Novogradnja":
|
||||
newBuilding = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (additionalField === "") {
|
||||
break;
|
||||
}
|
||||
additionalFieldIndex++;
|
||||
} while (true);
|
||||
|
||||
//If no published date it takes current date of crawling
|
||||
if (publishedDate) {
|
||||
renewedDate = new Date();
|
||||
} else {
|
||||
publishedDate = new Date();
|
||||
renewedDate = new Date();
|
||||
}
|
||||
|
||||
const data = {
|
||||
url,
|
||||
agencyObjectId: olxId,
|
||||
originAgencyName: AD_AGENCY.OLX,
|
||||
realEstateType: parsedCategory,
|
||||
adType: parsedAdType,
|
||||
agencyObjectId,
|
||||
originAgencyName: AD_AGENCY.SALJIC,
|
||||
realEstateType,
|
||||
adType,
|
||||
title,
|
||||
price: parsedPrice,
|
||||
area: parsedArea,
|
||||
gardenSize: parsedGardenSize,
|
||||
shortDescription: descriptions
|
||||
.first()
|
||||
.text()
|
||||
.trim(),
|
||||
longDescription: descriptions
|
||||
.last()
|
||||
.text()
|
||||
.trim(),
|
||||
price,
|
||||
area,
|
||||
gardenSize,
|
||||
shortDescription: descriptions.substring(0, descriptions.indexOf(".")),
|
||||
longDescription: descriptions,
|
||||
streetNumber: 0,
|
||||
streetName: "",
|
||||
streetName,
|
||||
locality: "",
|
||||
municipality: "",
|
||||
city: "",
|
||||
@@ -409,8 +505,8 @@ class SaljicCrawler {
|
||||
locationLat,
|
||||
locationLong,
|
||||
adStatus: status,
|
||||
publishedDate: publishedDateMoment.toISOString(),
|
||||
renewedDate: renewedDateMoment.toISOString(),
|
||||
publishedDate,
|
||||
renewedDate,
|
||||
numberOfRooms,
|
||||
numberOfFloors,
|
||||
floor,
|
||||
@@ -447,7 +543,7 @@ class SaljicCrawler {
|
||||
distanceToRiver,
|
||||
numberOfViewsAgency
|
||||
};
|
||||
|
||||
console.log(data);
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
@@ -459,19 +555,25 @@ class SaljicCrawler {
|
||||
|
||||
getAdCategoryId(categoryText) {
|
||||
switch (categoryText) {
|
||||
case "Stanovi":
|
||||
case "Stan":
|
||||
return AD_CATEGORY.FLAT.id;
|
||||
case "Zemljišta":
|
||||
case "Građevinsko zemljiste":
|
||||
return AD_CATEGORY.LAND.id;
|
||||
case "Kuće":
|
||||
case "Industrijsko zemljiste":
|
||||
return AD_CATEGORY.LAND.id;
|
||||
case "Poljoprivredno zemljiste":
|
||||
return AD_CATEGORY.LAND.id;
|
||||
case "Kuća":
|
||||
return AD_CATEGORY.HOUSE.id;
|
||||
case "Poslovni prostori":
|
||||
case "Poslovni prostor":
|
||||
return AD_CATEGORY.OFFICE.id;
|
||||
case "Kancelarije":
|
||||
return AD_CATEGORY.OFFICE.id;
|
||||
case "Apartmani":
|
||||
return AD_CATEGORY.APARTMENT.id;
|
||||
case "Garaže":
|
||||
case "Garaža":
|
||||
return AD_CATEGORY.GARAGE.id;
|
||||
case "Vikendice":
|
||||
case "Vikendica":
|
||||
return AD_CATEGORY.COTTAGE.id;
|
||||
default:
|
||||
return undefined;
|
||||
@@ -480,191 +582,15 @@ class SaljicCrawler {
|
||||
|
||||
getAdTypeId(adTypeText) {
|
||||
switch (adTypeText) {
|
||||
case "Prodaja":
|
||||
case "PRODAJA":
|
||||
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||
case "Izdavanje":
|
||||
case "NAJAM":
|
||||
return AD_TYPE.AD_TYPE_RENT.stringId;
|
||||
case "Potražnja":
|
||||
return AD_TYPE.AD_TYPE_REQUEST.stringId;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
getHeatingTypeId(heatingTypeText) {
|
||||
switch (heatingTypeText) {
|
||||
case "struja":
|
||||
return HEATING_TYPE.ELECTRICITY.id;
|
||||
case "plin":
|
||||
return HEATING_TYPE.GAS.id;
|
||||
case "drva":
|
||||
return HEATING_TYPE.WOOD.id;
|
||||
case "centralno (gradsko)":
|
||||
return HEATING_TYPE.CENTRAL_CITY.id;
|
||||
case "centralno (kotlovnica)":
|
||||
return HEATING_TYPE.CENTRAL_BOILER.id;
|
||||
case "centralno (plin)":
|
||||
return HEATING_TYPE.CENTRAL_GAS.id;
|
||||
case "nije uvedeno":
|
||||
return HEATING_TYPE.NO_HEATING.id;
|
||||
case "ostalo":
|
||||
return HEATING_TYPE.OTHER.id;
|
||||
case "drugo":
|
||||
return HEATING_TYPE.OTHER.id;
|
||||
default:
|
||||
console.log("grijanje = NEPOZNATO [", heatingTypeText, "]");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
getFurnishingTypeId(furnishingTypeText) {
|
||||
switch (furnishingTypeText) {
|
||||
case "namješten":
|
||||
return FURNISHING_TYPE.FURNISHED.id;
|
||||
case "polunamješten":
|
||||
return FURNISHING_TYPE.HALF_FURNISHED.id;
|
||||
case "nenamješten":
|
||||
return FURNISHING_TYPE.NOT_FURNISHED.id;
|
||||
case "":
|
||||
return FURNISHING_TYPE.FURNISHED.id;
|
||||
default:
|
||||
console.log("namješten = NEPOZNATO [", furnishingTypeText, "]");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
getAccessRoadTypeId(accessRoadTypeText) {
|
||||
switch (accessRoadTypeText) {
|
||||
case "asfalt":
|
||||
return ACCESS_ROAD_TYPE.ASPHALT.id;
|
||||
case "beton":
|
||||
return ACCESS_ROAD_TYPE.CONCRETE.id;
|
||||
case "makadam":
|
||||
return ACCESS_ROAD_TYPE.MACADAM.id;
|
||||
case "ostalo":
|
||||
return ACCESS_ROAD_TYPE.OTHER.id;
|
||||
default:
|
||||
console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
parseArea(areaText) {
|
||||
if (!areaText) {
|
||||
return NaN;
|
||||
}
|
||||
const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g;
|
||||
const textWithOnlyOneDecimalDot = areaText
|
||||
.replace(",", ".")
|
||||
.replace(removeDotsExceptLastOneRegex, "");
|
||||
|
||||
return parseFloat(textWithOnlyOneDecimalDot);
|
||||
}
|
||||
|
||||
parsePrice(priceText) {
|
||||
if (!priceText) {
|
||||
return NaN;
|
||||
}
|
||||
const formattedPriceText = priceText.replace(".", "").replace(",", ".");
|
||||
return parseFloat(formattedPriceText);
|
||||
}
|
||||
|
||||
parseNumberOfRooms(numberOfRoomsText, categoryId) {
|
||||
if (categoryId === AD_CATEGORY.FLAT.id) {
|
||||
switch (numberOfRoomsText) {
|
||||
case "garsonjera":
|
||||
return 0;
|
||||
case "jednosoban (1)":
|
||||
return 1;
|
||||
case "jednoiposoban (1.5)":
|
||||
return 1.5;
|
||||
case "dvosoban (2)":
|
||||
return 2;
|
||||
case "trosoban (3)":
|
||||
return 3;
|
||||
case "četverosoban (4)":
|
||||
return 4;
|
||||
case "petosoban i više":
|
||||
return 5;
|
||||
default:
|
||||
console.log(
|
||||
"broj soba [stan] = NEPOZNATO [",
|
||||
numberOfRoomsText,
|
||||
", ",
|
||||
categoryId,
|
||||
"]"
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
categoryId === AD_CATEGORY.HOUSE.id ||
|
||||
categoryId === AD_CATEGORY.COTTAGE.id ||
|
||||
categoryId === AD_CATEGORY.APARTMENT.id ||
|
||||
categoryId === AD_CATEGORY.OFFICE.id
|
||||
) {
|
||||
return parseInt(numberOfRoomsText) || null;
|
||||
}
|
||||
|
||||
console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]");
|
||||
return null;
|
||||
}
|
||||
|
||||
parseNumberOfFloors(numberOfFloorsText, categoryId) {
|
||||
if (
|
||||
categoryId === AD_CATEGORY.HOUSE.id ||
|
||||
categoryId === AD_CATEGORY.COTTAGE.id
|
||||
) {
|
||||
return parseInt(numberOfFloorsText) || null;
|
||||
}
|
||||
|
||||
if (categoryId === AD_CATEGORY.OFFICE.id) {
|
||||
if (
|
||||
numberOfFloorsText === "suteren" ||
|
||||
numberOfFloorsText === "prizemlje"
|
||||
) {
|
||||
return 0;
|
||||
}
|
||||
if (numberOfFloorsText === "6+") {
|
||||
return 7;
|
||||
}
|
||||
return parseInt(numberOfFloorsText) || null;
|
||||
}
|
||||
|
||||
console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]");
|
||||
return null;
|
||||
}
|
||||
|
||||
parseFloorNumber(floorText, categoryId) {
|
||||
if (
|
||||
categoryId === AD_CATEGORY.FLAT.id ||
|
||||
categoryId === AD_CATEGORY.APARTMENT.id
|
||||
) {
|
||||
if (
|
||||
floorText === "suteren" ||
|
||||
floorText === "prizemlje" ||
|
||||
floorText === "visoko prizemlje"
|
||||
) {
|
||||
return 0;
|
||||
}
|
||||
return parseInt(floorText) || null;
|
||||
}
|
||||
|
||||
if (categoryId === AD_CATEGORY.OFFICE.id) {
|
||||
if (floorText === "zaseban objekat") {
|
||||
return null;
|
||||
}
|
||||
if (floorText === "prizemlje" || floorText === "visoko prizemlje") {
|
||||
return 0;
|
||||
}
|
||||
return parseInt(floorText) || null;
|
||||
}
|
||||
|
||||
console.log("sprat = NEPOZNATO [", floorText, "]");
|
||||
return null;
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user