Added crawler for Saljic nekretnine.

This commit is contained in:
Naida Vatric
2020-01-31 22:03:39 +01:00
parent 7a7aecb3ee
commit 1ba7cf8531

View File

@@ -174,6 +174,25 @@ class SaljicCrawler {
}
});
let adTypesTmp = [];
$("#shop")
.find(".product")
.each((i, elem) => {
const adType = $(elem)
.find(".trakica-search-page")
.text()
.trim();
if (adType) {
adTypesTmp.push(adType);
}
});
//Converting to AD_TYPE
const adTypes = adTypesTmp.map(adTypeText => {
return this.getAdTypeId(adTypeText);
});
//Converting to absolute URLs
const hrefsAbs = hrefs.map(link => {
return "https://www.saljicnekretnine.ba" + link;
@@ -186,7 +205,7 @@ class SaljicCrawler {
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefsAbs[i]));
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
}
const scrapedData = await Promise.all(asyncScraping);
@@ -198,16 +217,19 @@ class SaljicCrawler {
}
}
async scrapeAd(url) {
async scrapeAd(url, adType) {
console.log("[SALJIC] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
// ??? treba li nesto za status
let status = AD_STATUS.STATUS_NORMAL;
// No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url
const agencyObjectId = parseInt(url.substring(46, url.length));
//Extracting main properties
const propertySelectors = {
title:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
@@ -227,7 +249,6 @@ class SaljicCrawler {
.replace(/ {1,}/g, " ")
.trim();
console.log("Title:", title);
const priceText = $(propertySelectors.price)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
@@ -240,18 +261,14 @@ class SaljicCrawler {
priceText.substring(8, priceText.length - 3).replace(",", "")
);
console.log("Price:", price);
const streetName = $(propertySelectors.streetName)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.trim();
console.log("Street:", streetName);
const descriptions = $(propertySelectors.descriptions)
.text()
.trim();
console.log("Description:", descriptions);
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
const latText = latAndLongSrc.substring(
@@ -264,8 +281,6 @@ class SaljicCrawler {
);
const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null;
console.log("Lat:", locationLat);
console.log("Long:", locationLong);
//====== DETAIL INFORMATION FIELDS ==========
let area,
@@ -306,6 +321,8 @@ class SaljicCrawler {
distanceToRiver = null;
let publishedDate = null;
let renewedDate = null;
let realEstateType;
let numberOfViewsAgency = null;
//Extracting data - Glavne karakteristike
let mainFieldIndex = 1;
@@ -346,18 +363,15 @@ class SaljicCrawler {
recentlyAdapted = true;
break;
case "Broj parking mjesta":
`${month}/${day}/${year}`;
parking = true;
break;
case "Dostupno od":
const day = mainFieldValue.substring(0, 2);
const month = mainFieldValue.substring(3, 5);
const year = mainFieldValue.substring(6, mainFieldValue.length);
console.log(`${month}/${day}/${year}`);
publishedDate = new Date(`${month}/${day}/${year}`);
break;
default:
// console.log(fieldTitle, " = ", fieldValue);
break;
}
@@ -367,39 +381,121 @@ class SaljicCrawler {
mainFieldIndex++;
} while (true);
console.log("Area:", area);
console.log("Garden size:", gardenSize);
console.log("Number of rooms:", numberOfRooms);
console.log("Number of floors", numberOfFloors);
console.log("Floor:", floor);
console.log("Adapted:", recentlyAdapted);
console.log("Parking:", parking);
console.log("Published date:", publishedDate);
//Extracting data - Sadrzaji
let additionalFieldIndex = 1;
do {
const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`;
//const category = $(propertySelectors.category)
//.text()
//.trim();
const additionalField = $(additionalFieldSelector)
.text()
.trim();
if (additionalFieldIndex === 1) {
//Extracting data of real estate type
const categoryTmp = additionalField
.replace(/[\n\r\t]/gm, "")
.substring(
additionalField.indexOf("Kategorija") + 10,
additionalField.length
)
.trim();
realEstateType = this.getAdCategoryId(categoryTmp);
} else {
switch (additionalField) {
case "Internet":
internet = true;
break;
case "Garaža":
garage = true;
break;
case "Klima":
airCondition = true;
break;
case "Balkon":
balcony = true;
break;
case "Ostava":
storeRoom = true;
break;
case "Podrum":
basementAttic = true;
break;
case "Blindirana vrata":
antiTheftDoor = true;
break;
case "Voda":
water = true;
break;
case "Kablovska":
cableTV = true;
break;
case "Uknjiženo":
registeredInZkBooks = true;
break;
case "Grijanje - centralno":
heatingType = HEATING_TYPE.CENTRAL_CITY.id;
break;
case "Grijanje - plin":
heatingType = HEATING_TYPE.GAS.id;
break;
case "Grijanje - struja":
heatingType = HEATING_TYPE.ELECTRICITY.id;
break;
case "Grijanje":
heatingType = HEATING_TYPE.OTHER.id;
break;
case "Plin":
gas = true;
break;
case "Namješten":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "Alarm":
alarm = true;
break;
case "Video nadzor":
videoSurveillance = true;
break;
case "Lift":
elevator = true;
break;
case "Novogradnja":
newBuilding = true;
break;
default:
break;
}
}
if (additionalField === "") {
break;
}
additionalFieldIndex++;
} while (true);
//If no published date it takes current date of crawling
if (publishedDate) {
renewedDate = new Date();
} else {
publishedDate = new Date();
renewedDate = new Date();
}
const data = {
url,
agencyObjectId: olxId,
originAgencyName: AD_AGENCY.OLX,
realEstateType: parsedCategory,
adType: parsedAdType,
agencyObjectId,
originAgencyName: AD_AGENCY.SALJIC,
realEstateType,
adType,
title,
price: parsedPrice,
area: parsedArea,
gardenSize: parsedGardenSize,
shortDescription: descriptions
.first()
.text()
.trim(),
longDescription: descriptions
.last()
.text()
.trim(),
price,
area,
gardenSize,
shortDescription: descriptions.substring(0, descriptions.indexOf(".")),
longDescription: descriptions,
streetNumber: 0,
streetName: "",
streetName,
locality: "",
municipality: "",
city: "",
@@ -409,8 +505,8 @@ class SaljicCrawler {
locationLat,
locationLong,
adStatus: status,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString(),
publishedDate,
renewedDate,
numberOfRooms,
numberOfFloors,
floor,
@@ -447,7 +543,7 @@ class SaljicCrawler {
distanceToRiver,
numberOfViewsAgency
};
console.log(data);
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
@@ -459,19 +555,25 @@ class SaljicCrawler {
getAdCategoryId(categoryText) {
switch (categoryText) {
case "Stanovi":
case "Stan":
return AD_CATEGORY.FLAT.id;
case "Zemljišta":
case "Građevinsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Kuće":
case "Industrijsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Poljoprivredno zemljiste":
return AD_CATEGORY.LAND.id;
case "Kuća":
return AD_CATEGORY.HOUSE.id;
case "Poslovni prostori":
case "Poslovni prostor":
return AD_CATEGORY.OFFICE.id;
case "Kancelarije":
return AD_CATEGORY.OFFICE.id;
case "Apartmani":
return AD_CATEGORY.APARTMENT.id;
case "Garaže":
case "Garaža":
return AD_CATEGORY.GARAGE.id;
case "Vikendice":
case "Vikendica":
return AD_CATEGORY.COTTAGE.id;
default:
return undefined;
@@ -480,191 +582,15 @@ class SaljicCrawler {
getAdTypeId(adTypeText) {
switch (adTypeText) {
case "Prodaja":
case "PRODAJA":
return AD_TYPE.AD_TYPE_SALE.stringId;
case "Izdavanje":
case "NAJAM":
return AD_TYPE.AD_TYPE_RENT.stringId;
case "Potražnja":
return AD_TYPE.AD_TYPE_REQUEST.stringId;
default:
return undefined;
}
}
getHeatingTypeId(heatingTypeText) {
switch (heatingTypeText) {
case "struja":
return HEATING_TYPE.ELECTRICITY.id;
case "plin":
return HEATING_TYPE.GAS.id;
case "drva":
return HEATING_TYPE.WOOD.id;
case "centralno (gradsko)":
return HEATING_TYPE.CENTRAL_CITY.id;
case "centralno (kotlovnica)":
return HEATING_TYPE.CENTRAL_BOILER.id;
case "centralno (plin)":
return HEATING_TYPE.CENTRAL_GAS.id;
case "nije uvedeno":
return HEATING_TYPE.NO_HEATING.id;
case "ostalo":
return HEATING_TYPE.OTHER.id;
case "drugo":
return HEATING_TYPE.OTHER.id;
default:
console.log("grijanje = NEPOZNATO [", heatingTypeText, "]");
return null;
}
}
getFurnishingTypeId(furnishingTypeText) {
switch (furnishingTypeText) {
case "namješten":
return FURNISHING_TYPE.FURNISHED.id;
case "polunamješten":
return FURNISHING_TYPE.HALF_FURNISHED.id;
case "nenamješten":
return FURNISHING_TYPE.NOT_FURNISHED.id;
case "":
return FURNISHING_TYPE.FURNISHED.id;
default:
console.log("namješten = NEPOZNATO [", furnishingTypeText, "]");
return null;
}
}
getAccessRoadTypeId(accessRoadTypeText) {
switch (accessRoadTypeText) {
case "asfalt":
return ACCESS_ROAD_TYPE.ASPHALT.id;
case "beton":
return ACCESS_ROAD_TYPE.CONCRETE.id;
case "makadam":
return ACCESS_ROAD_TYPE.MACADAM.id;
case "ostalo":
return ACCESS_ROAD_TYPE.OTHER.id;
default:
console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]");
return null;
}
}
parseArea(areaText) {
if (!areaText) {
return NaN;
}
const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g;
const textWithOnlyOneDecimalDot = areaText
.replace(",", ".")
.replace(removeDotsExceptLastOneRegex, "");
return parseFloat(textWithOnlyOneDecimalDot);
}
parsePrice(priceText) {
if (!priceText) {
return NaN;
}
const formattedPriceText = priceText.replace(".", "").replace(",", ".");
return parseFloat(formattedPriceText);
}
parseNumberOfRooms(numberOfRoomsText, categoryId) {
if (categoryId === AD_CATEGORY.FLAT.id) {
switch (numberOfRoomsText) {
case "garsonjera":
return 0;
case "jednosoban (1)":
return 1;
case "jednoiposoban (1.5)":
return 1.5;
case "dvosoban (2)":
return 2;
case "trosoban (3)":
return 3;
case "četverosoban (4)":
return 4;
case "petosoban i više":
return 5;
default:
console.log(
"broj soba [stan] = NEPOZNATO [",
numberOfRoomsText,
", ",
categoryId,
"]"
);
return null;
}
}
if (
categoryId === AD_CATEGORY.HOUSE.id ||
categoryId === AD_CATEGORY.COTTAGE.id ||
categoryId === AD_CATEGORY.APARTMENT.id ||
categoryId === AD_CATEGORY.OFFICE.id
) {
return parseInt(numberOfRoomsText) || null;
}
console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]");
return null;
}
parseNumberOfFloors(numberOfFloorsText, categoryId) {
if (
categoryId === AD_CATEGORY.HOUSE.id ||
categoryId === AD_CATEGORY.COTTAGE.id
) {
return parseInt(numberOfFloorsText) || null;
}
if (categoryId === AD_CATEGORY.OFFICE.id) {
if (
numberOfFloorsText === "suteren" ||
numberOfFloorsText === "prizemlje"
) {
return 0;
}
if (numberOfFloorsText === "6+") {
return 7;
}
return parseInt(numberOfFloorsText) || null;
}
console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]");
return null;
}
parseFloorNumber(floorText, categoryId) {
if (
categoryId === AD_CATEGORY.FLAT.id ||
categoryId === AD_CATEGORY.APARTMENT.id
) {
if (
floorText === "suteren" ||
floorText === "prizemlje" ||
floorText === "visoko prizemlje"
) {
return 0;
}
return parseInt(floorText) || null;
}
if (categoryId === AD_CATEGORY.OFFICE.id) {
if (floorText === "zaseban objekat") {
return null;
}
if (floorText === "prizemlje" || floorText === "visoko prizemlje") {
return 0;
}
return parseInt(floorText) || null;
}
console.log("sprat = NEPOZNATO [", floorText, "]");
return null;
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}