WIP Scraped no of rooms, floors etc.
This commit is contained in:
@@ -267,103 +267,6 @@ class SaljicCrawler {
|
||||
console.log("Lat:", locationLat);
|
||||
console.log("Long:", locationLong);
|
||||
|
||||
//const category = $(propertySelectors.category)
|
||||
//.text()
|
||||
//.trim();
|
||||
|
||||
//====== OTHER AD INFORMATION ===============
|
||||
let adType = null;
|
||||
let olxId = null;
|
||||
let numberOfViewsAgency = null;
|
||||
|
||||
let otherInformationDivId;
|
||||
//We need to locate DIV ID where other information are stored
|
||||
for (let possibleId = 10; possibleId <= 20; possibleId++) {
|
||||
const adTypeFieldTitle = $(
|
||||
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1`
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
if (adTypeFieldTitle === "Vrsta oglasa") {
|
||||
otherInformationDivId = possibleId;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!otherInformationDivId) {
|
||||
throw { message: "Other information DIV could not be found" };
|
||||
}
|
||||
|
||||
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
|
||||
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
|
||||
const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`;
|
||||
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
|
||||
|
||||
const publishedDate = $(publishedDateValueSelector)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
const publishedDateMoment = moment.tz(
|
||||
publishedDate,
|
||||
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
if (!publishedDateMoment.isValid()) {
|
||||
throw { message: "Invalid published date ! Check parsing format" };
|
||||
}
|
||||
|
||||
const renewedDate = $(renewedDateFullValueSelector)
|
||||
.data("content")
|
||||
.trim();
|
||||
|
||||
const renewedDateMoment = moment.tz(
|
||||
renewedDate,
|
||||
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
|
||||
DEFAULT_TIMEZONE
|
||||
);
|
||||
|
||||
if (!renewedDateMoment) {
|
||||
throw {
|
||||
message:
|
||||
"Invalid renewed date ! Check how parser parsed renewed date text"
|
||||
};
|
||||
}
|
||||
|
||||
adType = $(
|
||||
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
const parsedCategory = this.getAdCategoryId(category);
|
||||
if (!parsedCategory) {
|
||||
throw { message: `Unknown ad category [${category}]` };
|
||||
}
|
||||
|
||||
const parsedAdType = this.getAdTypeId(adType);
|
||||
if (!parsedAdType) {
|
||||
throw { message: "Unknown ad type" };
|
||||
}
|
||||
|
||||
const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`)
|
||||
.text()
|
||||
.trim();
|
||||
olxId = $(`${olxIdFieldSelector} > div.df2`)
|
||||
.text()
|
||||
.trim();
|
||||
numberOfViewsAgency = parseInt(
|
||||
$(numberOfViewsAgencyValueSelector)
|
||||
.text()
|
||||
.trim()
|
||||
);
|
||||
|
||||
if (olxIdFieldTitle !== "OLX ID") {
|
||||
throw { message: "Cannot find correct OLX ID" };
|
||||
}
|
||||
//===========================================
|
||||
|
||||
//====== DETAIL INFORMATION FIELDS ==========
|
||||
let area,
|
||||
gardenSize,
|
||||
@@ -401,177 +304,81 @@ class SaljicCrawler {
|
||||
buildingPermit = null,
|
||||
utilityConnection = null,
|
||||
distanceToRiver = null;
|
||||
let publishedDate = null;
|
||||
let renewedDate = null;
|
||||
|
||||
let fieldIndex = 1;
|
||||
//Extracting data - Glavne karakteristike
|
||||
let mainFieldIndex = 1;
|
||||
do {
|
||||
const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`;
|
||||
const fieldTitleSelector = `${fieldSelector} > div.df1`;
|
||||
const fieldValueSelector = `${fieldSelector} > div.df2`;
|
||||
const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
|
||||
|
||||
const fieldTitle = $(fieldTitleSelector)
|
||||
const mainField = $(mainFieldSelector)
|
||||
.text()
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
const fieldValue = $(fieldValueSelector)
|
||||
.text()
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
.replace(/[\n\r\t]/gm, "")
|
||||
.trim();
|
||||
|
||||
switch (fieldTitle) {
|
||||
case "kvadrata":
|
||||
area = fieldValue;
|
||||
break;
|
||||
case "okućnica (kvadratura)":
|
||||
gardenSize = fieldValue;
|
||||
break;
|
||||
case "broj soba":
|
||||
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
|
||||
break;
|
||||
case "broj prostorija":
|
||||
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
|
||||
break;
|
||||
case "broj spratova":
|
||||
numberOfFloors = this.parseNumberOfFloors(
|
||||
fieldValue,
|
||||
parsedCategory
|
||||
const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
|
||||
const mainFieldValue = mainField
|
||||
.substring(mainField.indexOf(" "), mainField.length)
|
||||
.trim();
|
||||
|
||||
switch (mainFieldTitle) {
|
||||
case "Površina":
|
||||
area = parseFloat(
|
||||
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
|
||||
);
|
||||
break;
|
||||
case "sprat":
|
||||
floor = this.parseFloorNumber(fieldValue, parsedCategory);
|
||||
case "Okućnica":
|
||||
gardenSize = parseFloat(
|
||||
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
|
||||
);
|
||||
break;
|
||||
case "vrsta grijanja":
|
||||
heatingType = this.getHeatingTypeId(fieldValue);
|
||||
case "Broj soba":
|
||||
numberOfRooms = parseInt(mainFieldValue);
|
||||
break;
|
||||
case "namješten?":
|
||||
furnishingType = this.getFurnishingTypeId(fieldValue);
|
||||
case "Broj spratova":
|
||||
numberOfFloors = parseInt(mainFieldValue);
|
||||
break;
|
||||
case "namješten":
|
||||
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
||||
case "Sprat":
|
||||
floor = parseInt(mainFieldValue);
|
||||
break;
|
||||
case "namještena":
|
||||
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
||||
break;
|
||||
case "voda":
|
||||
water = true;
|
||||
break;
|
||||
case "struja":
|
||||
electricity = true;
|
||||
break;
|
||||
case "kanalizacija":
|
||||
drainageSystem = fieldValue !== "nema";
|
||||
break;
|
||||
case "godina izgradnje":
|
||||
newBuilding = newBuilding || fieldValue === "novogradnja";
|
||||
break;
|
||||
case "kućni ljubimci":
|
||||
animalsAllowed = fieldValue === "da";
|
||||
break;
|
||||
case "uknjiženo / zk":
|
||||
registeredInZkBooks = true;
|
||||
break;
|
||||
case "uknjiženo (zk)":
|
||||
registeredInZkBooks = true;
|
||||
break;
|
||||
case "novogradnja":
|
||||
newBuilding = true;
|
||||
break;
|
||||
case "nedavno adaptiran":
|
||||
case "Godina renoviranja":
|
||||
recentlyAdapted = true;
|
||||
break;
|
||||
case "nedavno adaptirana":
|
||||
recentlyAdapted = true;
|
||||
break;
|
||||
case "balkon":
|
||||
balcony = true;
|
||||
break;
|
||||
case "lift":
|
||||
elevator = true;
|
||||
break;
|
||||
case "parking":
|
||||
case "Broj parking mjesta":
|
||||
`${month}/${day}/${year}`;
|
||||
parking = true;
|
||||
break;
|
||||
case "garaža":
|
||||
garage = true;
|
||||
break;
|
||||
case "plin":
|
||||
gas = true;
|
||||
break;
|
||||
case "blindirana vrata":
|
||||
antiTheftDoor = true;
|
||||
break;
|
||||
case "klima":
|
||||
airCondition = true;
|
||||
break;
|
||||
case "telefonski priključak":
|
||||
phoneConnection = true;
|
||||
break;
|
||||
case "kablovska tv":
|
||||
cableTV = true;
|
||||
break;
|
||||
case "internet":
|
||||
internet = true;
|
||||
break;
|
||||
case "podrum/tavan":
|
||||
basementAttic = true;
|
||||
break;
|
||||
case "ostava/špajz":
|
||||
storeRoom = true;
|
||||
break;
|
||||
case "video nadzor":
|
||||
videoSurveillance = true;
|
||||
break;
|
||||
case "alarm":
|
||||
alarm = true;
|
||||
break;
|
||||
case "za studente":
|
||||
suitableForStudents = true;
|
||||
break;
|
||||
case "uključen trošak režija":
|
||||
includingBills = true;
|
||||
break;
|
||||
case "građevinska dozvola":
|
||||
buildingPermit = true;
|
||||
break;
|
||||
case "komunalni priključak":
|
||||
utilityConnection = true;
|
||||
break;
|
||||
case "urbanistička dozvola":
|
||||
urbanPlanPermit = true;
|
||||
break;
|
||||
case "udaljenost od rijeke (m)":
|
||||
distanceToRiver = parseInt(fieldValue) || null;
|
||||
break;
|
||||
case "prilaz":
|
||||
accessRoadType = this.getAccessRoadTypeId(fieldValue);
|
||||
break;
|
||||
case "bazen":
|
||||
pool = true;
|
||||
break;
|
||||
case "iznajmljeno":
|
||||
status = AD_STATUS.STATUS_RENTED;
|
||||
case "Dostupno od":
|
||||
const day = mainFieldValue.substring(0, 2);
|
||||
const month = mainFieldValue.substring(3, 5);
|
||||
const year = mainFieldValue.substring(6, mainFieldValue.length);
|
||||
console.log(`${month}/${day}/${year}`);
|
||||
publishedDate = new Date(`${month}/${day}/${year}`);
|
||||
break;
|
||||
default:
|
||||
// console.log(fieldTitle, " = ", fieldValue);
|
||||
break;
|
||||
}
|
||||
|
||||
if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") {
|
||||
if (mainFieldTitle === "") {
|
||||
break;
|
||||
}
|
||||
mainFieldIndex++;
|
||||
} while (true);
|
||||
//===========================================
|
||||
|
||||
//=========================================
|
||||
const parsedArea = this.parseArea(area) || null;
|
||||
const parsedGardenSize = this.parseArea(gardenSize) || null;
|
||||
const parsedPrice = this.parsePrice(price) || null;
|
||||
console.log("Area:", area);
|
||||
console.log("Garden size:", gardenSize);
|
||||
console.log("Number of rooms:", numberOfRooms);
|
||||
console.log("Number of floors", numberOfFloors);
|
||||
console.log("Floor:", floor);
|
||||
console.log("Adapted:", recentlyAdapted);
|
||||
console.log("Parking:", parking);
|
||||
console.log("Published date:", publishedDate);
|
||||
|
||||
if (
|
||||
title.indexOf("[PRODANO]") !== -1 ||
|
||||
title.indexOf("[ZAVRŠENO]") !== -1
|
||||
) {
|
||||
status = AD_STATUS.STATUS_SOLD;
|
||||
}
|
||||
//const category = $(propertySelectors.category)
|
||||
//.text()
|
||||
//.trim();
|
||||
|
||||
const data = {
|
||||
url,
|
||||
|
||||
Reference in New Issue
Block a user