WIP Scraped no of rooms, floors etc.

This commit is contained in:
Naida Vatric
2020-01-31 00:55:24 +01:00
parent 78c4054cde
commit 7a7aecb3ee

View File

@@ -267,103 +267,6 @@ class SaljicCrawler {
console.log("Lat:", locationLat);
console.log("Long:", locationLong);
//const category = $(propertySelectors.category)
//.text()
//.trim();
//====== OTHER AD INFORMATION ===============
let adType = null;
let olxId = null;
let numberOfViewsAgency = null;
let otherInformationDivId;
//We need to locate DIV ID where other information are stored
for (let possibleId = 10; possibleId <= 20; possibleId++) {
const adTypeFieldTitle = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1`
)
.text()
.trim();
if (adTypeFieldTitle === "Vrsta oglasa") {
otherInformationDivId = possibleId;
break;
}
}
if (!otherInformationDivId) {
throw { message: "Other information DIV could not be found" };
}
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`;
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
const publishedDate = $(publishedDateValueSelector)
.text()
.trim();
const publishedDateMoment = moment.tz(
publishedDate,
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!publishedDateMoment.isValid()) {
throw { message: "Invalid published date ! Check parsing format" };
}
const renewedDate = $(renewedDateFullValueSelector)
.data("content")
.trim();
const renewedDateMoment = moment.tz(
renewedDate,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!renewedDateMoment) {
throw {
message:
"Invalid renewed date ! Check how parser parsed renewed date text"
};
}
adType = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
)
.text()
.trim();
const parsedCategory = this.getAdCategoryId(category);
if (!parsedCategory) {
throw { message: `Unknown ad category [${category}]` };
}
const parsedAdType = this.getAdTypeId(adType);
if (!parsedAdType) {
throw { message: "Unknown ad type" };
}
const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`)
.text()
.trim();
olxId = $(`${olxIdFieldSelector} > div.df2`)
.text()
.trim();
numberOfViewsAgency = parseInt(
$(numberOfViewsAgencyValueSelector)
.text()
.trim()
);
if (olxIdFieldTitle !== "OLX ID") {
throw { message: "Cannot find correct OLX ID" };
}
//===========================================
//====== DETAIL INFORMATION FIELDS ==========
let area,
gardenSize,
@@ -401,177 +304,81 @@ class SaljicCrawler {
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null;
let publishedDate = null;
let renewedDate = null;
let fieldIndex = 1;
//Extracting data - Glavne karakteristike
let mainFieldIndex = 1;
do {
const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`;
const fieldTitleSelector = `${fieldSelector} > div.df1`;
const fieldValueSelector = `${fieldSelector} > div.df2`;
const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
const fieldTitle = $(fieldTitleSelector)
const mainField = $(mainFieldSelector)
.text()
.trim()
.toLowerCase();
const fieldValue = $(fieldValueSelector)
.text()
.trim()
.toLowerCase();
.replace(/[\n\r\t]/gm, "")
.trim();
switch (fieldTitle) {
case "kvadrata":
area = fieldValue;
break;
case "okućnica (kvadratura)":
gardenSize = fieldValue;
break;
case "broj soba":
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
break;
case "broj prostorija":
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
break;
case "broj spratova":
numberOfFloors = this.parseNumberOfFloors(
fieldValue,
parsedCategory
const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
const mainFieldValue = mainField
.substring(mainField.indexOf(" "), mainField.length)
.trim();
switch (mainFieldTitle) {
case "Površina":
area = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "sprat":
floor = this.parseFloorNumber(fieldValue, parsedCategory);
case "Okućnica":
gardenSize = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "vrsta grijanja":
heatingType = this.getHeatingTypeId(fieldValue);
case "Broj soba":
numberOfRooms = parseInt(mainFieldValue);
break;
case "namješten?":
furnishingType = this.getFurnishingTypeId(fieldValue);
case "Broj spratova":
numberOfFloors = parseInt(mainFieldValue);
break;
case "namješten":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
case "Sprat":
floor = parseInt(mainFieldValue);
break;
case "namještena":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "voda":
water = true;
break;
case "struja":
electricity = true;
break;
case "kanalizacija":
drainageSystem = fieldValue !== "nema";
break;
case "godina izgradnje":
newBuilding = newBuilding || fieldValue === "novogradnja";
break;
case "kućni ljubimci":
animalsAllowed = fieldValue === "da";
break;
case "uknjiženo / zk":
registeredInZkBooks = true;
break;
case "uknjiženo (zk)":
registeredInZkBooks = true;
break;
case "novogradnja":
newBuilding = true;
break;
case "nedavno adaptiran":
case "Godina renoviranja":
recentlyAdapted = true;
break;
case "nedavno adaptirana":
recentlyAdapted = true;
break;
case "balkon":
balcony = true;
break;
case "lift":
elevator = true;
break;
case "parking":
case "Broj parking mjesta":
`${month}/${day}/${year}`;
parking = true;
break;
case "garaža":
garage = true;
break;
case "plin":
gas = true;
break;
case "blindirana vrata":
antiTheftDoor = true;
break;
case "klima":
airCondition = true;
break;
case "telefonski priključak":
phoneConnection = true;
break;
case "kablovska tv":
cableTV = true;
break;
case "internet":
internet = true;
break;
case "podrum/tavan":
basementAttic = true;
break;
case "ostava/špajz":
storeRoom = true;
break;
case "video nadzor":
videoSurveillance = true;
break;
case "alarm":
alarm = true;
break;
case "za studente":
suitableForStudents = true;
break;
case "uključen trošak režija":
includingBills = true;
break;
case "građevinska dozvola":
buildingPermit = true;
break;
case "komunalni priključak":
utilityConnection = true;
break;
case "urbanistička dozvola":
urbanPlanPermit = true;
break;
case "udaljenost od rijeke (m)":
distanceToRiver = parseInt(fieldValue) || null;
break;
case "prilaz":
accessRoadType = this.getAccessRoadTypeId(fieldValue);
break;
case "bazen":
pool = true;
break;
case "iznajmljeno":
status = AD_STATUS.STATUS_RENTED;
case "Dostupno od":
const day = mainFieldValue.substring(0, 2);
const month = mainFieldValue.substring(3, 5);
const year = mainFieldValue.substring(6, mainFieldValue.length);
console.log(`${month}/${day}/${year}`);
publishedDate = new Date(`${month}/${day}/${year}`);
break;
default:
// console.log(fieldTitle, " = ", fieldValue);
break;
}
if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") {
if (mainFieldTitle === "") {
break;
}
mainFieldIndex++;
} while (true);
//===========================================
//=========================================
const parsedArea = this.parseArea(area) || null;
const parsedGardenSize = this.parseArea(gardenSize) || null;
const parsedPrice = this.parsePrice(price) || null;
console.log("Area:", area);
console.log("Garden size:", gardenSize);
console.log("Number of rooms:", numberOfRooms);
console.log("Number of floors", numberOfFloors);
console.log("Floor:", floor);
console.log("Adapted:", recentlyAdapted);
console.log("Parking:", parking);
console.log("Published date:", publishedDate);
if (
title.indexOf("[PRODANO]") !== -1 ||
title.indexOf("[ZAVRŠENO]") !== -1
) {
status = AD_STATUS.STATUS_SOLD;
}
//const category = $(propertySelectors.category)
//.text()
//.trim();
const data = {
url,