WIP Scraped no of rooms, floors etc.

This commit is contained in:
Naida Vatric
2020-01-31 00:55:24 +01:00
parent 78c4054cde
commit 7a7aecb3ee

View File

@@ -267,103 +267,6 @@ class SaljicCrawler {
console.log("Lat:", locationLat); console.log("Lat:", locationLat);
console.log("Long:", locationLong); console.log("Long:", locationLong);
//const category = $(propertySelectors.category)
//.text()
//.trim();
//====== OTHER AD INFORMATION ===============
let adType = null;
let olxId = null;
let numberOfViewsAgency = null;
let otherInformationDivId;
//We need to locate DIV ID where other information are stored
for (let possibleId = 10; possibleId <= 20; possibleId++) {
const adTypeFieldTitle = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1`
)
.text()
.trim();
if (adTypeFieldTitle === "Vrsta oglasa") {
otherInformationDivId = possibleId;
break;
}
}
if (!otherInformationDivId) {
throw { message: "Other information DIV could not be found" };
}
const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`;
const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`;
const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`;
const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`;
const publishedDate = $(publishedDateValueSelector)
.text()
.trim();
const publishedDateMoment = moment.tz(
publishedDate,
OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!publishedDateMoment.isValid()) {
throw { message: "Invalid published date ! Check parsing format" };
}
const renewedDate = $(renewedDateFullValueSelector)
.data("content")
.trim();
const renewedDateMoment = moment.tz(
renewedDate,
OLX_ENUMS.OLX_RENEWED_DATE_FORMAT,
DEFAULT_TIMEZONE
);
if (!renewedDateMoment) {
throw {
message:
"Invalid renewed date ! Check how parser parsed renewed date text"
};
}
adType = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2`
)
.text()
.trim();
const parsedCategory = this.getAdCategoryId(category);
if (!parsedCategory) {
throw { message: `Unknown ad category [${category}]` };
}
const parsedAdType = this.getAdTypeId(adType);
if (!parsedAdType) {
throw { message: "Unknown ad type" };
}
const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`)
.text()
.trim();
olxId = $(`${olxIdFieldSelector} > div.df2`)
.text()
.trim();
numberOfViewsAgency = parseInt(
$(numberOfViewsAgencyValueSelector)
.text()
.trim()
);
if (olxIdFieldTitle !== "OLX ID") {
throw { message: "Cannot find correct OLX ID" };
}
//===========================================
//====== DETAIL INFORMATION FIELDS ========== //====== DETAIL INFORMATION FIELDS ==========
let area, let area,
gardenSize, gardenSize,
@@ -401,177 +304,81 @@ class SaljicCrawler {
buildingPermit = null, buildingPermit = null,
utilityConnection = null, utilityConnection = null,
distanceToRiver = null; distanceToRiver = null;
let publishedDate = null;
let renewedDate = null;
let fieldIndex = 1; //Extracting data - Glavne karakteristike
let mainFieldIndex = 1;
do { do {
const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
const fieldTitleSelector = `${fieldSelector} > div.df1`;
const fieldValueSelector = `${fieldSelector} > div.df2`;
const fieldTitle = $(fieldTitleSelector) const mainField = $(mainFieldSelector)
.text() .text()
.trim() .replace(/[\n\r\t]/gm, "")
.toLowerCase(); .trim();
const fieldValue = $(fieldValueSelector)
.text()
.trim()
.toLowerCase();
switch (fieldTitle) { const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
case "kvadrata": const mainFieldValue = mainField
area = fieldValue; .substring(mainField.indexOf(" "), mainField.length)
break; .trim();
case "okućnica (kvadratura)":
gardenSize = fieldValue; switch (mainFieldTitle) {
break; case "Površina":
case "broj soba": area = parseFloat(
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
break;
case "broj prostorija":
numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory);
break;
case "broj spratova":
numberOfFloors = this.parseNumberOfFloors(
fieldValue,
parsedCategory
); );
break; break;
case "sprat": case "Okućnica":
floor = this.parseFloorNumber(fieldValue, parsedCategory); gardenSize = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break; break;
case "vrsta grijanja": case "Broj soba":
heatingType = this.getHeatingTypeId(fieldValue); numberOfRooms = parseInt(mainFieldValue);
break; break;
case "namješten?": case "Broj spratova":
furnishingType = this.getFurnishingTypeId(fieldValue); numberOfFloors = parseInt(mainFieldValue);
break; break;
case "namješten": case "Sprat":
furnishingType = FURNISHING_TYPE.FURNISHED.id; floor = parseInt(mainFieldValue);
break; break;
case "namještena": case "Godina renoviranja":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "voda":
water = true;
break;
case "struja":
electricity = true;
break;
case "kanalizacija":
drainageSystem = fieldValue !== "nema";
break;
case "godina izgradnje":
newBuilding = newBuilding || fieldValue === "novogradnja";
break;
case "kućni ljubimci":
animalsAllowed = fieldValue === "da";
break;
case "uknjiženo / zk":
registeredInZkBooks = true;
break;
case "uknjiženo (zk)":
registeredInZkBooks = true;
break;
case "novogradnja":
newBuilding = true;
break;
case "nedavno adaptiran":
recentlyAdapted = true; recentlyAdapted = true;
break; break;
case "nedavno adaptirana": case "Broj parking mjesta":
recentlyAdapted = true; `${month}/${day}/${year}`;
break;
case "balkon":
balcony = true;
break;
case "lift":
elevator = true;
break;
case "parking":
parking = true; parking = true;
break; break;
case "garaža": case "Dostupno od":
garage = true; const day = mainFieldValue.substring(0, 2);
break; const month = mainFieldValue.substring(3, 5);
case "plin": const year = mainFieldValue.substring(6, mainFieldValue.length);
gas = true; console.log(`${month}/${day}/${year}`);
break; publishedDate = new Date(`${month}/${day}/${year}`);
case "blindirana vrata":
antiTheftDoor = true;
break;
case "klima":
airCondition = true;
break;
case "telefonski priključak":
phoneConnection = true;
break;
case "kablovska tv":
cableTV = true;
break;
case "internet":
internet = true;
break;
case "podrum/tavan":
basementAttic = true;
break;
case "ostava/špajz":
storeRoom = true;
break;
case "video nadzor":
videoSurveillance = true;
break;
case "alarm":
alarm = true;
break;
case "za studente":
suitableForStudents = true;
break;
case "uključen trošak režija":
includingBills = true;
break;
case "građevinska dozvola":
buildingPermit = true;
break;
case "komunalni priključak":
utilityConnection = true;
break;
case "urbanistička dozvola":
urbanPlanPermit = true;
break;
case "udaljenost od rijeke (m)":
distanceToRiver = parseInt(fieldValue) || null;
break;
case "prilaz":
accessRoadType = this.getAccessRoadTypeId(fieldValue);
break;
case "bazen":
pool = true;
break;
case "iznajmljeno":
status = AD_STATUS.STATUS_RENTED;
break; break;
default: default:
// console.log(fieldTitle, " = ", fieldValue); // console.log(fieldTitle, " = ", fieldValue);
break; break;
} }
if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { if (mainFieldTitle === "") {
break; break;
} }
mainFieldIndex++;
} while (true); } while (true);
//===========================================
//========================================= console.log("Area:", area);
const parsedArea = this.parseArea(area) || null; console.log("Garden size:", gardenSize);
const parsedGardenSize = this.parseArea(gardenSize) || null; console.log("Number of rooms:", numberOfRooms);
const parsedPrice = this.parsePrice(price) || null; console.log("Number of floors", numberOfFloors);
console.log("Floor:", floor);
console.log("Adapted:", recentlyAdapted);
console.log("Parking:", parking);
console.log("Published date:", publishedDate);
if ( //const category = $(propertySelectors.category)
title.indexOf("[PRODANO]") !== -1 || //.text()
title.indexOf("[ZAVRŠENO]") !== -1 //.trim();
) {
status = AD_STATUS.STATUS_SOLD;
}
const data = { const data = {
url, url,