diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 4156727..95c24bb 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -267,103 +267,6 @@ class SaljicCrawler { console.log("Lat:", locationLat); console.log("Long:", locationLong); - //const category = $(propertySelectors.category) - //.text() - //.trim(); - - //====== OTHER AD INFORMATION =============== - let adType = null; - let olxId = null; - let numberOfViewsAgency = null; - - let otherInformationDivId; - //We need to locate DIV ID where other information are stored - for (let possibleId = 10; possibleId <= 20; possibleId++) { - const adTypeFieldTitle = $( - `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` - ) - .text() - .trim(); - - if (adTypeFieldTitle === "Vrsta oglasa") { - otherInformationDivId = possibleId; - break; - } - } - - if (!otherInformationDivId) { - throw { message: "Other information DIV could not be found" }; - } - - const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; - const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; - const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; - const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; - - const publishedDate = $(publishedDateValueSelector) - .text() - .trim(); - - const publishedDateMoment = moment.tz( - publishedDate, - OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, - DEFAULT_TIMEZONE - ); - - if (!publishedDateMoment.isValid()) { - throw { message: "Invalid published date ! Check parsing format" }; - } - - const renewedDate = $(renewedDateFullValueSelector) - .data("content") - .trim(); - - const renewedDateMoment = moment.tz( - renewedDate, - OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, - DEFAULT_TIMEZONE - ); - - if (!renewedDateMoment) { - throw { - message: - "Invalid renewed date ! Check how parser parsed renewed date text" - }; - } - - adType = $( - `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` - ) - .text() - .trim(); - - const parsedCategory = this.getAdCategoryId(category); - if (!parsedCategory) { - throw { message: `Unknown ad category [${category}]` }; - } - - const parsedAdType = this.getAdTypeId(adType); - if (!parsedAdType) { - throw { message: "Unknown ad type" }; - } - - const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) - .text() - .trim(); - olxId = $(`${olxIdFieldSelector} > div.df2`) - .text() - .trim(); - numberOfViewsAgency = parseInt( - $(numberOfViewsAgencyValueSelector) - .text() - .trim() - ); - - if (olxIdFieldTitle !== "OLX ID") { - throw { message: "Cannot find correct OLX ID" }; - } - //=========================================== - //====== DETAIL INFORMATION FIELDS ========== let area, gardenSize, @@ -401,177 +304,81 @@ class SaljicCrawler { buildingPermit = null, utilityConnection = null, distanceToRiver = null; + let publishedDate = null; + let renewedDate = null; - let fieldIndex = 1; + //Extracting data - Glavne karakteristike + let mainFieldIndex = 1; do { - const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; - const fieldTitleSelector = `${fieldSelector} > div.df1`; - const fieldValueSelector = `${fieldSelector} > div.df2`; + const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`; - const fieldTitle = $(fieldTitleSelector) + const mainField = $(mainFieldSelector) .text() - .trim() - .toLowerCase(); - const fieldValue = $(fieldValueSelector) - .text() - .trim() - .toLowerCase(); + .replace(/[\n\r\t]/gm, "") + .trim(); - switch (fieldTitle) { - case "kvadrata": - area = fieldValue; - break; - case "okućnica (kvadratura)": - gardenSize = fieldValue; - break; - case "broj soba": - numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); - break; - case "broj prostorija": - numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); - break; - case "broj spratova": - numberOfFloors = this.parseNumberOfFloors( - fieldValue, - parsedCategory + const mainFieldTitle = mainField.substring(0, mainField.indexOf(" ")); + const mainFieldValue = mainField + .substring(mainField.indexOf(" "), mainField.length) + .trim(); + + switch (mainFieldTitle) { + case "Površina": + area = parseFloat( + mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) ); break; - case "sprat": - floor = this.parseFloorNumber(fieldValue, parsedCategory); + case "Okućnica": + gardenSize = parseFloat( + mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) + ); break; - case "vrsta grijanja": - heatingType = this.getHeatingTypeId(fieldValue); + case "Broj soba": + numberOfRooms = parseInt(mainFieldValue); break; - case "namješten?": - furnishingType = this.getFurnishingTypeId(fieldValue); + case "Broj spratova": + numberOfFloors = parseInt(mainFieldValue); break; - case "namješten": - furnishingType = FURNISHING_TYPE.FURNISHED.id; + case "Sprat": + floor = parseInt(mainFieldValue); break; - case "namještena": - furnishingType = FURNISHING_TYPE.FURNISHED.id; - break; - case "voda": - water = true; - break; - case "struja": - electricity = true; - break; - case "kanalizacija": - drainageSystem = fieldValue !== "nema"; - break; - case "godina izgradnje": - newBuilding = newBuilding || fieldValue === "novogradnja"; - break; - case "kućni ljubimci": - animalsAllowed = fieldValue === "da"; - break; - case "uknjiženo / zk": - registeredInZkBooks = true; - break; - case "uknjiženo (zk)": - registeredInZkBooks = true; - break; - case "novogradnja": - newBuilding = true; - break; - case "nedavno adaptiran": + case "Godina renoviranja": recentlyAdapted = true; break; - case "nedavno adaptirana": - recentlyAdapted = true; - break; - case "balkon": - balcony = true; - break; - case "lift": - elevator = true; - break; - case "parking": + case "Broj parking mjesta": + `${month}/${day}/${year}`; parking = true; break; - case "garaža": - garage = true; - break; - case "plin": - gas = true; - break; - case "blindirana vrata": - antiTheftDoor = true; - break; - case "klima": - airCondition = true; - break; - case "telefonski priključak": - phoneConnection = true; - break; - case "kablovska tv": - cableTV = true; - break; - case "internet": - internet = true; - break; - case "podrum/tavan": - basementAttic = true; - break; - case "ostava/špajz": - storeRoom = true; - break; - case "video nadzor": - videoSurveillance = true; - break; - case "alarm": - alarm = true; - break; - case "za studente": - suitableForStudents = true; - break; - case "uključen trošak režija": - includingBills = true; - break; - case "građevinska dozvola": - buildingPermit = true; - break; - case "komunalni priključak": - utilityConnection = true; - break; - case "urbanistička dozvola": - urbanPlanPermit = true; - break; - case "udaljenost od rijeke (m)": - distanceToRiver = parseInt(fieldValue) || null; - break; - case "prilaz": - accessRoadType = this.getAccessRoadTypeId(fieldValue); - break; - case "bazen": - pool = true; - break; - case "iznajmljeno": - status = AD_STATUS.STATUS_RENTED; + case "Dostupno od": + const day = mainFieldValue.substring(0, 2); + const month = mainFieldValue.substring(3, 5); + const year = mainFieldValue.substring(6, mainFieldValue.length); + console.log(`${month}/${day}/${year}`); + publishedDate = new Date(`${month}/${day}/${year}`); break; default: // console.log(fieldTitle, " = ", fieldValue); break; } - if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + if (mainFieldTitle === "") { break; } + mainFieldIndex++; } while (true); - //=========================================== - //========================================= - const parsedArea = this.parseArea(area) || null; - const parsedGardenSize = this.parseArea(gardenSize) || null; - const parsedPrice = this.parsePrice(price) || null; + console.log("Area:", area); + console.log("Garden size:", gardenSize); + console.log("Number of rooms:", numberOfRooms); + console.log("Number of floors", numberOfFloors); + console.log("Floor:", floor); + console.log("Adapted:", recentlyAdapted); + console.log("Parking:", parking); + console.log("Published date:", publishedDate); - if ( - title.indexOf("[PRODANO]") !== -1 || - title.indexOf("[ZAVRŠENO]") !== -1 - ) { - status = AD_STATUS.STATUS_SOLD; - } + //const category = $(propertySelectors.category) + //.text() + //.trim(); const data = { url,