From b6024af2cbd5d01fc030369747fd42c57476d98f Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Fri, 8 Nov 2019 17:05:51 +0100 Subject: [PATCH] add new fields for OLX crawler --- app/crawler/specificCrawlers/olx.js | 497 ++++++++++++++++++++++------ 1 file changed, 399 insertions(+), 98 deletions(-) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index d0bd0dd..952a8be 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -10,7 +10,10 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + HEATING_TYPE, + FURNISHING_TYPE, + ACCESS_ROAD_TYPE } = require("../../common/enums"); const { @@ -271,6 +274,7 @@ class OlxCrawler { //====== OTHER AD INFORMATION =============== let adType = null; let olxId = null; + let numberOfViewsAgency = null; let otherInformationDivId; //We need to locate DIV ID where other information are stored @@ -293,6 +297,7 @@ class OlxCrawler { const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; + const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) @@ -331,60 +336,7 @@ class OlxCrawler { ) .text() .trim(); - const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) - .text() - .trim(); - olxId = $(`${olxIdFieldSelector} > div.df2`) - .text() - .trim(); - if (olxIdFieldTitle !== "OLX ID") { - throw { message: "Cannot find correct OLX ID" }; - } - //=========================================== - - //====== DETAIL INFORMATION FIELDS ========== - let area = null; - let gardenSize = null; - - let fieldIndex = 1; - do { - const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; - const fieldTitleSelector = `${fieldSelector} > div.df1`; - const fieldValueSelector = `${fieldSelector} > div.df2`; - - const fieldTitle = $(fieldTitleSelector) - .text() - .trim(); - const fieldValue = $(fieldValueSelector) - .text() - .trim(); - - switch (fieldTitle) { - case "Kvadrata": - area = fieldValue; - break; - case "Okućnica (kvadratura)": - gardenSize = fieldValue; - break; - } - - if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { - break; - } - } while (true); - //=========================================== - - //====== UNUSED FIELDS FOR NOW ============== - const time = $("time").attr("datetime"); - const numberOfViews = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" - ) - .text() - .trim(); - //=========================================== - - //========================================= const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { throw { message: `Unknown ad category [${category}]` }; @@ -395,6 +347,218 @@ class OlxCrawler { throw { message: "Unknown ad type" }; } + const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) + .text() + .trim(); + olxId = $(`${olxIdFieldSelector} > div.df2`) + .text() + .trim(); + numberOfViewsAgency = parseInt( + $(numberOfViewsAgencyValueSelector) + .text() + .trim() + ); + + if (olxIdFieldTitle !== "OLX ID") { + throw { message: "Cannot find correct OLX ID" }; + } + //=========================================== + + //====== DETAIL INFORMATION FIELDS ========== + let area, + gardenSize, + numberOfRooms = null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = null, + furnishingType = null, + balcony = null, + newBuilding = null, + elevator = null, + water = null, + electricity = null, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = null, + garage = null, + gas = null, + antiTheftDoor = null, + airCondition = null, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = null, + storeRoom = null, + videoSurveillance = null, + alarm = null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null; + + let fieldIndex = 1; + do { + const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; + const fieldTitleSelector = `${fieldSelector} > div.df1`; + const fieldValueSelector = `${fieldSelector} > div.df2`; + + const fieldTitle = $(fieldTitleSelector) + .text() + .trim() + .toLowerCase(); + const fieldValue = $(fieldValueSelector) + .text() + .trim() + .toLowerCase(); + + switch (fieldTitle) { + case "kvadrata": + area = fieldValue; + break; + case "okućnica (kvadratura)": + gardenSize = fieldValue; + break; + case "broj soba": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj prostorija": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj spratova": + numberOfFloors = this.parseNumberOfFloors( + fieldValue, + parsedCategory + ); + break; + case "sprat": + floor = this.parseFloorNumber(fieldValue, parsedCategory); + break; + case "vrsta grijanja": + heatingType = this.getHeatingTypeId(fieldValue); + break; + case "namješten?": + furnishingType = this.getFurnishingTypeId(fieldValue); + break; + case "namješten": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "namještena": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "voda": + water = true; + break; + case "struja": + electricity = true; + break; + case "kanalizacija": + drainageSystem = fieldValue !== "nema"; + break; + case "godina izgradnje": + newBuilding = newBuilding || fieldValue === "novogradnja"; + break; + case "kućni ljubimci": + animalsAllowed = fieldValue === "da"; + break; + case "uknjiženo / zk": + registeredInZkBooks = true; + break; + case "uknjiženo (zk)": + registeredInZkBooks = true; + break; + case "novogradnja": + newBuilding = true; + break; + case "nedavno adaptiran": + recentlyAdapted = true; + break; + case "nedavno adaptirana": + recentlyAdapted = true; + break; + case "balkon": + balcony = true; + break; + case "lift": + elevator = true; + break; + case "parking": + parking = true; + break; + case "garaža": + garage = true; + break; + case "plin": + gas = true; + break; + case "blindirana vrata": + antiTheftDoor = true; + break; + case "klima": + airCondition = true; + break; + case "telefonski priključak": + phoneConnection = true; + break; + case "kablovska tv": + cableTV = true; + break; + case "internet": + internet = true; + break; + case "podrum/tavan": + basementAttic = true; + break; + case "ostava/špajz": + storeRoom = true; + break; + case "video nadzor": + videoSurveillance = true; + break; + case "alarm": + alarm = true; + break; + case "za studente": + suitableForStudents = true; + break; + case "uključen trošak režija": + includingBills = true; + break; + case "građevinska dozvola": + buildingPermit = true; + break; + case "komunalni priključak": + utilityConnection = true; + break; + case "urbanistička dozvola": + urbanPlanPermit = true; + break; + case "udaljenost od rijeke (m)": + distanceToRiver = parseInt(fieldValue) || null; + break; + case "prilaz": + accessRoadType = this.getAccessRoadTypeId(fieldValue); + break; + case "bazen": + pool = true; + break; + default: + // console.log(fieldTitle, " = ", fieldValue); + break; + } + + if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + break; + } + } while (true); + //=========================================== + + //========================================= const parsedArea = this.parseArea(area) || null; const parsedGardenSize = this.parseArea(gardenSize) || null; const parsedPrice = this.parsePrice(price) || null; @@ -439,7 +603,42 @@ class OlxCrawler { locationLong, adStatus: status, publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString() + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency }; return data; @@ -485,6 +684,64 @@ class OlxCrawler { } } + getHeatingTypeId(heatingTypeText) { + switch (heatingTypeText) { + case "struja": + return HEATING_TYPE.ELECTRICITY.id; + case "plin": + return HEATING_TYPE.GAS.id; + case "drva": + return HEATING_TYPE.WOOD.id; + case "centralno (gradsko)": + return HEATING_TYPE.CENTRAL_CITY.id; + case "centralno (kotlovnica)": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "centralno (plin)": + return HEATING_TYPE.CENTRAL_GAS.id; + case "nije uvedeno": + return HEATING_TYPE.NO_HEATING.id; + case "ostalo": + return HEATING_TYPE.OTHER.id; + case "drugo": + return HEATING_TYPE.OTHER.id; + default: + console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); + return null; + } + } + + getFurnishingTypeId(furnishingTypeText) { + switch (furnishingTypeText) { + case "namješten": + return FURNISHING_TYPE.FURNISHED.id; + case "polunamješten": + return FURNISHING_TYPE.HALF_FURNISHED.id; + case "nenamješten": + return FURNISHING_TYPE.NOT_FURNISHED.id; + case "": + return FURNISHING_TYPE.FURNISHED.id; + default: + console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); + return null; + } + } + + getAccessRoadTypeId(accessRoadTypeText) { + switch (accessRoadTypeText) { + case "asfalt": + return ACCESS_ROAD_TYPE.ASPHALT.id; + case "beton": + return ACCESS_ROAD_TYPE.CONCRETE.id; + case "makadam": + return ACCESS_ROAD_TYPE.MACADAM.id; + case "ostalo": + return ACCESS_ROAD_TYPE.OTHER.id; + default: + console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); + return null; + } + } + parseArea(areaText) { if (!areaText) { return NaN; @@ -505,56 +762,100 @@ class OlxCrawler { return parseFloat(formattedPriceText); } - parseRenewedDate(renewedDateText) { - const currentMoment = moment.tz(DEFAULT_TIMEZONE); - - if (renewedDateText.includes("Prije mjesec dana")) { - return currentMoment.add(-1, "month"); - } - - if (renewedDateText.includes("Jučer")) { - return currentMoment.add(-1, "day"); - } - - if (renewedDateText.includes("Prije sat")) { - return currentMoment.add(-1, "hour"); - } - - if (renewedDateText.includes("dan")) { - // format for this case should be "Prije N dana" or "Prije N dan" - const dateParts = renewedDateText.split(" "); - if (dateParts[0] === "Prije") { - const numberOfDays = parseInt(dateParts[1]); - return currentMoment.add(-1 * numberOfDays, "days"); - } else { - return undefined; + parseNumberOfRooms(numberOfRoomsText, categoryId) { + if (categoryId === AD_CATEGORY.FLAT.id) { + switch (numberOfRoomsText) { + case "garsonjera": + return 0; + case "jednosoban (1)": + return 1; + case "jednoiposoban (1.5)": + return 1.5; + case "dvosoban (2)": + return 2; + case "trosoban (3)": + return 3; + case "četverosoban (4)": + return 4; + case "petosoban i više": + return 5; + default: + console.log( + "broj soba [stan] = NEPOZNATO [", + numberOfRoomsText, + ", ", + categoryId, + "]" + ); + return null; } } - if (renewedDateText.includes("sat")) { - const dateParts = renewedDateText.split(" "); - const parsedHours = - dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined; - if (!parsedHours) { - return undefined; - } - return currentMoment.add(-1 * parsedHours, "hours"); + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id || + categoryId === AD_CATEGORY.APARTMENT.id || + categoryId === AD_CATEGORY.OFFICE.id + ) { + return parseInt(numberOfRoomsText) || null; } - const todayVariations = ["min", "sekund", "maloprije"]; - for (const todayVariation of todayVariations) { - if (renewedDateText.includes(todayVariation)) { - return currentMoment; - } + console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); + return null; + } + + parseNumberOfFloors(numberOfFloorsText, categoryId) { + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id + ) { + return parseInt(numberOfFloorsText) || null; } - const renewedDateMoment = moment.tz( - renewedDateText, - OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, - DEFAULT_TIMEZONE - ); + if (categoryId === AD_CATEGORY.OFFICE.id) { + if ( + numberOfFloorsText === "suteren" || + numberOfFloorsText === "prizemlje" + ) { + return 0; + } + if (numberOfFloorsText === "6+") { + return 7; + } + return parseInt(numberOfFloorsText) || null; + } - return renewedDateMoment.isValid() ? renewedDateMoment : undefined; + console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); + return null; + } + + parseFloorNumber(floorText, categoryId) { + if ( + categoryId === AD_CATEGORY.FLAT.id || + categoryId === AD_CATEGORY.APARTMENT.id + ) { + if ( + floorText === "suteren" || + floorText === "prizemlje" || + floorText === "visoko prizemlje" + ) { + return 0; + } + return parseInt(floorText) || null; + } + + if (categoryId === AD_CATEGORY.OFFICE.id) { + if (floorText === "zaseban objekat") { + return null; + } + if (floorText === "prizemlje" || floorText === "visoko prizemlje") { + return 0; + } + return parseInt(floorText) || null; + } + + console.log("sprat = NEPOZNATO [", floorText, "]"); + return null; } async sleep(ms) { @@ -569,7 +870,7 @@ class OlxCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } }