From 02f5b97e80f9cb221d141ee23f0bb62be15a3b33 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 7 Nov 2019 11:27:14 +0100 Subject: [PATCH 01/20] add migration for new real estate fields; update real estate model --- ...-additional-fields-to-realEstates-table.js | 163 ++++++++++++++++++ app/models/realEstate.js | 39 ++++- 2 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 app/migrations/20191105174319-add-additional-fields-to-realEstates-table.js diff --git a/app/migrations/20191105174319-add-additional-fields-to-realEstates-table.js b/app/migrations/20191105174319-add-additional-fields-to-realEstates-table.js new file mode 100644 index 0000000..134e6ad --- /dev/null +++ b/app/migrations/20191105174319-add-additional-fields-to-realEstates-table.js @@ -0,0 +1,163 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.addColumn("RealEstates", "numberOfRooms", { + type: Sequelize.REAL + }), + queryInterface.addColumn("RealEstates", "numberOfFloors", { + type: Sequelize.INTEGER + }), + queryInterface.addColumn("RealEstates", "floor", { + type: Sequelize.INTEGER + }), + queryInterface.addColumn("RealEstates", "accessRoadType", { + type: Sequelize.TEXT + }), + queryInterface.addColumn("RealEstates", "heatingType", { + type: Sequelize.TEXT + }), + queryInterface.addColumn("RealEstates", "furnishingType", { + type: Sequelize.TEXT + }), + queryInterface.addColumn("RealEstates", "balcony", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "newBuilding", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "elevator", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "water", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "electricity", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "drainageSystem", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "registeredInZkBooks", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "recentlyAdapted", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "parking", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "garage", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "gas", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "antiTheftDoor", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "airCondition", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "phoneConnection", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "cableTV", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "internet", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "basementAttic", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "storeRoom", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "videoSurveillance", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "alarm", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "suitableForStudents", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "includingBills", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "animalsAllowed", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "pool", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "exchange", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "urbanPlanPermit", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "buildingPermit", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "utilityConnection", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "distanceToRiver", { + type: Sequelize.INTEGER + }), + queryInterface.addColumn("RealEstates", "numberOfViewsAgency", { + type: Sequelize.INTEGER, + defaultValue: 0 + }), + queryInterface.addColumn("RealEstates", "numberOfViewsKivi", { + type: Sequelize.INTEGER, + defaultValue: 0 + }) + ]); + }, + + down: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.removeColumn("RealEstates", "numberOfRooms"), + queryInterface.removeColumn("RealEstates", "numberOfFloors"), + queryInterface.removeColumn("RealEstates", "floor"), + queryInterface.removeColumn("RealEstates", "accessRoadType"), + queryInterface.removeColumn("RealEstates", "heatingType"), + queryInterface.removeColumn("RealEstates", "furnishingType"), + queryInterface.removeColumn("RealEstates", "balcony"), + queryInterface.removeColumn("RealEstates", "newBuilding"), + queryInterface.removeColumn("RealEstates", "elevator"), + queryInterface.removeColumn("RealEstates", "water"), + queryInterface.removeColumn("RealEstates", "electricity"), + queryInterface.removeColumn("RealEstates", "drainageSystem"), + queryInterface.removeColumn("RealEstates", "registeredInZkBooks"), + queryInterface.removeColumn("RealEstates", "recentlyAdapted"), + queryInterface.removeColumn("RealEstates", "parking"), + queryInterface.removeColumn("RealEstates", "garage"), + queryInterface.removeColumn("RealEstates", "gas"), + queryInterface.removeColumn("RealEstates", "antiTheftDoor"), + queryInterface.removeColumn("RealEstates", "airCondition"), + queryInterface.removeColumn("RealEstates", "phoneConnection"), + queryInterface.removeColumn("RealEstates", "cableTV"), + queryInterface.removeColumn("RealEstates", "internet"), + queryInterface.removeColumn("RealEstates", "basementAttic"), + queryInterface.removeColumn("RealEstates", "storeRoom"), + queryInterface.removeColumn("RealEstates", "videoSurveillance"), + queryInterface.removeColumn("RealEstates", "alarm"), + queryInterface.removeColumn("RealEstates", "suitableForStudents"), + queryInterface.removeColumn("RealEstates", "includingBills"), + queryInterface.removeColumn("RealEstates", "animalsAllowed"), + queryInterface.removeColumn("RealEstates", "pool"), + queryInterface.removeColumn("RealEstates", "exchange"), + queryInterface.removeColumn("RealEstates", "urbanPlanPermit"), + queryInterface.removeColumn("RealEstates", "buildingPermit"), + queryInterface.removeColumn("RealEstates", "utilityConnection"), + queryInterface.removeColumn("RealEstates", "distanceToRiver"), + queryInterface.removeColumn("RealEstates", "numberOfViewsAgency"), + queryInterface.removeColumn("RealEstates", "numberOfViewsKivi") + ]); + } +}; diff --git a/app/models/realEstate.js b/app/models/realEstate.js index 93b82c4..0cb9374 100644 --- a/app/models/realEstate.js +++ b/app/models/realEstate.js @@ -48,7 +48,44 @@ module.exports = (sequelize, DataTypes) => { longDescription: DataTypes.TEXT, adStatus: DataTypes.INTEGER, publishedDate: DataTypes.DATE, - renewedDate: DataTypes.DATE + renewedDate: DataTypes.DATE, + numberOfRooms: DataTypes.INTEGER, + numberOfFloors: DataTypes.INTEGER, + floor: DataTypes.INTEGER, + accessRoadType: DataTypes.TEXT, + heatingType: DataTypes.TEXT, + furnishingType: DataTypes.TEXT, + balcony: DataTypes.BOOLEAN, + newBuilding: DataTypes.BOOLEAN, + elevator: DataTypes.BOOLEAN, + water: DataTypes.BOOLEAN, + electricity: DataTypes.BOOLEAN, + drainageSystem: DataTypes.BOOLEAN, + registeredInZkBooks: DataTypes.BOOLEAN, + recentlyAdapted: DataTypes.BOOLEAN, + parking: DataTypes.BOOLEAN, + garage: DataTypes.BOOLEAN, + gas: DataTypes.BOOLEAN, + antiTheftDoor: DataTypes.BOOLEAN, + airCondition: DataTypes.BOOLEAN, + phoneConnection: DataTypes.BOOLEAN, + cableTV: DataTypes.BOOLEAN, + internet: DataTypes.BOOLEAN, + basementAttic: DataTypes.BOOLEAN, + storeRoom: DataTypes.BOOLEAN, + videoSurveillance: DataTypes.BOOLEAN, + alarm: DataTypes.BOOLEAN, + suitableForStudents: DataTypes.BOOLEAN, + includingBills: DataTypes.BOOLEAN, + animalsAllowed: DataTypes.BOOLEAN, + pool: DataTypes.BOOLEAN, + exchange: DataTypes.BOOLEAN, + urbanPlanPermit: DataTypes.BOOLEAN, + buildingPermit: DataTypes.BOOLEAN, + utilityConnection: DataTypes.BOOLEAN, + distanceToRiver: DataTypes.INTEGER, + numberOfViewsAgency: DataTypes.INTEGER, + numberOfViewsKivi: DataTypes.INTEGER }); return RealEstate; From 9ba41dd7f7120487040b2f2c7c799c54f5a2f6bb Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Fri, 8 Nov 2019 16:39:37 +0100 Subject: [PATCH 02/20] add columns for update on duplicate real estate --- app/helpers/db/realEstate.js | 37 +++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index fd9f086..0282645 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -26,7 +26,42 @@ const bulkUpsertRealEstates = async realEstateData => { "gardenSize", "adStatus", "updatedAt", - "renewedDate" + "renewedDate", + "numberOfRooms", + "numberOfFloors", + "floor", + "accessRoadType", + "heatingType", + "furnishingType", + "balcony", + "newBuilding", + "elevator", + "water", + "electricity", + "drainageSystem", + "registeredInZkBooks", + "recentlyAdapted", + "parking", + "garage", + "gas", + "antiTheftDoor", + "airCondition", + "phoneConnection", + "cableTV", + "internet", + "basementAttic", + "storeRoom", + "videoSurveillance", + "alarm", + "suitableForStudents", + "includingBills", + "animalsAllowed", + "pool", + "urbanPlanPermit", + "buildingPermit", + "utilityConnection", + "distanceToRiver", + "numberOfViewsAgency" ]; const order = [["updatedAt", "desc"]]; From 50514aaf03dfa5eae10c54d3c54b386c8e0e5c25 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Fri, 8 Nov 2019 16:40:15 +0100 Subject: [PATCH 03/20] add new ENUMS for real estate properties --- app/common/enums.js | 74 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/app/common/enums.js b/app/common/enums.js index 942fb47..cc74cd9 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -187,11 +187,83 @@ const EMAIL_FREQUENCY = { } }; +const HEATING_TYPE = { + NO_HEATING: { + id: "NO_HEATING", + title: "Nije uvedeno" + }, + ELECTRICITY: { + id: "ELECTRICITY", + title: "Struja" + }, + GAS: { + id: "GAS", + title: "Plin" + }, + WOOD: { + id: "WOOD", + title: "Drva" + }, + CENTRAL_CITY: { + id: "CENTRAL_CITY", + title: "Centralno (gradsko)" + }, + CENTRAL_BOILER: { + id: "CENTRAL_BOILER", + title: "Centralno (kotlovnica)" + }, + CENTRAL_GAS: { + id: "CENTRAL_GAS", + title: "Centralno (plin)" + }, + OTHER: { + id: "OTHER", + title: "Drugo" + } +}; + +const ACCESS_ROAD_TYPE = { + ASPHALT: { + id: "ASPHALT", + title: "Asfalt" + }, + CONCRETE: { + id: "CONCRETE", + title: "Beton" + }, + MACADAM: { + id: "MACADAM", + title: "Makadam" + }, + OTHER: { + id: "OTHER", + title: "Drugo" + } +}; + +const FURNISHING_TYPE = { + NOT_FURNISHED: { + id: "NOT_FURNISHED", + title: "Nenamješten" + }, + HALF_FURNISHED: { + id: "HALF_FURNISHED", + title: "Polunamješten" + }, + FURNISHED: { + id: "FURNISHED", + title: "Namješten" + } +}; + module.exports = { AD_TYPE, AD_CATEGORY, AD_STATUS, AD_AGENCY, CRAWLER_AD_TYPE, - EMAIL_FREQUENCY + EMAIL_FREQUENCY, + HEATING_TYPE, + ACCESS_ROAD_TYPE, + FURNISHING_TYPE }; From b6024af2cbd5d01fc030369747fd42c57476d98f Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Fri, 8 Nov 2019 17:05:51 +0100 Subject: [PATCH 04/20] add new fields for OLX crawler --- app/crawler/specificCrawlers/olx.js | 497 ++++++++++++++++++++++------ 1 file changed, 399 insertions(+), 98 deletions(-) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index d0bd0dd..952a8be 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -10,7 +10,10 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + HEATING_TYPE, + FURNISHING_TYPE, + ACCESS_ROAD_TYPE } = require("../../common/enums"); const { @@ -271,6 +274,7 @@ class OlxCrawler { //====== OTHER AD INFORMATION =============== let adType = null; let olxId = null; + let numberOfViewsAgency = null; let otherInformationDivId; //We need to locate DIV ID where other information are stored @@ -293,6 +297,7 @@ class OlxCrawler { const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; + const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) @@ -331,60 +336,7 @@ class OlxCrawler { ) .text() .trim(); - const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) - .text() - .trim(); - olxId = $(`${olxIdFieldSelector} > div.df2`) - .text() - .trim(); - if (olxIdFieldTitle !== "OLX ID") { - throw { message: "Cannot find correct OLX ID" }; - } - //=========================================== - - //====== DETAIL INFORMATION FIELDS ========== - let area = null; - let gardenSize = null; - - let fieldIndex = 1; - do { - const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; - const fieldTitleSelector = `${fieldSelector} > div.df1`; - const fieldValueSelector = `${fieldSelector} > div.df2`; - - const fieldTitle = $(fieldTitleSelector) - .text() - .trim(); - const fieldValue = $(fieldValueSelector) - .text() - .trim(); - - switch (fieldTitle) { - case "Kvadrata": - area = fieldValue; - break; - case "Okućnica (kvadratura)": - gardenSize = fieldValue; - break; - } - - if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { - break; - } - } while (true); - //=========================================== - - //====== UNUSED FIELDS FOR NOW ============== - const time = $("time").attr("datetime"); - const numberOfViews = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" - ) - .text() - .trim(); - //=========================================== - - //========================================= const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { throw { message: `Unknown ad category [${category}]` }; @@ -395,6 +347,218 @@ class OlxCrawler { throw { message: "Unknown ad type" }; } + const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) + .text() + .trim(); + olxId = $(`${olxIdFieldSelector} > div.df2`) + .text() + .trim(); + numberOfViewsAgency = parseInt( + $(numberOfViewsAgencyValueSelector) + .text() + .trim() + ); + + if (olxIdFieldTitle !== "OLX ID") { + throw { message: "Cannot find correct OLX ID" }; + } + //=========================================== + + //====== DETAIL INFORMATION FIELDS ========== + let area, + gardenSize, + numberOfRooms = null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = null, + furnishingType = null, + balcony = null, + newBuilding = null, + elevator = null, + water = null, + electricity = null, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = null, + garage = null, + gas = null, + antiTheftDoor = null, + airCondition = null, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = null, + storeRoom = null, + videoSurveillance = null, + alarm = null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null; + + let fieldIndex = 1; + do { + const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; + const fieldTitleSelector = `${fieldSelector} > div.df1`; + const fieldValueSelector = `${fieldSelector} > div.df2`; + + const fieldTitle = $(fieldTitleSelector) + .text() + .trim() + .toLowerCase(); + const fieldValue = $(fieldValueSelector) + .text() + .trim() + .toLowerCase(); + + switch (fieldTitle) { + case "kvadrata": + area = fieldValue; + break; + case "okućnica (kvadratura)": + gardenSize = fieldValue; + break; + case "broj soba": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj prostorija": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj spratova": + numberOfFloors = this.parseNumberOfFloors( + fieldValue, + parsedCategory + ); + break; + case "sprat": + floor = this.parseFloorNumber(fieldValue, parsedCategory); + break; + case "vrsta grijanja": + heatingType = this.getHeatingTypeId(fieldValue); + break; + case "namješten?": + furnishingType = this.getFurnishingTypeId(fieldValue); + break; + case "namješten": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "namještena": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "voda": + water = true; + break; + case "struja": + electricity = true; + break; + case "kanalizacija": + drainageSystem = fieldValue !== "nema"; + break; + case "godina izgradnje": + newBuilding = newBuilding || fieldValue === "novogradnja"; + break; + case "kućni ljubimci": + animalsAllowed = fieldValue === "da"; + break; + case "uknjiženo / zk": + registeredInZkBooks = true; + break; + case "uknjiženo (zk)": + registeredInZkBooks = true; + break; + case "novogradnja": + newBuilding = true; + break; + case "nedavno adaptiran": + recentlyAdapted = true; + break; + case "nedavno adaptirana": + recentlyAdapted = true; + break; + case "balkon": + balcony = true; + break; + case "lift": + elevator = true; + break; + case "parking": + parking = true; + break; + case "garaža": + garage = true; + break; + case "plin": + gas = true; + break; + case "blindirana vrata": + antiTheftDoor = true; + break; + case "klima": + airCondition = true; + break; + case "telefonski priključak": + phoneConnection = true; + break; + case "kablovska tv": + cableTV = true; + break; + case "internet": + internet = true; + break; + case "podrum/tavan": + basementAttic = true; + break; + case "ostava/špajz": + storeRoom = true; + break; + case "video nadzor": + videoSurveillance = true; + break; + case "alarm": + alarm = true; + break; + case "za studente": + suitableForStudents = true; + break; + case "uključen trošak režija": + includingBills = true; + break; + case "građevinska dozvola": + buildingPermit = true; + break; + case "komunalni priključak": + utilityConnection = true; + break; + case "urbanistička dozvola": + urbanPlanPermit = true; + break; + case "udaljenost od rijeke (m)": + distanceToRiver = parseInt(fieldValue) || null; + break; + case "prilaz": + accessRoadType = this.getAccessRoadTypeId(fieldValue); + break; + case "bazen": + pool = true; + break; + default: + // console.log(fieldTitle, " = ", fieldValue); + break; + } + + if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + break; + } + } while (true); + //=========================================== + + //========================================= const parsedArea = this.parseArea(area) || null; const parsedGardenSize = this.parseArea(gardenSize) || null; const parsedPrice = this.parsePrice(price) || null; @@ -439,7 +603,42 @@ class OlxCrawler { locationLong, adStatus: status, publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString() + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency }; return data; @@ -485,6 +684,64 @@ class OlxCrawler { } } + getHeatingTypeId(heatingTypeText) { + switch (heatingTypeText) { + case "struja": + return HEATING_TYPE.ELECTRICITY.id; + case "plin": + return HEATING_TYPE.GAS.id; + case "drva": + return HEATING_TYPE.WOOD.id; + case "centralno (gradsko)": + return HEATING_TYPE.CENTRAL_CITY.id; + case "centralno (kotlovnica)": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "centralno (plin)": + return HEATING_TYPE.CENTRAL_GAS.id; + case "nije uvedeno": + return HEATING_TYPE.NO_HEATING.id; + case "ostalo": + return HEATING_TYPE.OTHER.id; + case "drugo": + return HEATING_TYPE.OTHER.id; + default: + console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); + return null; + } + } + + getFurnishingTypeId(furnishingTypeText) { + switch (furnishingTypeText) { + case "namješten": + return FURNISHING_TYPE.FURNISHED.id; + case "polunamješten": + return FURNISHING_TYPE.HALF_FURNISHED.id; + case "nenamješten": + return FURNISHING_TYPE.NOT_FURNISHED.id; + case "": + return FURNISHING_TYPE.FURNISHED.id; + default: + console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); + return null; + } + } + + getAccessRoadTypeId(accessRoadTypeText) { + switch (accessRoadTypeText) { + case "asfalt": + return ACCESS_ROAD_TYPE.ASPHALT.id; + case "beton": + return ACCESS_ROAD_TYPE.CONCRETE.id; + case "makadam": + return ACCESS_ROAD_TYPE.MACADAM.id; + case "ostalo": + return ACCESS_ROAD_TYPE.OTHER.id; + default: + console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); + return null; + } + } + parseArea(areaText) { if (!areaText) { return NaN; @@ -505,56 +762,100 @@ class OlxCrawler { return parseFloat(formattedPriceText); } - parseRenewedDate(renewedDateText) { - const currentMoment = moment.tz(DEFAULT_TIMEZONE); - - if (renewedDateText.includes("Prije mjesec dana")) { - return currentMoment.add(-1, "month"); - } - - if (renewedDateText.includes("Jučer")) { - return currentMoment.add(-1, "day"); - } - - if (renewedDateText.includes("Prije sat")) { - return currentMoment.add(-1, "hour"); - } - - if (renewedDateText.includes("dan")) { - // format for this case should be "Prije N dana" or "Prije N dan" - const dateParts = renewedDateText.split(" "); - if (dateParts[0] === "Prije") { - const numberOfDays = parseInt(dateParts[1]); - return currentMoment.add(-1 * numberOfDays, "days"); - } else { - return undefined; + parseNumberOfRooms(numberOfRoomsText, categoryId) { + if (categoryId === AD_CATEGORY.FLAT.id) { + switch (numberOfRoomsText) { + case "garsonjera": + return 0; + case "jednosoban (1)": + return 1; + case "jednoiposoban (1.5)": + return 1.5; + case "dvosoban (2)": + return 2; + case "trosoban (3)": + return 3; + case "četverosoban (4)": + return 4; + case "petosoban i više": + return 5; + default: + console.log( + "broj soba [stan] = NEPOZNATO [", + numberOfRoomsText, + ", ", + categoryId, + "]" + ); + return null; } } - if (renewedDateText.includes("sat")) { - const dateParts = renewedDateText.split(" "); - const parsedHours = - dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined; - if (!parsedHours) { - return undefined; - } - return currentMoment.add(-1 * parsedHours, "hours"); + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id || + categoryId === AD_CATEGORY.APARTMENT.id || + categoryId === AD_CATEGORY.OFFICE.id + ) { + return parseInt(numberOfRoomsText) || null; } - const todayVariations = ["min", "sekund", "maloprije"]; - for (const todayVariation of todayVariations) { - if (renewedDateText.includes(todayVariation)) { - return currentMoment; - } + console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); + return null; + } + + parseNumberOfFloors(numberOfFloorsText, categoryId) { + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id + ) { + return parseInt(numberOfFloorsText) || null; } - const renewedDateMoment = moment.tz( - renewedDateText, - OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, - DEFAULT_TIMEZONE - ); + if (categoryId === AD_CATEGORY.OFFICE.id) { + if ( + numberOfFloorsText === "suteren" || + numberOfFloorsText === "prizemlje" + ) { + return 0; + } + if (numberOfFloorsText === "6+") { + return 7; + } + return parseInt(numberOfFloorsText) || null; + } - return renewedDateMoment.isValid() ? renewedDateMoment : undefined; + console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); + return null; + } + + parseFloorNumber(floorText, categoryId) { + if ( + categoryId === AD_CATEGORY.FLAT.id || + categoryId === AD_CATEGORY.APARTMENT.id + ) { + if ( + floorText === "suteren" || + floorText === "prizemlje" || + floorText === "visoko prizemlje" + ) { + return 0; + } + return parseInt(floorText) || null; + } + + if (categoryId === AD_CATEGORY.OFFICE.id) { + if (floorText === "zaseban objekat") { + return null; + } + if (floorText === "prizemlje" || floorText === "visoko prizemlje") { + return 0; + } + return parseInt(floorText) || null; + } + + console.log("sprat = NEPOZNATO [", floorText, "]"); + return null; } async sleep(ms) { @@ -569,7 +870,7 @@ class OlxCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } From cb9bb9e5668707d8b09999d6b2908c73fc5073ce Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 11 Nov 2019 03:34:15 +0100 Subject: [PATCH 05/20] add rental scraper test script --- package.json | 3 ++- test/rentalScrapeTest.js | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 test/rentalScrapeTest.js diff --git a/package.json b/package.json index 6c00483..ca18f22 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,8 @@ "crawl": "cd app/crawler && node npmCrawl.js", "daily-notify": "cd app/npmScripts && node npmDailyNotify.js", "test-search": "cd test && node searchTest.js", - "test-olx-scraper": "cd test && node olxScrapeTest.js" + "test-olx-scraper": "cd test && node olxScrapeTest.js", + "test-rental-scraper": "cd test && node rentalScrapeTest.js" }, "repository": { "type": "git", diff --git a/test/rentalScrapeTest.js b/test/rentalScrapeTest.js new file mode 100644 index 0000000..9828f2d --- /dev/null +++ b/test/rentalScrapeTest.js @@ -0,0 +1,17 @@ +"use strict"; + +const rentalCrawler = require("../app/crawler/specificCrawlers/rental"); + +const urlToScrape = process.argv[2] || undefined; + +if (urlToScrape) { + const crawler = new rentalCrawler(); + + (async () => { + const data = await crawler.scrapeAd(urlToScrape); + console.log(data); + })(); +} else { + console.log("No URL to scrape. Use like this : "); + console.log("npm run test-olx-scraper -- URL_TO_SCRAPE"); +} From 9e10800b0294028ad0739fb9db91ab6aceca48d6 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 11 Nov 2019 17:15:14 +0100 Subject: [PATCH 06/20] add new heating type ENUM --- app/common/enums.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/app/common/enums.js b/app/common/enums.js index cc74cd9..a419c42 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -216,6 +216,10 @@ const HEATING_TYPE = { id: "CENTRAL_GAS", title: "Centralno (plin)" }, + HEAT_PUMP: { + id: "HEAT_PUMP", + title: "Toplotna pumpa" + }, OTHER: { id: "OTHER", title: "Drugo" From debdd01b2866d71eb75c09da8c1a7b9ec4481091 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 11 Nov 2019 17:15:46 +0100 Subject: [PATCH 07/20] add new fields to the Rental crawler --- app/crawler/specificCrawlers/rental.js | 294 ++++++++++++++++++++++++- 1 file changed, 291 insertions(+), 3 deletions(-) diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index 668cff2..38a2f5e 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -11,7 +11,10 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + HEATING_TYPE, + ACCESS_ROAD_TYPE, + FURNISHING_TYPE } = require("../../common/enums"); const { @@ -215,6 +218,7 @@ class RentalCrawler { const jsonData = scriptElement[0].children[0].data.substring(20); const parsedJsonData = JSON.parse(jsonData); extractedData = parsedJsonData[0]; + // console.log(extractedData); } catch (e) { throw { message: "Can't find ad data JSON" }; } @@ -237,6 +241,97 @@ class RentalCrawler { }; } + const descriptionIds = extractedData["re_descriptions_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(descriptionIds)) { + throw { + message: + 'Expected array od descriptions but "re_descriptions_id" not found !' + }; + } + + const spaceIds = extractedData["re_spaces_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(spaceIds)) { + throw { + message: 'Expected array od spaces but "re_spaces_id" not found !' + }; + } + + const numberOfViewsAgencySelector = $( + "body > div > div.container > div.row.content-top > div.col-xs-12.col-sm-12.col-md-9 > div > div.box-viewcount" + ); + + // number of views is written as : "Broj pregledavanja: NNN" + const numberOfViewsAgencyFullText = numberOfViewsAgencySelector + .text() + .trim(); + + const numberOfViewsAgencyParts = numberOfViewsAgencyFullText.split(":"); + + const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions( + descriptionIds + ); + const realEstatePropertiesFromSpaces = this.getPropertiesFromSpaces( + spaceIds + ); + + let numberOfRooms = + parseInt(extractedData["re_realEstates_roomsNO"]) + + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, + numberOfFloors = + parseInt(extractedData["re_realEstates_floorsNO"]) || null, // Check this for HOUSE + floor = parseInt(extractedData["re_realEstates_floorNO"]) || null, + accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType, + heatingType = + this.getHeatingTypeId(extractedData["re_heating_id"]) || null, + furnishingType = realEstatePropertiesFromDescriptions.furnishingType, + balcony = + realEstatePropertiesFromDescriptions.balcony || + realEstatePropertiesFromSpaces.balcony, + newBuilding = extractedData["op_realEstates_newBuilding"] + ? extractedData["op_realEstates_newBuilding"] === "1" + : null, + elevator = realEstatePropertiesFromDescriptions.elevator, + water = realEstatePropertiesFromDescriptions.water, + electricity = realEstatePropertiesFromDescriptions.electricity, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = + realEstatePropertiesFromDescriptions.parking || + realEstatePropertiesFromSpaces.parking, + garage = realEstatePropertiesFromSpaces.garage, + gas = null, + antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor, + airCondition = realEstatePropertiesFromDescriptions.airCondition, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = realEstatePropertiesFromSpaces.basementAttic, + storeRoom = realEstatePropertiesFromSpaces.storeRoom, + videoSurveillance = + realEstatePropertiesFromDescriptions.videoSurveillance, + alarm = realEstatePropertiesFromDescriptions.alarm, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = realEstatePropertiesFromDescriptions.pool, + urbanPlanPermit = + realEstatePropertiesFromDescriptions.urbanPlanPermit, + buildingPermit = null, + utilityConnection = + realEstatePropertiesFromDescriptions.utilityConnection, + distanceToRiver = null, + numberOfViewsAgency = + numberOfViewsAgencyParts.length > 1 + ? parseInt(numberOfViewsAgencyParts[1]) + : null; + const title = extractedData["re_realEstates_portalName"]; const extractedPrice = parseFloat( extractedData["re_realEstates_price"] @@ -303,7 +398,42 @@ class RentalCrawler { locationLong, adStatus, publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString() + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency }; return data; @@ -350,6 +480,164 @@ class RentalCrawler { } } + getPropertiesFromDescriptions(descriptionIds) { + const result = { + accessRoadType: null, + furnishingType: null, + balcony: null, + elevator: null, + parking: null, + antiTheftDoor: null, + airCondition: null, + videoSurveillance: null, + alarm: null, + pool: null, + urbanPlanPermit: null, + utilityConnection: null, + water: null, + electricity: null + }; + + for (const descriptionId of descriptionIds) { + switch (descriptionId) { + case 16: + result.furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + break; + case 17: + result.furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + break; + case 1: + case 28: + result.furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case 14: + result.elevator = true; + break; + case 39: + result.electricity = true; + break; + case 40: + result.water = true; + break; + case 41: + case 58: + result.accessRoadType = ACCESS_ROAD_TYPE.ASPHALT.id; + break; + case 26: + result.balcony = true; + break; + case 62: + result.parking = true; + break; + case 3: + result.antiTheftDoor = true; + break; + case 2: + case 21: + result.airCondition = true; + break; + case 4: + result.alarm = true; + break; + case 55: + result.videoSurveillance = true; + break; + case 9: + result.pool = true; + break; + case 60: + result.urbanPlanPermit = true; + break; + case 38: + result.utilityConnection = true; + break; + } + } + + return result; + } + + getPropertiesFromSpaces(spaceIds) { + const result = { + balcony: null, + parking: null, + garage: null, + basementAttic: null, + storeRoom: null + }; + + for (const spaceId of spaceIds) { + switch (spaceId) { + case 36: + case 12: + result.parking = true; + break; + case 1: + case 2: + case 3: + result.balcony = true; + break; + case 4: + case 30: + result.garage = true; + break; + case 9: + case 10: + result.storeRoom = true; + break; + case 18: + case 34: + case 37: + case 27: + result.basementAttic = true; + break; + } + } + + return result; + } + + getHeatingTypeId(heatingRentalId) { + // heatingRentalId can have multiple values, like: "1, 2, 3", parseInt will take first integer value + const heatingId = parseInt(heatingRentalId); + switch (heatingId) { + case 27: + case 16: + return HEATING_TYPE.GAS.id; + case 4: + return HEATING_TYPE.CENTRAL_GAS.id; + case 3: + case 23: + case 7: + case 8: + case 9: + case 10: + return HEATING_TYPE.CENTRAL_BOILER.id; + case 2: + case 13: + case 30: + case 17: + case 29: + case 31: + return HEATING_TYPE.ELECTRICITY.id; + case 24: + case 25: + return HEATING_TYPE.CENTRAL_CITY.id; + case 26: + case 21: + case 20: + return HEATING_TYPE.WOOD.id; + case 28: + case 19: + return HEATING_TYPE.HEAT_PUMP.id; + case 14: + case 32: + return HEATING_TYPE.OTHER.id; + default: + return null; + } + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } @@ -362,7 +650,7 @@ class RentalCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } From e871550ba60ebd6ef1e73f0203f3b795c65d9501 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 11 Nov 2019 18:46:01 +0100 Subject: [PATCH 08/20] add two more heating types for Rental crawler --- app/crawler/specificCrawlers/rental.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index 38a2f5e..de9618e 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -608,6 +608,7 @@ class RentalCrawler { return HEATING_TYPE.CENTRAL_GAS.id; case 3: case 23: + case 6: case 7: case 8: case 9: @@ -622,6 +623,7 @@ class RentalCrawler { return HEATING_TYPE.ELECTRICITY.id; case 24: case 25: + case 12: return HEATING_TYPE.CENTRAL_CITY.id; case 26: case 21: From c91e56c46e9d3edb71b0c380b07604569f7d8e0f Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 11 Nov 2019 19:34:43 +0100 Subject: [PATCH 09/20] add additional real estate fields for Aktido crawler --- app/crawler/specificCrawlers/aktido.js | 279 ++++++++++++++++++++++++- 1 file changed, 277 insertions(+), 2 deletions(-) diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index 373a6ef..d8fb517 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -11,7 +11,10 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + HEATING_TYPE, + ACCESS_ROAD_TYPE, + FURNISHING_TYPE } = require("../../common/enums"); const { @@ -237,6 +240,83 @@ class AktidoCrawler { }; } + const descriptionIds = extractedData["re_descriptions_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(descriptionIds)) { + throw { + message: + 'Expected array od descriptions but "re_descriptions_id" not found !' + }; + } + + const spaceIds = extractedData["re_spaces_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(spaceIds)) { + throw { + message: 'Expected array od spaces but "re_spaces_id" not found !' + }; + } + + const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions( + descriptionIds + ); + const realEstatePropertiesFromSpaces = this.getPropertiesFromSpaces( + spaceIds + ); + + let numberOfRooms = + parseInt(extractedData["re_realEstates_roomsNO"]) + + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, + numberOfFloors = + parseInt(extractedData["re_realEstates_floorsNO"]) || null, // Check this for HOUSE + floor = parseInt(extractedData["re_realEstates_floorNO"]) || null, + accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType, + heatingType = + this.getHeatingTypeId(extractedData["re_heating_id"]) || null, + furnishingType = realEstatePropertiesFromDescriptions.furnishingType, + balcony = + realEstatePropertiesFromDescriptions.balcony || + realEstatePropertiesFromSpaces.balcony, + newBuilding = extractedData["op_realEstates_newBuilding"] + ? extractedData["op_realEstates_newBuilding"] === "1" + : null, + elevator = realEstatePropertiesFromDescriptions.elevator, + water = realEstatePropertiesFromDescriptions.water, + electricity = realEstatePropertiesFromDescriptions.electricity, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = + realEstatePropertiesFromDescriptions.parking || + realEstatePropertiesFromSpaces.parking, + garage = realEstatePropertiesFromSpaces.garage, + gas = null, + antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor, + airCondition = realEstatePropertiesFromDescriptions.airCondition, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = realEstatePropertiesFromSpaces.basementAttic, + storeRoom = realEstatePropertiesFromSpaces.storeRoom, + videoSurveillance = + realEstatePropertiesFromDescriptions.videoSurveillance, + alarm = realEstatePropertiesFromDescriptions.alarm, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = realEstatePropertiesFromDescriptions.pool, + urbanPlanPermit = + realEstatePropertiesFromDescriptions.urbanPlanPermit, + buildingPermit = null, + utilityConnection = + realEstatePropertiesFromDescriptions.utilityConnection, + distanceToRiver = null, + numberOfViewsAgency = null; + const title = extractedData["re_realEstates_portalName"]; const extractedPrice = parseFloat( extractedData["re_realEstates_price"] @@ -303,7 +383,42 @@ class AktidoCrawler { locationLong, adStatus, publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString() + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency }; return data; @@ -350,6 +465,166 @@ class AktidoCrawler { } } + getPropertiesFromDescriptions(descriptionIds) { + const result = { + accessRoadType: null, + furnishingType: null, + balcony: null, + elevator: null, + parking: null, + antiTheftDoor: null, + airCondition: null, + videoSurveillance: null, + alarm: null, + pool: null, + urbanPlanPermit: null, + utilityConnection: null, + water: null, + electricity: null + }; + + for (const descriptionId of descriptionIds) { + switch (descriptionId) { + case 16: + result.furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + break; + case 17: + result.furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + break; + case 1: + case 28: + result.furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case 14: + result.elevator = true; + break; + case 39: + result.electricity = true; + break; + case 40: + result.water = true; + break; + case 41: + case 58: + result.accessRoadType = ACCESS_ROAD_TYPE.ASPHALT.id; + break; + case 26: + result.balcony = true; + break; + case 62: + result.parking = true; + break; + case 3: + result.antiTheftDoor = true; + break; + case 2: + case 21: + result.airCondition = true; + break; + case 4: + result.alarm = true; + break; + case 55: + result.videoSurveillance = true; + break; + case 9: + result.pool = true; + break; + case 60: + result.urbanPlanPermit = true; + break; + case 38: + result.utilityConnection = true; + break; + } + } + + return result; + } + + getPropertiesFromSpaces(spaceIds) { + const result = { + balcony: null, + parking: null, + garage: null, + basementAttic: null, + storeRoom: null + }; + + for (const spaceId of spaceIds) { + switch (spaceId) { + case 36: + case 12: + result.parking = true; + break; + case 1: + case 2: + case 3: + result.balcony = true; + break; + case 4: + case 30: + result.garage = true; + break; + case 9: + case 10: + result.storeRoom = true; + break; + case 18: + case 34: + case 37: + case 27: + result.basementAttic = true; + break; + } + } + + return result; + } + + getHeatingTypeId(heatingRentalId) { + // heatingRentalId can have multiple values, like: "1, 2, 3", parseInt will take first integer value + const heatingId = parseInt(heatingRentalId); + switch (heatingId) { + case 27: + case 16: + return HEATING_TYPE.GAS.id; + case 4: + return HEATING_TYPE.CENTRAL_GAS.id; + case 3: + case 23: + case 6: + case 7: + case 8: + case 9: + case 10: + return HEATING_TYPE.CENTRAL_BOILER.id; + case 2: + case 13: + case 30: + case 17: + case 29: + case 31: + return HEATING_TYPE.ELECTRICITY.id; + case 24: + case 25: + case 12: + return HEATING_TYPE.CENTRAL_CITY.id; + case 26: + case 21: + case 20: + return HEATING_TYPE.WOOD.id; + case 28: + case 19: + return HEATING_TYPE.HEAT_PUMP.id; + case 14: + case 32: + return HEATING_TYPE.OTHER.id; + default: + return null; + } + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } From b6d68db3a3b1c23eb7a9609a6e283cef23507117 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Tue, 12 Nov 2019 21:39:28 +0100 Subject: [PATCH 10/20] improve real estate properties detection for aktido --- app/crawler/specificCrawlers/aktido.js | 226 +++++++++++++++++++++++-- 1 file changed, 212 insertions(+), 14 deletions(-) diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index d8fb517..2445566 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -261,6 +261,82 @@ class AktidoCrawler { }; } + const infrastructureIds = extractedData["re_infrastructure_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(infrastructureIds)) { + throw { + message: + 'Expected array od infrastructures but "re_infrastructure_id" not found !' + }; + } + + const floorNoIds = extractedData["re_floorNO_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(floorNoIds)) { + throw { + message: + 'Expected array od infrastructures but "re_floorNO_id" not found !' + }; + } + + // counting floor enums + // for (let i = 1; i < 10; i++) { + // const floorEnumsTitle = $( + // `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(1)` + // ) + // .text() + // .trim(); + // if (floorEnumsTitle === "Spratnost:") { + // const floorEnumsValue = $( + // `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(2)` + // ) + // .text() + // .trim() + // .split(","); + // + // console.log("=========="); + // floorNoIds.forEach((id, index) => { + // console.log("\t", id, " = ", floorEnumsValue[index]); + // }); + // break; + // } + // } + + // enumerating infrastructure - relation between id and infrastructure title + // let found = false; + // let infrastructureDescriptions = {}; + // for (let i = 1; i < 5; i++) { + // found = false; + // for (let j = 1; j < 10; j++) { + // const infrastructureTitle = $( + // `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > strong` + // ) + // .text() + // .trim(); + // if (infrastructureTitle === "Osnovna infrastruktura:") { + // found = true; + // + // const infrastructureValues = $( + // `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > div` + // ) + // .text() + // .trim() + // .split(","); + // + // infrastructureIds.forEach((id, index) => { + // infrastructureDescriptions[id] = infrastructureValues[index]; + // }); + // } + // } + // if (found) { + // break; + // } + // } + const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions( descriptionIds ); @@ -268,12 +344,19 @@ class AktidoCrawler { spaceIds ); + const realEstatePropertiesFromInfrastructure = this.getPropertiesFromInfrastructure( + infrastructureIds + ); + let numberOfRooms = parseInt(extractedData["re_realEstates_roomsNO"]) + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, numberOfFloors = - parseInt(extractedData["re_realEstates_floorsNO"]) || null, // Check this for HOUSE - floor = parseInt(extractedData["re_realEstates_floorNO"]) || null, + parseInt(extractedData["re_realEstates_floorsNO"]) || + this.getNumberOfFloorsFromFloorId(extractedData["re_floorNO_id"]), + floor = + parseInt(extractedData["re_realEstates_floorNO"]) || + this.getFloorNumberFromFloorId(extractedData["re_floorNO_id"]), accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType, heatingType = this.getHeatingTypeId(extractedData["re_heating_id"]) || null, @@ -285,33 +368,44 @@ class AktidoCrawler { ? extractedData["op_realEstates_newBuilding"] === "1" : null, elevator = realEstatePropertiesFromDescriptions.elevator, - water = realEstatePropertiesFromDescriptions.water, - electricity = realEstatePropertiesFromDescriptions.electricity, - drainageSystem = null, - registeredInZkBooks = null, + water = + realEstatePropertiesFromDescriptions.water || + realEstatePropertiesFromInfrastructure.water, + electricity = + realEstatePropertiesFromDescriptions.electricity || + realEstatePropertiesFromInfrastructure.electricity, + drainageSystem = + realEstatePropertiesFromInfrastructure.drainageSystem, + registeredInZkBooks = + extractedData["op_realEstates_ownerPermit"] === 1 || null, recentlyAdapted = null, parking = realEstatePropertiesFromDescriptions.parking || realEstatePropertiesFromSpaces.parking, garage = realEstatePropertiesFromSpaces.garage, - gas = null, + gas = realEstatePropertiesFromInfrastructure.gas, antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor, airCondition = realEstatePropertiesFromDescriptions.airCondition, - phoneConnection = null, - cableTV = null, - internet = null, + phoneConnection = + realEstatePropertiesFromInfrastructure.phoneConnection, + cableTV = realEstatePropertiesFromInfrastructure.cableTV, + internet = realEstatePropertiesFromInfrastructure.internet, basementAttic = realEstatePropertiesFromSpaces.basementAttic, storeRoom = realEstatePropertiesFromSpaces.storeRoom, videoSurveillance = - realEstatePropertiesFromDescriptions.videoSurveillance, + realEstatePropertiesFromDescriptions.videoSurveillance || + realEstatePropertiesFromInfrastructure.videoSurveillance, alarm = realEstatePropertiesFromDescriptions.alarm, suitableForStudents = null, - includingBills = null, + includingBills = + extractedData["op_realEstates_utilitiesIncluded"] === "1" || null, animalsAllowed = null, pool = realEstatePropertiesFromDescriptions.pool, urbanPlanPermit = + extractedData["op_realEstates_locationPermit"] === "1" || realEstatePropertiesFromDescriptions.urbanPlanPermit, - buildingPermit = null, + buildingPermit = + extractedData["op_realEstates_buildingPermit"] === "1" || null, utilityConnection = realEstatePropertiesFromDescriptions.utilityConnection, distanceToRiver = null, @@ -625,6 +719,110 @@ class AktidoCrawler { } } + getPropertiesFromInfrastructure(infrastructureIds) { + const result = { + electricity: null, + water: null, + gas: null, + drainageSystem: null, + phoneConnection: null, + internet: null, + videoSurveillance: null, + cableTV: null + }; + + for (const infrastructureId of infrastructureIds) { + switch (infrastructureId) { + case 1: + result.electricity = true; + break; + case 2: + result.water = true; + break; + case 4: + result.gas = true; + break; + case 5: + result.drainageSystem = true; + break; + case 7: + case 8: + result.phoneConnection = true; + break; + case 10: + result.internet = true; + break; + case 11: + result.cableTV = true; + break; + case 16: + case 17: + result.videoSurveillance = true; + break; + } + } + + return result; + } + + getFloorNumberFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + // just extracting floor number from first element + + const floorsId = floorsIdText.split(","); + if (floorsId.length === 0) { + return null; + } + + const firstFloorId = parseInt(floorsId[0]); + + // 1 pod + // 2 sut + // 3 raz + // 4 pri + // 5 vpri + // 6 prv + // 7 dru + // 8 tre + // 9 čet + // 10 man + // 11 + // 12 pot + // 13 vpot + // 14 tav + // 15 pet + const floorNumber = [ + -1, + -1, + 0, + 0, + 1, + 1, + 2, + 3, + 4, + null, + null, + null, + null, + null, + 5 + ]; + + return floorNumber[firstFloorId - 1] || null; + } + + getNumberOfFloorsFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + + const floorIds = floorsIdText.split(","); + if (floorIds.length === 0) { + return null; + } + + return floorIds.length; + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } @@ -637,7 +835,7 @@ class AktidoCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } From a63671959b7588f37620271235ba84f6c6f47c23 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Tue, 12 Nov 2019 22:53:16 +0100 Subject: [PATCH 11/20] improve real estate properties detection for Rental --- app/crawler/specificCrawlers/rental.js | 171 +++++++++++++++++++++++-- 1 file changed, 157 insertions(+), 14 deletions(-) diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index de9618e..020fa10 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -218,7 +218,6 @@ class RentalCrawler { const jsonData = scriptElement[0].children[0].data.substring(20); const parsedJsonData = JSON.parse(jsonData); extractedData = parsedJsonData[0]; - // console.log(extractedData); } catch (e) { throw { message: "Can't find ad data JSON" }; } @@ -262,6 +261,28 @@ class RentalCrawler { }; } + const infrastructureIds = extractedData["re_infrastructure_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(infrastructureIds)) { + throw { + message: + 'Expected array od infrastructures but "re_infrastructure_id" not found !' + }; + } + + const floorNoIds = extractedData["re_floorNO_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(floorNoIds)) { + throw { + message: + 'Expected array od infrastructures but "re_floorNO_id" not found !' + }; + } + const numberOfViewsAgencySelector = $( "body > div > div.container > div.row.content-top > div.col-xs-12.col-sm-12.col-md-9 > div > div.box-viewcount" ); @@ -280,12 +301,19 @@ class RentalCrawler { spaceIds ); + const realEstatePropertiesFromInfrastructure = this.getPropertiesFromInfrastructure( + infrastructureIds + ); + let numberOfRooms = parseInt(extractedData["re_realEstates_roomsNO"]) + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, numberOfFloors = - parseInt(extractedData["re_realEstates_floorsNO"]) || null, // Check this for HOUSE - floor = parseInt(extractedData["re_realEstates_floorNO"]) || null, + parseInt(extractedData["re_realEstates_floorsNO"]) || + this.getNumberOfFloorsFromFloorId(extractedData["re_floorNO_id"]), + floor = + parseInt(extractedData["re_realEstates_floorNO"]) || + this.getFloorNumberFromFloorId(extractedData["re_floorNO_id"]), accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType, heatingType = this.getHeatingTypeId(extractedData["re_heating_id"]) || null, @@ -297,33 +325,44 @@ class RentalCrawler { ? extractedData["op_realEstates_newBuilding"] === "1" : null, elevator = realEstatePropertiesFromDescriptions.elevator, - water = realEstatePropertiesFromDescriptions.water, - electricity = realEstatePropertiesFromDescriptions.electricity, - drainageSystem = null, - registeredInZkBooks = null, + water = + realEstatePropertiesFromDescriptions.water || + realEstatePropertiesFromInfrastructure.water, + electricity = + realEstatePropertiesFromDescriptions.electricity || + realEstatePropertiesFromInfrastructure.electricity, + drainageSystem = + realEstatePropertiesFromInfrastructure.drainageSystem, + registeredInZkBooks = + extractedData["op_realEstates_ownerPermit"] === 1 || null, recentlyAdapted = null, parking = realEstatePropertiesFromDescriptions.parking || realEstatePropertiesFromSpaces.parking, garage = realEstatePropertiesFromSpaces.garage, - gas = null, + gas = realEstatePropertiesFromInfrastructure.gas, antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor, airCondition = realEstatePropertiesFromDescriptions.airCondition, - phoneConnection = null, - cableTV = null, - internet = null, + phoneConnection = + realEstatePropertiesFromInfrastructure.phoneConnection, + cableTV = realEstatePropertiesFromInfrastructure.cableTV, + internet = realEstatePropertiesFromInfrastructure.internet, basementAttic = realEstatePropertiesFromSpaces.basementAttic, storeRoom = realEstatePropertiesFromSpaces.storeRoom, videoSurveillance = - realEstatePropertiesFromDescriptions.videoSurveillance, + realEstatePropertiesFromDescriptions.videoSurveillance || + realEstatePropertiesFromInfrastructure.videoSurveillance, alarm = realEstatePropertiesFromDescriptions.alarm, suitableForStudents = null, - includingBills = null, + includingBills = + extractedData["op_realEstates_utilitiesIncluded"] === "1" || null, animalsAllowed = null, pool = realEstatePropertiesFromDescriptions.pool, urbanPlanPermit = + extractedData["op_realEstates_locationPermit"] === "1" || realEstatePropertiesFromDescriptions.urbanPlanPermit, - buildingPermit = null, + buildingPermit = + extractedData["op_realEstates_buildingPermit"] === "1" || null, utilityConnection = realEstatePropertiesFromDescriptions.utilityConnection, distanceToRiver = null, @@ -640,6 +679,110 @@ class RentalCrawler { } } + getPropertiesFromInfrastructure(infrastructureIds) { + const result = { + electricity: null, + water: null, + gas: null, + drainageSystem: null, + phoneConnection: null, + internet: null, + videoSurveillance: null, + cableTV: null + }; + + for (const infrastructureId of infrastructureIds) { + switch (infrastructureId) { + case 1: + result.electricity = true; + break; + case 2: + result.water = true; + break; + case 4: + result.gas = true; + break; + case 5: + result.drainageSystem = true; + break; + case 7: + case 8: + result.phoneConnection = true; + break; + case 10: + result.internet = true; + break; + case 11: + result.cableTV = true; + break; + case 16: + case 17: + result.videoSurveillance = true; + break; + } + } + + return result; + } + + getFloorNumberFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + // just extracting floor number from first element + + const floorsId = floorsIdText.split(","); + if (floorsId.length === 0) { + return null; + } + + const firstFloorId = parseInt(floorsId[0]); + + // 1 pod + // 2 sut + // 3 raz + // 4 pri + // 5 vpri + // 6 prv + // 7 dru + // 8 tre + // 9 čet + // 10 man + // 11 + // 12 pot + // 13 vpot + // 14 tav + // 15 pet + const floorNumber = [ + -1, + -1, + 0, + 0, + 1, + 1, + 2, + 3, + 4, + null, + null, + null, + null, + null, + 5 + ]; + + return floorNumber[firstFloorId - 1] || null; + } + + getNumberOfFloorsFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + + const floorIds = floorsIdText.split(","); + if (floorIds.length === 0) { + return null; + } + + return floorIds.length; + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } From ae93d2f03d519222070d250d1b75c9f6bb9adb9e Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 13 Nov 2019 16:52:55 +0100 Subject: [PATCH 12/20] update ENV variable description --- development.env | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/development.env b/development.env index 901c309..795b5d9 100644 --- a/development.env +++ b/development.env @@ -41,8 +41,8 @@ RENTAL_IGNORED_USERNAMES=!!! This is not used for rental crawler !!! RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page RENTAL_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found #==PROSTOR== -PROSTOR_MAX_PAGES=!!! This is not used for prostor crawler !!! -PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents MAX RESULTS in total +PROSTOR_MAX_PAGES=For Prostor crawler, this is MAX number of ads to crawl in total +PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents how many ads are crawled at once PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!! From 3b3e2eda071f7282db44eaca116b4e9e32f8c8ed Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 13 Nov 2019 16:54:16 +0100 Subject: [PATCH 13/20] refactor Prostor crawler --- app/crawler/specificConfigs/prostor.js | 3 +- app/crawler/specificCrawlers/prostor.js | 332 ++++++++++++++++++++++-- 2 files changed, 317 insertions(+), 18 deletions(-) diff --git a/app/crawler/specificConfigs/prostor.js b/app/crawler/specificConfigs/prostor.js index 098fc95..aebdb4d 100644 --- a/app/crawler/specificConfigs/prostor.js +++ b/app/crawler/specificConfigs/prostor.js @@ -29,5 +29,6 @@ module.exports = { PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories, PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [], PROSTOR_DELAY_BETWEEN_PAGES: - parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000 + parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000, + PROSTOR_FORCE_CRAWL: !!parseInt(process.env.PROSTOR_FORCE_CRAWL) }; diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index 7b970e0..dbd1b7e 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -2,6 +2,7 @@ const fetch = require("node-fetch"); const cheerio = require("cheerio"); +const moment = require("moment-timezone"); const { AD_TYPE, @@ -11,7 +12,11 @@ const { CRAWLER_AD_TYPE } = require("../../common/enums"); -const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); +const { + PRINT_CRAWLER_DEBUG, + DEFAULT_TIMEZONE +} = require("../../config/appConfig"); +const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor"); const PROSTOR_ENUMS = { PROSTOR_AD_TYPE: { @@ -48,9 +53,10 @@ class ProstorCrawler { this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxResultsPerPage = maxResultsPerPage; + this.delayBetweenPages = delayBetweenPages; } - async crawl() { + async crawlOld() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; @@ -79,6 +85,290 @@ class ProstorCrawler { return newRealEstates; } + async crawl() { + const crawlAdCategories = this.crawlerAdCategories; + + const newRealEstates = []; + + if (crawlAdCategories) { + const indexGenerators = []; + for (const adCategory of crawlAdCategories) { + indexGenerators.push(this.categoryIndexer(adCategory)); + } + + let done = false; + while (!done) { + const categoryIndexerPromises = []; + const generatorsToRemove = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); + } + + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); + + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; + + newRealEstates.push(...newRecords); + + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !PROSTOR_FORCE_CRAWL + ) { + generatorsToRemove[index] = true; + } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); + } + } + + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + await this.sleep(this.delayBetweenPages); + } + } + return newRealEstates; + } + + async *categoryIndexer(adCategory) { + const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { + const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; + const listOfAllRealEstates = await this.extractRealEstates( + urlPageToCrawl + ); + + let elementToStartIndexFrom = 0; + while (true) { + const realEstatesForSinglePage = listOfAllRealEstates.slice( + elementToStartIndexFrom, + elementToStartIndexFrom + this.maxResultsPerPage + ); + + if (realEstatesForSinglePage.length > 0) { + elementToStartIndexFrom += realEstatesForSinglePage.length; + + const singlePageResults = await this.indexSinglePage( + realEstatesForSinglePage + ); + + const filteredSinglePageResults = singlePageResults.filter( + singleResult => !!singleResult + ); + + if ( + Array.isArray(filteredSinglePageResults) && + filteredSinglePageResults.length > 0 + ) { + yield filteredSinglePageResults; + } else { + return undefined; + } + } else { + return undefined; + } + } + } else { + return undefined; + } + } + + async indexSinglePage(realEstatesList) { + const asyncActions = []; + for (const realEstate of realEstatesList) { + asyncActions.push(this.scrapeAd(realEstate)); + } + + try { + return await Promise.all(asyncActions); + } catch (e) { + console.log( + "[PROSTOR] Error crawling ads : ", + e.message || "UNKNOWN ERROR" + ); + return []; + } + } + + async scrapeAd(realEstate) { + const { lat, lng, property_name, price, size, link } = realEstate; + const url = `https://prostor.ba${link}`; + console.log("[PROSTOR] Scraping : ", url); + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + + let numberOfRooms = null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = null, + furnishingType = null, + balcony = null, + newBuilding = null, + elevator = null, + water = null, + electricity = null, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = null, + garage = null, + gas = null, + antiTheftDoor = null, + airCondition = null, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = null, + storeRoom = null, + videoSurveillance = null, + alarm = null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null, + numberOfViewsAgency = null; + + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = ProstorCrawler.getAdTypeId(linkParts[1]); + const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + + if (!adType || !realEstateType || !prostorId) { + console.log( + "adType: ", + adType, + " reType: ", + realEstateType, + " prostorId: ", + prostorId, + "url: ", + url + ); + return null; + } + + const adStatus = AD_STATUS.STATUS_NORMAL; + const title = property_name; + const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; + const parsedArea = parseFloat(size); + const gardenSize = null; + const longDescription = null; + const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE); + if (!publishedDateMoment.isValid()) { + throw { + message: `Invalid published date` + }; + } + + const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE); + if (!renewedDateMoment.isValid()) { + throw { + message: `Invalid renewed date` + }; + } + + const data = { + url, + agencyObjectId: prostorId, + originAgencyName: AD_AGENCY.PROSTOR, + realEstateType, + adType, + title, + price: parsedPrice, + area: parsedArea, + gardenSize, + shortDescription: "", + longDescription: longDescription, + streetNumber: 0, + streetName: "", + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat: lat, + locationLong: lng, + adStatus, + publishedDate: publishedDateMoment.toISOString(), + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency + }; + + return data; + } catch (e) { + console.error( + "[PROSTOR] Exception caught: " + e.message, + "\r\nURL:", + url + ); + return null; + } + } + async extractRealEstates(url) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); @@ -115,18 +405,19 @@ class ProstorCrawler { const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; const realEstates = JSON.parse(jsonData); - const transformedRealEstates = []; - - for (const realEstate of realEstates) { - const transformedRealEstate = ProstorCrawler.transformRealEstateData( - realEstate - ); - if (transformedRealEstate) { - transformedRealEstates.push(transformedRealEstate); - } - } - - return transformedRealEstates; + // const transformedRealEstates = []; + // + // for (const realEstate of realEstates) { + // const transformedRealEstate = ProstorCrawler.transformRealEstateData( + // realEstate + // ); + // if (transformedRealEstate) { + // transformedRealEstates.push(transformedRealEstate); + // } + // } + // + // return transformedRealEstates; + return realEstates; } else { throw { message: "Something is wrong with JSON data or data is moved" @@ -134,11 +425,14 @@ class ProstorCrawler { } } catch (e) { console.log(e); - throw { message: "Can't find ad data JSON" }; + throw e; } } } catch (e) { - console.error("[PROSTOR] Exception caught:", e.message); + console.error( + "[PROSTOR] Exception caught:", + e.message || "UNKNOWN MESSAGE" + ); return []; } } @@ -236,6 +530,10 @@ class ProstorCrawler { } } + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + async saveCrawledResults(results) { const savers = this.savers; @@ -244,7 +542,7 @@ class ProstorCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } From 618dcd217e48c8cb4a85f8ed00fb68346693a5db Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 02:09:22 +0100 Subject: [PATCH 14/20] update ENV variables template file --- development.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/development.env b/development.env index 795b5d9..3a63a34 100644 --- a/development.env +++ b/development.env @@ -41,7 +41,7 @@ RENTAL_IGNORED_USERNAMES=!!! This is not used for rental crawler !!! RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page RENTAL_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found #==PROSTOR== -PROSTOR_MAX_PAGES=For Prostor crawler, this is MAX number of ads to crawl in total +PROSTOR_MAX_PAGES=!!! This is not used for prostor crawler !!! PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents how many ads are crawled at once PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values From c13857bc096616fe4061b1bf8a4cc37dad08a340 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 02:09:42 +0100 Subject: [PATCH 15/20] add additional fields to the Prostor crawler --- app/crawler/specificCrawlers/prostor.js | 220 ++++++++++-------------- 1 file changed, 91 insertions(+), 129 deletions(-) diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index dbd1b7e..7737591 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -56,35 +56,6 @@ class ProstorCrawler { this.delayBetweenPages = delayBetweenPages; } - async crawlOld() { - const crawlAdCategories = this.crawlerAdCategories; - const newRealEstates = []; - - if (crawlAdCategories) { - for (const adCategory of crawlAdCategories) { - const urlAdTypePart = - PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; - const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; - if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { - const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; - const singleCategoryResults = await this.extractRealEstates( - urlPageToCrawl - ); - - const resultsSubset = singleCategoryResults.slice( - 0, - this.maxResultsPerPage - ); - - const saveResults = await this.saveCrawledResults(resultsSubset); - const { newRecords } = saveResults; - newRealEstates.push(...newRecords); - } - } - } - return newRealEstates; - } - async crawl() { const crawlAdCategories = this.crawlerAdCategories; @@ -210,20 +181,67 @@ class ProstorCrawler { async scrapeAd(realEstate) { const { lat, lng, property_name, price, size, link } = realEstate; const url = `https://prostor.ba${link}`; - console.log("[PROSTOR] Scraping : ", url); + // console.log("[PROSTOR] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); - let numberOfRooms = null, + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = ProstorCrawler.getAdTypeId(linkParts[1]); + const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + + if (!adType || !realEstateType || !prostorId) { + console.log( + "adType: ", + adType, + " reType: ", + realEstateType, + " prostorId: ", + prostorId, + "url: ", + url + ); + return null; + } + + const allDataSelector = + "body > div > div.container-fluid > div > div.column-right > table > tbody"; + + const realEstateProperties = {}; + + $(allDataSelector) + .find("p") + .each((i, elem) => { + const propertyElement = $(elem) + .text() + .split(":") + .map(text => text.trim()); + + const propertyTitle = propertyElement[0]; + realEstateProperties[propertyTitle] = propertyElement[1]; + }); + + if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { + return null; + } + + let numberOfRooms = + parseFloat(realEstateProperties["Broj soba"]) + + parseFloat(realEstateProperties["Broj spavaćih soba"]) || null, numberOfFloors = null, floor = null, accessRoadType = null, heatingType = null, furnishingType = null, balcony = null, - newBuilding = null, + newBuilding = linkParts[1] === "novogradnja", elevator = null, water = null, electricity = null, @@ -252,28 +270,46 @@ class ProstorCrawler { distanceToRiver = null, numberOfViewsAgency = null; - // link contains part of the URL in the format of : /prodaja/stan/stup/9556 - // general form is : /actionType/realEstateType/location/realEstateID - // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + // Floor versions (there are possibly more versions) : + // Sprat: 3/3 + // Sprat: 1 - 2/2 + // Sprat: Pr - 7/7 + // Sprat: -2/0 + // If there are two parts, that represents more real estates are sold + // numberOfFloors is contained in second part, after / sign - const linkParts = link.split("/"); + const floorsArray = realEstateProperties["Sprat"].split(" - "); + let floorText = ""; + if (floorsArray.length === 1) { + const floorDescription = floorsArray[0].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorDescription[0]; + floor = Math.round(parseFloat(floorText)); + } else if (floorsArray.length === 2) { + const floorDescription = floorsArray[1].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorsArray[0]; + floor = Math.round(parseFloat(floorText)); + } else { + // This is something strange + } - const adType = ProstorCrawler.getAdTypeId(linkParts[1]); - const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); - const prostorId = linkParts[4]; - - if (!adType || !realEstateType || !prostorId) { - console.log( - "adType: ", - adType, - " reType: ", - realEstateType, - " prostorId: ", - prostorId, - "url: ", - url - ); - return null; + if (isNaN(floor)) { + // It was textual representation of floor, like "Pr", "Su" or similar + switch (floorText.toLowerCase()) { + case "pr": + floor = 0; + break; + case "su": + floor = -1; + break; + default: + console.log( + "[PROSTOR] Unknown textual representation of floor : ", + floorText + ); + floor = null; + } } const adStatus = AD_STATUS.STATUS_NORMAL; @@ -282,19 +318,6 @@ class ProstorCrawler { const parsedArea = parseFloat(size); const gardenSize = null; const longDescription = null; - const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE); - if (!publishedDateMoment.isValid()) { - throw { - message: `Invalid published date` - }; - } - - const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE); - if (!renewedDateMoment.isValid()) { - throw { - message: `Invalid renewed date` - }; - } const data = { url, @@ -309,7 +332,7 @@ class ProstorCrawler { shortDescription: "", longDescription: longDescription, streetNumber: 0, - streetName: "", + streetName: realEstateProperties["Adresa"], locality: "", municipality: "", city: "", @@ -319,8 +342,6 @@ class ProstorCrawler { locationLat: lat, locationLong: lng, adStatus, - publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString(), numberOfRooms, numberOfFloors, floor, @@ -437,67 +458,6 @@ class ProstorCrawler { } } - static transformRealEstateData(realEstateData) { - try { - const { lat, lng, property_name, price, size, link } = realEstateData; - - // link contains part of the URL in the format of : /prodaja/stan/stup/9556 - // general form is : /actionType/realEstateType/location/realEstateID - // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] - - const linkParts = link.split("/"); - - const adType = ProstorCrawler.getAdTypeId(linkParts[1]); - const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); - const prostorId = linkParts[4]; - const url = `https://prostor.ba${link}`; - - if (!adType || !realEstateType || !prostorId) { - return null; - } - - const adStatus = AD_STATUS.STATUS_NORMAL; - const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; - const parsedArea = parseFloat(size); - - const data = { - url, - agencyObjectId: prostorId, - originAgencyName: AD_AGENCY.PROSTOR, - realEstateType, - adType, - title: property_name, - price: parsedPrice, - area: parsedArea, - gardenSize: null, - shortDescription: "", - longDescription: "", - streetNumber: 0, - streetName: "", - locality: "", - municipality: "", - city: "", - region: "", - entity: "", - country: "", - locationLat: lat, - locationLong: lng, - adStatus, - publishedDate: null, - renewedDate: null - }; - - return data; - } catch (e) { - console.error( - "[PROSTOR] Exception caught: " + e.message, - "\r\nURL:", - url - ); - return null; - } - } - //======= HELPER FUNCTIONS ============= static getAdCategoryId(categoryText) { @@ -525,6 +485,8 @@ class ProstorCrawler { return AD_TYPE.AD_TYPE_SALE.stringId; case "najam": return AD_TYPE.AD_TYPE_RENT.stringId; + case "novogradnja": + return AD_TYPE.AD_TYPE_SALE.stringId; default: return undefined; } From 1e68d640e298bc180188513635bb49ce76472cff Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 07:22:54 +0100 Subject: [PATCH 16/20] add RENTED enum status --- app/common/enums.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/common/enums.js b/app/common/enums.js index a419c42..53bedfa 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -156,7 +156,8 @@ const AD_STATUS = { STATUS_SOLD: 3, STATUS_DELETED: 4, STATUS_URGENT: 5, - STATUS_DISCOUNTED: 6 + STATUS_DISCOUNTED: 6, + STATUS_RENTED: 7 }; const AD_AGENCY = { From 168b2186e72373d8d166c50c2a4f2f19f646adcb Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 07:23:23 +0100 Subject: [PATCH 17/20] add more fields to the Prostor real estates crawler --- app/crawler/specificCrawlers/prostor.js | 144 +++++++++++++++++------- 1 file changed, 103 insertions(+), 41 deletions(-) diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index 7737591..cb1d3f9 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -9,7 +9,9 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + FURNISHING_TYPE, + HEATING_TYPE } = require("../../common/enums"); const { @@ -121,7 +123,7 @@ class ProstorCrawler { const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { - const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; + const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`; const listOfAllRealEstates = await this.extractRealEstates( urlPageToCrawl ); @@ -179,7 +181,7 @@ class ProstorCrawler { } async scrapeAd(realEstate) { - const { lat, lng, property_name, price, size, link } = realEstate; + const { lat, lng, property_name, price, size, link, status } = realEstate; const url = `https://prostor.ba${link}`; // console.log("[PROSTOR] Scraping : ", url); try { @@ -198,16 +200,6 @@ class ProstorCrawler { const prostorId = linkParts[4]; if (!adType || !realEstateType || !prostorId) { - console.log( - "adType: ", - adType, - " reType: ", - realEstateType, - " prostorId: ", - prostorId, - "url: ", - url - ); return null; } @@ -218,52 +210,70 @@ class ProstorCrawler { $(allDataSelector) .find("p") - .each((i, elem) => { - const propertyElement = $(elem) + .each((i, element) => { + const propertyElement = $(element) .text() .split(":") - .map(text => text.trim()); + .map(text => text.trim().toLowerCase()); const propertyTitle = propertyElement[0]; realEstateProperties[propertyTitle] = propertyElement[1]; }); + $(allDataSelector) + .find("div.mb-2") + .each((i, element) => { + const propertyElement = $(element) + .text() + .trim() + .toLowerCase(); + + realEstateProperties[propertyElement] = true; + }); + if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { return null; } let numberOfRooms = - parseFloat(realEstateProperties["Broj soba"]) + - parseFloat(realEstateProperties["Broj spavaćih soba"]) || null, + parseFloat(realEstateProperties["broj soba"]) + + parseFloat(realEstateProperties["broj spavaćih soba"]) || null, numberOfFloors = null, floor = null, accessRoadType = null, - heatingType = null, + heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties), furnishingType = null, - balcony = null, + balcony = + realEstateProperties["balkon"] || + realEstateProperties["terasa"] || + realEstateProperties["lođa"] || + null, newBuilding = linkParts[1] === "novogradnja", - elevator = null, - water = null, - electricity = null, - drainageSystem = null, + elevator = realEstateProperties["lift"] || null, + water = realEstateProperties["voda"] || null, + electricity = realEstateProperties["električna energija"] || null, + drainageSystem = realEstateProperties["kanalizacija"] || null, registeredInZkBooks = null, recentlyAdapted = null, - parking = null, - garage = null, - gas = null, - antiTheftDoor = null, - airCondition = null, - phoneConnection = null, - cableTV = null, - internet = null, - basementAttic = null, - storeRoom = null, - videoSurveillance = null, - alarm = null, + parking = realEstateProperties["parking"] || null, + garage = realEstateProperties["garaža"] || null, + gas = realEstateProperties["plin"] || null, + antiTheftDoor = realEstateProperties["blindo vrata"] || null, + airCondition = realEstateProperties["klima"] || null, + phoneConnection = realEstateProperties["telefon"] || null, + cableTV = realEstateProperties["kablovksa tv"] || null, + internet = + realEstateProperties["internet"] || + realEstateProperties["adsl"] || + null, + basementAttic = realEstateProperties["podrum"] || null, + storeRoom = realEstateProperties["ostava"] || null, + videoSurveillance = realEstateProperties["video nadzor"], + alarm = realEstateProperties["alarm"] || null, suitableForStudents = null, includingBills = null, animalsAllowed = null, - pool = null, + pool = realEstateProperties["bazen"] || null, urbanPlanPermit = null, buildingPermit = null, utilityConnection = null, @@ -278,7 +288,7 @@ class ProstorCrawler { // If there are two parts, that represents more real estates are sold // numberOfFloors is contained in second part, after / sign - const floorsArray = realEstateProperties["Sprat"].split(" - "); + const floorsArray = realEstateProperties["sprat"].split(" - "); let floorText = ""; if (floorsArray.length === 1) { const floorDescription = floorsArray[0].split("/"); @@ -296,7 +306,7 @@ class ProstorCrawler { if (isNaN(floor)) { // It was textual representation of floor, like "Pr", "Su" or similar - switch (floorText.toLowerCase()) { + switch (floorText) { case "pr": floor = 0; break; @@ -312,7 +322,15 @@ class ProstorCrawler { } } - const adStatus = AD_STATUS.STATUS_NORMAL; + if (realEstateProperties["namješteno"]) { + furnishingType = FURNISHING_TYPE.FURNISHED.id; + } else if (realEstateProperties["polunamješteno"]) { + furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + } else { + furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + } + + const adStatus = ProstorCrawler.getStatusId(status); const title = property_name; const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; const parsedArea = parseFloat(size); @@ -332,7 +350,7 @@ class ProstorCrawler { shortDescription: "", longDescription: longDescription, streetNumber: 0, - streetName: realEstateProperties["Adresa"], + streetName: realEstateProperties["adresa"], locality: "", municipality: "", city: "", @@ -492,6 +510,50 @@ class ProstorCrawler { } } + static getHeatingTypeId(realEstateProperties) { + const realEstatePropertiesKeys = Object.keys(realEstateProperties); + for (const property of realEstatePropertiesKeys) { + switch (property) { + case "centralno toplane": + return HEATING_TYPE.CENTRAL_CITY.id; + case "etažno plinsko": + return HEATING_TYPE.CENTRAL_GAS.id; + case "termo blok": + case "podno grijanje": + return HEATING_TYPE.OTHER.id; + case "etažno električno": + case "konvektori": + return HEATING_TYPE.ELECTRICITY.id; + case "plinske peći": + return HEATING_TYPE.GAS.id; + case "vlastita kotlovnica": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "toplotna pumpa": + return HEATING_TYPE.HEAT_PUMP.id; + case "kamin": + return HEATING_TYPE.WOOD.id; + default: + //console.log("[PROSTOR] Nepoznato >>> [", property, "]"); + } + } + } + + static getStatusId(statusText) { + switch (statusText) { + case "": + return AD_STATUS.STATUS_NORMAL; + case "Rezervisano": + return AD_STATUS.STATUS_RESERVED; + case "Prodano": + return AD_STATUS.STATUS_SOLD; + case "Iznajmljeno": + return AD_STATUS.STATUS_RENTED; + default: + console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]"); + return AD_STATUS.STATUS_NORMAL; + } + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } From a7cd75653d7e9c83b7719d29406b4e23003dd7b9 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 08:04:58 +0100 Subject: [PATCH 18/20] improve OLX ad status detection --- app/crawler/specificCrawlers/olx.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 952a8be..6ea9e8a 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -547,6 +547,9 @@ class OlxCrawler { case "bazen": pool = true; break; + case "iznajmljeno": + status = AD_STATUS.STATUS_RENTED; + break; default: // console.log(fieldTitle, " = ", fieldValue); break; @@ -573,6 +576,10 @@ class OlxCrawler { locationLong = parseFloat(locationLatLngMatches[2]) || null; } + if (title.indexOf("[PRODANO]") !== -1) { + status = AD_STATUS.STATUS_SOLD; + } + const data = { url, agencyObjectId: olxId, From 5148f88a626e35615de0dafb90151c4f77924fa7 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 08:31:57 +0100 Subject: [PATCH 19/20] improve Rental and Aktido ad status detection --- app/crawler/specificCrawlers/aktido.js | 7 +++++-- app/crawler/specificCrawlers/rental.js | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index 2445566..74bcba7 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -222,6 +222,7 @@ class AktidoCrawler { throw { message: "Can't find ad data JSON" }; } + let adStatus = AD_STATUS.STATUS_NORMAL; const aktidoId = extractedData["re_realEstates_id"]; const adCategory = this.getKiviCategoryIdFromAktidoId( parseInt(extractedData["re_types_id"]) @@ -348,6 +349,10 @@ class AktidoCrawler { infrastructureIds ); + if (extractedData["adm_realEstates_discount"] === "1") { + adStatus = AD_STATUS.STATUS_DISCOUNTED; + } + let numberOfRooms = parseInt(extractedData["re_realEstates_roomsNO"]) + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, @@ -451,8 +456,6 @@ class AktidoCrawler { }; } - const adStatus = AD_STATUS.STATUS_NORMAL; - const data = { url, agencyObjectId: aktidoId, diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index 020fa10..8f38dc8 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -222,6 +222,7 @@ class RentalCrawler { throw { message: "Can't find ad data JSON" }; } + let adStatus = AD_STATUS.STATUS_NORMAL; const rentalId = extractedData["re_realEstates_id"]; const adCategory = this.getKiviCategoryIdFromRentalId( parseInt(extractedData["re_types_id"]) @@ -305,6 +306,10 @@ class RentalCrawler { infrastructureIds ); + if (extractedData["adm_realEstates_discount"] === "1") { + adStatus = AD_STATUS.STATUS_DISCOUNTED; + } + let numberOfRooms = parseInt(extractedData["re_realEstates_roomsNO"]) + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, @@ -411,8 +416,6 @@ class RentalCrawler { }; } - const adStatus = AD_STATUS.STATUS_NORMAL; - const data = { url, agencyObjectId: rentalId, From af42d2c4482a04a7e2cf8d124efb4f77a502acc7 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 08:47:48 +0100 Subject: [PATCH 20/20] improve OLX ad status detection --- app/crawler/specificCrawlers/olx.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 6ea9e8a..ede9734 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -576,7 +576,10 @@ class OlxCrawler { locationLong = parseFloat(locationLatLngMatches[2]) || null; } - if (title.indexOf("[PRODANO]") !== -1) { + if ( + title.indexOf("[PRODANO]") !== -1 || + title.indexOf("[ZAVRŠENO]") !== -1 + ) { status = AD_STATUS.STATUS_SOLD; }