diff --git a/app/common/enums.js b/app/common/enums.js index 942fb47..53bedfa 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -156,7 +156,8 @@ const AD_STATUS = { STATUS_SOLD: 3, STATUS_DELETED: 4, STATUS_URGENT: 5, - STATUS_DISCOUNTED: 6 + STATUS_DISCOUNTED: 6, + STATUS_RENTED: 7 }; const AD_AGENCY = { @@ -187,11 +188,87 @@ const EMAIL_FREQUENCY = { } }; +const HEATING_TYPE = { + NO_HEATING: { + id: "NO_HEATING", + title: "Nije uvedeno" + }, + ELECTRICITY: { + id: "ELECTRICITY", + title: "Struja" + }, + GAS: { + id: "GAS", + title: "Plin" + }, + WOOD: { + id: "WOOD", + title: "Drva" + }, + CENTRAL_CITY: { + id: "CENTRAL_CITY", + title: "Centralno (gradsko)" + }, + CENTRAL_BOILER: { + id: "CENTRAL_BOILER", + title: "Centralno (kotlovnica)" + }, + CENTRAL_GAS: { + id: "CENTRAL_GAS", + title: "Centralno (plin)" + }, + HEAT_PUMP: { + id: "HEAT_PUMP", + title: "Toplotna pumpa" + }, + OTHER: { + id: "OTHER", + title: "Drugo" + } +}; + +const ACCESS_ROAD_TYPE = { + ASPHALT: { + id: "ASPHALT", + title: "Asfalt" + }, + CONCRETE: { + id: "CONCRETE", + title: "Beton" + }, + MACADAM: { + id: "MACADAM", + title: "Makadam" + }, + OTHER: { + id: "OTHER", + title: "Drugo" + } +}; + +const FURNISHING_TYPE = { + NOT_FURNISHED: { + id: "NOT_FURNISHED", + title: "Nenamješten" + }, + HALF_FURNISHED: { + id: "HALF_FURNISHED", + title: "Polunamješten" + }, + FURNISHED: { + id: "FURNISHED", + title: "Namješten" + } +}; + module.exports = { AD_TYPE, AD_CATEGORY, AD_STATUS, AD_AGENCY, CRAWLER_AD_TYPE, - EMAIL_FREQUENCY + EMAIL_FREQUENCY, + HEATING_TYPE, + ACCESS_ROAD_TYPE, + FURNISHING_TYPE }; diff --git a/app/crawler/specificConfigs/prostor.js b/app/crawler/specificConfigs/prostor.js index 098fc95..aebdb4d 100644 --- a/app/crawler/specificConfigs/prostor.js +++ b/app/crawler/specificConfigs/prostor.js @@ -29,5 +29,6 @@ module.exports = { PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories, PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [], PROSTOR_DELAY_BETWEEN_PAGES: - parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000 + parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000, + PROSTOR_FORCE_CRAWL: !!parseInt(process.env.PROSTOR_FORCE_CRAWL) }; diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index 373a6ef..74bcba7 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -11,7 +11,10 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + HEATING_TYPE, + ACCESS_ROAD_TYPE, + FURNISHING_TYPE } = require("../../common/enums"); const { @@ -219,6 +222,7 @@ class AktidoCrawler { throw { message: "Can't find ad data JSON" }; } + let adStatus = AD_STATUS.STATUS_NORMAL; const aktidoId = extractedData["re_realEstates_id"]; const adCategory = this.getKiviCategoryIdFromAktidoId( parseInt(extractedData["re_types_id"]) @@ -237,6 +241,181 @@ class AktidoCrawler { }; } + const descriptionIds = extractedData["re_descriptions_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(descriptionIds)) { + throw { + message: + 'Expected array od descriptions but "re_descriptions_id" not found !' + }; + } + + const spaceIds = extractedData["re_spaces_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(spaceIds)) { + throw { + message: 'Expected array od spaces but "re_spaces_id" not found !' + }; + } + + const infrastructureIds = extractedData["re_infrastructure_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(infrastructureIds)) { + throw { + message: + 'Expected array od infrastructures but "re_infrastructure_id" not found !' + }; + } + + const floorNoIds = extractedData["re_floorNO_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(floorNoIds)) { + throw { + message: + 'Expected array od infrastructures but "re_floorNO_id" not found !' + }; + } + + // counting floor enums + // for (let i = 1; i < 10; i++) { + // const floorEnumsTitle = $( + // `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(1)` + // ) + // .text() + // .trim(); + // if (floorEnumsTitle === "Spratnost:") { + // const floorEnumsValue = $( + // `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(2)` + // ) + // .text() + // .trim() + // .split(","); + // + // console.log("=========="); + // floorNoIds.forEach((id, index) => { + // console.log("\t", id, " = ", floorEnumsValue[index]); + // }); + // break; + // } + // } + + // enumerating infrastructure - relation between id and infrastructure title + // let found = false; + // let infrastructureDescriptions = {}; + // for (let i = 1; i < 5; i++) { + // found = false; + // for (let j = 1; j < 10; j++) { + // const infrastructureTitle = $( + // `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > strong` + // ) + // .text() + // .trim(); + // if (infrastructureTitle === "Osnovna infrastruktura:") { + // found = true; + // + // const infrastructureValues = $( + // `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > div` + // ) + // .text() + // .trim() + // .split(","); + // + // infrastructureIds.forEach((id, index) => { + // infrastructureDescriptions[id] = infrastructureValues[index]; + // }); + // } + // } + // if (found) { + // break; + // } + // } + + const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions( + descriptionIds + ); + const realEstatePropertiesFromSpaces = this.getPropertiesFromSpaces( + spaceIds + ); + + const realEstatePropertiesFromInfrastructure = this.getPropertiesFromInfrastructure( + infrastructureIds + ); + + if (extractedData["adm_realEstates_discount"] === "1") { + adStatus = AD_STATUS.STATUS_DISCOUNTED; + } + + let numberOfRooms = + parseInt(extractedData["re_realEstates_roomsNO"]) + + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, + numberOfFloors = + parseInt(extractedData["re_realEstates_floorsNO"]) || + this.getNumberOfFloorsFromFloorId(extractedData["re_floorNO_id"]), + floor = + parseInt(extractedData["re_realEstates_floorNO"]) || + this.getFloorNumberFromFloorId(extractedData["re_floorNO_id"]), + accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType, + heatingType = + this.getHeatingTypeId(extractedData["re_heating_id"]) || null, + furnishingType = realEstatePropertiesFromDescriptions.furnishingType, + balcony = + realEstatePropertiesFromDescriptions.balcony || + realEstatePropertiesFromSpaces.balcony, + newBuilding = extractedData["op_realEstates_newBuilding"] + ? extractedData["op_realEstates_newBuilding"] === "1" + : null, + elevator = realEstatePropertiesFromDescriptions.elevator, + water = + realEstatePropertiesFromDescriptions.water || + realEstatePropertiesFromInfrastructure.water, + electricity = + realEstatePropertiesFromDescriptions.electricity || + realEstatePropertiesFromInfrastructure.electricity, + drainageSystem = + realEstatePropertiesFromInfrastructure.drainageSystem, + registeredInZkBooks = + extractedData["op_realEstates_ownerPermit"] === 1 || null, + recentlyAdapted = null, + parking = + realEstatePropertiesFromDescriptions.parking || + realEstatePropertiesFromSpaces.parking, + garage = realEstatePropertiesFromSpaces.garage, + gas = realEstatePropertiesFromInfrastructure.gas, + antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor, + airCondition = realEstatePropertiesFromDescriptions.airCondition, + phoneConnection = + realEstatePropertiesFromInfrastructure.phoneConnection, + cableTV = realEstatePropertiesFromInfrastructure.cableTV, + internet = realEstatePropertiesFromInfrastructure.internet, + basementAttic = realEstatePropertiesFromSpaces.basementAttic, + storeRoom = realEstatePropertiesFromSpaces.storeRoom, + videoSurveillance = + realEstatePropertiesFromDescriptions.videoSurveillance || + realEstatePropertiesFromInfrastructure.videoSurveillance, + alarm = realEstatePropertiesFromDescriptions.alarm, + suitableForStudents = null, + includingBills = + extractedData["op_realEstates_utilitiesIncluded"] === "1" || null, + animalsAllowed = null, + pool = realEstatePropertiesFromDescriptions.pool, + urbanPlanPermit = + extractedData["op_realEstates_locationPermit"] === "1" || + realEstatePropertiesFromDescriptions.urbanPlanPermit, + buildingPermit = + extractedData["op_realEstates_buildingPermit"] === "1" || null, + utilityConnection = + realEstatePropertiesFromDescriptions.utilityConnection, + distanceToRiver = null, + numberOfViewsAgency = null; + const title = extractedData["re_realEstates_portalName"]; const extractedPrice = parseFloat( extractedData["re_realEstates_price"] @@ -277,8 +456,6 @@ class AktidoCrawler { }; } - const adStatus = AD_STATUS.STATUS_NORMAL; - const data = { url, agencyObjectId: aktidoId, @@ -303,7 +480,42 @@ class AktidoCrawler { locationLong, adStatus, publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString() + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency }; return data; @@ -350,6 +562,270 @@ class AktidoCrawler { } } + getPropertiesFromDescriptions(descriptionIds) { + const result = { + accessRoadType: null, + furnishingType: null, + balcony: null, + elevator: null, + parking: null, + antiTheftDoor: null, + airCondition: null, + videoSurveillance: null, + alarm: null, + pool: null, + urbanPlanPermit: null, + utilityConnection: null, + water: null, + electricity: null + }; + + for (const descriptionId of descriptionIds) { + switch (descriptionId) { + case 16: + result.furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + break; + case 17: + result.furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + break; + case 1: + case 28: + result.furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case 14: + result.elevator = true; + break; + case 39: + result.electricity = true; + break; + case 40: + result.water = true; + break; + case 41: + case 58: + result.accessRoadType = ACCESS_ROAD_TYPE.ASPHALT.id; + break; + case 26: + result.balcony = true; + break; + case 62: + result.parking = true; + break; + case 3: + result.antiTheftDoor = true; + break; + case 2: + case 21: + result.airCondition = true; + break; + case 4: + result.alarm = true; + break; + case 55: + result.videoSurveillance = true; + break; + case 9: + result.pool = true; + break; + case 60: + result.urbanPlanPermit = true; + break; + case 38: + result.utilityConnection = true; + break; + } + } + + return result; + } + + getPropertiesFromSpaces(spaceIds) { + const result = { + balcony: null, + parking: null, + garage: null, + basementAttic: null, + storeRoom: null + }; + + for (const spaceId of spaceIds) { + switch (spaceId) { + case 36: + case 12: + result.parking = true; + break; + case 1: + case 2: + case 3: + result.balcony = true; + break; + case 4: + case 30: + result.garage = true; + break; + case 9: + case 10: + result.storeRoom = true; + break; + case 18: + case 34: + case 37: + case 27: + result.basementAttic = true; + break; + } + } + + return result; + } + + getHeatingTypeId(heatingRentalId) { + // heatingRentalId can have multiple values, like: "1, 2, 3", parseInt will take first integer value + const heatingId = parseInt(heatingRentalId); + switch (heatingId) { + case 27: + case 16: + return HEATING_TYPE.GAS.id; + case 4: + return HEATING_TYPE.CENTRAL_GAS.id; + case 3: + case 23: + case 6: + case 7: + case 8: + case 9: + case 10: + return HEATING_TYPE.CENTRAL_BOILER.id; + case 2: + case 13: + case 30: + case 17: + case 29: + case 31: + return HEATING_TYPE.ELECTRICITY.id; + case 24: + case 25: + case 12: + return HEATING_TYPE.CENTRAL_CITY.id; + case 26: + case 21: + case 20: + return HEATING_TYPE.WOOD.id; + case 28: + case 19: + return HEATING_TYPE.HEAT_PUMP.id; + case 14: + case 32: + return HEATING_TYPE.OTHER.id; + default: + return null; + } + } + + getPropertiesFromInfrastructure(infrastructureIds) { + const result = { + electricity: null, + water: null, + gas: null, + drainageSystem: null, + phoneConnection: null, + internet: null, + videoSurveillance: null, + cableTV: null + }; + + for (const infrastructureId of infrastructureIds) { + switch (infrastructureId) { + case 1: + result.electricity = true; + break; + case 2: + result.water = true; + break; + case 4: + result.gas = true; + break; + case 5: + result.drainageSystem = true; + break; + case 7: + case 8: + result.phoneConnection = true; + break; + case 10: + result.internet = true; + break; + case 11: + result.cableTV = true; + break; + case 16: + case 17: + result.videoSurveillance = true; + break; + } + } + + return result; + } + + getFloorNumberFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + // just extracting floor number from first element + + const floorsId = floorsIdText.split(","); + if (floorsId.length === 0) { + return null; + } + + const firstFloorId = parseInt(floorsId[0]); + + // 1 pod + // 2 sut + // 3 raz + // 4 pri + // 5 vpri + // 6 prv + // 7 dru + // 8 tre + // 9 čet + // 10 man + // 11 + // 12 pot + // 13 vpot + // 14 tav + // 15 pet + const floorNumber = [ + -1, + -1, + 0, + 0, + 1, + 1, + 2, + 3, + 4, + null, + null, + null, + null, + null, + 5 + ]; + + return floorNumber[firstFloorId - 1] || null; + } + + getNumberOfFloorsFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + + const floorIds = floorsIdText.split(","); + if (floorIds.length === 0) { + return null; + } + + return floorIds.length; + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } @@ -362,7 +838,7 @@ class AktidoCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index d0bd0dd..ede9734 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -10,7 +10,10 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + HEATING_TYPE, + FURNISHING_TYPE, + ACCESS_ROAD_TYPE } = require("../../common/enums"); const { @@ -271,6 +274,7 @@ class OlxCrawler { //====== OTHER AD INFORMATION =============== let adType = null; let olxId = null; + let numberOfViewsAgency = null; let otherInformationDivId; //We need to locate DIV ID where other information are stored @@ -293,6 +297,7 @@ class OlxCrawler { const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; + const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) @@ -331,60 +336,7 @@ class OlxCrawler { ) .text() .trim(); - const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) - .text() - .trim(); - olxId = $(`${olxIdFieldSelector} > div.df2`) - .text() - .trim(); - if (olxIdFieldTitle !== "OLX ID") { - throw { message: "Cannot find correct OLX ID" }; - } - //=========================================== - - //====== DETAIL INFORMATION FIELDS ========== - let area = null; - let gardenSize = null; - - let fieldIndex = 1; - do { - const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; - const fieldTitleSelector = `${fieldSelector} > div.df1`; - const fieldValueSelector = `${fieldSelector} > div.df2`; - - const fieldTitle = $(fieldTitleSelector) - .text() - .trim(); - const fieldValue = $(fieldValueSelector) - .text() - .trim(); - - switch (fieldTitle) { - case "Kvadrata": - area = fieldValue; - break; - case "Okućnica (kvadratura)": - gardenSize = fieldValue; - break; - } - - if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { - break; - } - } while (true); - //=========================================== - - //====== UNUSED FIELDS FOR NOW ============== - const time = $("time").attr("datetime"); - const numberOfViews = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" - ) - .text() - .trim(); - //=========================================== - - //========================================= const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { throw { message: `Unknown ad category [${category}]` }; @@ -395,6 +347,221 @@ class OlxCrawler { throw { message: "Unknown ad type" }; } + const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) + .text() + .trim(); + olxId = $(`${olxIdFieldSelector} > div.df2`) + .text() + .trim(); + numberOfViewsAgency = parseInt( + $(numberOfViewsAgencyValueSelector) + .text() + .trim() + ); + + if (olxIdFieldTitle !== "OLX ID") { + throw { message: "Cannot find correct OLX ID" }; + } + //=========================================== + + //====== DETAIL INFORMATION FIELDS ========== + let area, + gardenSize, + numberOfRooms = null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = null, + furnishingType = null, + balcony = null, + newBuilding = null, + elevator = null, + water = null, + electricity = null, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = null, + garage = null, + gas = null, + antiTheftDoor = null, + airCondition = null, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = null, + storeRoom = null, + videoSurveillance = null, + alarm = null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null; + + let fieldIndex = 1; + do { + const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; + const fieldTitleSelector = `${fieldSelector} > div.df1`; + const fieldValueSelector = `${fieldSelector} > div.df2`; + + const fieldTitle = $(fieldTitleSelector) + .text() + .trim() + .toLowerCase(); + const fieldValue = $(fieldValueSelector) + .text() + .trim() + .toLowerCase(); + + switch (fieldTitle) { + case "kvadrata": + area = fieldValue; + break; + case "okućnica (kvadratura)": + gardenSize = fieldValue; + break; + case "broj soba": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj prostorija": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj spratova": + numberOfFloors = this.parseNumberOfFloors( + fieldValue, + parsedCategory + ); + break; + case "sprat": + floor = this.parseFloorNumber(fieldValue, parsedCategory); + break; + case "vrsta grijanja": + heatingType = this.getHeatingTypeId(fieldValue); + break; + case "namješten?": + furnishingType = this.getFurnishingTypeId(fieldValue); + break; + case "namješten": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "namještena": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "voda": + water = true; + break; + case "struja": + electricity = true; + break; + case "kanalizacija": + drainageSystem = fieldValue !== "nema"; + break; + case "godina izgradnje": + newBuilding = newBuilding || fieldValue === "novogradnja"; + break; + case "kućni ljubimci": + animalsAllowed = fieldValue === "da"; + break; + case "uknjiženo / zk": + registeredInZkBooks = true; + break; + case "uknjiženo (zk)": + registeredInZkBooks = true; + break; + case "novogradnja": + newBuilding = true; + break; + case "nedavno adaptiran": + recentlyAdapted = true; + break; + case "nedavno adaptirana": + recentlyAdapted = true; + break; + case "balkon": + balcony = true; + break; + case "lift": + elevator = true; + break; + case "parking": + parking = true; + break; + case "garaža": + garage = true; + break; + case "plin": + gas = true; + break; + case "blindirana vrata": + antiTheftDoor = true; + break; + case "klima": + airCondition = true; + break; + case "telefonski priključak": + phoneConnection = true; + break; + case "kablovska tv": + cableTV = true; + break; + case "internet": + internet = true; + break; + case "podrum/tavan": + basementAttic = true; + break; + case "ostava/špajz": + storeRoom = true; + break; + case "video nadzor": + videoSurveillance = true; + break; + case "alarm": + alarm = true; + break; + case "za studente": + suitableForStudents = true; + break; + case "uključen trošak režija": + includingBills = true; + break; + case "građevinska dozvola": + buildingPermit = true; + break; + case "komunalni priključak": + utilityConnection = true; + break; + case "urbanistička dozvola": + urbanPlanPermit = true; + break; + case "udaljenost od rijeke (m)": + distanceToRiver = parseInt(fieldValue) || null; + break; + case "prilaz": + accessRoadType = this.getAccessRoadTypeId(fieldValue); + break; + case "bazen": + pool = true; + break; + case "iznajmljeno": + status = AD_STATUS.STATUS_RENTED; + break; + default: + // console.log(fieldTitle, " = ", fieldValue); + break; + } + + if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + break; + } + } while (true); + //=========================================== + + //========================================= const parsedArea = this.parseArea(area) || null; const parsedGardenSize = this.parseArea(gardenSize) || null; const parsedPrice = this.parsePrice(price) || null; @@ -409,6 +576,13 @@ class OlxCrawler { locationLong = parseFloat(locationLatLngMatches[2]) || null; } + if ( + title.indexOf("[PRODANO]") !== -1 || + title.indexOf("[ZAVRŠENO]") !== -1 + ) { + status = AD_STATUS.STATUS_SOLD; + } + const data = { url, agencyObjectId: olxId, @@ -439,7 +613,42 @@ class OlxCrawler { locationLong, adStatus: status, publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString() + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency }; return data; @@ -485,6 +694,64 @@ class OlxCrawler { } } + getHeatingTypeId(heatingTypeText) { + switch (heatingTypeText) { + case "struja": + return HEATING_TYPE.ELECTRICITY.id; + case "plin": + return HEATING_TYPE.GAS.id; + case "drva": + return HEATING_TYPE.WOOD.id; + case "centralno (gradsko)": + return HEATING_TYPE.CENTRAL_CITY.id; + case "centralno (kotlovnica)": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "centralno (plin)": + return HEATING_TYPE.CENTRAL_GAS.id; + case "nije uvedeno": + return HEATING_TYPE.NO_HEATING.id; + case "ostalo": + return HEATING_TYPE.OTHER.id; + case "drugo": + return HEATING_TYPE.OTHER.id; + default: + console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); + return null; + } + } + + getFurnishingTypeId(furnishingTypeText) { + switch (furnishingTypeText) { + case "namješten": + return FURNISHING_TYPE.FURNISHED.id; + case "polunamješten": + return FURNISHING_TYPE.HALF_FURNISHED.id; + case "nenamješten": + return FURNISHING_TYPE.NOT_FURNISHED.id; + case "": + return FURNISHING_TYPE.FURNISHED.id; + default: + console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); + return null; + } + } + + getAccessRoadTypeId(accessRoadTypeText) { + switch (accessRoadTypeText) { + case "asfalt": + return ACCESS_ROAD_TYPE.ASPHALT.id; + case "beton": + return ACCESS_ROAD_TYPE.CONCRETE.id; + case "makadam": + return ACCESS_ROAD_TYPE.MACADAM.id; + case "ostalo": + return ACCESS_ROAD_TYPE.OTHER.id; + default: + console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); + return null; + } + } + parseArea(areaText) { if (!areaText) { return NaN; @@ -505,56 +772,100 @@ class OlxCrawler { return parseFloat(formattedPriceText); } - parseRenewedDate(renewedDateText) { - const currentMoment = moment.tz(DEFAULT_TIMEZONE); - - if (renewedDateText.includes("Prije mjesec dana")) { - return currentMoment.add(-1, "month"); - } - - if (renewedDateText.includes("Jučer")) { - return currentMoment.add(-1, "day"); - } - - if (renewedDateText.includes("Prije sat")) { - return currentMoment.add(-1, "hour"); - } - - if (renewedDateText.includes("dan")) { - // format for this case should be "Prije N dana" or "Prije N dan" - const dateParts = renewedDateText.split(" "); - if (dateParts[0] === "Prije") { - const numberOfDays = parseInt(dateParts[1]); - return currentMoment.add(-1 * numberOfDays, "days"); - } else { - return undefined; + parseNumberOfRooms(numberOfRoomsText, categoryId) { + if (categoryId === AD_CATEGORY.FLAT.id) { + switch (numberOfRoomsText) { + case "garsonjera": + return 0; + case "jednosoban (1)": + return 1; + case "jednoiposoban (1.5)": + return 1.5; + case "dvosoban (2)": + return 2; + case "trosoban (3)": + return 3; + case "četverosoban (4)": + return 4; + case "petosoban i više": + return 5; + default: + console.log( + "broj soba [stan] = NEPOZNATO [", + numberOfRoomsText, + ", ", + categoryId, + "]" + ); + return null; } } - if (renewedDateText.includes("sat")) { - const dateParts = renewedDateText.split(" "); - const parsedHours = - dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined; - if (!parsedHours) { - return undefined; - } - return currentMoment.add(-1 * parsedHours, "hours"); + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id || + categoryId === AD_CATEGORY.APARTMENT.id || + categoryId === AD_CATEGORY.OFFICE.id + ) { + return parseInt(numberOfRoomsText) || null; } - const todayVariations = ["min", "sekund", "maloprije"]; - for (const todayVariation of todayVariations) { - if (renewedDateText.includes(todayVariation)) { - return currentMoment; - } + console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); + return null; + } + + parseNumberOfFloors(numberOfFloorsText, categoryId) { + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id + ) { + return parseInt(numberOfFloorsText) || null; } - const renewedDateMoment = moment.tz( - renewedDateText, - OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, - DEFAULT_TIMEZONE - ); + if (categoryId === AD_CATEGORY.OFFICE.id) { + if ( + numberOfFloorsText === "suteren" || + numberOfFloorsText === "prizemlje" + ) { + return 0; + } + if (numberOfFloorsText === "6+") { + return 7; + } + return parseInt(numberOfFloorsText) || null; + } - return renewedDateMoment.isValid() ? renewedDateMoment : undefined; + console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); + return null; + } + + parseFloorNumber(floorText, categoryId) { + if ( + categoryId === AD_CATEGORY.FLAT.id || + categoryId === AD_CATEGORY.APARTMENT.id + ) { + if ( + floorText === "suteren" || + floorText === "prizemlje" || + floorText === "visoko prizemlje" + ) { + return 0; + } + return parseInt(floorText) || null; + } + + if (categoryId === AD_CATEGORY.OFFICE.id) { + if (floorText === "zaseban objekat") { + return null; + } + if (floorText === "prizemlje" || floorText === "visoko prizemlje") { + return 0; + } + return parseInt(floorText) || null; + } + + console.log("sprat = NEPOZNATO [", floorText, "]"); + return null; } async sleep(ms) { @@ -569,7 +880,7 @@ class OlxCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index 7b970e0..cb1d3f9 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -2,16 +2,23 @@ const fetch = require("node-fetch"); const cheerio = require("cheerio"); +const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + FURNISHING_TYPE, + HEATING_TYPE } = require("../../common/enums"); -const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); +const { + PRINT_CRAWLER_DEBUG, + DEFAULT_TIMEZONE +} = require("../../config/appConfig"); +const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor"); const PROSTOR_ENUMS = { PROSTOR_AD_TYPE: { @@ -48,37 +55,359 @@ class ProstorCrawler { this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxResultsPerPage = maxResultsPerPage; + this.delayBetweenPages = delayBetweenPages; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; + const newRealEstates = []; if (crawlAdCategories) { + const indexGenerators = []; for (const adCategory of crawlAdCategories) { - const urlAdTypePart = - PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; - const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; - if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { - const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; - const singleCategoryResults = await this.extractRealEstates( - urlPageToCrawl - ); + indexGenerators.push(this.categoryIndexer(adCategory)); + } - const resultsSubset = singleCategoryResults.slice( - 0, - this.maxResultsPerPage - ); - - const saveResults = await this.saveCrawledResults(resultsSubset); - const { newRecords } = saveResults; - newRealEstates.push(...newRecords); + let done = false; + while (!done) { + const categoryIndexerPromises = []; + const generatorsToRemove = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); } + + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); + + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; + + newRealEstates.push(...newRecords); + + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !PROSTOR_FORCE_CRAWL + ) { + generatorsToRemove[index] = true; + } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); + } + } + + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + await this.sleep(this.delayBetweenPages); } } return newRealEstates; } + async *categoryIndexer(adCategory) { + const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { + const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`; + const listOfAllRealEstates = await this.extractRealEstates( + urlPageToCrawl + ); + + let elementToStartIndexFrom = 0; + while (true) { + const realEstatesForSinglePage = listOfAllRealEstates.slice( + elementToStartIndexFrom, + elementToStartIndexFrom + this.maxResultsPerPage + ); + + if (realEstatesForSinglePage.length > 0) { + elementToStartIndexFrom += realEstatesForSinglePage.length; + + const singlePageResults = await this.indexSinglePage( + realEstatesForSinglePage + ); + + const filteredSinglePageResults = singlePageResults.filter( + singleResult => !!singleResult + ); + + if ( + Array.isArray(filteredSinglePageResults) && + filteredSinglePageResults.length > 0 + ) { + yield filteredSinglePageResults; + } else { + return undefined; + } + } else { + return undefined; + } + } + } else { + return undefined; + } + } + + async indexSinglePage(realEstatesList) { + const asyncActions = []; + for (const realEstate of realEstatesList) { + asyncActions.push(this.scrapeAd(realEstate)); + } + + try { + return await Promise.all(asyncActions); + } catch (e) { + console.log( + "[PROSTOR] Error crawling ads : ", + e.message || "UNKNOWN ERROR" + ); + return []; + } + } + + async scrapeAd(realEstate) { + const { lat, lng, property_name, price, size, link, status } = realEstate; + const url = `https://prostor.ba${link}`; + // console.log("[PROSTOR] Scraping : ", url); + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = ProstorCrawler.getAdTypeId(linkParts[1]); + const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + + if (!adType || !realEstateType || !prostorId) { + return null; + } + + const allDataSelector = + "body > div > div.container-fluid > div > div.column-right > table > tbody"; + + const realEstateProperties = {}; + + $(allDataSelector) + .find("p") + .each((i, element) => { + const propertyElement = $(element) + .text() + .split(":") + .map(text => text.trim().toLowerCase()); + + const propertyTitle = propertyElement[0]; + realEstateProperties[propertyTitle] = propertyElement[1]; + }); + + $(allDataSelector) + .find("div.mb-2") + .each((i, element) => { + const propertyElement = $(element) + .text() + .trim() + .toLowerCase(); + + realEstateProperties[propertyElement] = true; + }); + + if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { + return null; + } + + let numberOfRooms = + parseFloat(realEstateProperties["broj soba"]) + + parseFloat(realEstateProperties["broj spavaćih soba"]) || null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties), + furnishingType = null, + balcony = + realEstateProperties["balkon"] || + realEstateProperties["terasa"] || + realEstateProperties["lođa"] || + null, + newBuilding = linkParts[1] === "novogradnja", + elevator = realEstateProperties["lift"] || null, + water = realEstateProperties["voda"] || null, + electricity = realEstateProperties["električna energija"] || null, + drainageSystem = realEstateProperties["kanalizacija"] || null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = realEstateProperties["parking"] || null, + garage = realEstateProperties["garaža"] || null, + gas = realEstateProperties["plin"] || null, + antiTheftDoor = realEstateProperties["blindo vrata"] || null, + airCondition = realEstateProperties["klima"] || null, + phoneConnection = realEstateProperties["telefon"] || null, + cableTV = realEstateProperties["kablovksa tv"] || null, + internet = + realEstateProperties["internet"] || + realEstateProperties["adsl"] || + null, + basementAttic = realEstateProperties["podrum"] || null, + storeRoom = realEstateProperties["ostava"] || null, + videoSurveillance = realEstateProperties["video nadzor"], + alarm = realEstateProperties["alarm"] || null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = realEstateProperties["bazen"] || null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null, + numberOfViewsAgency = null; + + // Floor versions (there are possibly more versions) : + // Sprat: 3/3 + // Sprat: 1 - 2/2 + // Sprat: Pr - 7/7 + // Sprat: -2/0 + // If there are two parts, that represents more real estates are sold + // numberOfFloors is contained in second part, after / sign + + const floorsArray = realEstateProperties["sprat"].split(" - "); + let floorText = ""; + if (floorsArray.length === 1) { + const floorDescription = floorsArray[0].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorDescription[0]; + floor = Math.round(parseFloat(floorText)); + } else if (floorsArray.length === 2) { + const floorDescription = floorsArray[1].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorsArray[0]; + floor = Math.round(parseFloat(floorText)); + } else { + // This is something strange + } + + if (isNaN(floor)) { + // It was textual representation of floor, like "Pr", "Su" or similar + switch (floorText) { + case "pr": + floor = 0; + break; + case "su": + floor = -1; + break; + default: + console.log( + "[PROSTOR] Unknown textual representation of floor : ", + floorText + ); + floor = null; + } + } + + if (realEstateProperties["namješteno"]) { + furnishingType = FURNISHING_TYPE.FURNISHED.id; + } else if (realEstateProperties["polunamješteno"]) { + furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + } else { + furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + } + + const adStatus = ProstorCrawler.getStatusId(status); + const title = property_name; + const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; + const parsedArea = parseFloat(size); + const gardenSize = null; + const longDescription = null; + + const data = { + url, + agencyObjectId: prostorId, + originAgencyName: AD_AGENCY.PROSTOR, + realEstateType, + adType, + title, + price: parsedPrice, + area: parsedArea, + gardenSize, + shortDescription: "", + longDescription: longDescription, + streetNumber: 0, + streetName: realEstateProperties["adresa"], + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat: lat, + locationLong: lng, + adStatus, + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency + }; + + return data; + } catch (e) { + console.error( + "[PROSTOR] Exception caught: " + e.message, + "\r\nURL:", + url + ); + return null; + } + } + async extractRealEstates(url) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); @@ -115,18 +444,19 @@ class ProstorCrawler { const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; const realEstates = JSON.parse(jsonData); - const transformedRealEstates = []; - - for (const realEstate of realEstates) { - const transformedRealEstate = ProstorCrawler.transformRealEstateData( - realEstate - ); - if (transformedRealEstate) { - transformedRealEstates.push(transformedRealEstate); - } - } - - return transformedRealEstates; + // const transformedRealEstates = []; + // + // for (const realEstate of realEstates) { + // const transformedRealEstate = ProstorCrawler.transformRealEstateData( + // realEstate + // ); + // if (transformedRealEstate) { + // transformedRealEstates.push(transformedRealEstate); + // } + // } + // + // return transformedRealEstates; + return realEstates; } else { throw { message: "Something is wrong with JSON data or data is moved" @@ -134,73 +464,15 @@ class ProstorCrawler { } } catch (e) { console.log(e); - throw { message: "Can't find ad data JSON" }; + throw e; } } - } catch (e) { - console.error("[PROSTOR] Exception caught:", e.message); - return []; - } - } - - static transformRealEstateData(realEstateData) { - try { - const { lat, lng, property_name, price, size, link } = realEstateData; - - // link contains part of the URL in the format of : /prodaja/stan/stup/9556 - // general form is : /actionType/realEstateType/location/realEstateID - // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] - - const linkParts = link.split("/"); - - const adType = ProstorCrawler.getAdTypeId(linkParts[1]); - const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); - const prostorId = linkParts[4]; - const url = `https://prostor.ba${link}`; - - if (!adType || !realEstateType || !prostorId) { - return null; - } - - const adStatus = AD_STATUS.STATUS_NORMAL; - const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; - const parsedArea = parseFloat(size); - - const data = { - url, - agencyObjectId: prostorId, - originAgencyName: AD_AGENCY.PROSTOR, - realEstateType, - adType, - title: property_name, - price: parsedPrice, - area: parsedArea, - gardenSize: null, - shortDescription: "", - longDescription: "", - streetNumber: 0, - streetName: "", - locality: "", - municipality: "", - city: "", - region: "", - entity: "", - country: "", - locationLat: lat, - locationLong: lng, - adStatus, - publishedDate: null, - renewedDate: null - }; - - return data; } catch (e) { console.error( - "[PROSTOR] Exception caught: " + e.message, - "\r\nURL:", - url + "[PROSTOR] Exception caught:", + e.message || "UNKNOWN MESSAGE" ); - return null; + return []; } } @@ -231,11 +503,61 @@ class ProstorCrawler { return AD_TYPE.AD_TYPE_SALE.stringId; case "najam": return AD_TYPE.AD_TYPE_RENT.stringId; + case "novogradnja": + return AD_TYPE.AD_TYPE_SALE.stringId; default: return undefined; } } + static getHeatingTypeId(realEstateProperties) { + const realEstatePropertiesKeys = Object.keys(realEstateProperties); + for (const property of realEstatePropertiesKeys) { + switch (property) { + case "centralno toplane": + return HEATING_TYPE.CENTRAL_CITY.id; + case "etažno plinsko": + return HEATING_TYPE.CENTRAL_GAS.id; + case "termo blok": + case "podno grijanje": + return HEATING_TYPE.OTHER.id; + case "etažno električno": + case "konvektori": + return HEATING_TYPE.ELECTRICITY.id; + case "plinske peći": + return HEATING_TYPE.GAS.id; + case "vlastita kotlovnica": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "toplotna pumpa": + return HEATING_TYPE.HEAT_PUMP.id; + case "kamin": + return HEATING_TYPE.WOOD.id; + default: + //console.log("[PROSTOR] Nepoznato >>> [", property, "]"); + } + } + } + + static getStatusId(statusText) { + switch (statusText) { + case "": + return AD_STATUS.STATUS_NORMAL; + case "Rezervisano": + return AD_STATUS.STATUS_RESERVED; + case "Prodano": + return AD_STATUS.STATUS_SOLD; + case "Iznajmljeno": + return AD_STATUS.STATUS_RENTED; + default: + console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]"); + return AD_STATUS.STATUS_NORMAL; + } + } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + async saveCrawledResults(results) { const savers = this.savers; @@ -244,7 +566,7 @@ class ProstorCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index 668cff2..8f38dc8 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -11,7 +11,10 @@ const { AD_CATEGORY, AD_AGENCY, AD_STATUS, - CRAWLER_AD_TYPE + CRAWLER_AD_TYPE, + HEATING_TYPE, + ACCESS_ROAD_TYPE, + FURNISHING_TYPE } = require("../../common/enums"); const { @@ -219,6 +222,7 @@ class RentalCrawler { throw { message: "Can't find ad data JSON" }; } + let adStatus = AD_STATUS.STATUS_NORMAL; const rentalId = extractedData["re_realEstates_id"]; const adCategory = this.getKiviCategoryIdFromRentalId( parseInt(extractedData["re_types_id"]) @@ -237,6 +241,141 @@ class RentalCrawler { }; } + const descriptionIds = extractedData["re_descriptions_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(descriptionIds)) { + throw { + message: + 'Expected array od descriptions but "re_descriptions_id" not found !' + }; + } + + const spaceIds = extractedData["re_spaces_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(spaceIds)) { + throw { + message: 'Expected array od spaces but "re_spaces_id" not found !' + }; + } + + const infrastructureIds = extractedData["re_infrastructure_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(infrastructureIds)) { + throw { + message: + 'Expected array od infrastructures but "re_infrastructure_id" not found !' + }; + } + + const floorNoIds = extractedData["re_floorNO_id"] + .split(",") + .map(stringNumber => parseInt(stringNumber)); + + if (!Array.isArray(floorNoIds)) { + throw { + message: + 'Expected array od infrastructures but "re_floorNO_id" not found !' + }; + } + + const numberOfViewsAgencySelector = $( + "body > div > div.container > div.row.content-top > div.col-xs-12.col-sm-12.col-md-9 > div > div.box-viewcount" + ); + + // number of views is written as : "Broj pregledavanja: NNN" + const numberOfViewsAgencyFullText = numberOfViewsAgencySelector + .text() + .trim(); + + const numberOfViewsAgencyParts = numberOfViewsAgencyFullText.split(":"); + + const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions( + descriptionIds + ); + const realEstatePropertiesFromSpaces = this.getPropertiesFromSpaces( + spaceIds + ); + + const realEstatePropertiesFromInfrastructure = this.getPropertiesFromInfrastructure( + infrastructureIds + ); + + if (extractedData["adm_realEstates_discount"] === "1") { + adStatus = AD_STATUS.STATUS_DISCOUNTED; + } + + let numberOfRooms = + parseInt(extractedData["re_realEstates_roomsNO"]) + + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, + numberOfFloors = + parseInt(extractedData["re_realEstates_floorsNO"]) || + this.getNumberOfFloorsFromFloorId(extractedData["re_floorNO_id"]), + floor = + parseInt(extractedData["re_realEstates_floorNO"]) || + this.getFloorNumberFromFloorId(extractedData["re_floorNO_id"]), + accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType, + heatingType = + this.getHeatingTypeId(extractedData["re_heating_id"]) || null, + furnishingType = realEstatePropertiesFromDescriptions.furnishingType, + balcony = + realEstatePropertiesFromDescriptions.balcony || + realEstatePropertiesFromSpaces.balcony, + newBuilding = extractedData["op_realEstates_newBuilding"] + ? extractedData["op_realEstates_newBuilding"] === "1" + : null, + elevator = realEstatePropertiesFromDescriptions.elevator, + water = + realEstatePropertiesFromDescriptions.water || + realEstatePropertiesFromInfrastructure.water, + electricity = + realEstatePropertiesFromDescriptions.electricity || + realEstatePropertiesFromInfrastructure.electricity, + drainageSystem = + realEstatePropertiesFromInfrastructure.drainageSystem, + registeredInZkBooks = + extractedData["op_realEstates_ownerPermit"] === 1 || null, + recentlyAdapted = null, + parking = + realEstatePropertiesFromDescriptions.parking || + realEstatePropertiesFromSpaces.parking, + garage = realEstatePropertiesFromSpaces.garage, + gas = realEstatePropertiesFromInfrastructure.gas, + antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor, + airCondition = realEstatePropertiesFromDescriptions.airCondition, + phoneConnection = + realEstatePropertiesFromInfrastructure.phoneConnection, + cableTV = realEstatePropertiesFromInfrastructure.cableTV, + internet = realEstatePropertiesFromInfrastructure.internet, + basementAttic = realEstatePropertiesFromSpaces.basementAttic, + storeRoom = realEstatePropertiesFromSpaces.storeRoom, + videoSurveillance = + realEstatePropertiesFromDescriptions.videoSurveillance || + realEstatePropertiesFromInfrastructure.videoSurveillance, + alarm = realEstatePropertiesFromDescriptions.alarm, + suitableForStudents = null, + includingBills = + extractedData["op_realEstates_utilitiesIncluded"] === "1" || null, + animalsAllowed = null, + pool = realEstatePropertiesFromDescriptions.pool, + urbanPlanPermit = + extractedData["op_realEstates_locationPermit"] === "1" || + realEstatePropertiesFromDescriptions.urbanPlanPermit, + buildingPermit = + extractedData["op_realEstates_buildingPermit"] === "1" || null, + utilityConnection = + realEstatePropertiesFromDescriptions.utilityConnection, + distanceToRiver = null, + numberOfViewsAgency = + numberOfViewsAgencyParts.length > 1 + ? parseInt(numberOfViewsAgencyParts[1]) + : null; + const title = extractedData["re_realEstates_portalName"]; const extractedPrice = parseFloat( extractedData["re_realEstates_price"] @@ -277,8 +416,6 @@ class RentalCrawler { }; } - const adStatus = AD_STATUS.STATUS_NORMAL; - const data = { url, agencyObjectId: rentalId, @@ -303,7 +440,42 @@ class RentalCrawler { locationLong, adStatus, publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString() + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency }; return data; @@ -350,6 +522,270 @@ class RentalCrawler { } } + getPropertiesFromDescriptions(descriptionIds) { + const result = { + accessRoadType: null, + furnishingType: null, + balcony: null, + elevator: null, + parking: null, + antiTheftDoor: null, + airCondition: null, + videoSurveillance: null, + alarm: null, + pool: null, + urbanPlanPermit: null, + utilityConnection: null, + water: null, + electricity: null + }; + + for (const descriptionId of descriptionIds) { + switch (descriptionId) { + case 16: + result.furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + break; + case 17: + result.furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + break; + case 1: + case 28: + result.furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case 14: + result.elevator = true; + break; + case 39: + result.electricity = true; + break; + case 40: + result.water = true; + break; + case 41: + case 58: + result.accessRoadType = ACCESS_ROAD_TYPE.ASPHALT.id; + break; + case 26: + result.balcony = true; + break; + case 62: + result.parking = true; + break; + case 3: + result.antiTheftDoor = true; + break; + case 2: + case 21: + result.airCondition = true; + break; + case 4: + result.alarm = true; + break; + case 55: + result.videoSurveillance = true; + break; + case 9: + result.pool = true; + break; + case 60: + result.urbanPlanPermit = true; + break; + case 38: + result.utilityConnection = true; + break; + } + } + + return result; + } + + getPropertiesFromSpaces(spaceIds) { + const result = { + balcony: null, + parking: null, + garage: null, + basementAttic: null, + storeRoom: null + }; + + for (const spaceId of spaceIds) { + switch (spaceId) { + case 36: + case 12: + result.parking = true; + break; + case 1: + case 2: + case 3: + result.balcony = true; + break; + case 4: + case 30: + result.garage = true; + break; + case 9: + case 10: + result.storeRoom = true; + break; + case 18: + case 34: + case 37: + case 27: + result.basementAttic = true; + break; + } + } + + return result; + } + + getHeatingTypeId(heatingRentalId) { + // heatingRentalId can have multiple values, like: "1, 2, 3", parseInt will take first integer value + const heatingId = parseInt(heatingRentalId); + switch (heatingId) { + case 27: + case 16: + return HEATING_TYPE.GAS.id; + case 4: + return HEATING_TYPE.CENTRAL_GAS.id; + case 3: + case 23: + case 6: + case 7: + case 8: + case 9: + case 10: + return HEATING_TYPE.CENTRAL_BOILER.id; + case 2: + case 13: + case 30: + case 17: + case 29: + case 31: + return HEATING_TYPE.ELECTRICITY.id; + case 24: + case 25: + case 12: + return HEATING_TYPE.CENTRAL_CITY.id; + case 26: + case 21: + case 20: + return HEATING_TYPE.WOOD.id; + case 28: + case 19: + return HEATING_TYPE.HEAT_PUMP.id; + case 14: + case 32: + return HEATING_TYPE.OTHER.id; + default: + return null; + } + } + + getPropertiesFromInfrastructure(infrastructureIds) { + const result = { + electricity: null, + water: null, + gas: null, + drainageSystem: null, + phoneConnection: null, + internet: null, + videoSurveillance: null, + cableTV: null + }; + + for (const infrastructureId of infrastructureIds) { + switch (infrastructureId) { + case 1: + result.electricity = true; + break; + case 2: + result.water = true; + break; + case 4: + result.gas = true; + break; + case 5: + result.drainageSystem = true; + break; + case 7: + case 8: + result.phoneConnection = true; + break; + case 10: + result.internet = true; + break; + case 11: + result.cableTV = true; + break; + case 16: + case 17: + result.videoSurveillance = true; + break; + } + } + + return result; + } + + getFloorNumberFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + // just extracting floor number from first element + + const floorsId = floorsIdText.split(","); + if (floorsId.length === 0) { + return null; + } + + const firstFloorId = parseInt(floorsId[0]); + + // 1 pod + // 2 sut + // 3 raz + // 4 pri + // 5 vpri + // 6 prv + // 7 dru + // 8 tre + // 9 čet + // 10 man + // 11 + // 12 pot + // 13 vpot + // 14 tav + // 15 pet + const floorNumber = [ + -1, + -1, + 0, + 0, + 1, + 1, + 2, + 3, + 4, + null, + null, + null, + null, + null, + 5 + ]; + + return floorNumber[firstFloorId - 1] || null; + } + + getNumberOfFloorsFromFloorId(floorsIdText) { + // floorIdText can be array of numbers, separated by comma or number + + const floorIds = floorsIdText.split(","); + if (floorIds.length === 0) { + return null; + } + + return floorIds.length; + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } @@ -362,7 +798,7 @@ class RentalCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index fd9f086..0282645 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -26,7 +26,42 @@ const bulkUpsertRealEstates = async realEstateData => { "gardenSize", "adStatus", "updatedAt", - "renewedDate" + "renewedDate", + "numberOfRooms", + "numberOfFloors", + "floor", + "accessRoadType", + "heatingType", + "furnishingType", + "balcony", + "newBuilding", + "elevator", + "water", + "electricity", + "drainageSystem", + "registeredInZkBooks", + "recentlyAdapted", + "parking", + "garage", + "gas", + "antiTheftDoor", + "airCondition", + "phoneConnection", + "cableTV", + "internet", + "basementAttic", + "storeRoom", + "videoSurveillance", + "alarm", + "suitableForStudents", + "includingBills", + "animalsAllowed", + "pool", + "urbanPlanPermit", + "buildingPermit", + "utilityConnection", + "distanceToRiver", + "numberOfViewsAgency" ]; const order = [["updatedAt", "desc"]]; diff --git a/app/migrations/20191105174319-add-additional-fields-to-realEstates-table.js b/app/migrations/20191105174319-add-additional-fields-to-realEstates-table.js new file mode 100644 index 0000000..134e6ad --- /dev/null +++ b/app/migrations/20191105174319-add-additional-fields-to-realEstates-table.js @@ -0,0 +1,163 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.addColumn("RealEstates", "numberOfRooms", { + type: Sequelize.REAL + }), + queryInterface.addColumn("RealEstates", "numberOfFloors", { + type: Sequelize.INTEGER + }), + queryInterface.addColumn("RealEstates", "floor", { + type: Sequelize.INTEGER + }), + queryInterface.addColumn("RealEstates", "accessRoadType", { + type: Sequelize.TEXT + }), + queryInterface.addColumn("RealEstates", "heatingType", { + type: Sequelize.TEXT + }), + queryInterface.addColumn("RealEstates", "furnishingType", { + type: Sequelize.TEXT + }), + queryInterface.addColumn("RealEstates", "balcony", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "newBuilding", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "elevator", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "water", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "electricity", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "drainageSystem", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "registeredInZkBooks", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "recentlyAdapted", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "parking", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "garage", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "gas", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "antiTheftDoor", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "airCondition", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "phoneConnection", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "cableTV", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "internet", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "basementAttic", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "storeRoom", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "videoSurveillance", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "alarm", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "suitableForStudents", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "includingBills", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "animalsAllowed", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "pool", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "exchange", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "urbanPlanPermit", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "buildingPermit", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "utilityConnection", { + type: Sequelize.BOOLEAN + }), + queryInterface.addColumn("RealEstates", "distanceToRiver", { + type: Sequelize.INTEGER + }), + queryInterface.addColumn("RealEstates", "numberOfViewsAgency", { + type: Sequelize.INTEGER, + defaultValue: 0 + }), + queryInterface.addColumn("RealEstates", "numberOfViewsKivi", { + type: Sequelize.INTEGER, + defaultValue: 0 + }) + ]); + }, + + down: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.removeColumn("RealEstates", "numberOfRooms"), + queryInterface.removeColumn("RealEstates", "numberOfFloors"), + queryInterface.removeColumn("RealEstates", "floor"), + queryInterface.removeColumn("RealEstates", "accessRoadType"), + queryInterface.removeColumn("RealEstates", "heatingType"), + queryInterface.removeColumn("RealEstates", "furnishingType"), + queryInterface.removeColumn("RealEstates", "balcony"), + queryInterface.removeColumn("RealEstates", "newBuilding"), + queryInterface.removeColumn("RealEstates", "elevator"), + queryInterface.removeColumn("RealEstates", "water"), + queryInterface.removeColumn("RealEstates", "electricity"), + queryInterface.removeColumn("RealEstates", "drainageSystem"), + queryInterface.removeColumn("RealEstates", "registeredInZkBooks"), + queryInterface.removeColumn("RealEstates", "recentlyAdapted"), + queryInterface.removeColumn("RealEstates", "parking"), + queryInterface.removeColumn("RealEstates", "garage"), + queryInterface.removeColumn("RealEstates", "gas"), + queryInterface.removeColumn("RealEstates", "antiTheftDoor"), + queryInterface.removeColumn("RealEstates", "airCondition"), + queryInterface.removeColumn("RealEstates", "phoneConnection"), + queryInterface.removeColumn("RealEstates", "cableTV"), + queryInterface.removeColumn("RealEstates", "internet"), + queryInterface.removeColumn("RealEstates", "basementAttic"), + queryInterface.removeColumn("RealEstates", "storeRoom"), + queryInterface.removeColumn("RealEstates", "videoSurveillance"), + queryInterface.removeColumn("RealEstates", "alarm"), + queryInterface.removeColumn("RealEstates", "suitableForStudents"), + queryInterface.removeColumn("RealEstates", "includingBills"), + queryInterface.removeColumn("RealEstates", "animalsAllowed"), + queryInterface.removeColumn("RealEstates", "pool"), + queryInterface.removeColumn("RealEstates", "exchange"), + queryInterface.removeColumn("RealEstates", "urbanPlanPermit"), + queryInterface.removeColumn("RealEstates", "buildingPermit"), + queryInterface.removeColumn("RealEstates", "utilityConnection"), + queryInterface.removeColumn("RealEstates", "distanceToRiver"), + queryInterface.removeColumn("RealEstates", "numberOfViewsAgency"), + queryInterface.removeColumn("RealEstates", "numberOfViewsKivi") + ]); + } +}; diff --git a/app/models/realEstate.js b/app/models/realEstate.js index 93b82c4..0cb9374 100644 --- a/app/models/realEstate.js +++ b/app/models/realEstate.js @@ -48,7 +48,44 @@ module.exports = (sequelize, DataTypes) => { longDescription: DataTypes.TEXT, adStatus: DataTypes.INTEGER, publishedDate: DataTypes.DATE, - renewedDate: DataTypes.DATE + renewedDate: DataTypes.DATE, + numberOfRooms: DataTypes.INTEGER, + numberOfFloors: DataTypes.INTEGER, + floor: DataTypes.INTEGER, + accessRoadType: DataTypes.TEXT, + heatingType: DataTypes.TEXT, + furnishingType: DataTypes.TEXT, + balcony: DataTypes.BOOLEAN, + newBuilding: DataTypes.BOOLEAN, + elevator: DataTypes.BOOLEAN, + water: DataTypes.BOOLEAN, + electricity: DataTypes.BOOLEAN, + drainageSystem: DataTypes.BOOLEAN, + registeredInZkBooks: DataTypes.BOOLEAN, + recentlyAdapted: DataTypes.BOOLEAN, + parking: DataTypes.BOOLEAN, + garage: DataTypes.BOOLEAN, + gas: DataTypes.BOOLEAN, + antiTheftDoor: DataTypes.BOOLEAN, + airCondition: DataTypes.BOOLEAN, + phoneConnection: DataTypes.BOOLEAN, + cableTV: DataTypes.BOOLEAN, + internet: DataTypes.BOOLEAN, + basementAttic: DataTypes.BOOLEAN, + storeRoom: DataTypes.BOOLEAN, + videoSurveillance: DataTypes.BOOLEAN, + alarm: DataTypes.BOOLEAN, + suitableForStudents: DataTypes.BOOLEAN, + includingBills: DataTypes.BOOLEAN, + animalsAllowed: DataTypes.BOOLEAN, + pool: DataTypes.BOOLEAN, + exchange: DataTypes.BOOLEAN, + urbanPlanPermit: DataTypes.BOOLEAN, + buildingPermit: DataTypes.BOOLEAN, + utilityConnection: DataTypes.BOOLEAN, + distanceToRiver: DataTypes.INTEGER, + numberOfViewsAgency: DataTypes.INTEGER, + numberOfViewsKivi: DataTypes.INTEGER }); return RealEstate; diff --git a/development.env b/development.env index 901c309..3a63a34 100644 --- a/development.env +++ b/development.env @@ -42,7 +42,7 @@ RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page RENTAL_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found #==PROSTOR== PROSTOR_MAX_PAGES=!!! This is not used for prostor crawler !!! -PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents MAX RESULTS in total +PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents how many ads are crawled at once PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!! diff --git a/package.json b/package.json index 6c00483..ca18f22 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,8 @@ "crawl": "cd app/crawler && node npmCrawl.js", "daily-notify": "cd app/npmScripts && node npmDailyNotify.js", "test-search": "cd test && node searchTest.js", - "test-olx-scraper": "cd test && node olxScrapeTest.js" + "test-olx-scraper": "cd test && node olxScrapeTest.js", + "test-rental-scraper": "cd test && node rentalScrapeTest.js" }, "repository": { "type": "git", diff --git a/test/rentalScrapeTest.js b/test/rentalScrapeTest.js new file mode 100644 index 0000000..9828f2d --- /dev/null +++ b/test/rentalScrapeTest.js @@ -0,0 +1,17 @@ +"use strict"; + +const rentalCrawler = require("../app/crawler/specificCrawlers/rental"); + +const urlToScrape = process.argv[2] || undefined; + +if (urlToScrape) { + const crawler = new rentalCrawler(); + + (async () => { + const data = await crawler.scrapeAd(urlToScrape); + console.log(data); + })(); +} else { + console.log("No URL to scrape. Use like this : "); + console.log("npm run test-olx-scraper -- URL_TO_SCRAPE"); +}