"use strict"; const fetch = require("node-fetch"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); const htmlToText = require("html-to-text"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE, HEATING_TYPE, ACCESS_ROAD_TYPE, FURNISHING_TYPE } = require("../../common/enums"); const { DEFAULT_TIMEZONE, PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); const AKTIDO_ENUMS = { AKTIDO_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "/prodaja-1/najam-2", [CRAWLER_AD_TYPE.ONLY_SELL]: "/prodaja-1", [CRAWLER_AD_TYPE.ONLY_RENT]: "/najam-2" }, AKTIDO_AD_CATEGORY: { [AD_CATEGORY.ALL.id]: "", [AD_CATEGORY.FLAT.id]: "/tip-2", [AD_CATEGORY.HOUSE.id]: "/tip-1", [AD_CATEGORY.LAND.id]: "/tip-5", [AD_CATEGORY.OFFICE.id]: "/tip-4", [AD_CATEGORY.APARTMENT.id]: "/tip-3", [AD_CATEGORY.GARAGE.id]: "/tip-6" //[AD_CATEGORY.COTTAGE.id]: "" }, AKTIDO_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss", AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" }; const { AKTIDO_FORCE_CRAWL } = require("../specificConfigs/aktido"); class AktidoCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 1000, maxResultsPerPage = 100, ignoredUsernames = [], delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.aktido.ba/pretraga/sortiraj-date_DESC"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; this.delayBetweenPages = delayBetweenPages; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory)); } let done = false; while (!done) { const categoryIndexerPromises = []; const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); generatorsToRemove.push(false); } const singlePageResults = await Promise.all(categoryIndexerPromises); const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); const { newRecords } = saveResults; newRealEstates.push(...newRecords); if ( Array.isArray(newRecords) && newRecords.length === 0 && !AKTIDO_FORCE_CRAWL ) { generatorsToRemove[index] = true; } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; // console.log("Generator ", index + 1, "has no more pages"); } } // console.log("Generators state : ", generatorsToRemove); for (let i = generatorsToRemove.length - 1; i >= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory) { let pageToIndex = 1; const urlAdTypePart = AKTIDO_ENUMS.AKTIDO_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = AKTIDO_ENUMS.AKTIDO_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}/stranica-${pageToIndex}`; const singlePageResults = await this.indexSinglePage( urlPageToCrawl, this.maxResultsPerPage ); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; } else { return undefined; } ++pageToIndex; if (pageToIndex === this.maxPages) { return undefined; } } } else { return undefined; } } async indexSinglePage(url, maxResultsPerPage) { if (PRINT_CRAWLER_DEBUG) { console.log("[AKTIDO] Index page : ", url); } try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; $( "body > div > div.container > div.row > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div.row.box-items.group-grid-view" ) .find(".moreInfo") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { asyncScraping.push(this.scrapeAd(hrefs[i])); } const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; } catch (e) { console.error("[AKTIDO] Exception caught:" + e); return []; } } async scrapeAd(url) { // console.log("[AKTIDO] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); const mapElementParent = $(".box-map").parent(); const scriptElement = $("script", mapElementParent); if ( scriptElement[0] && scriptElement[0].children && scriptElement[0].children[0] && scriptElement[0].children[0].data ) { let extractedData; try { //data string starts with : var json_map_data = [{"r ... //so we remove first 20 characters const jsonData = scriptElement[0].children[0].data.substring(20); const parsedJsonData = JSON.parse(jsonData); extractedData = parsedJsonData[0]; } catch (e) { throw { message: "Can't find ad data JSON" }; } let adStatus = AD_STATUS.STATUS_NORMAL; const aktidoId = extractedData["re_realEstates_id"]; const adCategory = this.getKiviCategoryIdFromAktidoId( parseInt(extractedData["re_types_id"]) ); if (!adCategory) { throw { message: `Invalid category : ${extractedData["re_types_id"]}` }; } const adType = this.getKiviAdTypeFromAktidoActionId( parseInt(extractedData["re_action_id"]) ); if (!adType) { throw { message: `Invalid ad type : ${extractedData["re_action_id"]}` }; } const descriptionIds = extractedData["re_descriptions_id"] .split(",") .map(stringNumber => parseInt(stringNumber)); if (!Array.isArray(descriptionIds)) { throw { message: 'Expected array od descriptions but "re_descriptions_id" not found !' }; } const spaceIds = extractedData["re_spaces_id"] .split(",") .map(stringNumber => parseInt(stringNumber)); if (!Array.isArray(spaceIds)) { throw { message: 'Expected array od spaces but "re_spaces_id" not found !' }; } const infrastructureIds = extractedData["re_infrastructure_id"] .split(",") .map(stringNumber => parseInt(stringNumber)); if (!Array.isArray(infrastructureIds)) { throw { message: 'Expected array od infrastructures but "re_infrastructure_id" not found !' }; } const floorNoIds = extractedData["re_floorNO_id"] .split(",") .map(stringNumber => parseInt(stringNumber)); if (!Array.isArray(floorNoIds)) { throw { message: 'Expected array od infrastructures but "re_floorNO_id" not found !' }; } // counting floor enums // for (let i = 1; i < 10; i++) { // const floorEnumsTitle = $( // `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(1)` // ) // .text() // .trim(); // if (floorEnumsTitle === "Spratnost:") { // const floorEnumsValue = $( // `body > div.container-fluid > div.container > div:nth-child(2) > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div:nth-child(1) > div > div > div.col-xs-12.col-sm-4.box-details > div.body > p:nth-child(${i}) > span:nth-child(2)` // ) // .text() // .trim() // .split(","); // // console.log("=========="); // floorNoIds.forEach((id, index) => { // console.log("\t", id, " = ", floorEnumsValue[index]); // }); // break; // } // } // enumerating infrastructure - relation between id and infrastructure title // let found = false; // let infrastructureDescriptions = {}; // for (let i = 1; i < 5; i++) { // found = false; // for (let j = 1; j < 10; j++) { // const infrastructureTitle = $( // `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > strong` // ) // .text() // .trim(); // if (infrastructureTitle === "Osnovna infrastruktura:") { // found = true; // // const infrastructureValues = $( // `#b2 > div > div:nth-child(${i}) > div > ul > li:nth-child(${j}) > div` // ) // .text() // .trim() // .split(","); // // infrastructureIds.forEach((id, index) => { // infrastructureDescriptions[id] = infrastructureValues[index]; // }); // } // } // if (found) { // break; // } // } const realEstatePropertiesFromDescriptions = this.getPropertiesFromDescriptions( descriptionIds ); const realEstatePropertiesFromSpaces = this.getPropertiesFromSpaces( spaceIds ); const realEstatePropertiesFromInfrastructure = this.getPropertiesFromInfrastructure( infrastructureIds ); if (extractedData["adm_realEstates_discount"] === "1") { adStatus = AD_STATUS.STATUS_DISCOUNTED; } let numberOfRooms = parseInt(extractedData["re_realEstates_roomsNO"]) + parseInt(extractedData["re_realEstates_bedroomNO"]) || null, numberOfFloors = parseInt(extractedData["re_realEstates_floorsNO"]) || this.getNumberOfFloorsFromFloorId(extractedData["re_floorNO_id"]), floor = parseInt(extractedData["re_realEstates_floorNO"]) || this.getFloorNumberFromFloorId(extractedData["re_floorNO_id"]), accessRoadType = realEstatePropertiesFromDescriptions.accessRoadType, heatingType = this.getHeatingTypeId(extractedData["re_heating_id"]) || null, furnishingType = realEstatePropertiesFromDescriptions.furnishingType, balcony = realEstatePropertiesFromDescriptions.balcony || realEstatePropertiesFromSpaces.balcony, newBuilding = extractedData["op_realEstates_newBuilding"] ? extractedData["op_realEstates_newBuilding"] === "1" : null, elevator = realEstatePropertiesFromDescriptions.elevator, water = realEstatePropertiesFromDescriptions.water || realEstatePropertiesFromInfrastructure.water, electricity = realEstatePropertiesFromDescriptions.electricity || realEstatePropertiesFromInfrastructure.electricity, drainageSystem = realEstatePropertiesFromInfrastructure.drainageSystem, registeredInZkBooks = extractedData["op_realEstates_ownerPermit"] === 1 || null, recentlyAdapted = null, parking = realEstatePropertiesFromDescriptions.parking || realEstatePropertiesFromSpaces.parking, garage = realEstatePropertiesFromSpaces.garage, gas = realEstatePropertiesFromInfrastructure.gas, antiTheftDoor = realEstatePropertiesFromDescriptions.antiTheftDoor, airCondition = realEstatePropertiesFromDescriptions.airCondition, phoneConnection = realEstatePropertiesFromInfrastructure.phoneConnection, cableTV = realEstatePropertiesFromInfrastructure.cableTV, internet = realEstatePropertiesFromInfrastructure.internet, basementAttic = realEstatePropertiesFromSpaces.basementAttic, storeRoom = realEstatePropertiesFromSpaces.storeRoom, videoSurveillance = realEstatePropertiesFromDescriptions.videoSurveillance || realEstatePropertiesFromInfrastructure.videoSurveillance, alarm = realEstatePropertiesFromDescriptions.alarm, suitableForStudents = null, includingBills = extractedData["op_realEstates_utilitiesIncluded"] === "1" || null, animalsAllowed = null, pool = realEstatePropertiesFromDescriptions.pool, urbanPlanPermit = extractedData["op_realEstates_locationPermit"] === "1" || realEstatePropertiesFromDescriptions.urbanPlanPermit, buildingPermit = extractedData["op_realEstates_buildingPermit"] === "1" || null, utilityConnection = realEstatePropertiesFromDescriptions.utilityConnection, distanceToRiver = null, numberOfViewsAgency = null; const title = extractedData["re_realEstates_portalName"]; const extractedPrice = parseFloat( extractedData["re_realEstates_price"] ); const price = extractedPrice ? extractedPrice : null; const area = parseFloat(extractedData["re_realEstates_area"]); const gardenSize = parseFloat( extractedData["re_realEstates_fieldArea"] ); const longDescription = htmlToText.fromString( extractedData["re_realEstates_description"] ); const locationLong = extractedData["re_realEstates_longitude"]; const locationLat = extractedData["re_realEstates_latitude"]; const publishedDateMoment = moment.tz( extractedData["re_realEstates_inserted"], AKTIDO_ENUMS.AKTIDO_PUBLISHED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!publishedDateMoment.isValid()) { throw { message: `Invalid published date : ${ extractedData["re_realEstates_inserted"] }` }; } const renewedDateMoment = moment.tz( extractedData["re_realEstates_edited"], AKTIDO_ENUMS.AKTIDO_RENEWED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!renewedDateMoment.isValid()) { throw { message: `Invalid renewed date : ${ extractedData["re_realEstates_edited"] }` }; } const data = { url, agencyObjectId: aktidoId, originAgencyName: AD_AGENCY.AKTIDO, realEstateType: adCategory, adType, title, price, area, gardenSize, shortDescription: "", longDescription: longDescription, streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat, locationLong, adStatus, publishedDate: publishedDateMoment.toISOString(), renewedDate: renewedDateMoment.toISOString(), numberOfRooms, numberOfFloors, floor, accessRoadType, heatingType, furnishingType, balcony, newBuilding, elevator, water, electricity, drainageSystem, registeredInZkBooks, recentlyAdapted, parking, garage, gas, antiTheftDoor, airCondition, phoneConnection, cableTV, internet, basementAttic, storeRoom, videoSurveillance, alarm, suitableForStudents, includingBills, animalsAllowed, pool, urbanPlanPermit, buildingPermit, utilityConnection, distanceToRiver, numberOfViewsAgency }; return data; } else { console.log("[AKTIDO] No JSON data for this ad : ", url); return null; } } catch (e) { console.error("[AKTIDO] Exception caught: " + e.message, "\r\nURL:", url); return null; } return null; } //======= HELPER FUNCTIONS ============= getKiviCategoryIdFromAktidoId(aktidoCategoryId) { switch (aktidoCategoryId) { case 1: return AD_CATEGORY.HOUSE.id; case 2: return AD_CATEGORY.FLAT.id; case 3: return AD_CATEGORY.APARTMENT.id; case 4: return AD_CATEGORY.OFFICE.id; case 5: return AD_CATEGORY.LAND.id; case 6: return AD_CATEGORY.GARAGE.id; default: return undefined; } } getKiviAdTypeFromAktidoActionId(actionId) { switch (actionId) { case 1: return AD_TYPE.AD_TYPE_SALE.stringId; case 2: return AD_TYPE.AD_TYPE_RENT.stringId; default: return undefined; } } getPropertiesFromDescriptions(descriptionIds) { const result = { accessRoadType: null, furnishingType: null, balcony: null, elevator: null, parking: null, antiTheftDoor: null, airCondition: null, videoSurveillance: null, alarm: null, pool: null, urbanPlanPermit: null, utilityConnection: null, water: null, electricity: null }; for (const descriptionId of descriptionIds) { switch (descriptionId) { case 16: result.furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; break; case 17: result.furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; break; case 1: case 28: result.furnishingType = FURNISHING_TYPE.FURNISHED.id; break; case 14: result.elevator = true; break; case 39: result.electricity = true; break; case 40: result.water = true; break; case 41: case 58: result.accessRoadType = ACCESS_ROAD_TYPE.ASPHALT.id; break; case 26: result.balcony = true; break; case 62: result.parking = true; break; case 3: result.antiTheftDoor = true; break; case 2: case 21: result.airCondition = true; break; case 4: result.alarm = true; break; case 55: result.videoSurveillance = true; break; case 9: result.pool = true; break; case 60: result.urbanPlanPermit = true; break; case 38: result.utilityConnection = true; break; } } return result; } getPropertiesFromSpaces(spaceIds) { const result = { balcony: null, parking: null, garage: null, basementAttic: null, storeRoom: null }; for (const spaceId of spaceIds) { switch (spaceId) { case 36: case 12: result.parking = true; break; case 1: case 2: case 3: result.balcony = true; break; case 4: case 30: result.garage = true; break; case 9: case 10: result.storeRoom = true; break; case 18: case 34: case 37: case 27: result.basementAttic = true; break; } } return result; } getHeatingTypeId(heatingRentalId) { // heatingRentalId can have multiple values, like: "1, 2, 3", parseInt will take first integer value const heatingId = parseInt(heatingRentalId); switch (heatingId) { case 27: case 16: return HEATING_TYPE.GAS.id; case 4: return HEATING_TYPE.CENTRAL_GAS.id; case 3: case 23: case 6: case 7: case 8: case 9: case 10: return HEATING_TYPE.CENTRAL_BOILER.id; case 2: case 13: case 30: case 17: case 29: case 31: return HEATING_TYPE.ELECTRICITY.id; case 24: case 25: case 12: return HEATING_TYPE.CENTRAL_CITY.id; case 26: case 21: case 20: return HEATING_TYPE.WOOD.id; case 28: case 19: return HEATING_TYPE.HEAT_PUMP.id; case 14: case 32: return HEATING_TYPE.OTHER.id; default: return null; } } getPropertiesFromInfrastructure(infrastructureIds) { const result = { electricity: null, water: null, gas: null, drainageSystem: null, phoneConnection: null, internet: null, videoSurveillance: null, cableTV: null }; for (const infrastructureId of infrastructureIds) { switch (infrastructureId) { case 1: result.electricity = true; break; case 2: result.water = true; break; case 4: result.gas = true; break; case 5: result.drainageSystem = true; break; case 7: case 8: result.phoneConnection = true; break; case 10: result.internet = true; break; case 11: result.cableTV = true; break; case 16: case 17: result.videoSurveillance = true; break; } } return result; } getFloorNumberFromFloorId(floorsIdText) { // floorIdText can be array of numbers, separated by comma or number // just extracting floor number from first element const floorsId = floorsIdText.split(","); if (floorsId.length === 0) { return null; } const firstFloorId = parseInt(floorsId[0]); // 1 pod // 2 sut // 3 raz // 4 pri // 5 vpri // 6 prv // 7 dru // 8 tre // 9 čet // 10 man // 11 // 12 pot // 13 vpot // 14 tav // 15 pet const floorNumber = [ -1, -1, 0, 0, 1, 1, 2, 3, 4, null, null, null, null, null, 5 ]; return floorNumber[firstFloorId - 1] || null; } getNumberOfFloorsFromFloorId(floorsIdText) { // floorIdText can be array of numbers, separated by comma or number const floorIds = floorsIdText.split(","); if (floorIds.length === 0) { return null; } return floorIds.length; } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = AktidoCrawler;