"use strict"; const fetch = require("node-fetch"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); const FormData = require("form-data"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE, FURNISHING_TYPE, HEATING_TYPE } = require("../../common/enums"); const { PRINT_CRAWLER_DEBUG, DEFAULT_TIMEZONE, PROSTOR_LOGIN } = require("../../config/appConfig"); const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor"); const PROSTOR_ENUMS = { PROSTOR_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "&action=0", [CRAWLER_AD_TYPE.ONLY_SELL]: "&action=1", [CRAWLER_AD_TYPE.ONLY_RENT]: "&action=2" }, PROSTOR_AD_CATEGORY: { [AD_CATEGORY.ALL.id]: "", [AD_CATEGORY.FLAT.id]: "&type=7", [AD_CATEGORY.HOUSE.id]: "&type=8", [AD_CATEGORY.LAND.id]: "&type=10", [AD_CATEGORY.OFFICE.id]: "&type=9", [AD_CATEGORY.APARTMENT.id]: "&type=11", [AD_CATEGORY.GARAGE.id]: "&type=14" //[AD_CATEGORY.COTTAGE.id]: "" }, PROSTOR_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss", PROSTOR_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" }; class ProstorCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 5000, maxResultsPerPage = 5000, ignoredUsernames = [], delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://prostor.ba/pretraga"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxResultsPerPage = maxResultsPerPage; this.delayBetweenPages = delayBetweenPages; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; //We need session cookie to use login privileges const prostorCookie = await this.getCookies(); //New tag to check if crawler loged in const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie); const newRealEstates = []; //Crawl only if login was successful if (crawlAdCategories && login) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie)); } let done = false; while (!done) { const categoryIndexerPromises = []; const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); generatorsToRemove.push(false); } const singlePageResults = await Promise.all(categoryIndexerPromises); const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); const { newRecords } = saveResults; newRealEstates.push(...newRecords); if ( Array.isArray(newRecords) && newRecords.length === 0 && !PROSTOR_FORCE_CRAWL ) { generatorsToRemove[index] = true; } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; // console.log("Generator ", index + 1, "has no more pages"); } } // console.log("Generators state : ", generatorsToRemove); for (let i = generatorsToRemove.length - 1; i >= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory, prostorCookie) { const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`; const listOfAllRealEstates = await this.extractRealEstates( urlPageToCrawl, prostorCookie ); let elementToStartIndexFrom = 0; while (true) { const realEstatesForSinglePage = listOfAllRealEstates.slice( elementToStartIndexFrom, elementToStartIndexFrom + this.maxResultsPerPage ); if (realEstatesForSinglePage.length > 0) { elementToStartIndexFrom += realEstatesForSinglePage.length; const singlePageResults = await this.indexSinglePage( realEstatesForSinglePage, prostorCookie ); const filteredSinglePageResults = singlePageResults.filter( singleResult => !!singleResult ); if ( Array.isArray(filteredSinglePageResults) && filteredSinglePageResults.length > 0 ) { yield filteredSinglePageResults; } else { return undefined; } } else { return undefined; } } } else { return undefined; } } async indexSinglePage(realEstatesList, prostorCookie) { const asyncActions = []; for (const realEstate of realEstatesList) { asyncActions.push(this.scrapeAd(realEstate, prostorCookie)); } try { return await Promise.all(asyncActions); } catch (e) { console.log( "[PROSTOR] Error crawling ads : ", e.message || "UNKNOWN ERROR" ); return []; } } async scrapeAd(realEstate, prostorCookie) { const { lat, lng, property_name, price, size, link, status } = realEstate; //Status information is given already in realestate list const adStatus = ProstorCrawler.getStatusId(status); const url = `https://prostor.ba${link}`; // console.log("[PROSTOR] Scraping : ", url); try { const adPageSource = await fetch(url, { headers: { Cookie: prostorCookie } }); const body = await adPageSource.text(); const $ = cheerio.load(body); // link contains part of the URL in the format of : /prodaja/stan/stup/9556 // general form is : /actionType/realEstateType/location/realEstateID // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] const linkParts = link.split("/"); const adType = ProstorCrawler.getAdTypeId(linkParts[1]); const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); const prostorId = linkParts[4]; if (!adType || !realEstateType || !prostorId) { return null; } const allDataSelector = "body > div > div.container-fluid > div > div.column-right > table > tbody"; const realEstateProperties = {}; $(allDataSelector) .find("p") .each((i, element) => { const propertyElement = $(element) .text() .split(":") .map(text => text.trim().toLowerCase()); const propertyTitle = propertyElement[0]; realEstateProperties[propertyTitle] = propertyElement[1]; }); $(allDataSelector) .find("div.mb-2") .each((i, element) => { const propertyElement = $(element) .text() .trim() .toLowerCase(); realEstateProperties[propertyElement] = true; }); if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { return null; } let numberOfRooms = parseFloat(realEstateProperties["broj soba"]) + parseFloat(realEstateProperties["broj spavaćih soba"]) || null, numberOfFloors = null, floor = null, accessRoadType = null, heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties), furnishingType = null, balcony = realEstateProperties["balkon"] || realEstateProperties["terasa"] || realEstateProperties["lođa"] || null, newBuilding = linkParts[1] === "novogradnja", elevator = realEstateProperties["lift"] || null, water = realEstateProperties["voda"] || null, electricity = realEstateProperties["električna energija"] || null, drainageSystem = realEstateProperties["kanalizacija"] || null, registeredInZkBooks = null, recentlyAdapted = null, parking = realEstateProperties["parking"] || null, garage = realEstateProperties["garaža"] || null, gas = realEstateProperties["plin"] || null, antiTheftDoor = realEstateProperties["blindo vrata"] || null, airCondition = realEstateProperties["klima"] || null, phoneConnection = realEstateProperties["telefon"] || null, cableTV = realEstateProperties["kablovksa tv"] || null, internet = realEstateProperties["internet"] || realEstateProperties["adsl"] || null, basementAttic = realEstateProperties["podrum"] || null, storeRoom = realEstateProperties["ostava"] || null, videoSurveillance = realEstateProperties["video nadzor"], alarm = realEstateProperties["alarm"] || null, suitableForStudents = null, includingBills = null, animalsAllowed = null, pool = realEstateProperties["bazen"] || null, urbanPlanPermit = null, buildingPermit = null, utilityConnection = null, distanceToRiver = null, numberOfViewsAgency = null; // Floor versions (there are possibly more versions) : // Sprat: 3/3 // Sprat: 1 - 2/2 // Sprat: Pr - 7/7 // Sprat: -2/0 // If there are two parts, that represents more real estates are sold // numberOfFloors is contained in second part, after / sign const floorsArray = realEstateProperties["sprat"].split(" - "); let floorText = ""; if (floorsArray.length === 1) { const floorDescription = floorsArray[0].split("/"); numberOfFloors = parseInt(floorDescription[1]) || null; floorText = floorDescription[0]; floor = Math.round(parseFloat(floorText)); } else if (floorsArray.length === 2) { const floorDescription = floorsArray[1].split("/"); numberOfFloors = parseInt(floorDescription[1]) || null; floorText = floorsArray[0]; floor = Math.round(parseFloat(floorText)); } else { // This is something strange } if (isNaN(floor)) { // It was textual representation of floor, like "Pr", "Su" or similar switch (floorText) { case "pr": floor = 0; break; case "su": floor = -1; break; default: console.log( "[PROSTOR] Unknown textual representation of floor : ", floorText ); floor = null; } } if (realEstateProperties["namješteno"]) { furnishingType = FURNISHING_TYPE.FURNISHED.id; } else if (realEstateProperties["polunamješteno"]) { furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; } else { furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; } const title = property_name; const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; const parsedArea = parseFloat(size); const gardenSize = null; const longDescription = null; const data = { url, agencyObjectId: prostorId, originAgencyName: AD_AGENCY.PROSTOR, realEstateType, adType, title, price: parsedPrice, area: parsedArea, gardenSize, shortDescription: "", longDescription: longDescription, streetNumber: 0, streetName: realEstateProperties["adresa"], locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat: lat, locationLong: lng, adStatus, numberOfRooms, numberOfFloors, floor, accessRoadType, heatingType, furnishingType, balcony, newBuilding, elevator, water, electricity, drainageSystem, registeredInZkBooks, recentlyAdapted, parking, garage, gas, antiTheftDoor, airCondition, phoneConnection, cableTV, internet, basementAttic, storeRoom, videoSurveillance, alarm, suitableForStudents, includingBills, animalsAllowed, pool, urbanPlanPermit, buildingPermit, utilityConnection, distanceToRiver, numberOfViewsAgency }; return data; } catch (e) { console.error( "[PROSTOR] Exception caught: " + e.message, "\r\nURL:", url ); return null; } } async extractRealEstates(url, prostorCookie) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); } try { const res = await fetch(url, { headers: { Cookie: prostorCookie } }); const body = await res.text(); const $ = cheerio.load(body); const scriptElement = $( "body > div > div.container-fluid > script:nth-child(7)" ); if ( scriptElement[0] && scriptElement[0].children && scriptElement[0].children[0] && scriptElement[0].children[0].data ) { const scriptData = scriptElement[0].children[0].data; try { // script element data contains JS code and we need to extract only data for realEstates // data string starts with : var map; var markers = [{"r ... // so we remove first 23 characters // // real estate JSON data ends with ...}, ]; map = new... // so we need to find index of that substring to know where to stop // we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually const jsonEndIndex = scriptData.indexOf(", ]; map = new"); if (jsonEndIndex > -1) { const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; const realEstates = JSON.parse(jsonData); // const transformedRealEstates = []; // // for (const realEstate of realEstates) { // const transformedRealEstate = ProstorCrawler.transformRealEstateData( // realEstate // ); // if (transformedRealEstate) { // transformedRealEstates.push(transformedRealEstate); // } // } // // return transformedRealEstates; return realEstates; } else { throw { message: "Something is wrong with JSON data or data is moved" }; } } catch (e) { console.log(e); throw e; } } } catch (e) { console.error( "[PROSTOR] Exception caught:", e.message || "UNKNOWN MESSAGE" ); return []; } } //======= HELPER FUNCTIONS ============= static getAdCategoryId(categoryText) { switch (categoryText) { case "stan": return AD_CATEGORY.FLAT.id; case "kuca": return AD_CATEGORY.HOUSE.id; case "apartman": return AD_CATEGORY.APARTMENT.id; case "poslovni-prostor": return AD_CATEGORY.OFFICE.id; case "garaza": return AD_CATEGORY.GARAGE.id; case "zemljiste": return AD_CATEGORY.LAND.id; default: return undefined; } } static getAdTypeId(adTypeText) { switch (adTypeText) { case "prodaja": return AD_TYPE.AD_TYPE_SALE.stringId; case "najam": return AD_TYPE.AD_TYPE_RENT.stringId; case "novogradnja": return AD_TYPE.AD_TYPE_SALE.stringId; default: return undefined; } } static getHeatingTypeId(realEstateProperties) { const realEstatePropertiesKeys = Object.keys(realEstateProperties); for (const property of realEstatePropertiesKeys) { switch (property) { case "centralno toplane": return HEATING_TYPE.CENTRAL_CITY.id; case "etažno plinsko": return HEATING_TYPE.CENTRAL_GAS.id; case "termo blok": case "podno grijanje": return HEATING_TYPE.OTHER.id; case "etažno električno": case "konvektori": return HEATING_TYPE.ELECTRICITY.id; case "plinske peći": return HEATING_TYPE.GAS.id; case "vlastita kotlovnica": return HEATING_TYPE.CENTRAL_BOILER.id; case "toplotna pumpa": return HEATING_TYPE.HEAT_PUMP.id; case "kamin": return HEATING_TYPE.WOOD.id; default: //console.log("[PROSTOR] Nepoznato >>> [", property, "]"); } } } static getStatusId(statusText) { switch (statusText) { case "": return AD_STATUS.STATUS_NORMAL; case "Rezervisano": return AD_STATUS.STATUS_RESERVED; case "Prodano": return AD_STATUS.STATUS_SOLD; case "Iznajmljeno": return AD_STATUS.STATUS_RENTED; case "VIP ponuda": return AD_STATUS.STATUS_VIP; default: console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]"); return AD_STATUS.STATUS_NORMAL; } } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } async loginForScraping(PROSTOR_LOGIN, prostorCookie) { let formData = new FormData(); formData.append("email", PROSTOR_LOGIN.EMAIL); formData.append("password", PROSTOR_LOGIN.PASSWORD); return fetch("https://prostor.ba/moj-prostor/prijava", { method: "POST", body: formData, headers: { Cookie: prostorCookie } }) .then(page => { return page.text(); }) .then(resp => { const $ = cheerio.load(resp); if ( $("h1") .text() .indexOf("Dobrodošli") !== -1 ) { console.log("[PROSTOR]: Crawler loged in!"); return true; } else { console.log("[PROSTOR]: Crawler login failed - wrong credentials!"); return false; } }) .catch(err => { console.log("[PROSTOR]: Crawler login error ", err); }); } async getCookies() { const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", { headers: { Cookie: "" } }); const raw = getResponse.headers.raw()["set-cookie"]; const cookie = raw .map(datastring => { const data = datastring.split(";"); const cookieData = data[0]; return cookieData; }) .join(";"); return cookie; } } module.exports = ProstorCrawler;