"use strict"; const fetch = require("node-fetch"); const cheerio = require("cheerio"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE } = require("../../common/enums"); const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); const PROSTOR_ENUMS = { PROSTOR_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "&action=0", [CRAWLER_AD_TYPE.ONLY_SELL]: "&action=1", [CRAWLER_AD_TYPE.ONLY_RENT]: "&action=2" }, PROSTOR_AD_CATEGORY: { [AD_CATEGORY.ALL.id]: "", [AD_CATEGORY.FLAT.id]: "&type=7", [AD_CATEGORY.HOUSE.id]: "&type=8", [AD_CATEGORY.LAND.id]: "&type=10", [AD_CATEGORY.OFFICE.id]: "&type=9", [AD_CATEGORY.APARTMENT.id]: "&type=11", [AD_CATEGORY.GARAGE.id]: "&type=14" //[AD_CATEGORY.COTTAGE.id]: "" }, PROSTOR_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss", PROSTOR_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" }; class ProstorCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 5000, maxResultsPerPage = 5000, ignoredUsernames = [], delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://prostor.ba/pretraga"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxResultsPerPage = maxResultsPerPage; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; if (crawlAdCategories) { for (const adCategory of crawlAdCategories) { const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; const singleCategoryResults = await this.extractRealEstates( urlPageToCrawl ); const resultsSubset = singleCategoryResults.slice( 0, this.maxResultsPerPage ); const saveResults = await this.saveCrawledResults(resultsSubset); const { newRecords } = saveResults; newRealEstates.push(...newRecords); } } } return newRealEstates; } async extractRealEstates(url) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); } try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); const scriptElement = $( "body > div > div.container-fluid > script:nth-child(7)" ); if ( scriptElement[0] && scriptElement[0].children && scriptElement[0].children[0] && scriptElement[0].children[0].data ) { const scriptData = scriptElement[0].children[0].data; try { // script element data contains JS code and we need to extract only data for realEstates // data string starts with : var map; var markers = [{"r ... // so we remove first 23 characters // // real estate JSON data ends with ...}, ]; map = new... // so we need to find index of that substring to know where to stop // we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually const jsonEndIndex = scriptData.indexOf(", ]; map = new"); if (jsonEndIndex > -1) { const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; const realEstates = JSON.parse(jsonData); const transformedRealEstates = []; for (const realEstate of realEstates) { const transformedRealEstate = ProstorCrawler.transformRealEstateData( realEstate ); if (transformedRealEstate) { transformedRealEstates.push(transformedRealEstate); } } return transformedRealEstates; } else { throw { message: "Something is wrong with JSON data or data is moved" }; } } catch (e) { console.log(e); throw { message: "Can't find ad data JSON" }; } } } catch (e) { console.error("[PROSTOR] Exception caught:", e.message); return []; } } static transformRealEstateData(realEstateData) { try { const { lat, lng, property_name, price, size, link } = realEstateData; // link contains part of the URL in the format of : /prodaja/stan/stup/9556 // general form is : /actionType/realEstateType/location/realEstateID // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] const linkParts = link.split("/"); const adType = ProstorCrawler.getAdTypeId(linkParts[1]); const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); const prostorId = linkParts[4]; const url = `https://prostor.ba${link}`; if (!adType || !realEstateType || !prostorId) { return null; } const adStatus = AD_STATUS.STATUS_NORMAL; const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; const parsedArea = parseFloat(size); const data = { url, agencyObjectId: prostorId, originAgencyName: AD_AGENCY.PROSTOR, realEstateType, adType, title: property_name, price: parsedPrice, area: parsedArea, gardenSize: null, shortDescription: "", longDescription: "", streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat: lat, locationLong: lng, adStatus, publishedDate: null, renewedDate: null }; return data; } catch (e) { console.error( "[PROSTOR] Exception caught: " + e.message, "\r\nURL:", url ); return null; } } //======= HELPER FUNCTIONS ============= static getAdCategoryId(categoryText) { switch (categoryText) { case "stan": return AD_CATEGORY.FLAT.id; case "kuca": return AD_CATEGORY.HOUSE.id; case "apartman": return AD_CATEGORY.APARTMENT.id; case "poslovni-prostor": return AD_CATEGORY.OFFICE.id; case "garaza": return AD_CATEGORY.GARAGE.id; case "zemljiste": return AD_CATEGORY.LAND.id; default: return undefined; } } static getAdTypeId(adTypeText) { switch (adTypeText) { case "prodaja": return AD_TYPE.AD_TYPE_SALE.stringId; case "najam": return AD_TYPE.AD_TYPE_RENT.stringId; default: return undefined; } } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return await savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = ProstorCrawler;