From b11f18696f01d3ca9aee29739677bea55b1bbdd0 Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Wed, 29 Jan 2020 01:09:53 +0100 Subject: [PATCH 1/5] Prepared config files. --- app/common/enums.js | 3 +- app/crawler/crawl.js | 13 +++- app/crawler/crawlerConfig.js | 4 +- app/crawler/specificConfigs/saljic.js | 34 +++++++++++ app/crawler/specificCrawlers/saljic.js | 83 ++++++++++++++++++++++++++ development.env | 5 ++ 6 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 app/crawler/specificConfigs/saljic.js create mode 100644 app/crawler/specificCrawlers/saljic.js diff --git a/app/common/enums.js b/app/common/enums.js index 33cb41e..47c36c3 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -223,7 +223,8 @@ const AD_AGENCY = { OLX: "OLX", RENTAL: "RENTAL", PROSTOR: "PROSTOR", - AKTIDO: "AKTIDO" + AKTIDO: "AKTIDO", + SALJIC: "SALJIC" }; const CRAWLER_AD_TYPE = { diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 82411b6..d4c335e 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -9,12 +9,14 @@ const OlxCrawler = require("./specificCrawlers/olx"); const RentalCrawler = require("./specificCrawlers/rental"); const ProstorCrawler = require("./specificCrawlers/prostor"); const AktidoCrawler = require("./specificCrawlers/aktido"); +const SaljicCrawler = require("./specificCrawlers/saljic"); const { OLX_CONFIG, RENTAL_CONFIG, PROSTOR_CONFIG, - AKTIDO_CONFIG + AKTIDO_CONFIG, + SALJIC_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); @@ -57,6 +59,15 @@ async function crawlAll() { AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE, AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES, AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES + ), + new SaljicCrawler( + [postgresSaver], + SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE, + SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES, + SALJIC_CONFIG.SALJIC_MAX_PAGES, + SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE, + SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES, + SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES ) ]; diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index ee98e44..4853d53 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx"); const RENTAL_CONFIG = require("./specificConfigs/rental"); const PROSTOR_CONFIG = require("./specificConfigs/prostor"); const AKTIDO_CONFIG = require("./specificConfigs/aktido"); +const SALJIC_CONFIG = require("./specificConfigs/saljic"); module.exports = { OLX_CONFIG, RENTAL_CONFIG, PROSTOR_CONFIG, - AKTIDO_CONFIG + AKTIDO_CONFIG, + SALJIC_CONFIG }; diff --git a/app/crawler/specificConfigs/saljic.js b/app/crawler/specificConfigs/saljic.js new file mode 100644 index 0000000..2e39ffe --- /dev/null +++ b/app/crawler/specificConfigs/saljic.js @@ -0,0 +1,34 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const saljicCrawlerAdType = + process.env.SALJIC_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE] + : null; + +const saljicParsedCrawlerAdCategories = + process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const saljicIgnoredUsernames = []; + +const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100, + SALJIC_MAX_RESULTS_PER_PAGE: + parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000, + SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE, + SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories, + SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [], + SALJIC_DELAY_BETWEEN_PAGES: + parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000, + SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL) +}; diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js new file mode 100644 index 0000000..407a558 --- /dev/null +++ b/app/crawler/specificCrawlers/saljic.js @@ -0,0 +1,83 @@ +"use strict"; + +const fetch = require("node-fetch"); +const cheerio = require("cheerio"); +const moment = require("moment-timezone"); + +const { + AD_TYPE, + AD_CATEGORY, + AD_AGENCY, + AD_STATUS, + CRAWLER_AD_TYPE, + FURNISHING_TYPE, + HEATING_TYPE +} = require("../../common/enums"); + +const { + PRINT_CRAWLER_DEBUG, + DEFAULT_TIMEZONE +} = require("../../config/appConfig"); +const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic"); + +const SALJIC_ENUMS = { + SALJIC_AD_TYPE: { + [CRAWLER_AD_TYPE.ALL]: "&input_vrsta=", + [CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1", + [CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2" + }, + SALJIC_AD_CATEGORY: { + [AD_CATEGORY.ALL.id]: "&input_kategorija=", + [AD_CATEGORY.FLAT.id]: "&input_kategorija=15", + [AD_CATEGORY.HOUSE.id]: "&input_kategorija=9", + [AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko + [AD_CATEGORY.OFFICE.id]: "&input_kategorija=8", + [AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1", + [AD_CATEGORY.GARAGE.id]: "&input_kategorija=2" + //[AD_CATEGORY.COTTAGE.id]: "" + } +}; + +class SaljicCrawler { + constructor( + savers = [], + crawlerAdTypes = CRAWLER_AD_TYPE.ALL, + crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], + maxPages = 5000, + maxResultsPerPage = 5000, + ignoredUsernames = [], + delayBetweenPages = 1000 + ) { + this.savers = savers; + this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; + this.crawlerAdTypes = crawlerAdTypes; + this.crawlerAdCategories = crawlerAdCategories; + this.maxResultsPerPage = maxResultsPerPage; + this.delayBetweenPages = delayBetweenPages; + } + + async crawl() { + // + console.log("Saljic URL: ", this.baseUrl); + } + + //======= HELPER FUNCTIONS ============= + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async saveCrawledResults(results) { + const savers = this.savers; + + // for (const saver of savers) { + // await saver.save(results); + // } + + //For now, we use only Postgres saver, so ... + return savers[0].save(results); + //so that we can use some sequelize options and information when data is inserted + } +} + +module.exports = SaljicCrawler; diff --git a/development.env b/development.env index 89f0a1e..a18b79d 100644 --- a/development.env +++ b/development.env @@ -59,3 +59,8 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!! AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found +#==SALJIC NEKRETNINE== +SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once +SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values +SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values +SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found \ No newline at end of file -- 2.47.3 From 94ffc2d6d210946c0536cc9b93617cb8c078e2a0 Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Wed, 29 Jan 2020 23:22:39 +0100 Subject: [PATCH 2/5] WIP Started saljic crawler. --- app/crawler/specificCrawlers/saljic.js | 496 ++++++++++++++++++++++++- 1 file changed, 494 insertions(+), 2 deletions(-) diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 407a558..0e912fc 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -57,12 +57,504 @@ class SaljicCrawler { } async crawl() { - // - console.log("Saljic URL: ", this.baseUrl); + const crawlAdCategories = this.crawlerAdCategories; + + const newRealEstates = []; + + if (crawlAdCategories) { + const indexGenerators = []; + for (const adCategory of crawlAdCategories) { + indexGenerators.push(this.categoryIndexer(adCategory)); + } + // + console.log(indexGenerators); + // + let done = false; + while (!done) { + const categoryIndexerPromises = []; + const generatorsToRemove = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); + } + + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); + + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; + + newRealEstates.push(...newRecords); + + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !SALJIC_FORCE_CRAWL + ) { + generatorsToRemove[index] = true; + } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); + } + } + + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + await this.sleep(this.delayBetweenPages); + } + } + return newRealEstates; + } + + async *categoryIndexer(adCategory) { + const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory]; + + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { + const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}`; + const listOfAllRealEstates = await this.extractRealEstates( + urlPageToCrawl + ); + + let elementToStartIndexFrom = 0; + while (true) { + const realEstatesForSinglePage = listOfAllRealEstates.slice( + elementToStartIndexFrom, + elementToStartIndexFrom + this.maxResultsPerPage + ); + + if (realEstatesForSinglePage.length > 0) { + elementToStartIndexFrom += realEstatesForSinglePage.length; + + const singlePageResults = await this.indexSinglePage( + realEstatesForSinglePage + ); + + const filteredSinglePageResults = singlePageResults.filter( + singleResult => !!singleResult + ); + + if ( + Array.isArray(filteredSinglePageResults) && + filteredSinglePageResults.length > 0 + ) { + yield filteredSinglePageResults; + } else { + return undefined; + } + } else { + return undefined; + } + } + } else { + return undefined; + } + } + + async indexSinglePage(realEstatesList) { + const asyncActions = []; + for (const realEstate of realEstatesList) { + asyncActions.push(this.scrapeAd(realEstate)); + } + + try { + return await Promise.all(asyncActions); + } catch (e) { + console.log( + "[SALJIC] Error crawling ads : ", + e.message || "UNKNOWN ERROR" + ); + return []; + } + } + + async scrapeAd(realEstate) { + const { lat, lng, property_name, price, size, link, status } = realEstate; + const url = `https://www.saljicnekretnine.ba/v2/${link}`; + // console.log("[SALJIC] Scraping : ", url); + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + // ?? Ovo se mora promijeniti + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = SaljicCrawler.getAdTypeId(linkParts[1]); + const realEstateType = SaljicCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + + if (!adType || !realEstateType || !prostorId) { + return null; + } + + const allDataSelector = + "body > div > div.container-fluid > div > div.column-right > table > tbody"; + + const realEstateProperties = {}; + + $(allDataSelector) + .find("p") + .each((i, element) => { + const propertyElement = $(element) + .text() + .split(":") + .map(text => text.trim().toLowerCase()); + + const propertyTitle = propertyElement[0]; + realEstateProperties[propertyTitle] = propertyElement[1]; + }); + + $(allDataSelector) + .find("div.mb-2") + .each((i, element) => { + const propertyElement = $(element) + .text() + .trim() + .toLowerCase(); + + realEstateProperties[propertyElement] = true; + }); + + if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { + return null; + } + + let numberOfRooms = + parseFloat(realEstateProperties["broj soba"]) + + parseFloat(realEstateProperties["broj spavaćih soba"]) || null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties), + furnishingType = null, + balcony = + realEstateProperties["balkon"] || + realEstateProperties["terasa"] || + realEstateProperties["lođa"] || + null, + newBuilding = linkParts[1] === "novogradnja", + elevator = realEstateProperties["lift"] || null, + water = realEstateProperties["voda"] || null, + electricity = realEstateProperties["električna energija"] || null, + drainageSystem = realEstateProperties["kanalizacija"] || null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = realEstateProperties["parking"] || null, + garage = realEstateProperties["garaža"] || null, + gas = realEstateProperties["plin"] || null, + antiTheftDoor = realEstateProperties["blindo vrata"] || null, + airCondition = realEstateProperties["klima"] || null, + phoneConnection = realEstateProperties["telefon"] || null, + cableTV = realEstateProperties["kablovksa tv"] || null, + internet = + realEstateProperties["internet"] || + realEstateProperties["adsl"] || + null, + basementAttic = realEstateProperties["podrum"] || null, + storeRoom = realEstateProperties["ostava"] || null, + videoSurveillance = realEstateProperties["video nadzor"], + alarm = realEstateProperties["alarm"] || null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = realEstateProperties["bazen"] || null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null, + numberOfViewsAgency = null; + + // Floor versions (there are possibly more versions) : + // Sprat: 3/3 + // Sprat: 1 - 2/2 + // Sprat: Pr - 7/7 + // Sprat: -2/0 + // If there are two parts, that represents more real estates are sold + // numberOfFloors is contained in second part, after / sign + + const floorsArray = realEstateProperties["sprat"].split(" - "); + let floorText = ""; + if (floorsArray.length === 1) { + const floorDescription = floorsArray[0].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorDescription[0]; + floor = Math.round(parseFloat(floorText)); + } else if (floorsArray.length === 2) { + const floorDescription = floorsArray[1].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorsArray[0]; + floor = Math.round(parseFloat(floorText)); + } else { + // This is something strange + } + + if (isNaN(floor)) { + // It was textual representation of floor, like "Pr", "Su" or similar + switch (floorText) { + case "pr": + floor = 0; + break; + case "su": + floor = -1; + break; + default: + console.log( + "[SALJIC] Unknown textual representation of floor : ", + floorText + ); + floor = null; + } + } + + if (realEstateProperties["namješteno"]) { + furnishingType = FURNISHING_TYPE.FURNISHED.id; + } else if (realEstateProperties["polunamješteno"]) { + furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + } else { + furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + } + + const adStatus = SaljicCrawler.getStatusId(status); + const title = property_name; + const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; + const parsedArea = parseFloat(size); + const gardenSize = null; + const longDescription = null; + + const data = { + url, + agencyObjectId: prostorId, + originAgencyName: AD_AGENCY.SALJIC, + realEstateType, + adType, + title, + price: parsedPrice, + area: parsedArea, + gardenSize, + shortDescription: "", + longDescription: longDescription, + streetNumber: 0, + streetName: realEstateProperties["adresa"], + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat: lat, + locationLong: lng, + adStatus, + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency + }; + + return data; + } catch (e) { + console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url); + return null; + } + } + + async extractRealEstates(url) { + if (PRINT_CRAWLER_DEBUG) { + console.log("[SALJIC] Index page : ", url); + } + + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + console.log("SALJIC: $", $); + + const scriptElement = $( + "body > div > div.container-fluid > script:nth-child(7)" + ); + // + //console.log(scriptElement[0]); + //console.log(scriptElement[0].children); + if ( + scriptElement[0] && + scriptElement[0].children && + scriptElement[0].children[0] && + scriptElement[0].children[0].data + ) { + const scriptData = scriptElement[0].children[0].data; + // + console.log(scriptData); + try { + // script element data contains JS code and we need to extract only data for realEstates + // data string starts with : var map; var markers = [{"r ... + // so we remove first 23 characters + // + // real estate JSON data ends with ...}, ]; map = new... + // so we need to find index of that substring to know where to stop + // we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually + + const jsonEndIndex = scriptData.indexOf(", ]; map = new"); + if (jsonEndIndex > -1) { + const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; + const realEstates = JSON.parse(jsonData); + + // const transformedRealEstates = []; + // + // for (const realEstate of realEstates) { + // const transformedRealEstate = SaljicCrawler.transformRealEstateData( + // realEstate + // ); + // if (transformedRealEstate) { + // transformedRealEstates.push(transformedRealEstate); + // } + // } + // + // return transformedRealEstates; + return realEstates; + } else { + throw { + message: "Something is wrong with JSON data or data is moved" + }; + } + } catch (e) { + console.log(e); + throw e; + } + } + } catch (e) { + console.error( + "[SALJIC] Exception caught:", + e.message || "UNKNOWN MESSAGE" + ); + return []; + } } //======= HELPER FUNCTIONS ============= + static getAdCategoryId(categoryText) { + switch (categoryText) { + case "stan": + return AD_CATEGORY.FLAT.id; + case "kuca": + return AD_CATEGORY.HOUSE.id; + case "apartman": + return AD_CATEGORY.APARTMENT.id; + case "poslovni-prostor": + return AD_CATEGORY.OFFICE.id; + case "garaza": + return AD_CATEGORY.GARAGE.id; + case "zemljiste": + return AD_CATEGORY.LAND.id; + default: + return undefined; + } + } + + static getAdTypeId(adTypeText) { + switch (adTypeText) { + case "prodaja": + return AD_TYPE.AD_TYPE_SALE.stringId; + case "najam": + return AD_TYPE.AD_TYPE_RENT.stringId; + case "novogradnja": + return AD_TYPE.AD_TYPE_SALE.stringId; + default: + return undefined; + } + } + + static getHeatingTypeId(realEstateProperties) { + const realEstatePropertiesKeys = Object.keys(realEstateProperties); + for (const property of realEstatePropertiesKeys) { + switch (property) { + case "centralno toplane": + return HEATING_TYPE.CENTRAL_CITY.id; + case "etažno plinsko": + return HEATING_TYPE.CENTRAL_GAS.id; + case "termo blok": + case "podno grijanje": + return HEATING_TYPE.OTHER.id; + case "etažno električno": + case "konvektori": + return HEATING_TYPE.ELECTRICITY.id; + case "plinske peći": + return HEATING_TYPE.GAS.id; + case "vlastita kotlovnica": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "toplotna pumpa": + return HEATING_TYPE.HEAT_PUMP.id; + case "kamin": + return HEATING_TYPE.WOOD.id; + default: + //console.log("[SALJIC] Nepoznato >>> [", property, "]"); + } + } + } + + static getStatusId(statusText) { + switch (statusText) { + case "": + return AD_STATUS.STATUS_NORMAL; + case "Rezervisano": + return AD_STATUS.STATUS_RESERVED; + case "Prodano": + return AD_STATUS.STATUS_SOLD; + case "Iznajmljeno": + return AD_STATUS.STATUS_RENTED; + default: + console.log("[SALJIC] Unknown AD_STATUS : [", statusText, "]"); + return AD_STATUS.STATUS_NORMAL; + } + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } -- 2.47.3 From 78c4054cde264225ea53e57a6bdc802bb30896d2 Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Thu, 30 Jan 2020 16:24:34 +0100 Subject: [PATCH 3/5] WIP Scraped title, price and location. --- app/crawler/specificCrawlers/saljic.js | 937 ++++++++++++++++--------- 1 file changed, 620 insertions(+), 317 deletions(-) diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 0e912fc..4156727 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -67,7 +67,7 @@ class SaljicCrawler { indexGenerators.push(this.categoryIndexer(adCategory)); } // - console.log(indexGenerators); + //console.log(indexGenerators); // let done = false; while (!done) { @@ -120,247 +120,490 @@ class SaljicCrawler { } async *categoryIndexer(adCategory) { + let pageToIndex = 1; + const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { - const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}`; - const listOfAllRealEstates = await this.extractRealEstates( - urlPageToCrawl - ); - - let elementToStartIndexFrom = 0; while (true) { - const realEstatesForSinglePage = listOfAllRealEstates.slice( - elementToStartIndexFrom, - elementToStartIndexFrom + this.maxResultsPerPage + const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11; + const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`; + + const singlePageResults = await this.indexSinglePage( + urlPageToCrawl, + this.maxResultsPerPage ); - if (realEstatesForSinglePage.length > 0) { - elementToStartIndexFrom += realEstatesForSinglePage.length; - - const singlePageResults = await this.indexSinglePage( - realEstatesForSinglePage - ); - - const filteredSinglePageResults = singlePageResults.filter( - singleResult => !!singleResult - ); - - if ( - Array.isArray(filteredSinglePageResults) && - filteredSinglePageResults.length > 0 - ) { - yield filteredSinglePageResults; - } else { - return undefined; - } + if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { + yield singlePageResults; } else { return undefined; } + + ++pageToIndex; + if (pageToIndex === this.maxPages) { + return undefined; + } } } else { return undefined; } } - async indexSinglePage(realEstatesList) { - const asyncActions = []; - for (const realEstate of realEstatesList) { - asyncActions.push(this.scrapeAd(realEstate)); + async indexSinglePage(url, maxResultsPerPage) { + if (PRINT_CRAWLER_DEBUG) { + console.log("[SALJIC] Index page : ", url); } try { - return await Promise.all(asyncActions); + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + let hrefs = []; + + $("#shop") + .find(".product") + .each((i, elem) => { + const href = $(elem) + .find("a") + .first() + .attr("href"); + if (href) { + hrefs.push(href); + } + }); + + //Converting to absolute URLs + const hrefsAbs = hrefs.map(link => { + return "https://www.saljicnekretnine.ba" + link; + }); + + let actualNoOfResults = + hrefsAbs.length <= maxResultsPerPage + ? hrefsAbs.length + : maxResultsPerPage; + + const asyncScraping = []; + for (let i = 0; i < actualNoOfResults; i++) { + asyncScraping.push(this.scrapeAd(hrefsAbs[i])); + } + + const scrapedData = await Promise.all(asyncScraping); + const filteredScrapedData = scrapedData.filter(adData => !!adData); + return filteredScrapedData; } catch (e) { - console.log( - "[SALJIC] Error crawling ads : ", - e.message || "UNKNOWN ERROR" - ); + console.error("[SALJIC] Exception caught:" + e); return []; } } - async scrapeAd(realEstate) { - const { lat, lng, property_name, price, size, link, status } = realEstate; - const url = `https://www.saljicnekretnine.ba/v2/${link}`; - // console.log("[SALJIC] Scraping : ", url); + async scrapeAd(url) { + console.log("[SALJIC] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); - // ?? Ovo se mora promijeniti - // link contains part of the URL in the format of : /prodaja/stan/stup/9556 - // general form is : /actionType/realEstateType/location/realEstateID - // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] - const linkParts = link.split("/"); + // ??? treba li nesto za status + let status = AD_STATUS.STATUS_NORMAL; - const adType = SaljicCrawler.getAdTypeId(linkParts[1]); - const realEstateType = SaljicCrawler.getAdCategoryId(linkParts[2]); - const prostorId = linkParts[4]; + const propertySelectors = { + title: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", + price: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", + streetName: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p", - if (!adType || !realEstateType || !prostorId) { - return null; - } + descriptions: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)", + latAndLong: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" + }; + const title = $(propertySelectors.title) + .text() + .replace(/(\r\n|\n|\r)/gm, "") + .replace(/ {1,}/g, " ") + .trim(); - const allDataSelector = - "body > div > div.container-fluid > div > div.column-right > table > tbody"; - - const realEstateProperties = {}; - - $(allDataSelector) - .find("p") - .each((i, element) => { - const propertyElement = $(element) - .text() - .split(":") - .map(text => text.trim().toLowerCase()); - - const propertyTitle = propertyElement[0]; - realEstateProperties[propertyTitle] = propertyElement[1]; - }); - - $(allDataSelector) - .find("div.mb-2") - .each((i, element) => { - const propertyElement = $(element) - .text() - .trim() - .toLowerCase(); - - realEstateProperties[propertyElement] = true; - }); - - if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { - return null; - } - - let numberOfRooms = - parseFloat(realEstateProperties["broj soba"]) + - parseFloat(realEstateProperties["broj spavaćih soba"]) || null, - numberOfFloors = null, - floor = null, - accessRoadType = null, - heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties), - furnishingType = null, - balcony = - realEstateProperties["balkon"] || - realEstateProperties["terasa"] || - realEstateProperties["lođa"] || - null, - newBuilding = linkParts[1] === "novogradnja", - elevator = realEstateProperties["lift"] || null, - water = realEstateProperties["voda"] || null, - electricity = realEstateProperties["električna energija"] || null, - drainageSystem = realEstateProperties["kanalizacija"] || null, - registeredInZkBooks = null, - recentlyAdapted = null, - parking = realEstateProperties["parking"] || null, - garage = realEstateProperties["garaža"] || null, - gas = realEstateProperties["plin"] || null, - antiTheftDoor = realEstateProperties["blindo vrata"] || null, - airCondition = realEstateProperties["klima"] || null, - phoneConnection = realEstateProperties["telefon"] || null, - cableTV = realEstateProperties["kablovksa tv"] || null, - internet = - realEstateProperties["internet"] || - realEstateProperties["adsl"] || - null, - basementAttic = realEstateProperties["podrum"] || null, - storeRoom = realEstateProperties["ostava"] || null, - videoSurveillance = realEstateProperties["video nadzor"], - alarm = realEstateProperties["alarm"] || null, - suitableForStudents = null, - includingBills = null, - animalsAllowed = null, - pool = realEstateProperties["bazen"] || null, - urbanPlanPermit = null, - buildingPermit = null, - utilityConnection = null, - distanceToRiver = null, - numberOfViewsAgency = null; - - // Floor versions (there are possibly more versions) : - // Sprat: 3/3 - // Sprat: 1 - 2/2 - // Sprat: Pr - 7/7 - // Sprat: -2/0 - // If there are two parts, that represents more real estates are sold - // numberOfFloors is contained in second part, after / sign - - const floorsArray = realEstateProperties["sprat"].split(" - "); - let floorText = ""; - if (floorsArray.length === 1) { - const floorDescription = floorsArray[0].split("/"); - numberOfFloors = parseInt(floorDescription[1]) || null; - floorText = floorDescription[0]; - floor = Math.round(parseFloat(floorText)); - } else if (floorsArray.length === 2) { - const floorDescription = floorsArray[1].split("/"); - numberOfFloors = parseInt(floorDescription[1]) || null; - floorText = floorsArray[0]; - floor = Math.round(parseFloat(floorText)); - } else { - // This is something strange - } - - if (isNaN(floor)) { - // It was textual representation of floor, like "Pr", "Su" or similar - switch (floorText) { - case "pr": - floor = 0; - break; - case "su": - floor = -1; - break; - default: - console.log( - "[SALJIC] Unknown textual representation of floor : ", - floorText + console.log("Title:", title); + const priceText = $(propertySelectors.price) + .text() + .replace(/(\r\n|\n|\r)/gm, "") + .replace(/ {1,}/g, " ") + .trim(); + const price = + priceText === "CIJENA NA UPIT" + ? null + : parseFloat( + priceText.substring(8, priceText.length - 3).replace(",", "") ); - floor = null; + + console.log("Price:", price); + + const streetName = $(propertySelectors.streetName) + .text() + .replace(/(\r\n|\n|\r)/gm, "") + .trim(); + console.log("Street:", streetName); + + const descriptions = $(propertySelectors.descriptions) + .text() + .trim(); + console.log("Description:", descriptions); + + const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); + const latText = latAndLongSrc.substring( + latAndLongSrc.indexOf("marker=") + 7, + latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + ); + const longText = latAndLongSrc.substring( + latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3, + latAndLongSrc.length + ); + const locationLat = parseFloat(latText) || null; + const locationLong = parseFloat(longText) || null; + console.log("Lat:", locationLat); + console.log("Long:", locationLong); + + //const category = $(propertySelectors.category) + //.text() + //.trim(); + + //====== OTHER AD INFORMATION =============== + let adType = null; + let olxId = null; + let numberOfViewsAgency = null; + + let otherInformationDivId; + //We need to locate DIV ID where other information are stored + for (let possibleId = 10; possibleId <= 20; possibleId++) { + const adTypeFieldTitle = $( + `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` + ) + .text() + .trim(); + + if (adTypeFieldTitle === "Vrsta oglasa") { + otherInformationDivId = possibleId; + break; } } - if (realEstateProperties["namješteno"]) { - furnishingType = FURNISHING_TYPE.FURNISHED.id; - } else if (realEstateProperties["polunamješteno"]) { - furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; - } else { - furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + if (!otherInformationDivId) { + throw { message: "Other information DIV could not be found" }; } - const adStatus = SaljicCrawler.getStatusId(status); - const title = property_name; - const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; - const parsedArea = parseFloat(size); - const gardenSize = null; - const longDescription = null; + const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; + const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; + const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; + const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; + + const publishedDate = $(publishedDateValueSelector) + .text() + .trim(); + + const publishedDateMoment = moment.tz( + publishedDate, + OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + + if (!publishedDateMoment.isValid()) { + throw { message: "Invalid published date ! Check parsing format" }; + } + + const renewedDate = $(renewedDateFullValueSelector) + .data("content") + .trim(); + + const renewedDateMoment = moment.tz( + renewedDate, + OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + + if (!renewedDateMoment) { + throw { + message: + "Invalid renewed date ! Check how parser parsed renewed date text" + }; + } + + adType = $( + `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` + ) + .text() + .trim(); + + const parsedCategory = this.getAdCategoryId(category); + if (!parsedCategory) { + throw { message: `Unknown ad category [${category}]` }; + } + + const parsedAdType = this.getAdTypeId(adType); + if (!parsedAdType) { + throw { message: "Unknown ad type" }; + } + + const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) + .text() + .trim(); + olxId = $(`${olxIdFieldSelector} > div.df2`) + .text() + .trim(); + numberOfViewsAgency = parseInt( + $(numberOfViewsAgencyValueSelector) + .text() + .trim() + ); + + if (olxIdFieldTitle !== "OLX ID") { + throw { message: "Cannot find correct OLX ID" }; + } + //=========================================== + + //====== DETAIL INFORMATION FIELDS ========== + let area, + gardenSize, + numberOfRooms = null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = null, + furnishingType = null, + balcony = null, + newBuilding = null, + elevator = null, + water = null, + electricity = null, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = null, + garage = null, + gas = null, + antiTheftDoor = null, + airCondition = null, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = null, + storeRoom = null, + videoSurveillance = null, + alarm = null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null; + + let fieldIndex = 1; + do { + const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; + const fieldTitleSelector = `${fieldSelector} > div.df1`; + const fieldValueSelector = `${fieldSelector} > div.df2`; + + const fieldTitle = $(fieldTitleSelector) + .text() + .trim() + .toLowerCase(); + const fieldValue = $(fieldValueSelector) + .text() + .trim() + .toLowerCase(); + + switch (fieldTitle) { + case "kvadrata": + area = fieldValue; + break; + case "okućnica (kvadratura)": + gardenSize = fieldValue; + break; + case "broj soba": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj prostorija": + numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); + break; + case "broj spratova": + numberOfFloors = this.parseNumberOfFloors( + fieldValue, + parsedCategory + ); + break; + case "sprat": + floor = this.parseFloorNumber(fieldValue, parsedCategory); + break; + case "vrsta grijanja": + heatingType = this.getHeatingTypeId(fieldValue); + break; + case "namješten?": + furnishingType = this.getFurnishingTypeId(fieldValue); + break; + case "namješten": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "namještena": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "voda": + water = true; + break; + case "struja": + electricity = true; + break; + case "kanalizacija": + drainageSystem = fieldValue !== "nema"; + break; + case "godina izgradnje": + newBuilding = newBuilding || fieldValue === "novogradnja"; + break; + case "kućni ljubimci": + animalsAllowed = fieldValue === "da"; + break; + case "uknjiženo / zk": + registeredInZkBooks = true; + break; + case "uknjiženo (zk)": + registeredInZkBooks = true; + break; + case "novogradnja": + newBuilding = true; + break; + case "nedavno adaptiran": + recentlyAdapted = true; + break; + case "nedavno adaptirana": + recentlyAdapted = true; + break; + case "balkon": + balcony = true; + break; + case "lift": + elevator = true; + break; + case "parking": + parking = true; + break; + case "garaža": + garage = true; + break; + case "plin": + gas = true; + break; + case "blindirana vrata": + antiTheftDoor = true; + break; + case "klima": + airCondition = true; + break; + case "telefonski priključak": + phoneConnection = true; + break; + case "kablovska tv": + cableTV = true; + break; + case "internet": + internet = true; + break; + case "podrum/tavan": + basementAttic = true; + break; + case "ostava/špajz": + storeRoom = true; + break; + case "video nadzor": + videoSurveillance = true; + break; + case "alarm": + alarm = true; + break; + case "za studente": + suitableForStudents = true; + break; + case "uključen trošak režija": + includingBills = true; + break; + case "građevinska dozvola": + buildingPermit = true; + break; + case "komunalni priključak": + utilityConnection = true; + break; + case "urbanistička dozvola": + urbanPlanPermit = true; + break; + case "udaljenost od rijeke (m)": + distanceToRiver = parseInt(fieldValue) || null; + break; + case "prilaz": + accessRoadType = this.getAccessRoadTypeId(fieldValue); + break; + case "bazen": + pool = true; + break; + case "iznajmljeno": + status = AD_STATUS.STATUS_RENTED; + break; + default: + // console.log(fieldTitle, " = ", fieldValue); + break; + } + + if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + break; + } + } while (true); + //=========================================== + + //========================================= + const parsedArea = this.parseArea(area) || null; + const parsedGardenSize = this.parseArea(gardenSize) || null; + const parsedPrice = this.parsePrice(price) || null; + + if ( + title.indexOf("[PRODANO]") !== -1 || + title.indexOf("[ZAVRŠENO]") !== -1 + ) { + status = AD_STATUS.STATUS_SOLD; + } const data = { url, - agencyObjectId: prostorId, - originAgencyName: AD_AGENCY.SALJIC, - realEstateType, - adType, + agencyObjectId: olxId, + originAgencyName: AD_AGENCY.OLX, + realEstateType: parsedCategory, + adType: parsedAdType, title, price: parsedPrice, area: parsedArea, - gardenSize, - shortDescription: "", - longDescription: longDescription, + gardenSize: parsedGardenSize, + shortDescription: descriptions + .first() + .text() + .trim(), + longDescription: descriptions + .last() + .text() + .trim(), streetNumber: 0, - streetName: realEstateProperties["adresa"], + streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", - locationLat: lat, - locationLong: lng, - adStatus, + locationLat, + locationLong, + adStatus: status, + publishedDate: publishedDateMoment.toISOString(), + renewedDate: renewedDateMoment.toISOString(), numberOfRooms, numberOfFloors, floor, @@ -400,159 +643,219 @@ class SaljicCrawler { return data; } catch (e) { - console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url); - return null; - } - } - - async extractRealEstates(url) { - if (PRINT_CRAWLER_DEBUG) { - console.log("[SALJIC] Index page : ", url); - } - - try { - const res = await fetch(url); - const body = await res.text(); - const $ = cheerio.load(body); - console.log("SALJIC: $", $); - - const scriptElement = $( - "body > div > div.container-fluid > script:nth-child(7)" - ); - // - //console.log(scriptElement[0]); - //console.log(scriptElement[0].children); - if ( - scriptElement[0] && - scriptElement[0].children && - scriptElement[0].children[0] && - scriptElement[0].children[0].data - ) { - const scriptData = scriptElement[0].children[0].data; - // - console.log(scriptData); - try { - // script element data contains JS code and we need to extract only data for realEstates - // data string starts with : var map; var markers = [{"r ... - // so we remove first 23 characters - // - // real estate JSON data ends with ...}, ]; map = new... - // so we need to find index of that substring to know where to stop - // we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually - - const jsonEndIndex = scriptData.indexOf(", ]; map = new"); - if (jsonEndIndex > -1) { - const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; - const realEstates = JSON.parse(jsonData); - - // const transformedRealEstates = []; - // - // for (const realEstate of realEstates) { - // const transformedRealEstate = SaljicCrawler.transformRealEstateData( - // realEstate - // ); - // if (transformedRealEstate) { - // transformedRealEstates.push(transformedRealEstate); - // } - // } - // - // return transformedRealEstates; - return realEstates; - } else { - throw { - message: "Something is wrong with JSON data or data is moved" - }; - } - } catch (e) { - console.log(e); - throw e; - } - } - } catch (e) { - console.error( - "[SALJIC] Exception caught:", - e.message || "UNKNOWN MESSAGE" - ); - return []; + console.error("Exception caught: " + e.message, "\r\nURL:", url); } + return null; } //======= HELPER FUNCTIONS ============= - static getAdCategoryId(categoryText) { + getAdCategoryId(categoryText) { switch (categoryText) { - case "stan": + case "Stanovi": return AD_CATEGORY.FLAT.id; - case "kuca": - return AD_CATEGORY.HOUSE.id; - case "apartman": - return AD_CATEGORY.APARTMENT.id; - case "poslovni-prostor": - return AD_CATEGORY.OFFICE.id; - case "garaza": - return AD_CATEGORY.GARAGE.id; - case "zemljiste": + case "Zemljišta": return AD_CATEGORY.LAND.id; + case "Kuće": + return AD_CATEGORY.HOUSE.id; + case "Poslovni prostori": + return AD_CATEGORY.OFFICE.id; + case "Apartmani": + return AD_CATEGORY.APARTMENT.id; + case "Garaže": + return AD_CATEGORY.GARAGE.id; + case "Vikendice": + return AD_CATEGORY.COTTAGE.id; default: return undefined; } } - static getAdTypeId(adTypeText) { + getAdTypeId(adTypeText) { switch (adTypeText) { - case "prodaja": + case "Prodaja": return AD_TYPE.AD_TYPE_SALE.stringId; - case "najam": + case "Izdavanje": return AD_TYPE.AD_TYPE_RENT.stringId; - case "novogradnja": - return AD_TYPE.AD_TYPE_SALE.stringId; + case "Potražnja": + return AD_TYPE.AD_TYPE_REQUEST.stringId; default: return undefined; } } - static getHeatingTypeId(realEstateProperties) { - const realEstatePropertiesKeys = Object.keys(realEstateProperties); - for (const property of realEstatePropertiesKeys) { - switch (property) { - case "centralno toplane": - return HEATING_TYPE.CENTRAL_CITY.id; - case "etažno plinsko": - return HEATING_TYPE.CENTRAL_GAS.id; - case "termo blok": - case "podno grijanje": - return HEATING_TYPE.OTHER.id; - case "etažno električno": - case "konvektori": - return HEATING_TYPE.ELECTRICITY.id; - case "plinske peći": - return HEATING_TYPE.GAS.id; - case "vlastita kotlovnica": - return HEATING_TYPE.CENTRAL_BOILER.id; - case "toplotna pumpa": - return HEATING_TYPE.HEAT_PUMP.id; - case "kamin": - return HEATING_TYPE.WOOD.id; + getHeatingTypeId(heatingTypeText) { + switch (heatingTypeText) { + case "struja": + return HEATING_TYPE.ELECTRICITY.id; + case "plin": + return HEATING_TYPE.GAS.id; + case "drva": + return HEATING_TYPE.WOOD.id; + case "centralno (gradsko)": + return HEATING_TYPE.CENTRAL_CITY.id; + case "centralno (kotlovnica)": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "centralno (plin)": + return HEATING_TYPE.CENTRAL_GAS.id; + case "nije uvedeno": + return HEATING_TYPE.NO_HEATING.id; + case "ostalo": + return HEATING_TYPE.OTHER.id; + case "drugo": + return HEATING_TYPE.OTHER.id; + default: + console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); + return null; + } + } + + getFurnishingTypeId(furnishingTypeText) { + switch (furnishingTypeText) { + case "namješten": + return FURNISHING_TYPE.FURNISHED.id; + case "polunamješten": + return FURNISHING_TYPE.HALF_FURNISHED.id; + case "nenamješten": + return FURNISHING_TYPE.NOT_FURNISHED.id; + case "": + return FURNISHING_TYPE.FURNISHED.id; + default: + console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); + return null; + } + } + + getAccessRoadTypeId(accessRoadTypeText) { + switch (accessRoadTypeText) { + case "asfalt": + return ACCESS_ROAD_TYPE.ASPHALT.id; + case "beton": + return ACCESS_ROAD_TYPE.CONCRETE.id; + case "makadam": + return ACCESS_ROAD_TYPE.MACADAM.id; + case "ostalo": + return ACCESS_ROAD_TYPE.OTHER.id; + default: + console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); + return null; + } + } + + parseArea(areaText) { + if (!areaText) { + return NaN; + } + const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; + const textWithOnlyOneDecimalDot = areaText + .replace(",", ".") + .replace(removeDotsExceptLastOneRegex, ""); + + return parseFloat(textWithOnlyOneDecimalDot); + } + + parsePrice(priceText) { + if (!priceText) { + return NaN; + } + const formattedPriceText = priceText.replace(".", "").replace(",", "."); + return parseFloat(formattedPriceText); + } + + parseNumberOfRooms(numberOfRoomsText, categoryId) { + if (categoryId === AD_CATEGORY.FLAT.id) { + switch (numberOfRoomsText) { + case "garsonjera": + return 0; + case "jednosoban (1)": + return 1; + case "jednoiposoban (1.5)": + return 1.5; + case "dvosoban (2)": + return 2; + case "trosoban (3)": + return 3; + case "četverosoban (4)": + return 4; + case "petosoban i više": + return 5; default: - //console.log("[SALJIC] Nepoznato >>> [", property, "]"); + console.log( + "broj soba [stan] = NEPOZNATO [", + numberOfRoomsText, + ", ", + categoryId, + "]" + ); + return null; } } + + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id || + categoryId === AD_CATEGORY.APARTMENT.id || + categoryId === AD_CATEGORY.OFFICE.id + ) { + return parseInt(numberOfRoomsText) || null; + } + + console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); + return null; } - static getStatusId(statusText) { - switch (statusText) { - case "": - return AD_STATUS.STATUS_NORMAL; - case "Rezervisano": - return AD_STATUS.STATUS_RESERVED; - case "Prodano": - return AD_STATUS.STATUS_SOLD; - case "Iznajmljeno": - return AD_STATUS.STATUS_RENTED; - default: - console.log("[SALJIC] Unknown AD_STATUS : [", statusText, "]"); - return AD_STATUS.STATUS_NORMAL; + parseNumberOfFloors(numberOfFloorsText, categoryId) { + if ( + categoryId === AD_CATEGORY.HOUSE.id || + categoryId === AD_CATEGORY.COTTAGE.id + ) { + return parseInt(numberOfFloorsText) || null; } + + if (categoryId === AD_CATEGORY.OFFICE.id) { + if ( + numberOfFloorsText === "suteren" || + numberOfFloorsText === "prizemlje" + ) { + return 0; + } + if (numberOfFloorsText === "6+") { + return 7; + } + return parseInt(numberOfFloorsText) || null; + } + + console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); + return null; + } + + parseFloorNumber(floorText, categoryId) { + if ( + categoryId === AD_CATEGORY.FLAT.id || + categoryId === AD_CATEGORY.APARTMENT.id + ) { + if ( + floorText === "suteren" || + floorText === "prizemlje" || + floorText === "visoko prizemlje" + ) { + return 0; + } + return parseInt(floorText) || null; + } + + if (categoryId === AD_CATEGORY.OFFICE.id) { + if (floorText === "zaseban objekat") { + return null; + } + if (floorText === "prizemlje" || floorText === "visoko prizemlje") { + return 0; + } + return parseInt(floorText) || null; + } + + console.log("sprat = NEPOZNATO [", floorText, "]"); + return null; } async sleep(ms) { -- 2.47.3 From 7a7aecb3ee21c950d0f006e425fabd53166cefff Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Fri, 31 Jan 2020 00:55:24 +0100 Subject: [PATCH 4/5] WIP Scraped no of rooms, floors etc. --- app/crawler/specificCrawlers/saljic.js | 291 +++++-------------------- 1 file changed, 49 insertions(+), 242 deletions(-) diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 4156727..95c24bb 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -267,103 +267,6 @@ class SaljicCrawler { console.log("Lat:", locationLat); console.log("Long:", locationLong); - //const category = $(propertySelectors.category) - //.text() - //.trim(); - - //====== OTHER AD INFORMATION =============== - let adType = null; - let olxId = null; - let numberOfViewsAgency = null; - - let otherInformationDivId; - //We need to locate DIV ID where other information are stored - for (let possibleId = 10; possibleId <= 20; possibleId++) { - const adTypeFieldTitle = $( - `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` - ) - .text() - .trim(); - - if (adTypeFieldTitle === "Vrsta oglasa") { - otherInformationDivId = possibleId; - break; - } - } - - if (!otherInformationDivId) { - throw { message: "Other information DIV could not be found" }; - } - - const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; - const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; - const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; - const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; - - const publishedDate = $(publishedDateValueSelector) - .text() - .trim(); - - const publishedDateMoment = moment.tz( - publishedDate, - OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, - DEFAULT_TIMEZONE - ); - - if (!publishedDateMoment.isValid()) { - throw { message: "Invalid published date ! Check parsing format" }; - } - - const renewedDate = $(renewedDateFullValueSelector) - .data("content") - .trim(); - - const renewedDateMoment = moment.tz( - renewedDate, - OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, - DEFAULT_TIMEZONE - ); - - if (!renewedDateMoment) { - throw { - message: - "Invalid renewed date ! Check how parser parsed renewed date text" - }; - } - - adType = $( - `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` - ) - .text() - .trim(); - - const parsedCategory = this.getAdCategoryId(category); - if (!parsedCategory) { - throw { message: `Unknown ad category [${category}]` }; - } - - const parsedAdType = this.getAdTypeId(adType); - if (!parsedAdType) { - throw { message: "Unknown ad type" }; - } - - const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) - .text() - .trim(); - olxId = $(`${olxIdFieldSelector} > div.df2`) - .text() - .trim(); - numberOfViewsAgency = parseInt( - $(numberOfViewsAgencyValueSelector) - .text() - .trim() - ); - - if (olxIdFieldTitle !== "OLX ID") { - throw { message: "Cannot find correct OLX ID" }; - } - //=========================================== - //====== DETAIL INFORMATION FIELDS ========== let area, gardenSize, @@ -401,177 +304,81 @@ class SaljicCrawler { buildingPermit = null, utilityConnection = null, distanceToRiver = null; + let publishedDate = null; + let renewedDate = null; - let fieldIndex = 1; + //Extracting data - Glavne karakteristike + let mainFieldIndex = 1; do { - const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; - const fieldTitleSelector = `${fieldSelector} > div.df1`; - const fieldValueSelector = `${fieldSelector} > div.df2`; + const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`; - const fieldTitle = $(fieldTitleSelector) + const mainField = $(mainFieldSelector) .text() - .trim() - .toLowerCase(); - const fieldValue = $(fieldValueSelector) - .text() - .trim() - .toLowerCase(); + .replace(/[\n\r\t]/gm, "") + .trim(); - switch (fieldTitle) { - case "kvadrata": - area = fieldValue; - break; - case "okućnica (kvadratura)": - gardenSize = fieldValue; - break; - case "broj soba": - numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); - break; - case "broj prostorija": - numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); - break; - case "broj spratova": - numberOfFloors = this.parseNumberOfFloors( - fieldValue, - parsedCategory + const mainFieldTitle = mainField.substring(0, mainField.indexOf(" ")); + const mainFieldValue = mainField + .substring(mainField.indexOf(" "), mainField.length) + .trim(); + + switch (mainFieldTitle) { + case "Površina": + area = parseFloat( + mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) ); break; - case "sprat": - floor = this.parseFloorNumber(fieldValue, parsedCategory); + case "Okućnica": + gardenSize = parseFloat( + mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) + ); break; - case "vrsta grijanja": - heatingType = this.getHeatingTypeId(fieldValue); + case "Broj soba": + numberOfRooms = parseInt(mainFieldValue); break; - case "namješten?": - furnishingType = this.getFurnishingTypeId(fieldValue); + case "Broj spratova": + numberOfFloors = parseInt(mainFieldValue); break; - case "namješten": - furnishingType = FURNISHING_TYPE.FURNISHED.id; + case "Sprat": + floor = parseInt(mainFieldValue); break; - case "namještena": - furnishingType = FURNISHING_TYPE.FURNISHED.id; - break; - case "voda": - water = true; - break; - case "struja": - electricity = true; - break; - case "kanalizacija": - drainageSystem = fieldValue !== "nema"; - break; - case "godina izgradnje": - newBuilding = newBuilding || fieldValue === "novogradnja"; - break; - case "kućni ljubimci": - animalsAllowed = fieldValue === "da"; - break; - case "uknjiženo / zk": - registeredInZkBooks = true; - break; - case "uknjiženo (zk)": - registeredInZkBooks = true; - break; - case "novogradnja": - newBuilding = true; - break; - case "nedavno adaptiran": + case "Godina renoviranja": recentlyAdapted = true; break; - case "nedavno adaptirana": - recentlyAdapted = true; - break; - case "balkon": - balcony = true; - break; - case "lift": - elevator = true; - break; - case "parking": + case "Broj parking mjesta": + `${month}/${day}/${year}`; parking = true; break; - case "garaža": - garage = true; - break; - case "plin": - gas = true; - break; - case "blindirana vrata": - antiTheftDoor = true; - break; - case "klima": - airCondition = true; - break; - case "telefonski priključak": - phoneConnection = true; - break; - case "kablovska tv": - cableTV = true; - break; - case "internet": - internet = true; - break; - case "podrum/tavan": - basementAttic = true; - break; - case "ostava/špajz": - storeRoom = true; - break; - case "video nadzor": - videoSurveillance = true; - break; - case "alarm": - alarm = true; - break; - case "za studente": - suitableForStudents = true; - break; - case "uključen trošak režija": - includingBills = true; - break; - case "građevinska dozvola": - buildingPermit = true; - break; - case "komunalni priključak": - utilityConnection = true; - break; - case "urbanistička dozvola": - urbanPlanPermit = true; - break; - case "udaljenost od rijeke (m)": - distanceToRiver = parseInt(fieldValue) || null; - break; - case "prilaz": - accessRoadType = this.getAccessRoadTypeId(fieldValue); - break; - case "bazen": - pool = true; - break; - case "iznajmljeno": - status = AD_STATUS.STATUS_RENTED; + case "Dostupno od": + const day = mainFieldValue.substring(0, 2); + const month = mainFieldValue.substring(3, 5); + const year = mainFieldValue.substring(6, mainFieldValue.length); + console.log(`${month}/${day}/${year}`); + publishedDate = new Date(`${month}/${day}/${year}`); break; default: // console.log(fieldTitle, " = ", fieldValue); break; } - if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + if (mainFieldTitle === "") { break; } + mainFieldIndex++; } while (true); - //=========================================== - //========================================= - const parsedArea = this.parseArea(area) || null; - const parsedGardenSize = this.parseArea(gardenSize) || null; - const parsedPrice = this.parsePrice(price) || null; + console.log("Area:", area); + console.log("Garden size:", gardenSize); + console.log("Number of rooms:", numberOfRooms); + console.log("Number of floors", numberOfFloors); + console.log("Floor:", floor); + console.log("Adapted:", recentlyAdapted); + console.log("Parking:", parking); + console.log("Published date:", publishedDate); - if ( - title.indexOf("[PRODANO]") !== -1 || - title.indexOf("[ZAVRŠENO]") !== -1 - ) { - status = AD_STATUS.STATUS_SOLD; - } + //const category = $(propertySelectors.category) + //.text() + //.trim(); const data = { url, -- 2.47.3 From 1ba7cf8531057ae25d911fe76f8f0d8f31d77b4b Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Fri, 31 Jan 2020 22:03:39 +0100 Subject: [PATCH 5/5] Added crawler for Saljic nekretnine. --- app/crawler/specificCrawlers/saljic.js | 382 ++++++++++--------------- 1 file changed, 154 insertions(+), 228 deletions(-) diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 95c24bb..37c569e 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -174,6 +174,25 @@ class SaljicCrawler { } }); + let adTypesTmp = []; + + $("#shop") + .find(".product") + .each((i, elem) => { + const adType = $(elem) + .find(".trakica-search-page") + .text() + .trim(); + if (adType) { + adTypesTmp.push(adType); + } + }); + + //Converting to AD_TYPE + const adTypes = adTypesTmp.map(adTypeText => { + return this.getAdTypeId(adTypeText); + }); + //Converting to absolute URLs const hrefsAbs = hrefs.map(link => { return "https://www.saljicnekretnine.ba" + link; @@ -186,7 +205,7 @@ class SaljicCrawler { const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { - asyncScraping.push(this.scrapeAd(hrefsAbs[i])); + asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i])); } const scrapedData = await Promise.all(asyncScraping); @@ -198,16 +217,19 @@ class SaljicCrawler { } } - async scrapeAd(url) { + async scrapeAd(url, adType) { console.log("[SALJIC] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); - // ??? treba li nesto za status - let status = AD_STATUS.STATUS_NORMAL; + // No information for status ex. PRODAN + const status = AD_STATUS.STATUS_NORMAL; + //Extracting agency ID from url + const agencyObjectId = parseInt(url.substring(46, url.length)); + //Extracting main properties const propertySelectors = { title: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", @@ -227,7 +249,6 @@ class SaljicCrawler { .replace(/ {1,}/g, " ") .trim(); - console.log("Title:", title); const priceText = $(propertySelectors.price) .text() .replace(/(\r\n|\n|\r)/gm, "") @@ -240,18 +261,14 @@ class SaljicCrawler { priceText.substring(8, priceText.length - 3).replace(",", "") ); - console.log("Price:", price); - const streetName = $(propertySelectors.streetName) .text() .replace(/(\r\n|\n|\r)/gm, "") .trim(); - console.log("Street:", streetName); const descriptions = $(propertySelectors.descriptions) .text() .trim(); - console.log("Description:", descriptions); const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); const latText = latAndLongSrc.substring( @@ -264,8 +281,6 @@ class SaljicCrawler { ); const locationLat = parseFloat(latText) || null; const locationLong = parseFloat(longText) || null; - console.log("Lat:", locationLat); - console.log("Long:", locationLong); //====== DETAIL INFORMATION FIELDS ========== let area, @@ -306,6 +321,8 @@ class SaljicCrawler { distanceToRiver = null; let publishedDate = null; let renewedDate = null; + let realEstateType; + let numberOfViewsAgency = null; //Extracting data - Glavne karakteristike let mainFieldIndex = 1; @@ -346,18 +363,15 @@ class SaljicCrawler { recentlyAdapted = true; break; case "Broj parking mjesta": - `${month}/${day}/${year}`; parking = true; break; case "Dostupno od": const day = mainFieldValue.substring(0, 2); const month = mainFieldValue.substring(3, 5); const year = mainFieldValue.substring(6, mainFieldValue.length); - console.log(`${month}/${day}/${year}`); publishedDate = new Date(`${month}/${day}/${year}`); break; default: - // console.log(fieldTitle, " = ", fieldValue); break; } @@ -367,39 +381,121 @@ class SaljicCrawler { mainFieldIndex++; } while (true); - console.log("Area:", area); - console.log("Garden size:", gardenSize); - console.log("Number of rooms:", numberOfRooms); - console.log("Number of floors", numberOfFloors); - console.log("Floor:", floor); - console.log("Adapted:", recentlyAdapted); - console.log("Parking:", parking); - console.log("Published date:", publishedDate); + //Extracting data - Sadrzaji + let additionalFieldIndex = 1; + do { + const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`; - //const category = $(propertySelectors.category) - //.text() - //.trim(); + const additionalField = $(additionalFieldSelector) + .text() + .trim(); + + if (additionalFieldIndex === 1) { + //Extracting data of real estate type + const categoryTmp = additionalField + .replace(/[\n\r\t]/gm, "") + .substring( + additionalField.indexOf("Kategorija") + 10, + additionalField.length + ) + .trim(); + realEstateType = this.getAdCategoryId(categoryTmp); + } else { + switch (additionalField) { + case "Internet": + internet = true; + break; + case "Garaža": + garage = true; + break; + case "Klima": + airCondition = true; + break; + case "Balkon": + balcony = true; + break; + case "Ostava": + storeRoom = true; + break; + case "Podrum": + basementAttic = true; + break; + case "Blindirana vrata": + antiTheftDoor = true; + break; + case "Voda": + water = true; + break; + case "Kablovska": + cableTV = true; + break; + case "Uknjiženo": + registeredInZkBooks = true; + break; + case "Grijanje - centralno": + heatingType = HEATING_TYPE.CENTRAL_CITY.id; + break; + case "Grijanje - plin": + heatingType = HEATING_TYPE.GAS.id; + break; + case "Grijanje - struja": + heatingType = HEATING_TYPE.ELECTRICITY.id; + break; + case "Grijanje": + heatingType = HEATING_TYPE.OTHER.id; + break; + case "Plin": + gas = true; + break; + case "Namješten": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "Alarm": + alarm = true; + break; + case "Video nadzor": + videoSurveillance = true; + break; + case "Lift": + elevator = true; + break; + case "Novogradnja": + newBuilding = true; + break; + + default: + break; + } + } + + if (additionalField === "") { + break; + } + additionalFieldIndex++; + } while (true); + + //If no published date it takes current date of crawling + if (publishedDate) { + renewedDate = new Date(); + } else { + publishedDate = new Date(); + renewedDate = new Date(); + } const data = { url, - agencyObjectId: olxId, - originAgencyName: AD_AGENCY.OLX, - realEstateType: parsedCategory, - adType: parsedAdType, + agencyObjectId, + originAgencyName: AD_AGENCY.SALJIC, + realEstateType, + adType, title, - price: parsedPrice, - area: parsedArea, - gardenSize: parsedGardenSize, - shortDescription: descriptions - .first() - .text() - .trim(), - longDescription: descriptions - .last() - .text() - .trim(), + price, + area, + gardenSize, + shortDescription: descriptions.substring(0, descriptions.indexOf(".")), + longDescription: descriptions, streetNumber: 0, - streetName: "", + streetName, locality: "", municipality: "", city: "", @@ -409,8 +505,8 @@ class SaljicCrawler { locationLat, locationLong, adStatus: status, - publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString(), + publishedDate, + renewedDate, numberOfRooms, numberOfFloors, floor, @@ -447,7 +543,7 @@ class SaljicCrawler { distanceToRiver, numberOfViewsAgency }; - + console.log(data); return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); @@ -459,19 +555,25 @@ class SaljicCrawler { getAdCategoryId(categoryText) { switch (categoryText) { - case "Stanovi": + case "Stan": return AD_CATEGORY.FLAT.id; - case "Zemljišta": + case "Građevinsko zemljiste": return AD_CATEGORY.LAND.id; - case "Kuće": + case "Industrijsko zemljiste": + return AD_CATEGORY.LAND.id; + case "Poljoprivredno zemljiste": + return AD_CATEGORY.LAND.id; + case "Kuća": return AD_CATEGORY.HOUSE.id; - case "Poslovni prostori": + case "Poslovni prostor": + return AD_CATEGORY.OFFICE.id; + case "Kancelarije": return AD_CATEGORY.OFFICE.id; case "Apartmani": return AD_CATEGORY.APARTMENT.id; - case "Garaže": + case "Garaža": return AD_CATEGORY.GARAGE.id; - case "Vikendice": + case "Vikendica": return AD_CATEGORY.COTTAGE.id; default: return undefined; @@ -480,191 +582,15 @@ class SaljicCrawler { getAdTypeId(adTypeText) { switch (adTypeText) { - case "Prodaja": + case "PRODAJA": return AD_TYPE.AD_TYPE_SALE.stringId; - case "Izdavanje": + case "NAJAM": return AD_TYPE.AD_TYPE_RENT.stringId; - case "Potražnja": - return AD_TYPE.AD_TYPE_REQUEST.stringId; default: return undefined; } } - getHeatingTypeId(heatingTypeText) { - switch (heatingTypeText) { - case "struja": - return HEATING_TYPE.ELECTRICITY.id; - case "plin": - return HEATING_TYPE.GAS.id; - case "drva": - return HEATING_TYPE.WOOD.id; - case "centralno (gradsko)": - return HEATING_TYPE.CENTRAL_CITY.id; - case "centralno (kotlovnica)": - return HEATING_TYPE.CENTRAL_BOILER.id; - case "centralno (plin)": - return HEATING_TYPE.CENTRAL_GAS.id; - case "nije uvedeno": - return HEATING_TYPE.NO_HEATING.id; - case "ostalo": - return HEATING_TYPE.OTHER.id; - case "drugo": - return HEATING_TYPE.OTHER.id; - default: - console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); - return null; - } - } - - getFurnishingTypeId(furnishingTypeText) { - switch (furnishingTypeText) { - case "namješten": - return FURNISHING_TYPE.FURNISHED.id; - case "polunamješten": - return FURNISHING_TYPE.HALF_FURNISHED.id; - case "nenamješten": - return FURNISHING_TYPE.NOT_FURNISHED.id; - case "": - return FURNISHING_TYPE.FURNISHED.id; - default: - console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); - return null; - } - } - - getAccessRoadTypeId(accessRoadTypeText) { - switch (accessRoadTypeText) { - case "asfalt": - return ACCESS_ROAD_TYPE.ASPHALT.id; - case "beton": - return ACCESS_ROAD_TYPE.CONCRETE.id; - case "makadam": - return ACCESS_ROAD_TYPE.MACADAM.id; - case "ostalo": - return ACCESS_ROAD_TYPE.OTHER.id; - default: - console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); - return null; - } - } - - parseArea(areaText) { - if (!areaText) { - return NaN; - } - const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; - const textWithOnlyOneDecimalDot = areaText - .replace(",", ".") - .replace(removeDotsExceptLastOneRegex, ""); - - return parseFloat(textWithOnlyOneDecimalDot); - } - - parsePrice(priceText) { - if (!priceText) { - return NaN; - } - const formattedPriceText = priceText.replace(".", "").replace(",", "."); - return parseFloat(formattedPriceText); - } - - parseNumberOfRooms(numberOfRoomsText, categoryId) { - if (categoryId === AD_CATEGORY.FLAT.id) { - switch (numberOfRoomsText) { - case "garsonjera": - return 0; - case "jednosoban (1)": - return 1; - case "jednoiposoban (1.5)": - return 1.5; - case "dvosoban (2)": - return 2; - case "trosoban (3)": - return 3; - case "četverosoban (4)": - return 4; - case "petosoban i više": - return 5; - default: - console.log( - "broj soba [stan] = NEPOZNATO [", - numberOfRoomsText, - ", ", - categoryId, - "]" - ); - return null; - } - } - - if ( - categoryId === AD_CATEGORY.HOUSE.id || - categoryId === AD_CATEGORY.COTTAGE.id || - categoryId === AD_CATEGORY.APARTMENT.id || - categoryId === AD_CATEGORY.OFFICE.id - ) { - return parseInt(numberOfRoomsText) || null; - } - - console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); - return null; - } - - parseNumberOfFloors(numberOfFloorsText, categoryId) { - if ( - categoryId === AD_CATEGORY.HOUSE.id || - categoryId === AD_CATEGORY.COTTAGE.id - ) { - return parseInt(numberOfFloorsText) || null; - } - - if (categoryId === AD_CATEGORY.OFFICE.id) { - if ( - numberOfFloorsText === "suteren" || - numberOfFloorsText === "prizemlje" - ) { - return 0; - } - if (numberOfFloorsText === "6+") { - return 7; - } - return parseInt(numberOfFloorsText) || null; - } - - console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); - return null; - } - - parseFloorNumber(floorText, categoryId) { - if ( - categoryId === AD_CATEGORY.FLAT.id || - categoryId === AD_CATEGORY.APARTMENT.id - ) { - if ( - floorText === "suteren" || - floorText === "prizemlje" || - floorText === "visoko prizemlje" - ) { - return 0; - } - return parseInt(floorText) || null; - } - - if (categoryId === AD_CATEGORY.OFFICE.id) { - if (floorText === "zaseban objekat") { - return null; - } - if (floorText === "prizemlje" || floorText === "visoko prizemlje") { - return 0; - } - return parseInt(floorText) || null; - } - - console.log("sprat = NEPOZNATO [", floorText, "]"); - return null; - } - async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } -- 2.47.3