"use strict"; const fetch = require("node-fetch"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE, FURNISHING_TYPE, HEATING_TYPE } = require("../../common/enums"); const { PRINT_CRAWLER_DEBUG, DEFAULT_TIMEZONE } = require("../../config/appConfig"); const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic"); const SALJIC_ENUMS = { SALJIC_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "&input_vrsta=", [CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1", [CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2" }, SALJIC_AD_CATEGORY: { [AD_CATEGORY.ALL.id]: "&input_kategorija=", [AD_CATEGORY.FLAT.id]: "&input_kategorija=15", [AD_CATEGORY.HOUSE.id]: "&input_kategorija=9", [AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko [AD_CATEGORY.OFFICE.id]: "&input_kategorija=8", [AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1", [AD_CATEGORY.GARAGE.id]: "&input_kategorija=2" //[AD_CATEGORY.COTTAGE.id]: "" } }; class SaljicCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 5000, maxResultsPerPage = 5000, ignoredUsernames = [], delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxResultsPerPage = maxResultsPerPage; this.delayBetweenPages = delayBetweenPages; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory)); } // //console.log(indexGenerators); // let done = false; while (!done) { const categoryIndexerPromises = []; const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); generatorsToRemove.push(false); } const singlePageResults = await Promise.all(categoryIndexerPromises); const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); const { newRecords } = saveResults; newRealEstates.push(...newRecords); if ( Array.isArray(newRecords) && newRecords.length === 0 && !SALJIC_FORCE_CRAWL ) { generatorsToRemove[index] = true; } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; // console.log("Generator ", index + 1, "has no more pages"); } } // console.log("Generators state : ", generatorsToRemove); for (let i = generatorsToRemove.length - 1; i >= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory) { let pageToIndex = 1; const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11; const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`; const singlePageResults = await this.indexSinglePage( urlPageToCrawl, this.maxResultsPerPage ); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; } else { return undefined; } ++pageToIndex; if (pageToIndex === this.maxPages) { return undefined; } } } else { return undefined; } } async indexSinglePage(url, maxResultsPerPage) { if (PRINT_CRAWLER_DEBUG) { console.log("[SALJIC] Index page : ", url); } try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; $("#shop") .find(".product") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); let adTypesTmp = []; $("#shop") .find(".product") .each((i, elem) => { const adType = $(elem) .find(".trakica-search-page") .text() .trim(); if (adType) { adTypesTmp.push(adType); } }); //Converting to AD_TYPE const adTypes = adTypesTmp.map(adTypeText => { return this.getAdTypeId(adTypeText); }); //Converting to absolute URLs const hrefsAbs = hrefs.map(link => { return "https://www.saljicnekretnine.ba" + link; }); let actualNoOfResults = hrefsAbs.length <= maxResultsPerPage ? hrefsAbs.length : maxResultsPerPage; const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i])); } const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; } catch (e) { console.error("[SALJIC] Exception caught:" + e); return []; } } async scrapeAd(url, adType) { // console.log("[SALJIC] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); // No information for status ex. PRODAN const status = AD_STATUS.STATUS_NORMAL; //Extracting agency ID from url const agencyObjectId = parseInt(url.substring(46, url.length)); //Extracting main properties const propertySelectors = { title: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", price: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", streetName: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p", descriptions: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)", latAndLong: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" }; const title = $(propertySelectors.title) .text() .replace(/(\r\n|\n|\r)/gm, "") .replace(/ {1,}/g, " ") .trim(); const priceText = $(propertySelectors.price) .text() .replace(/(\r\n|\n|\r)/gm, "") .replace(/ {1,}/g, " ") .trim(); const price = priceText === "CIJENA NA UPIT" ? null : parseFloat( priceText.substring(8, priceText.length - 3).replace(",", "") ); const streetName = $(propertySelectors.streetName) .text() .replace(/(\r\n|\n|\r)/gm, "") .trim(); const descriptions = $(propertySelectors.descriptions) .text() .replace(/\"/g, "") .trim(); const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); const latText = latAndLongSrc.substring( latAndLongSrc.indexOf("marker=") + 7, latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) ); const longText = latAndLongSrc.substring( latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3, latAndLongSrc.length ); const locationLat = parseFloat(latText) || null; const locationLong = parseFloat(longText) || null; //====== DETAIL INFORMATION FIELDS ========== let area = null, gardenSize = null, numberOfRooms = null, numberOfFloors = null, floor = null, accessRoadType = null, heatingType = null, furnishingType = null, balcony = null, newBuilding = null, elevator = null, water = null, electricity = null, drainageSystem = null, registeredInZkBooks = null, recentlyAdapted = null, parking = null, garage = null, gas = null, antiTheftDoor = null, airCondition = null, phoneConnection = null, cableTV = null, internet = null, basementAttic = null, storeRoom = null, videoSurveillance = null, alarm = null, suitableForStudents = null, includingBills = null, animalsAllowed = null, pool = null, exchange = null, urbanPlanPermit = null, buildingPermit = null, utilityConnection = null, distanceToRiver = null; let publishedDate = null; let renewedDate = null; let realEstateType; let numberOfViewsAgency = null; let numberOfViewsKivi = null; let streetNumber = 0; let adStatus = status; let shortDescription = descriptions.substring( 0, descriptions.indexOf(".") ); let longDescription = descriptions; //Extracting data - Glavne karakteristike let mainFieldIndex = 1; do { const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`; const mainField = $(mainFieldSelector) .text() .replace(/[\n\r\t]/gm, "") .trim(); const mainFieldTitle = mainField.substring(0, mainField.indexOf(" ")); const mainFieldValue = mainField .substring(mainField.indexOf(" "), mainField.length) .trim(); switch (mainFieldTitle) { case "Površina": area = parseFloat( mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) ); break; case "Okućnica": gardenSize = parseFloat( mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) ); break; case "Broj soba": numberOfRooms = parseInt(mainFieldValue); break; case "Broj spratova": numberOfFloors = parseInt(mainFieldValue); break; case "Sprat": floor = parseInt(mainFieldValue); break; case "Godina renoviranja": recentlyAdapted = true; break; case "Broj parking mjesta": parking = true; break; case "Dostupno od": const day = mainFieldValue.substring(0, 2); const month = mainFieldValue.substring(3, 5); const year = mainFieldValue.substring(6, mainFieldValue.length); publishedDate = new Date(`${month}/${day}/${year}`); break; default: break; } if (mainFieldTitle === "") { break; } mainFieldIndex++; } while (true); //Extracting data - Sadrzaji let additionalFieldIndex = 1; do { const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`; const additionalField = $(additionalFieldSelector) .text() .trim(); if (additionalFieldIndex === 1) { //Extracting data of real estate type const categoryTmp = additionalField .replace(/[\n\r\t]/gm, "") .substring( additionalField.indexOf("Kategorija") + 10, additionalField.length ) .trim(); realEstateType = this.getAdCategoryId(categoryTmp); } else { switch (additionalField) { case "Internet": internet = true; break; case "Garaža": garage = true; break; case "Klima": airCondition = true; break; case "Balkon": balcony = true; break; case "Ostava": storeRoom = true; break; case "Podrum": basementAttic = true; break; case "Blindirana vrata": antiTheftDoor = true; break; case "Voda": water = true; break; case "Kablovska": cableTV = true; break; case "Uknjiženo": registeredInZkBooks = true; break; case "Grijanje - centralno": heatingType = HEATING_TYPE.CENTRAL_CITY.id; break; case "Grijanje - plin": heatingType = HEATING_TYPE.GAS.id; break; case "Grijanje - struja": heatingType = HEATING_TYPE.ELECTRICITY.id; break; case "Grijanje": heatingType = HEATING_TYPE.OTHER.id; break; case "Plin": gas = true; break; case "Namješten": furnishingType = FURNISHING_TYPE.FURNISHED.id; break; case "Alarm": alarm = true; break; case "Video nadzor": videoSurveillance = true; break; case "Lift": elevator = true; break; case "Novogradnja": newBuilding = true; break; default: break; } } if (additionalField === "") { break; } additionalFieldIndex++; } while (true); //If no published date it takes current date of crawling if (publishedDate) { renewedDate = new Date(); } else { publishedDate = new Date(); renewedDate = new Date(); } const originAgencyName = AD_AGENCY.SALJIC; const locality = ""; const municipality = ""; const city = ""; const region = ""; const entity = ""; const country = ""; const data = { url, agencyObjectId, originAgencyName, realEstateType, adType, title, price, area, gardenSize, shortDescription, longDescription, streetNumber, streetName, locality, municipality, city, region, entity, country, locationLat, locationLong, adStatus, publishedDate, renewedDate, numberOfRooms, numberOfFloors, floor, accessRoadType, heatingType, furnishingType, balcony, newBuilding, elevator, water, electricity, drainageSystem, registeredInZkBooks, recentlyAdapted, parking, garage, gas, antiTheftDoor, airCondition, phoneConnection, cableTV, internet, basementAttic, storeRoom, videoSurveillance, alarm, suitableForStudents, includingBills, animalsAllowed, pool, exchange, urbanPlanPermit, buildingPermit, utilityConnection, distanceToRiver, numberOfViewsAgency, numberOfViewsKivi }; return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); } return null; } //======= HELPER FUNCTIONS ============= getAdCategoryId(categoryText) { switch (categoryText) { case "Stan": return AD_CATEGORY.FLAT.id; case "Građevinsko zemljiste": return AD_CATEGORY.LAND.id; case "Industrijsko zemljiste": return AD_CATEGORY.LAND.id; case "Poljoprivredno zemljiste": return AD_CATEGORY.LAND.id; case "Kuća": return AD_CATEGORY.HOUSE.id; case "Poslovni prostor": return AD_CATEGORY.OFFICE.id; case "Kancelarije": return AD_CATEGORY.OFFICE.id; case "Apartmani": return AD_CATEGORY.APARTMENT.id; case "Garaža": return AD_CATEGORY.GARAGE.id; case "Vikendica": return AD_CATEGORY.COTTAGE.id; default: return undefined; } } getAdTypeId(adTypeText) { switch (adTypeText) { case "PRODAJA": return AD_TYPE.AD_TYPE_SALE.stringId; case "NAJAM": return AD_TYPE.AD_TYPE_RENT.stringId; default: return undefined; } } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = SaljicCrawler;