"use strict"; const fetch = require("node-fetch"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE, FURNISHING_TYPE, HEATING_TYPE } = require("../../common/enums"); const { PRINT_CRAWLER_DEBUG, DEFAULT_TIMEZONE } = require("../../config/appConfig"); const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic"); const SALJIC_ENUMS = { SALJIC_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "&input_vrsta=", [CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1", [CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2" }, SALJIC_AD_CATEGORY: { [AD_CATEGORY.ALL.id]: "&input_kategorija=", [AD_CATEGORY.FLAT.id]: "&input_kategorija=15", [AD_CATEGORY.HOUSE.id]: "&input_kategorija=9", [AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko [AD_CATEGORY.OFFICE.id]: "&input_kategorija=8", [AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1", [AD_CATEGORY.GARAGE.id]: "&input_kategorija=2" //[AD_CATEGORY.COTTAGE.id]: "" } }; class SaljicCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 5000, maxResultsPerPage = 5000, ignoredUsernames = [], delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxResultsPerPage = maxResultsPerPage; this.delayBetweenPages = delayBetweenPages; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory)); } // //console.log(indexGenerators); // let done = false; while (!done) { const categoryIndexerPromises = []; const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); generatorsToRemove.push(false); } const singlePageResults = await Promise.all(categoryIndexerPromises); const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); const { newRecords } = saveResults; newRealEstates.push(...newRecords); if ( Array.isArray(newRecords) && newRecords.length === 0 && !SALJIC_FORCE_CRAWL ) { generatorsToRemove[index] = true; } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; // console.log("Generator ", index + 1, "has no more pages"); } } // console.log("Generators state : ", generatorsToRemove); for (let i = generatorsToRemove.length - 1; i >= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory) { let pageToIndex = 1; const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11; const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`; const singlePageResults = await this.indexSinglePage( urlPageToCrawl, this.maxResultsPerPage ); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; } else { return undefined; } ++pageToIndex; if (pageToIndex === this.maxPages) { return undefined; } } } else { return undefined; } } async indexSinglePage(url, maxResultsPerPage) { if (PRINT_CRAWLER_DEBUG) { console.log("[SALJIC] Index page : ", url); } try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; $("#shop") .find(".product") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); //Converting to absolute URLs const hrefsAbs = hrefs.map(link => { return "https://www.saljicnekretnine.ba" + link; }); let actualNoOfResults = hrefsAbs.length <= maxResultsPerPage ? hrefsAbs.length : maxResultsPerPage; const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { asyncScraping.push(this.scrapeAd(hrefsAbs[i])); } const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; } catch (e) { console.error("[SALJIC] Exception caught:" + e); return []; } } async scrapeAd(url) { console.log("[SALJIC] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); // ??? treba li nesto za status let status = AD_STATUS.STATUS_NORMAL; const propertySelectors = { title: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", price: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", streetName: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p", descriptions: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)", latAndLong: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" }; const title = $(propertySelectors.title) .text() .replace(/(\r\n|\n|\r)/gm, "") .replace(/ {1,}/g, " ") .trim(); console.log("Title:", title); const priceText = $(propertySelectors.price) .text() .replace(/(\r\n|\n|\r)/gm, "") .replace(/ {1,}/g, " ") .trim(); const price = priceText === "CIJENA NA UPIT" ? null : parseFloat( priceText.substring(8, priceText.length - 3).replace(",", "") ); console.log("Price:", price); const streetName = $(propertySelectors.streetName) .text() .replace(/(\r\n|\n|\r)/gm, "") .trim(); console.log("Street:", streetName); const descriptions = $(propertySelectors.descriptions) .text() .trim(); console.log("Description:", descriptions); const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); const latText = latAndLongSrc.substring( latAndLongSrc.indexOf("marker=") + 7, latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) ); const longText = latAndLongSrc.substring( latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3, latAndLongSrc.length ); const locationLat = parseFloat(latText) || null; const locationLong = parseFloat(longText) || null; console.log("Lat:", locationLat); console.log("Long:", locationLong); //====== DETAIL INFORMATION FIELDS ========== let area, gardenSize, numberOfRooms = null, numberOfFloors = null, floor = null, accessRoadType = null, heatingType = null, furnishingType = null, balcony = null, newBuilding = null, elevator = null, water = null, electricity = null, drainageSystem = null, registeredInZkBooks = null, recentlyAdapted = null, parking = null, garage = null, gas = null, antiTheftDoor = null, airCondition = null, phoneConnection = null, cableTV = null, internet = null, basementAttic = null, storeRoom = null, videoSurveillance = null, alarm = null, suitableForStudents = null, includingBills = null, animalsAllowed = null, pool = null, urbanPlanPermit = null, buildingPermit = null, utilityConnection = null, distanceToRiver = null; let publishedDate = null; let renewedDate = null; //Extracting data - Glavne karakteristike let mainFieldIndex = 1; do { const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`; const mainField = $(mainFieldSelector) .text() .replace(/[\n\r\t]/gm, "") .trim(); const mainFieldTitle = mainField.substring(0, mainField.indexOf(" ")); const mainFieldValue = mainField .substring(mainField.indexOf(" "), mainField.length) .trim(); switch (mainFieldTitle) { case "Površina": area = parseFloat( mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) ); break; case "Okućnica": gardenSize = parseFloat( mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) ); break; case "Broj soba": numberOfRooms = parseInt(mainFieldValue); break; case "Broj spratova": numberOfFloors = parseInt(mainFieldValue); break; case "Sprat": floor = parseInt(mainFieldValue); break; case "Godina renoviranja": recentlyAdapted = true; break; case "Broj parking mjesta": `${month}/${day}/${year}`; parking = true; break; case "Dostupno od": const day = mainFieldValue.substring(0, 2); const month = mainFieldValue.substring(3, 5); const year = mainFieldValue.substring(6, mainFieldValue.length); console.log(`${month}/${day}/${year}`); publishedDate = new Date(`${month}/${day}/${year}`); break; default: // console.log(fieldTitle, " = ", fieldValue); break; } if (mainFieldTitle === "") { break; } mainFieldIndex++; } while (true); console.log("Area:", area); console.log("Garden size:", gardenSize); console.log("Number of rooms:", numberOfRooms); console.log("Number of floors", numberOfFloors); console.log("Floor:", floor); console.log("Adapted:", recentlyAdapted); console.log("Parking:", parking); console.log("Published date:", publishedDate); //const category = $(propertySelectors.category) //.text() //.trim(); const data = { url, agencyObjectId: olxId, originAgencyName: AD_AGENCY.OLX, realEstateType: parsedCategory, adType: parsedAdType, title, price: parsedPrice, area: parsedArea, gardenSize: parsedGardenSize, shortDescription: descriptions .first() .text() .trim(), longDescription: descriptions .last() .text() .trim(), streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat, locationLong, adStatus: status, publishedDate: publishedDateMoment.toISOString(), renewedDate: renewedDateMoment.toISOString(), numberOfRooms, numberOfFloors, floor, accessRoadType, heatingType, furnishingType, balcony, newBuilding, elevator, water, electricity, drainageSystem, registeredInZkBooks, recentlyAdapted, parking, garage, gas, antiTheftDoor, airCondition, phoneConnection, cableTV, internet, basementAttic, storeRoom, videoSurveillance, alarm, suitableForStudents, includingBills, animalsAllowed, pool, urbanPlanPermit, buildingPermit, utilityConnection, distanceToRiver, numberOfViewsAgency }; return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); } return null; } //======= HELPER FUNCTIONS ============= getAdCategoryId(categoryText) { switch (categoryText) { case "Stanovi": return AD_CATEGORY.FLAT.id; case "Zemljišta": return AD_CATEGORY.LAND.id; case "Kuće": return AD_CATEGORY.HOUSE.id; case "Poslovni prostori": return AD_CATEGORY.OFFICE.id; case "Apartmani": return AD_CATEGORY.APARTMENT.id; case "Garaže": return AD_CATEGORY.GARAGE.id; case "Vikendice": return AD_CATEGORY.COTTAGE.id; default: return undefined; } } getAdTypeId(adTypeText) { switch (adTypeText) { case "Prodaja": return AD_TYPE.AD_TYPE_SALE.stringId; case "Izdavanje": return AD_TYPE.AD_TYPE_RENT.stringId; case "Potražnja": return AD_TYPE.AD_TYPE_REQUEST.stringId; default: return undefined; } } getHeatingTypeId(heatingTypeText) { switch (heatingTypeText) { case "struja": return HEATING_TYPE.ELECTRICITY.id; case "plin": return HEATING_TYPE.GAS.id; case "drva": return HEATING_TYPE.WOOD.id; case "centralno (gradsko)": return HEATING_TYPE.CENTRAL_CITY.id; case "centralno (kotlovnica)": return HEATING_TYPE.CENTRAL_BOILER.id; case "centralno (plin)": return HEATING_TYPE.CENTRAL_GAS.id; case "nije uvedeno": return HEATING_TYPE.NO_HEATING.id; case "ostalo": return HEATING_TYPE.OTHER.id; case "drugo": return HEATING_TYPE.OTHER.id; default: console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); return null; } } getFurnishingTypeId(furnishingTypeText) { switch (furnishingTypeText) { case "namješten": return FURNISHING_TYPE.FURNISHED.id; case "polunamješten": return FURNISHING_TYPE.HALF_FURNISHED.id; case "nenamješten": return FURNISHING_TYPE.NOT_FURNISHED.id; case "": return FURNISHING_TYPE.FURNISHED.id; default: console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); return null; } } getAccessRoadTypeId(accessRoadTypeText) { switch (accessRoadTypeText) { case "asfalt": return ACCESS_ROAD_TYPE.ASPHALT.id; case "beton": return ACCESS_ROAD_TYPE.CONCRETE.id; case "makadam": return ACCESS_ROAD_TYPE.MACADAM.id; case "ostalo": return ACCESS_ROAD_TYPE.OTHER.id; default: console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); return null; } } parseArea(areaText) { if (!areaText) { return NaN; } const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; const textWithOnlyOneDecimalDot = areaText .replace(",", ".") .replace(removeDotsExceptLastOneRegex, ""); return parseFloat(textWithOnlyOneDecimalDot); } parsePrice(priceText) { if (!priceText) { return NaN; } const formattedPriceText = priceText.replace(".", "").replace(",", "."); return parseFloat(formattedPriceText); } parseNumberOfRooms(numberOfRoomsText, categoryId) { if (categoryId === AD_CATEGORY.FLAT.id) { switch (numberOfRoomsText) { case "garsonjera": return 0; case "jednosoban (1)": return 1; case "jednoiposoban (1.5)": return 1.5; case "dvosoban (2)": return 2; case "trosoban (3)": return 3; case "četverosoban (4)": return 4; case "petosoban i više": return 5; default: console.log( "broj soba [stan] = NEPOZNATO [", numberOfRoomsText, ", ", categoryId, "]" ); return null; } } if ( categoryId === AD_CATEGORY.HOUSE.id || categoryId === AD_CATEGORY.COTTAGE.id || categoryId === AD_CATEGORY.APARTMENT.id || categoryId === AD_CATEGORY.OFFICE.id ) { return parseInt(numberOfRoomsText) || null; } console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); return null; } parseNumberOfFloors(numberOfFloorsText, categoryId) { if ( categoryId === AD_CATEGORY.HOUSE.id || categoryId === AD_CATEGORY.COTTAGE.id ) { return parseInt(numberOfFloorsText) || null; } if (categoryId === AD_CATEGORY.OFFICE.id) { if ( numberOfFloorsText === "suteren" || numberOfFloorsText === "prizemlje" ) { return 0; } if (numberOfFloorsText === "6+") { return 7; } return parseInt(numberOfFloorsText) || null; } console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); return null; } parseFloorNumber(floorText, categoryId) { if ( categoryId === AD_CATEGORY.FLAT.id || categoryId === AD_CATEGORY.APARTMENT.id ) { if ( floorText === "suteren" || floorText === "prizemlje" || floorText === "visoko prizemlje" ) { return 0; } return parseInt(floorText) || null; } if (categoryId === AD_CATEGORY.OFFICE.id) { if (floorText === "zaseban objekat") { return null; } if (floorText === "prizemlje" || floorText === "visoko prizemlje") { return 0; } return parseInt(floorText) || null; } console.log("sprat = NEPOZNATO [", floorText, "]"); return null; } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = SaljicCrawler;