diff --git a/app/common/enums.js b/app/common/enums.js index c89b21e..3f3ab4e 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -304,7 +304,8 @@ const AD_AGENCY = { RENTAL: "RENTAL", PROSTOR: "PROSTOR", AKTIDO: "AKTIDO", - KIVI: "KIVI" + KIVI: "KIVI", + SALJIC: "SALJIC" }; const CRAWLER_AD_TYPE = { diff --git a/app/config/appConfig.js b/app/config/appConfig.js index 3ccedbe..3e6003f 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -9,6 +9,8 @@ const APP_URL = ? process.env.APP_URL || "http://market-alarm" : process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`; +const STAGING = process.env.SETTINGS !== "production"; + const DEFAULT_TIMEZONE = "Europe/Sarajevo"; const CRAWLER_INTERVAL = parseInt(process.env.CRAWLER_INTERVAL) || 60; @@ -50,6 +52,8 @@ module.exports = { MAX_REAL_ESTATES_IN_FIRST_EMAIL, PRINT_CRAWLER_DEBUG, API_MAP_KEY, + STAGING, CHECK_UP_DAYS, PROSTOR_LOGIN + }; diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 82411b6..d4c335e 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -9,12 +9,14 @@ const OlxCrawler = require("./specificCrawlers/olx"); const RentalCrawler = require("./specificCrawlers/rental"); const ProstorCrawler = require("./specificCrawlers/prostor"); const AktidoCrawler = require("./specificCrawlers/aktido"); +const SaljicCrawler = require("./specificCrawlers/saljic"); const { OLX_CONFIG, RENTAL_CONFIG, PROSTOR_CONFIG, - AKTIDO_CONFIG + AKTIDO_CONFIG, + SALJIC_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); @@ -57,6 +59,15 @@ async function crawlAll() { AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE, AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES, AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES + ), + new SaljicCrawler( + [postgresSaver], + SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE, + SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES, + SALJIC_CONFIG.SALJIC_MAX_PAGES, + SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE, + SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES, + SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES ) ]; diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index ee98e44..4853d53 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx"); const RENTAL_CONFIG = require("./specificConfigs/rental"); const PROSTOR_CONFIG = require("./specificConfigs/prostor"); const AKTIDO_CONFIG = require("./specificConfigs/aktido"); +const SALJIC_CONFIG = require("./specificConfigs/saljic"); module.exports = { OLX_CONFIG, RENTAL_CONFIG, PROSTOR_CONFIG, - AKTIDO_CONFIG + AKTIDO_CONFIG, + SALJIC_CONFIG }; diff --git a/app/crawler/specificConfigs/saljic.js b/app/crawler/specificConfigs/saljic.js new file mode 100644 index 0000000..2e39ffe --- /dev/null +++ b/app/crawler/specificConfigs/saljic.js @@ -0,0 +1,34 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const saljicCrawlerAdType = + process.env.SALJIC_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE] + : null; + +const saljicParsedCrawlerAdCategories = + process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const saljicIgnoredUsernames = []; + +const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100, + SALJIC_MAX_RESULTS_PER_PAGE: + parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000, + SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE, + SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories, + SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [], + SALJIC_DELAY_BETWEEN_PAGES: + parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000, + SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL) +}; diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js new file mode 100644 index 0000000..267dcde --- /dev/null +++ b/app/crawler/specificCrawlers/saljic.js @@ -0,0 +1,610 @@ +"use strict"; + +const fetch = require("node-fetch"); +const cheerio = require("cheerio"); +const moment = require("moment-timezone"); + +const { + AD_TYPE, + AD_CATEGORY, + AD_AGENCY, + AD_STATUS, + CRAWLER_AD_TYPE, + FURNISHING_TYPE, + HEATING_TYPE +} = require("../../common/enums"); + +const { + PRINT_CRAWLER_DEBUG, + DEFAULT_TIMEZONE +} = require("../../config/appConfig"); +const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic"); + +const SALJIC_ENUMS = { + SALJIC_AD_TYPE: { + [CRAWLER_AD_TYPE.ALL]: "&input_vrsta=", + [CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1", + [CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2" + }, + SALJIC_AD_CATEGORY: { + [AD_CATEGORY.ALL.id]: "&input_kategorija=", + [AD_CATEGORY.FLAT.id]: "&input_kategorija=15", + [AD_CATEGORY.HOUSE.id]: "&input_kategorija=9", + [AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko + [AD_CATEGORY.OFFICE.id]: "&input_kategorija=8", + [AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1", + [AD_CATEGORY.GARAGE.id]: "&input_kategorija=2" + //[AD_CATEGORY.COTTAGE.id]: "" + } +}; + +class SaljicCrawler { + constructor( + savers = [], + crawlerAdTypes = CRAWLER_AD_TYPE.ALL, + crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], + maxPages = 5000, + maxResultsPerPage = 5000, + ignoredUsernames = [], + delayBetweenPages = 1000 + ) { + this.savers = savers; + this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; + this.crawlerAdTypes = crawlerAdTypes; + this.crawlerAdCategories = crawlerAdCategories; + this.maxResultsPerPage = maxResultsPerPage; + this.delayBetweenPages = delayBetweenPages; + } + + async crawl() { + const crawlAdCategories = this.crawlerAdCategories; + + const newRealEstates = []; + + if (crawlAdCategories) { + const indexGenerators = []; + for (const adCategory of crawlAdCategories) { + indexGenerators.push(this.categoryIndexer(adCategory)); + } + // + //console.log(indexGenerators); + // + let done = false; + while (!done) { + const categoryIndexerPromises = []; + const generatorsToRemove = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); + } + + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); + + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; + + newRealEstates.push(...newRecords); + + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !SALJIC_FORCE_CRAWL + ) { + generatorsToRemove[index] = true; + } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); + } + } + + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + await this.sleep(this.delayBetweenPages); + } + } + return newRealEstates; + } + + async *categoryIndexer(adCategory) { + let pageToIndex = 1; + + const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory]; + + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { + while (true) { + const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11; + const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`; + + const singlePageResults = await this.indexSinglePage( + urlPageToCrawl, + this.maxResultsPerPage + ); + + if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { + yield singlePageResults; + } else { + return undefined; + } + + ++pageToIndex; + if (pageToIndex === this.maxPages) { + return undefined; + } + } + } else { + return undefined; + } + } + + async indexSinglePage(url, maxResultsPerPage) { + if (PRINT_CRAWLER_DEBUG) { + console.log("[SALJIC] Index page : ", url); + } + + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + let hrefs = []; + + $("#shop") + .find(".product") + .each((i, elem) => { + const href = $(elem) + .find("a") + .first() + .attr("href"); + if (href) { + hrefs.push(href); + } + }); + + let adTypesTmp = []; + + $("#shop") + .find(".product") + .each((i, elem) => { + const adType = $(elem) + .find(".trakica-search-page") + .text() + .trim(); + if (adType) { + adTypesTmp.push(adType); + } + }); + + //Converting to AD_TYPE + const adTypes = adTypesTmp.map(adTypeText => { + return this.getAdTypeId(adTypeText); + }); + + //Converting to absolute URLs + const hrefsAbs = hrefs.map(link => { + return "https://www.saljicnekretnine.ba" + link; + }); + + let actualNoOfResults = + hrefsAbs.length <= maxResultsPerPage + ? hrefsAbs.length + : maxResultsPerPage; + + const asyncScraping = []; + for (let i = 0; i < actualNoOfResults; i++) { + asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i])); + } + + const scrapedData = await Promise.all(asyncScraping); + const filteredScrapedData = scrapedData.filter(adData => !!adData); + return filteredScrapedData; + } catch (e) { + console.error("[SALJIC] Exception caught:" + e); + return []; + } + } + + async scrapeAd(url, adType) { + // console.log("[SALJIC] Scraping : ", url); + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + + // No information for status ex. PRODAN + const status = AD_STATUS.STATUS_NORMAL; + //Extracting agency ID from url + const agencyObjectId = parseInt(url.substring(46, url.length)); + + //Extracting main properties + const propertySelectors = { + title: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", + price: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", + streetName: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p", + + descriptions: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)", + latAndLong: + "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" + }; + const title = $(propertySelectors.title) + .text() + .replace(/(\r\n|\n|\r)/gm, "") + .replace(/ {1,}/g, " ") + .trim(); + + const priceText = $(propertySelectors.price) + .text() + .replace(/(\r\n|\n|\r)/gm, "") + .replace(/ {1,}/g, " ") + .trim(); + const price = + priceText === "CIJENA NA UPIT" + ? null + : parseFloat( + priceText.substring(8, priceText.length - 3).replace(",", "") + ); + + const streetName = $(propertySelectors.streetName) + .text() + .replace(/(\r\n|\n|\r)/gm, "") + .trim(); + + const descriptions = $(propertySelectors.descriptions) + .text() + .trim(); + + const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); + const latText = latAndLongSrc.substring( + latAndLongSrc.indexOf("marker=") + 7, + latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + ); + const longText = latAndLongSrc.substring( + latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3, + latAndLongSrc.length + ); + const locationLat = parseFloat(latText) || null; + const locationLong = parseFloat(longText) || null; + + //====== DETAIL INFORMATION FIELDS ========== + let area, + gardenSize, + numberOfRooms = null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = null, + furnishingType = null, + balcony = null, + newBuilding = null, + elevator = null, + water = null, + electricity = null, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = null, + garage = null, + gas = null, + antiTheftDoor = null, + airCondition = null, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = null, + storeRoom = null, + videoSurveillance = null, + alarm = null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null; + let publishedDate = null; + let renewedDate = null; + let realEstateType; + let numberOfViewsAgency = null; + + //Extracting data - Glavne karakteristike + let mainFieldIndex = 1; + do { + const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`; + + const mainField = $(mainFieldSelector) + .text() + .replace(/[\n\r\t]/gm, "") + .trim(); + + const mainFieldTitle = mainField.substring(0, mainField.indexOf(" ")); + const mainFieldValue = mainField + .substring(mainField.indexOf(" "), mainField.length) + .trim(); + + switch (mainFieldTitle) { + case "Površina": + area = parseFloat( + mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) + ); + break; + case "Okućnica": + gardenSize = parseFloat( + mainFieldValue.substring(0, mainFieldValue.indexOf(" ")) + ); + break; + case "Broj soba": + numberOfRooms = parseInt(mainFieldValue); + break; + case "Broj spratova": + numberOfFloors = parseInt(mainFieldValue); + break; + case "Sprat": + floor = parseInt(mainFieldValue); + break; + case "Godina renoviranja": + recentlyAdapted = true; + break; + case "Broj parking mjesta": + parking = true; + break; + case "Dostupno od": + const day = mainFieldValue.substring(0, 2); + const month = mainFieldValue.substring(3, 5); + const year = mainFieldValue.substring(6, mainFieldValue.length); + publishedDate = new Date(`${month}/${day}/${year}`); + break; + default: + break; + } + + if (mainFieldTitle === "") { + break; + } + mainFieldIndex++; + } while (true); + + //Extracting data - Sadrzaji + let additionalFieldIndex = 1; + do { + const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`; + + const additionalField = $(additionalFieldSelector) + .text() + .trim(); + + if (additionalFieldIndex === 1) { + //Extracting data of real estate type + const categoryTmp = additionalField + .replace(/[\n\r\t]/gm, "") + .substring( + additionalField.indexOf("Kategorija") + 10, + additionalField.length + ) + .trim(); + realEstateType = this.getAdCategoryId(categoryTmp); + } else { + switch (additionalField) { + case "Internet": + internet = true; + break; + case "Garaža": + garage = true; + break; + case "Klima": + airCondition = true; + break; + case "Balkon": + balcony = true; + break; + case "Ostava": + storeRoom = true; + break; + case "Podrum": + basementAttic = true; + break; + case "Blindirana vrata": + antiTheftDoor = true; + break; + case "Voda": + water = true; + break; + case "Kablovska": + cableTV = true; + break; + case "Uknjiženo": + registeredInZkBooks = true; + break; + case "Grijanje - centralno": + heatingType = HEATING_TYPE.CENTRAL_CITY.id; + break; + case "Grijanje - plin": + heatingType = HEATING_TYPE.GAS.id; + break; + case "Grijanje - struja": + heatingType = HEATING_TYPE.ELECTRICITY.id; + break; + case "Grijanje": + heatingType = HEATING_TYPE.OTHER.id; + break; + case "Plin": + gas = true; + break; + case "Namješten": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "Alarm": + alarm = true; + break; + case "Video nadzor": + videoSurveillance = true; + break; + case "Lift": + elevator = true; + break; + case "Novogradnja": + newBuilding = true; + break; + + default: + break; + } + } + + if (additionalField === "") { + break; + } + additionalFieldIndex++; + } while (true); + + //If no published date it takes current date of crawling + if (publishedDate) { + renewedDate = new Date(); + } else { + publishedDate = new Date(); + renewedDate = new Date(); + } + + const data = { + url, + agencyObjectId, + originAgencyName: AD_AGENCY.SALJIC, + realEstateType, + adType, + title, + price, + area, + gardenSize, + shortDescription: descriptions.substring(0, descriptions.indexOf(".")), + longDescription: descriptions, + streetNumber: 0, + streetName, + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat, + locationLong, + adStatus: status, + publishedDate, + renewedDate, + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency + }; + return data; + } catch (e) { + console.error("Exception caught: " + e.message, "\r\nURL:", url); + } + return null; + } + + //======= HELPER FUNCTIONS ============= + + getAdCategoryId(categoryText) { + switch (categoryText) { + case "Stan": + return AD_CATEGORY.FLAT.id; + case "Građevinsko zemljiste": + return AD_CATEGORY.LAND.id; + case "Industrijsko zemljiste": + return AD_CATEGORY.LAND.id; + case "Poljoprivredno zemljiste": + return AD_CATEGORY.LAND.id; + case "Kuća": + return AD_CATEGORY.HOUSE.id; + case "Poslovni prostor": + return AD_CATEGORY.OFFICE.id; + case "Kancelarije": + return AD_CATEGORY.OFFICE.id; + case "Apartmani": + return AD_CATEGORY.APARTMENT.id; + case "Garaža": + return AD_CATEGORY.GARAGE.id; + case "Vikendica": + return AD_CATEGORY.COTTAGE.id; + default: + return undefined; + } + } + + getAdTypeId(adTypeText) { + switch (adTypeText) { + case "PRODAJA": + return AD_TYPE.AD_TYPE_SALE.stringId; + case "NAJAM": + return AD_TYPE.AD_TYPE_RENT.stringId; + default: + return undefined; + } + } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async saveCrawledResults(results) { + const savers = this.savers; + + // for (const saver of savers) { + // await saver.save(results); + // } + + //For now, we use only Postgres saver, so ... + return savers[0].save(results); + //so that we can use some sequelize options and information when data is inserted + } +} + +module.exports = SaljicCrawler; diff --git a/app/helpers/emailContentGenerator.js b/app/helpers/emailContentGenerator.js index 5a76e9b..d201e8f 100644 --- a/app/helpers/emailContentGenerator.js +++ b/app/helpers/emailContentGenerator.js @@ -1,8 +1,15 @@ "use strict"; -const { MAX_REAL_ESTATES_IN_EMAIL, APP_URL } = require("../config/appConfig"); +const { + MAX_REAL_ESTATES_IN_EMAIL, + APP_URL, + STAGING +} = require("../config/appConfig"); const { AD_CATEGORY, AD_TYPE, EMAIL_FREQUENCY } = require("../common/enums"); +//Tag to recognize staging from development +const stagingTag = STAGING ? "[STAGING] " : ""; + const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => { return `