diff --git a/app/common/enums.js b/app/common/enums.js index 33cb41e..47c36c3 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -223,7 +223,8 @@ const AD_AGENCY = { OLX: "OLX", RENTAL: "RENTAL", PROSTOR: "PROSTOR", - AKTIDO: "AKTIDO" + AKTIDO: "AKTIDO", + SALJIC: "SALJIC" }; const CRAWLER_AD_TYPE = { diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 82411b6..d4c335e 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -9,12 +9,14 @@ const OlxCrawler = require("./specificCrawlers/olx"); const RentalCrawler = require("./specificCrawlers/rental"); const ProstorCrawler = require("./specificCrawlers/prostor"); const AktidoCrawler = require("./specificCrawlers/aktido"); +const SaljicCrawler = require("./specificCrawlers/saljic"); const { OLX_CONFIG, RENTAL_CONFIG, PROSTOR_CONFIG, - AKTIDO_CONFIG + AKTIDO_CONFIG, + SALJIC_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); @@ -57,6 +59,15 @@ async function crawlAll() { AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE, AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES, AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES + ), + new SaljicCrawler( + [postgresSaver], + SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE, + SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES, + SALJIC_CONFIG.SALJIC_MAX_PAGES, + SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE, + SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES, + SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES ) ]; diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index ee98e44..4853d53 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx"); const RENTAL_CONFIG = require("./specificConfigs/rental"); const PROSTOR_CONFIG = require("./specificConfigs/prostor"); const AKTIDO_CONFIG = require("./specificConfigs/aktido"); +const SALJIC_CONFIG = require("./specificConfigs/saljic"); module.exports = { OLX_CONFIG, RENTAL_CONFIG, PROSTOR_CONFIG, - AKTIDO_CONFIG + AKTIDO_CONFIG, + SALJIC_CONFIG }; diff --git a/app/crawler/specificConfigs/saljic.js b/app/crawler/specificConfigs/saljic.js new file mode 100644 index 0000000..2e39ffe --- /dev/null +++ b/app/crawler/specificConfigs/saljic.js @@ -0,0 +1,34 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const saljicCrawlerAdType = + process.env.SALJIC_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE] + : null; + +const saljicParsedCrawlerAdCategories = + process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const saljicIgnoredUsernames = []; + +const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100, + SALJIC_MAX_RESULTS_PER_PAGE: + parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000, + SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE, + SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories, + SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [], + SALJIC_DELAY_BETWEEN_PAGES: + parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000, + SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL) +}; diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js new file mode 100644 index 0000000..407a558 --- /dev/null +++ b/app/crawler/specificCrawlers/saljic.js @@ -0,0 +1,83 @@ +"use strict"; + +const fetch = require("node-fetch"); +const cheerio = require("cheerio"); +const moment = require("moment-timezone"); + +const { + AD_TYPE, + AD_CATEGORY, + AD_AGENCY, + AD_STATUS, + CRAWLER_AD_TYPE, + FURNISHING_TYPE, + HEATING_TYPE +} = require("../../common/enums"); + +const { + PRINT_CRAWLER_DEBUG, + DEFAULT_TIMEZONE +} = require("../../config/appConfig"); +const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic"); + +const SALJIC_ENUMS = { + SALJIC_AD_TYPE: { + [CRAWLER_AD_TYPE.ALL]: "&input_vrsta=", + [CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1", + [CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2" + }, + SALJIC_AD_CATEGORY: { + [AD_CATEGORY.ALL.id]: "&input_kategorija=", + [AD_CATEGORY.FLAT.id]: "&input_kategorija=15", + [AD_CATEGORY.HOUSE.id]: "&input_kategorija=9", + [AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko + [AD_CATEGORY.OFFICE.id]: "&input_kategorija=8", + [AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1", + [AD_CATEGORY.GARAGE.id]: "&input_kategorija=2" + //[AD_CATEGORY.COTTAGE.id]: "" + } +}; + +class SaljicCrawler { + constructor( + savers = [], + crawlerAdTypes = CRAWLER_AD_TYPE.ALL, + crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], + maxPages = 5000, + maxResultsPerPage = 5000, + ignoredUsernames = [], + delayBetweenPages = 1000 + ) { + this.savers = savers; + this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; + this.crawlerAdTypes = crawlerAdTypes; + this.crawlerAdCategories = crawlerAdCategories; + this.maxResultsPerPage = maxResultsPerPage; + this.delayBetweenPages = delayBetweenPages; + } + + async crawl() { + // + console.log("Saljic URL: ", this.baseUrl); + } + + //======= HELPER FUNCTIONS ============= + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async saveCrawledResults(results) { + const savers = this.savers; + + // for (const saver of savers) { + // await saver.save(results); + // } + + //For now, we use only Postgres saver, so ... + return savers[0].save(results); + //so that we can use some sequelize options and information when data is inserted + } +} + +module.exports = SaljicCrawler; diff --git a/development.env b/development.env index 89f0a1e..a18b79d 100644 --- a/development.env +++ b/development.env @@ -59,3 +59,8 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!! AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found +#==SALJIC NEKRETNINE== +SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once +SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values +SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values +SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found \ No newline at end of file