diff --git a/app/config/appConfig.js b/app/config/appConfig.js index 1ed59a3..0ff991a 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -41,6 +41,13 @@ const PROSTOR_LOGIN = { PASSWORD: process.env.PROSTOR_LOGIN_PASS }; +const USER_AGENT = + process.env.USER_AGENT || + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"; + +const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use +const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || ""; + module.exports = { APP_PORT, APP_URL, @@ -54,5 +61,8 @@ module.exports = { API_MAP_KEY, STAGING, CHECK_UP_DAYS, - PROSTOR_LOGIN + PROSTOR_LOGIN, + USER_AGENT, + USE_SCRAPER_API, + SCRAPER_API_KEY }; diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index 74bcba7..9755eb7 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -1,6 +1,6 @@ "use strict"; -const fetch = require("node-fetch"); +const fetch = require("../../helpers/fetchWrapper"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index ede9734..1e3d63a 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -1,6 +1,6 @@ "use strict"; -const fetch = require("node-fetch"); +const fetch = require("../../helpers/fetchWrapper"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index 04be5f3..5690a4b 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -1,6 +1,6 @@ "use strict"; -const fetch = require("node-fetch"); +const fetch = require("../../helpers/fetchWrapper"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); const FormData = require("form-data"); @@ -191,13 +191,7 @@ class ProstorCrawler { const { lat, lng, property_name, price, size, link, status } = realEstate; //Status information is given already in realestate list - //For VIP Ads status ='' canot be used, but no VIP ads are crawled - //We will make "fake" vip ad for RE that have size=55 - //It is weird because yesterday it said 'VIP ponuda' ??? - const adStatus = - size === "55" - ? ProstorCrawler.getStatusId("VIP ponuda") - : ProstorCrawler.getStatusId(status); + const adStatus = ProstorCrawler.getStatusId(status); const url = `https://prostor.ba${link}`; diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index 39eb1c5..ed37e83 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -1,6 +1,6 @@ "use strict"; -const fetch = require("node-fetch"); +const fetch = require("../../helpers/fetchWrapper"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); @@ -399,7 +399,9 @@ class RentalCrawler { ); if (!publishedDateMoment.isValid()) { throw { - message: `Invalid published date : ${extractedData["re_realEstates_inserted"]}` + message: `Invalid published date : ${ + extractedData["re_realEstates_inserted"] + }` }; } @@ -410,7 +412,9 @@ class RentalCrawler { ); if (!renewedDateMoment.isValid()) { throw { - message: `Invalid renewed date : ${extractedData["re_realEstates_edited"]}` + message: `Invalid renewed date : ${ + extractedData["re_realEstates_edited"] + }` }; } diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 2eef7cc..9914f35 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -1,6 +1,6 @@ "use strict"; -const fetch = require("node-fetch"); +const fetch = require("../../helpers/fetchWrapper"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index 0f77260..8915695 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -332,10 +332,14 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => { }; } + //When includeIncompleteAds are not defined - null it will consider it true const order = [["updatedAt", "desc"]]; return db.RealEstate.findAll({ - where: includeIncompleteAds ? queryIncludeIncomplete : query, + where: + includeIncompleteAds || includeIncompleteAds == null + ? queryIncludeIncomplete + : query, limit: maxResults, order }); diff --git a/app/helpers/db/searchRequest.js b/app/helpers/db/searchRequest.js index f845981..2532f21 100644 --- a/app/helpers/db/searchRequest.js +++ b/app/helpers/db/searchRequest.js @@ -174,7 +174,7 @@ const findSearchRequestsForRealEstate = async realEstate => { } else { // If real estate dont have defined number of rooms ex. null //It returns requests that didn't choose number of rooms - also null - //Or ones that picked some values but also picked to includeIncomplete ads + //Or ones that picked some values but also picked to includeIncomplete ads (or default) numberOfRoomsQuery = { [Op.or]: [ { @@ -193,7 +193,10 @@ const findSearchRequestsForRealEstate = async realEstate => { }, { includeIncompleteAds: { - [Op.eq]: true + [Op.or]: { + [Op.eq]: true, + [Op.is]: null + } } } ] @@ -243,7 +246,10 @@ const findSearchRequestsForRealEstate = async realEstate => { }, { includeIncompleteAds: { - [Op.eq]: true + [Op.or]: { + [Op.eq]: true, + [Op.is]: null + } } } ] @@ -292,7 +298,10 @@ const findSearchRequestsForRealEstate = async realEstate => { }, { includeIncompleteAds: { - [Op.eq]: true + [Op.or]: { + [Op.eq]: true, + [Op.is]: null + } } } ] @@ -330,7 +339,10 @@ const findSearchRequestsForRealEstate = async realEstate => { }, { includeIncompleteAds: { - [Op.eq]: true + [Op.or]: { + [Op.eq]: true, + [Op.is]: null + } } } ] @@ -364,7 +376,10 @@ const findSearchRequestsForRealEstate = async realEstate => { }, { includeIncompleteAds: { - [Op.eq]: true + [Op.or]: { + [Op.eq]: true, + [Op.is]: null + } } } ] @@ -398,7 +413,10 @@ const findSearchRequestsForRealEstate = async realEstate => { }, { includeIncompleteAds: { - [Op.eq]: true + [Op.or]: { + [Op.eq]: true, + [Op.is]: null + } } } ] @@ -440,10 +458,13 @@ const findSearchRequestsForRealEstate = async realEstate => { [Op.eq]: "ANY" }; } - //Tag to check if incomplete ads are accepted in query + //Tag to check if incomplete ads are accepted in query which is default if (checkForIncompleteWanted) { query.includeIncompleteAds = { - [Op.eq]: true + [Op.or]: { + [Op.eq]: true, + [Op.is]: null + } }; } diff --git a/app/helpers/fetchWrapper.js b/app/helpers/fetchWrapper.js new file mode 100644 index 0000000..3af2817 --- /dev/null +++ b/app/helpers/fetchWrapper.js @@ -0,0 +1,21 @@ +const nodeFetch = require("node-fetch"); +const { + USER_AGENT, + USE_SCRAPER_API, + SCRAPER_API_KEY +} = require("../config/appConfig"); + +const fetch = async (url, options = {}) => { + const newOptions = Object.assign({}, options); + if (!newOptions["headers"]) { + newOptions["headers"] = {}; + } + newOptions["headers"]["User-Agent"] = USER_AGENT; + const urlAdaptedForScraping = USE_SCRAPER_API + ? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}` + : url; + + return nodeFetch(urlAdaptedForScraping, newOptions); +}; + +module.exports = fetch; diff --git a/app/models/searchRequest.js b/app/models/searchRequest.js index 39bcd2e..f0c04eb 100644 --- a/app/models/searchRequest.js +++ b/app/models/searchRequest.js @@ -15,15 +15,7 @@ module.exports = (sequelize, DataTypes) => { allowNull: false, defaultValue: { type: "Polygon", - coordinates: [ - [ - [0, 0], - [0, 0], - [0, 0], - [0, 0], - [0, 0] - ] - ], + coordinates: [[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]], crs: { type: "name", properties: { name: "EPSG:4326" } } } }, diff --git a/app/views/advancedFilters.ejs b/app/views/advancedFilters.ejs index 23e50f7..03b387f 100644 --- a/app/views/advancedFilters.ejs +++ b/app/views/advancedFilters.ejs @@ -61,9 +61,8 @@

diff --git a/development.env b/development.env index 683fe65..42844a6 100644 --- a/development.env +++ b/development.env @@ -10,6 +10,7 @@ APP_BASE_URL=base url for the app ENVIRONMENT=Variable to denote development, staging and production +USER_AGENT=User agent header to send in fetch requests MAX_REAL_ESTATES_IN_EMAIL=Max number of real estates that will be shown in email, others will be truncated and URL with full list will be shwon MAX_REAL_ESTATES_IN_FIRST_EMAIL=Max number of real estates that will be shown in first (welcome) email @@ -21,6 +22,10 @@ GA_ID=Google Analytics ID #=============== GOOGLE MAPS =============# API_MAP_KEY=(your-key-here) +#=============== SCRAPER API SUPORT =============# +USE_SCRAPER_API= To turn it on (1) or off (0) +SCRAPER_API_KEY= Key for Scraper api + #=============== AWS SDK EMAIL SETTINGS =======# AWS_KEY_ID=(your-key-here) AWS_SECRET_ACCESS_KEY=(your-key-here) @@ -69,4 +74,4 @@ AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values -SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found \ No newline at end of file +SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found