diff --git a/app/common/enums.js b/app/common/enums.js index 33cb41e..85ed553 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -216,7 +216,8 @@ const AD_STATUS = { STATUS_DELETED: 4, STATUS_URGENT: 5, STATUS_DISCOUNTED: 6, - STATUS_RENTED: 7 + STATUS_RENTED: 7, + STATUS_VIP: 8 }; const AD_AGENCY = { diff --git a/app/config/appConfig.js b/app/config/appConfig.js index 7a7887b..4403fbd 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -32,6 +32,11 @@ const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0; const API_MAP_KEY = process.env.API_MAP_KEY || ""; +const PROSTOR_LOGIN = { + EMAIL: process.env.PROSTOR_LOGIN_EMAIL, + PASSWORD: process.env.PROSTOR_LOGIN_PASS +}; + module.exports = { APP_PORT, APP_URL, @@ -42,5 +47,6 @@ module.exports = { MAX_REAL_ESTATES_IN_EMAIL, MAX_REAL_ESTATES_IN_FIRST_EMAIL, PRINT_CRAWLER_DEBUG, - API_MAP_KEY + API_MAP_KEY, + PROSTOR_LOGIN }; diff --git a/app/controllers/realEstates.js b/app/controllers/realEstates.js index ce82765..48c1aff 100644 --- a/app/controllers/realEstates.js +++ b/app/controllers/realEstates.js @@ -2,13 +2,14 @@ const { findRealEstatesForSearchRequest } = require("../helpers/db/searchRequestMatch"); +const { AD_STATUS } = require("../common/enums"); const getRealEstates = async (req, res) => { const searchRequestId = req.params["searchRequestId"] || ""; const realEstates = await findRealEstatesForSearchRequest(searchRequestId); const title = "Nekretnine koje odgovaraju Vašim uslovima pretrage"; - res.render("realEstates", { realEstates, title }); + res.render("realEstates", { realEstates, title, AD_STATUS }); }; module.exports = { diff --git a/app/controllers/redirect.js b/app/controllers/redirect.js index 9975ab2..eb4e505 100644 --- a/app/controllers/redirect.js +++ b/app/controllers/redirect.js @@ -1,9 +1,11 @@ const { getRealEstateById } = require("../helpers/db/realEstate"); +const { AD_STATUS } = require("../common/enums"); const getRedirect = async (req, res) => { const id = req.params.id || null; let error = false; let redirectUrl = undefined; + let vipAd = undefined; if (!id) { error = true; } else { @@ -13,6 +15,7 @@ const getRedirect = async (req, res) => { error = true; } else { redirectUrl = realEstate.url; + vipAd = realEstate.adStatus === AD_STATUS.STATUS_VIP; } } catch (e) { error = true; @@ -24,7 +27,7 @@ const getRedirect = async (req, res) => { res.render("notFound", { title }); } else { const title = "Preusmjeravanje"; - res.render("redirect", { title, redirectUrl }); + res.render("redirect", { title, redirectUrl, vipAd }); } }; diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index cb1d3f9..04be5f3 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -3,6 +3,7 @@ const fetch = require("node-fetch"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); +const FormData = require("form-data"); const { AD_TYPE, @@ -16,7 +17,8 @@ const { const { PRINT_CRAWLER_DEBUG, - DEFAULT_TIMEZONE + DEFAULT_TIMEZONE, + PROSTOR_LOGIN } = require("../../config/appConfig"); const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor"); @@ -60,13 +62,16 @@ class ProstorCrawler { async crawl() { const crawlAdCategories = this.crawlerAdCategories; - + //We need session cookie to use login privileges + const prostorCookie = await this.getCookies(); + //New tag to check if crawler loged in + const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie); const newRealEstates = []; - - if (crawlAdCategories) { + //Crawl only if login was successful + if (crawlAdCategories && login) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { - indexGenerators.push(this.categoryIndexer(adCategory)); + indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie)); } let done = false; @@ -119,13 +124,14 @@ class ProstorCrawler { return newRealEstates; } - async *categoryIndexer(adCategory) { + async *categoryIndexer(adCategory, prostorCookie) { const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`; const listOfAllRealEstates = await this.extractRealEstates( - urlPageToCrawl + urlPageToCrawl, + prostorCookie ); let elementToStartIndexFrom = 0; @@ -139,7 +145,8 @@ class ProstorCrawler { elementToStartIndexFrom += realEstatesForSinglePage.length; const singlePageResults = await this.indexSinglePage( - realEstatesForSinglePage + realEstatesForSinglePage, + prostorCookie ); const filteredSinglePageResults = singlePageResults.filter( @@ -163,10 +170,10 @@ class ProstorCrawler { } } - async indexSinglePage(realEstatesList) { + async indexSinglePage(realEstatesList, prostorCookie) { const asyncActions = []; for (const realEstate of realEstatesList) { - asyncActions.push(this.scrapeAd(realEstate)); + asyncActions.push(this.scrapeAd(realEstate, prostorCookie)); } try { @@ -180,12 +187,25 @@ class ProstorCrawler { } } - async scrapeAd(realEstate) { + async scrapeAd(realEstate, prostorCookie) { const { lat, lng, property_name, price, size, link, status } = realEstate; + + //Status information is given already in realestate list + //For VIP Ads status ='' canot be used, but no VIP ads are crawled + //We will make "fake" vip ad for RE that have size=55 + //It is weird because yesterday it said 'VIP ponuda' ??? + const adStatus = + size === "55" + ? ProstorCrawler.getStatusId("VIP ponuda") + : ProstorCrawler.getStatusId(status); + const url = `https://prostor.ba${link}`; + // console.log("[PROSTOR] Scraping : ", url); try { - const adPageSource = await fetch(url); + const adPageSource = await fetch(url, { + headers: { Cookie: prostorCookie } + }); const body = await adPageSource.text(); const $ = cheerio.load(body); @@ -330,7 +350,6 @@ class ProstorCrawler { furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; } - const adStatus = ProstorCrawler.getStatusId(status); const title = property_name; const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; const parsedArea = parseFloat(size); @@ -408,13 +427,15 @@ class ProstorCrawler { } } - async extractRealEstates(url) { + async extractRealEstates(url, prostorCookie) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); } try { - const res = await fetch(url); + const res = await fetch(url, { + headers: { Cookie: prostorCookie } + }); const body = await res.text(); const $ = cheerio.load(body); @@ -548,6 +569,8 @@ class ProstorCrawler { return AD_STATUS.STATUS_SOLD; case "Iznajmljeno": return AD_STATUS.STATUS_RENTED; + case "VIP ponuda": + return AD_STATUS.STATUS_VIP; default: console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]"); return AD_STATUS.STATUS_NORMAL; @@ -569,6 +592,51 @@ class ProstorCrawler { return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } + async loginForScraping(PROSTOR_LOGIN, prostorCookie) { + let formData = new FormData(); + formData.append("email", PROSTOR_LOGIN.EMAIL); + formData.append("password", PROSTOR_LOGIN.PASSWORD); + + return fetch("https://prostor.ba/moj-prostor/prijava", { + method: "POST", + body: formData, + headers: { Cookie: prostorCookie } + }) + .then(page => { + return page.text(); + }) + .then(resp => { + const $ = cheerio.load(resp); + if ( + $("h1") + .text() + .indexOf("Dobrodošli") !== -1 + ) { + console.log("[PROSTOR]: Crawler loged in!"); + return true; + } else { + console.log("[PROSTOR]: Crawler login failed - wrong credentials!"); + return false; + } + }) + .catch(err => { + console.log("[PROSTOR]: Crawler login error ", err); + }); + } + async getCookies() { + const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", { + headers: { Cookie: "" } + }); + const raw = getResponse.headers.raw()["set-cookie"]; + const cookie = raw + .map(datastring => { + const data = datastring.split(";"); + const cookieData = data[0]; + return cookieData; + }) + .join(";"); + return cookie; + } } module.exports = ProstorCrawler; diff --git a/app/views/realEstates.ejs b/app/views/realEstates.ejs index 3e94a1f..b47b744 100644 --- a/app/views/realEstates.ejs +++ b/app/views/realEstates.ejs @@ -2,6 +2,18 @@ diff --git a/app/views/redirect.ejs b/app/views/redirect.ejs index 52233cb..e36e081 100644 --- a/app/views/redirect.ejs +++ b/app/views/redirect.ejs @@ -1,26 +1,49 @@ -

+

-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+<% if(vipAd) { %>
-
- Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski -
+
+ Ovaj oglas zahtijeva da budete član + Prostor.ba. +
+
+ Ulogujte se + ili napravite + novi račun, a potom otvorite oglas. +
+<% } else { %> +
+
+ Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski +
+
+<% }%> + diff --git a/development.env b/development.env index 89f0a1e..150f8be 100644 --- a/development.env +++ b/development.env @@ -51,6 +51,8 @@ PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories t PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!! PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!! PROSTOR_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found +PROSTOR_LOGIN_EMAIL=Email of valid Prostor.ba account for crawling purposes +PROSTOR_LOGIN_PASS=Password of valid Prostor.ba account for crawling purposes #==AKTIDO== AKTIDO_MAX_PAGES=Restrict crawler to this number of pages AKTIDO_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved diff --git a/package-lock.json b/package-lock.json index 9661459..4626180 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1346,13 +1346,23 @@ "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=" }, "form-data": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz", - "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.0.tgz", + "integrity": "sha512-CKMFDglpbMi6PyN+brwB9Q/GOw0eAnsrEZDgcsH5Krhz5Od/haKHAX0NmQfha2zPPz0JpWzA7GJHGSnvCRLWsg==", "requires": { "asynckit": "^0.4.0", - "combined-stream": "^1.0.6", + "combined-stream": "^1.0.8", "mime-types": "^2.1.12" + }, + "dependencies": { + "combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "requires": { + "delayed-stream": "~1.0.0" + } + } } }, "forwarded": { @@ -3430,6 +3440,18 @@ "tough-cookie": "~2.4.3", "tunnel-agent": "^0.6.0", "uuid": "^3.3.2" + }, + "dependencies": { + "form-data": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz", + "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==", + "requires": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.6", + "mime-types": "^2.1.12" + } + } } }, "require-directory": { diff --git a/package.json b/package.json index 75a7cc4..511f772 100644 --- a/package.json +++ b/package.json @@ -39,6 +39,7 @@ "express": "^4.16.4", "express-ejs-layouts": "^2.5.0", "express-layout": "^0.1.0", + "form-data": "^3.0.0", "html-to-text": "^5.1.1", "moment": "^2.24.0", "moment-timezone": "^0.5.26",