diff --git a/app/common/enums.js b/app/common/enums.js index 33cb41e..85ed553 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -216,7 +216,8 @@ const AD_STATUS = { STATUS_DELETED: 4, STATUS_URGENT: 5, STATUS_DISCOUNTED: 6, - STATUS_RENTED: 7 + STATUS_RENTED: 7, + STATUS_VIP: 8 }; const AD_AGENCY = { diff --git a/app/config/appConfig.js b/app/config/appConfig.js index 7a7887b..4403fbd 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -32,6 +32,11 @@ const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0; const API_MAP_KEY = process.env.API_MAP_KEY || ""; +const PROSTOR_LOGIN = { + EMAIL: process.env.PROSTOR_LOGIN_EMAIL, + PASSWORD: process.env.PROSTOR_LOGIN_PASS +}; + module.exports = { APP_PORT, APP_URL, @@ -42,5 +47,6 @@ module.exports = { MAX_REAL_ESTATES_IN_EMAIL, MAX_REAL_ESTATES_IN_FIRST_EMAIL, PRINT_CRAWLER_DEBUG, - API_MAP_KEY + API_MAP_KEY, + PROSTOR_LOGIN }; diff --git a/app/controllers/realEstates.js b/app/controllers/realEstates.js index ce82765..48c1aff 100644 --- a/app/controllers/realEstates.js +++ b/app/controllers/realEstates.js @@ -2,13 +2,14 @@ const { findRealEstatesForSearchRequest } = require("../helpers/db/searchRequestMatch"); +const { AD_STATUS } = require("../common/enums"); const getRealEstates = async (req, res) => { const searchRequestId = req.params["searchRequestId"] || ""; const realEstates = await findRealEstatesForSearchRequest(searchRequestId); const title = "Nekretnine koje odgovaraju Vašim uslovima pretrage"; - res.render("realEstates", { realEstates, title }); + res.render("realEstates", { realEstates, title, AD_STATUS }); }; module.exports = { diff --git a/app/controllers/redirect.js b/app/controllers/redirect.js index 9975ab2..eb4e505 100644 --- a/app/controllers/redirect.js +++ b/app/controllers/redirect.js @@ -1,9 +1,11 @@ const { getRealEstateById } = require("../helpers/db/realEstate"); +const { AD_STATUS } = require("../common/enums"); const getRedirect = async (req, res) => { const id = req.params.id || null; let error = false; let redirectUrl = undefined; + let vipAd = undefined; if (!id) { error = true; } else { @@ -13,6 +15,7 @@ const getRedirect = async (req, res) => { error = true; } else { redirectUrl = realEstate.url; + vipAd = realEstate.adStatus === AD_STATUS.STATUS_VIP; } } catch (e) { error = true; @@ -24,7 +27,7 @@ const getRedirect = async (req, res) => { res.render("notFound", { title }); } else { const title = "Preusmjeravanje"; - res.render("redirect", { title, redirectUrl }); + res.render("redirect", { title, redirectUrl, vipAd }); } }; diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index cb1d3f9..04be5f3 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -3,6 +3,7 @@ const fetch = require("node-fetch"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); +const FormData = require("form-data"); const { AD_TYPE, @@ -16,7 +17,8 @@ const { const { PRINT_CRAWLER_DEBUG, - DEFAULT_TIMEZONE + DEFAULT_TIMEZONE, + PROSTOR_LOGIN } = require("../../config/appConfig"); const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor"); @@ -60,13 +62,16 @@ class ProstorCrawler { async crawl() { const crawlAdCategories = this.crawlerAdCategories; - + //We need session cookie to use login privileges + const prostorCookie = await this.getCookies(); + //New tag to check if crawler loged in + const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie); const newRealEstates = []; - - if (crawlAdCategories) { + //Crawl only if login was successful + if (crawlAdCategories && login) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { - indexGenerators.push(this.categoryIndexer(adCategory)); + indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie)); } let done = false; @@ -119,13 +124,14 @@ class ProstorCrawler { return newRealEstates; } - async *categoryIndexer(adCategory) { + async *categoryIndexer(adCategory, prostorCookie) { const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`; const listOfAllRealEstates = await this.extractRealEstates( - urlPageToCrawl + urlPageToCrawl, + prostorCookie ); let elementToStartIndexFrom = 0; @@ -139,7 +145,8 @@ class ProstorCrawler { elementToStartIndexFrom += realEstatesForSinglePage.length; const singlePageResults = await this.indexSinglePage( - realEstatesForSinglePage + realEstatesForSinglePage, + prostorCookie ); const filteredSinglePageResults = singlePageResults.filter( @@ -163,10 +170,10 @@ class ProstorCrawler { } } - async indexSinglePage(realEstatesList) { + async indexSinglePage(realEstatesList, prostorCookie) { const asyncActions = []; for (const realEstate of realEstatesList) { - asyncActions.push(this.scrapeAd(realEstate)); + asyncActions.push(this.scrapeAd(realEstate, prostorCookie)); } try { @@ -180,12 +187,25 @@ class ProstorCrawler { } } - async scrapeAd(realEstate) { + async scrapeAd(realEstate, prostorCookie) { const { lat, lng, property_name, price, size, link, status } = realEstate; + + //Status information is given already in realestate list + //For VIP Ads status ='' canot be used, but no VIP ads are crawled + //We will make "fake" vip ad for RE that have size=55 + //It is weird because yesterday it said 'VIP ponuda' ??? + const adStatus = + size === "55" + ? ProstorCrawler.getStatusId("VIP ponuda") + : ProstorCrawler.getStatusId(status); + const url = `https://prostor.ba${link}`; + // console.log("[PROSTOR] Scraping : ", url); try { - const adPageSource = await fetch(url); + const adPageSource = await fetch(url, { + headers: { Cookie: prostorCookie } + }); const body = await adPageSource.text(); const $ = cheerio.load(body); @@ -330,7 +350,6 @@ class ProstorCrawler { furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; } - const adStatus = ProstorCrawler.getStatusId(status); const title = property_name; const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; const parsedArea = parseFloat(size); @@ -408,13 +427,15 @@ class ProstorCrawler { } } - async extractRealEstates(url) { + async extractRealEstates(url, prostorCookie) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); } try { - const res = await fetch(url); + const res = await fetch(url, { + headers: { Cookie: prostorCookie } + }); const body = await res.text(); const $ = cheerio.load(body); @@ -548,6 +569,8 @@ class ProstorCrawler { return AD_STATUS.STATUS_SOLD; case "Iznajmljeno": return AD_STATUS.STATUS_RENTED; + case "VIP ponuda": + return AD_STATUS.STATUS_VIP; default: console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]"); return AD_STATUS.STATUS_NORMAL; @@ -569,6 +592,51 @@ class ProstorCrawler { return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } + async loginForScraping(PROSTOR_LOGIN, prostorCookie) { + let formData = new FormData(); + formData.append("email", PROSTOR_LOGIN.EMAIL); + formData.append("password", PROSTOR_LOGIN.PASSWORD); + + return fetch("https://prostor.ba/moj-prostor/prijava", { + method: "POST", + body: formData, + headers: { Cookie: prostorCookie } + }) + .then(page => { + return page.text(); + }) + .then(resp => { + const $ = cheerio.load(resp); + if ( + $("h1") + .text() + .indexOf("Dobrodošli") !== -1 + ) { + console.log("[PROSTOR]: Crawler loged in!"); + return true; + } else { + console.log("[PROSTOR]: Crawler login failed - wrong credentials!"); + return false; + } + }) + .catch(err => { + console.log("[PROSTOR]: Crawler login error ", err); + }); + } + async getCookies() { + const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", { + headers: { Cookie: "" } + }); + const raw = getResponse.headers.raw()["set-cookie"]; + const cookie = raw + .map(datastring => { + const data = datastring.split(";"); + const cookieData = data[0]; + return cookieData; + }) + .join(";"); + return cookie; + } } module.exports = ProstorCrawler; diff --git a/app/views/realEstates.ejs b/app/views/realEstates.ejs index 3e94a1f..b47b744 100644 --- a/app/views/realEstates.ejs +++ b/app/views/realEstates.ejs @@ -2,6 +2,18 @@