From 850528267011f08261f1486483f4de9374df8d09 Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Sun, 12 Jan 2020 01:22:50 +0100 Subject: [PATCH] WiP Login of crawler prostor. --- app/config/appConfig.js | 8 +++- app/crawler/specificCrawlers/prostor.js | 54 ++++++++++++++++++++++--- development.env | 2 + 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/app/config/appConfig.js b/app/config/appConfig.js index 7a7887b..4403fbd 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -32,6 +32,11 @@ const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0; const API_MAP_KEY = process.env.API_MAP_KEY || ""; +const PROSTOR_LOGIN = { + EMAIL: process.env.PROSTOR_LOGIN_EMAIL, + PASSWORD: process.env.PROSTOR_LOGIN_PASS +}; + module.exports = { APP_PORT, APP_URL, @@ -42,5 +47,6 @@ module.exports = { MAX_REAL_ESTATES_IN_EMAIL, MAX_REAL_ESTATES_IN_FIRST_EMAIL, PRINT_CRAWLER_DEBUG, - API_MAP_KEY + API_MAP_KEY, + PROSTOR_LOGIN }; diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index ca4271c..96aab61 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -16,7 +16,8 @@ const { const { PRINT_CRAWLER_DEBUG, - DEFAULT_TIMEZONE + DEFAULT_TIMEZONE, + PROSTOR_LOGIN } = require("../../config/appConfig"); const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor"); @@ -60,10 +61,12 @@ class ProstorCrawler { async crawl() { const crawlAdCategories = this.crawlerAdCategories; - + //New tag to check if crawler loged in + const login = await this.loginForScraping(PROSTOR_LOGIN); const newRealEstates = []; - - if (crawlAdCategories) { + // + console.log("login before crawl:", login); + if (crawlAdCategories && login) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory)); @@ -549,8 +552,6 @@ class ProstorCrawler { } static getStatusId(statusText) { - // - console.log("statusText u funkciji", statusText); switch (statusText) { case "": return AD_STATUS.STATUS_NORMAL; @@ -583,6 +584,47 @@ class ProstorCrawler { return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } + async loginForScraping(PROSTOR_LOGIN) { + console.log("PROSTOR_LOGIN", PROSTOR_LOGIN); + let logedin = false; + fetch("https://prostor.ba/moj-prostor/prijava", { + method: "POST", + body: JSON.stringify({ + email: PROSTOR_LOGIN.EMAIL, + password: PROSTOR_LOGIN.PASSWORD + }) + }) + .then(page => { + /* console.log("page", page.text()); + + const $ = cheerio.load(page); + console.log("$ ", $); + if ( + $(".icons .d-none.d-xl-inline-block.mr-2") + .text() + .indexOf("Dobrodošli") != -1 + ) { + console.log("[PROSTOR]: Crawler loged in!"); + logedin = true; + } else { + console.log("[PROSTOR]: Crawler login failed - wrong credentials!"); + } */ + + return page.text(); + }) + .then(resp => { + // console.log(resp); + const $ = cheerio.load(resp); + console.log("$ ", $("h1").text()); + }) + + .catch(err => { + console.log("[PROSTOR]: Crawler login error ", err); + }); + // + console.log("login in function:", logedin); + return logedin; + } } module.exports = ProstorCrawler; diff --git a/development.env b/development.env index 89f0a1e..150f8be 100644 --- a/development.env +++ b/development.env @@ -51,6 +51,8 @@ PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories t PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!! PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!! PROSTOR_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found +PROSTOR_LOGIN_EMAIL=Email of valid Prostor.ba account for crawling purposes +PROSTOR_LOGIN_PASS=Password of valid Prostor.ba account for crawling purposes #==AKTIDO== AKTIDO_MAX_PAGES=Restrict crawler to this number of pages AKTIDO_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved