Prostor vip ads #83

Merged
RabbIT09-n merged 7 commits from prostor-vip-ads into master 2020-01-31 22:52:14 +01:00
10 changed files with 181 additions and 41 deletions

View File

@@ -216,7 +216,8 @@ const AD_STATUS = {
STATUS_DELETED: 4,
STATUS_URGENT: 5,
STATUS_DISCOUNTED: 6,
STATUS_RENTED: 7
STATUS_RENTED: 7,
STATUS_VIP: 8
};
const AD_AGENCY = {

View File

@@ -32,6 +32,11 @@ const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0;
const API_MAP_KEY = process.env.API_MAP_KEY || "";
const PROSTOR_LOGIN = {
EMAIL: process.env.PROSTOR_LOGIN_EMAIL,
PASSWORD: process.env.PROSTOR_LOGIN_PASS
};
module.exports = {
APP_PORT,
APP_URL,
@@ -42,5 +47,6 @@ module.exports = {
MAX_REAL_ESTATES_IN_EMAIL,
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
PRINT_CRAWLER_DEBUG,
API_MAP_KEY
API_MAP_KEY,
PROSTOR_LOGIN
};

View File

@@ -2,13 +2,14 @@
const {
findRealEstatesForSearchRequest
} = require("../helpers/db/searchRequestMatch");
const { AD_STATUS } = require("../common/enums");
const getRealEstates = async (req, res) => {
const searchRequestId = req.params["searchRequestId"] || "";
const realEstates = await findRealEstatesForSearchRequest(searchRequestId);
const title = "Nekretnine koje odgovaraju Vašim uslovima pretrage";
res.render("realEstates", { realEstates, title });
res.render("realEstates", { realEstates, title, AD_STATUS });
};
module.exports = {

View File

@@ -1,9 +1,11 @@
const { getRealEstateById } = require("../helpers/db/realEstate");
const { AD_STATUS } = require("../common/enums");
const getRedirect = async (req, res) => {
const id = req.params.id || null;
let error = false;
let redirectUrl = undefined;
let vipAd = undefined;
if (!id) {
error = true;
} else {
@@ -13,6 +15,7 @@ const getRedirect = async (req, res) => {
error = true;
} else {
redirectUrl = realEstate.url;
vipAd = realEstate.adStatus === AD_STATUS.STATUS_VIP;
}
} catch (e) {
error = true;
@@ -24,7 +27,7 @@ const getRedirect = async (req, res) => {
res.render("notFound", { title });
} else {
const title = "Preusmjeravanje";
res.render("redirect", { title, redirectUrl });
res.render("redirect", { title, redirectUrl, vipAd });
}
};

View File

@@ -3,6 +3,7 @@
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const FormData = require("form-data");
const {
AD_TYPE,
@@ -16,7 +17,8 @@ const {
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
DEFAULT_TIMEZONE,
PROSTOR_LOGIN
} = require("../../config/appConfig");
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
@@ -60,13 +62,16 @@ class ProstorCrawler {
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
//We need session cookie to use login privileges
const prostorCookie = await this.getCookies();
//New tag to check if crawler loged in
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
const newRealEstates = [];
if (crawlAdCategories) {
//Crawl only if login was successful
if (crawlAdCategories && login) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory));
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
}
let done = false;
@@ -119,13 +124,14 @@ class ProstorCrawler {
return newRealEstates;
}
async *categoryIndexer(adCategory) {
async *categoryIndexer(adCategory, prostorCookie) {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl
urlPageToCrawl,
prostorCookie
);
let elementToStartIndexFrom = 0;
@@ -139,7 +145,8 @@ class ProstorCrawler {
elementToStartIndexFrom += realEstatesForSinglePage.length;
const singlePageResults = await this.indexSinglePage(
realEstatesForSinglePage
realEstatesForSinglePage,
prostorCookie
);
const filteredSinglePageResults = singlePageResults.filter(
@@ -163,10 +170,10 @@ class ProstorCrawler {
}
}
async indexSinglePage(realEstatesList) {
async indexSinglePage(realEstatesList, prostorCookie) {
const asyncActions = [];
for (const realEstate of realEstatesList) {
asyncActions.push(this.scrapeAd(realEstate));
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
}
try {
@@ -180,12 +187,25 @@ class ProstorCrawler {
}
}
async scrapeAd(realEstate) {
async scrapeAd(realEstate, prostorCookie) {
const { lat, lng, property_name, price, size, link, status } = realEstate;
//Status information is given already in realestate list
//For VIP Ads status ='' canot be used, but no VIP ads are crawled
//We will make "fake" vip ad for RE that have size=55
//It is weird because yesterday it said 'VIP ponuda' ???
const adStatus =
size === "55"
? ProstorCrawler.getStatusId("VIP ponuda")
: ProstorCrawler.getStatusId(status);
const url = `https://prostor.ba${link}`;
// console.log("[PROSTOR] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const adPageSource = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await adPageSource.text();
const $ = cheerio.load(body);
@@ -330,7 +350,6 @@ class ProstorCrawler {
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
}
const adStatus = ProstorCrawler.getStatusId(status);
const title = property_name;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
@@ -408,13 +427,15 @@ class ProstorCrawler {
}
}
async extractRealEstates(url) {
async extractRealEstates(url, prostorCookie) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url);
}
try {
const res = await fetch(url);
const res = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await res.text();
const $ = cheerio.load(body);
@@ -548,6 +569,8 @@ class ProstorCrawler {
return AD_STATUS.STATUS_SOLD;
case "Iznajmljeno":
return AD_STATUS.STATUS_RENTED;
case "VIP ponuda":
return AD_STATUS.STATUS_VIP;
default:
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
return AD_STATUS.STATUS_NORMAL;
@@ -569,6 +592,51 @@ class ProstorCrawler {
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
let formData = new FormData();
formData.append("email", PROSTOR_LOGIN.EMAIL);
formData.append("password", PROSTOR_LOGIN.PASSWORD);
return fetch("https://prostor.ba/moj-prostor/prijava", {
method: "POST",
body: formData,
headers: { Cookie: prostorCookie }
})
.then(page => {
return page.text();
})
.then(resp => {
const $ = cheerio.load(resp);
if (
$("h1")
.text()
.indexOf("Dobrodošli") !== -1
) {
console.log("[PROSTOR]: Crawler loged in!");
return true;
} else {
console.log("[PROSTOR]: Crawler login failed - wrong credentials!");
return false;
}
})
.catch(err => {
console.log("[PROSTOR]: Crawler login error ", err);
});
}
async getCookies() {
const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", {
headers: { Cookie: "" }
});
const raw = getResponse.headers.raw()["set-cookie"];
const cookie = raw
.map(datastring => {
const data = datastring.split(";");
const cookieData = data[0];
return cookieData;
})
.join(";");
return cookie;
}
}
module.exports = ProstorCrawler;

View File

@@ -2,6 +2,18 @@
<ul class="collection with-header">
<% for(const realEstate of realEstates) { %>
<li class="collection-item">
<% if(realEstate.adStatus === AD_STATUS.STATUS_VIP) {%>
<div>
<% //This needs to do redirecting instead of direct link to realestate
%>
<a href="/redirect/<%= realEstate.id %>" class="estates-link">
<%= realEstate.title %>
<div class="kivi-color secondary-content">
<i class="material-icons">send</i>
</div>
</a>
</div>
<%} else { %>
<div>
<a href="<%= realEstate.url %>" class="estates-link">
<%= realEstate.title %>
@@ -10,6 +22,7 @@
</div>
</a>
</div>
<% }%>
</li>
<% } %>
</ul>

View File

@@ -1,26 +1,49 @@
<br><br>
<br /><br />
<div class="center">
<div class="preloader-wrapper big active center">
<div class="kivi-spinner-color spinner-layer spinner-green-only">
<div class="circle-clipper left">
<div class="circle"></div>
</div><div class="gap-patch">
<div class="circle"></div>
</div><div class="circle-clipper right">
<div class="circle"></div>
</div>
</div>
<div class="preloader-wrapper big active center">
<div class="kivi-spinner-color spinner-layer spinner-green-only">
<div class="circle-clipper left">
<div class="circle"></div>
</div>
<div class="gap-patch">
<div class="circle"></div>
</div>
<div class="circle-clipper right">
<div class="circle"></div>
</div>
</div>
</div>
</div>
<br>
<br />
<% if(vipAd) { %>
<div class="center">
<h6>
<a href="<%= redirectUrl %>" rel="noreferrer" id="realEstateUrl">Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski</a>
</h6>
<h6>
Ovaj oglas zahtijeva da budete član
<a href="https://prostor.ba/" rel="noreferrer">Prostor.ba</a>.
<br />
<br />
<a href="https://prostor.ba/moj-prostor/prijava" rel="noreferrer"
>Ulogujte se</a
>
ili napravite
<a href="https://prostor.ba/moj-prostor/registracija" rel="noreferrer"
>novi račun</a
>, a potom otvorite <a href="<%= redirectUrl %>" rel="noreferrer">oglas</a>.
</h6>
</div>
<% } else { %>
<div class="center">
<h6>
<a href="<%= redirectUrl %>" rel="noreferrer" id="realEstateUrl"
>Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski</a
>
</h6>
</div>
<% }%>
<script>
window.onload = function() {
document.getElementById('realEstateUrl').click();
}
window.onload = function() {
document.getElementById("realEstateUrl").click();
};
</script>

View File

@@ -51,6 +51,8 @@ PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories t
PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!!
PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!!
PROSTOR_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
PROSTOR_LOGIN_EMAIL=Email of valid Prostor.ba account for crawling purposes
PROSTOR_LOGIN_PASS=Password of valid Prostor.ba account for crawling purposes
#==AKTIDO==
AKTIDO_MAX_PAGES=Restrict crawler to this number of pages
AKTIDO_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved

30
package-lock.json generated
View File

@@ -1346,13 +1346,23 @@
"integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE="
},
"form-data": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
"integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.0.tgz",
"integrity": "sha512-CKMFDglpbMi6PyN+brwB9Q/GOw0eAnsrEZDgcsH5Krhz5Od/haKHAX0NmQfha2zPPz0JpWzA7GJHGSnvCRLWsg==",
"requires": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.6",
"combined-stream": "^1.0.8",
"mime-types": "^2.1.12"
},
"dependencies": {
"combined-stream": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"requires": {
"delayed-stream": "~1.0.0"
}
}
}
},
"forwarded": {
@@ -3430,6 +3440,18 @@
"tough-cookie": "~2.4.3",
"tunnel-agent": "^0.6.0",
"uuid": "^3.3.2"
},
"dependencies": {
"form-data": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
"integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
"requires": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.6",
"mime-types": "^2.1.12"
}
}
}
},
"require-directory": {

View File

@@ -39,6 +39,7 @@
"express": "^4.16.4",
"express-ejs-layouts": "^2.5.0",
"express-layout": "^0.1.0",
"form-data": "^3.0.0",
"html-to-text": "^5.1.1",
"moment": "^2.24.0",
"moment-timezone": "^0.5.26",