Compare commits
10 Commits
crawler-sa
...
prostor-vi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
511b290096 | ||
|
|
ba43fa0713 | ||
|
|
e70901d369 | ||
|
|
8505282670 | ||
|
|
64e4835899 | ||
|
|
1658325c4b | ||
|
|
49161c1b60 | ||
|
|
d23ddf849f | ||
|
|
38bd0343f5 | ||
|
|
fa4e0d64de |
@@ -216,15 +216,15 @@ const AD_STATUS = {
|
|||||||
STATUS_DELETED: 4,
|
STATUS_DELETED: 4,
|
||||||
STATUS_URGENT: 5,
|
STATUS_URGENT: 5,
|
||||||
STATUS_DISCOUNTED: 6,
|
STATUS_DISCOUNTED: 6,
|
||||||
STATUS_RENTED: 7
|
STATUS_RENTED: 7,
|
||||||
|
STATUS_VIP: 8
|
||||||
};
|
};
|
||||||
|
|
||||||
const AD_AGENCY = {
|
const AD_AGENCY = {
|
||||||
OLX: "OLX",
|
OLX: "OLX",
|
||||||
RENTAL: "RENTAL",
|
RENTAL: "RENTAL",
|
||||||
PROSTOR: "PROSTOR",
|
PROSTOR: "PROSTOR",
|
||||||
AKTIDO: "AKTIDO",
|
AKTIDO: "AKTIDO"
|
||||||
SALJIC: "SALJIC"
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const CRAWLER_AD_TYPE = {
|
const CRAWLER_AD_TYPE = {
|
||||||
|
|||||||
@@ -32,6 +32,11 @@ const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0;
|
|||||||
|
|
||||||
const API_MAP_KEY = process.env.API_MAP_KEY || "";
|
const API_MAP_KEY = process.env.API_MAP_KEY || "";
|
||||||
|
|
||||||
|
const PROSTOR_LOGIN = {
|
||||||
|
EMAIL: process.env.PROSTOR_LOGIN_EMAIL,
|
||||||
|
PASSWORD: process.env.PROSTOR_LOGIN_PASS
|
||||||
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
APP_PORT,
|
APP_PORT,
|
||||||
APP_URL,
|
APP_URL,
|
||||||
@@ -42,5 +47,6 @@ module.exports = {
|
|||||||
MAX_REAL_ESTATES_IN_EMAIL,
|
MAX_REAL_ESTATES_IN_EMAIL,
|
||||||
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
|
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
|
||||||
PRINT_CRAWLER_DEBUG,
|
PRINT_CRAWLER_DEBUG,
|
||||||
API_MAP_KEY
|
API_MAP_KEY,
|
||||||
|
PROSTOR_LOGIN
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -2,13 +2,14 @@
|
|||||||
const {
|
const {
|
||||||
findRealEstatesForSearchRequest
|
findRealEstatesForSearchRequest
|
||||||
} = require("../helpers/db/searchRequestMatch");
|
} = require("../helpers/db/searchRequestMatch");
|
||||||
|
const { AD_STATUS } = require("../common/enums");
|
||||||
|
|
||||||
const getRealEstates = async (req, res) => {
|
const getRealEstates = async (req, res) => {
|
||||||
const searchRequestId = req.params["searchRequestId"] || "";
|
const searchRequestId = req.params["searchRequestId"] || "";
|
||||||
const realEstates = await findRealEstatesForSearchRequest(searchRequestId);
|
const realEstates = await findRealEstatesForSearchRequest(searchRequestId);
|
||||||
|
|
||||||
const title = "Nekretnine koje odgovaraju Vašim uslovima pretrage";
|
const title = "Nekretnine koje odgovaraju Vašim uslovima pretrage";
|
||||||
res.render("realEstates", { realEstates, title });
|
res.render("realEstates", { realEstates, title, AD_STATUS });
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
const { getRealEstateById } = require("../helpers/db/realEstate");
|
const { getRealEstateById } = require("../helpers/db/realEstate");
|
||||||
|
const { AD_STATUS } = require("../common/enums");
|
||||||
|
|
||||||
const getRedirect = async (req, res) => {
|
const getRedirect = async (req, res) => {
|
||||||
const id = req.params.id || null;
|
const id = req.params.id || null;
|
||||||
let error = false;
|
let error = false;
|
||||||
let redirectUrl = undefined;
|
let redirectUrl = undefined;
|
||||||
|
let vipAd = undefined;
|
||||||
if (!id) {
|
if (!id) {
|
||||||
error = true;
|
error = true;
|
||||||
} else {
|
} else {
|
||||||
@@ -13,6 +15,7 @@ const getRedirect = async (req, res) => {
|
|||||||
error = true;
|
error = true;
|
||||||
} else {
|
} else {
|
||||||
redirectUrl = realEstate.url;
|
redirectUrl = realEstate.url;
|
||||||
|
vipAd = realEstate.adStatus === AD_STATUS.STATUS_VIP;
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
error = true;
|
error = true;
|
||||||
@@ -24,7 +27,7 @@ const getRedirect = async (req, res) => {
|
|||||||
res.render("notFound", { title });
|
res.render("notFound", { title });
|
||||||
} else {
|
} else {
|
||||||
const title = "Preusmjeravanje";
|
const title = "Preusmjeravanje";
|
||||||
res.render("redirect", { title, redirectUrl });
|
res.render("redirect", { title, redirectUrl, vipAd });
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -9,14 +9,12 @@ const OlxCrawler = require("./specificCrawlers/olx");
|
|||||||
const RentalCrawler = require("./specificCrawlers/rental");
|
const RentalCrawler = require("./specificCrawlers/rental");
|
||||||
const ProstorCrawler = require("./specificCrawlers/prostor");
|
const ProstorCrawler = require("./specificCrawlers/prostor");
|
||||||
const AktidoCrawler = require("./specificCrawlers/aktido");
|
const AktidoCrawler = require("./specificCrawlers/aktido");
|
||||||
const SaljicCrawler = require("./specificCrawlers/saljic");
|
|
||||||
|
|
||||||
const {
|
const {
|
||||||
OLX_CONFIG,
|
OLX_CONFIG,
|
||||||
RENTAL_CONFIG,
|
RENTAL_CONFIG,
|
||||||
PROSTOR_CONFIG,
|
PROSTOR_CONFIG,
|
||||||
AKTIDO_CONFIG,
|
AKTIDO_CONFIG
|
||||||
SALJIC_CONFIG
|
|
||||||
} = require("./crawlerConfig");
|
} = require("./crawlerConfig");
|
||||||
const PostgresSaver = require("./savers/postgres");
|
const PostgresSaver = require("./savers/postgres");
|
||||||
|
|
||||||
@@ -59,15 +57,6 @@ async function crawlAll() {
|
|||||||
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
||||||
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
||||||
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
||||||
),
|
|
||||||
new SaljicCrawler(
|
|
||||||
[postgresSaver],
|
|
||||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
|
||||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
|
||||||
SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
|
||||||
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
|
||||||
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
|
||||||
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
|
||||||
)
|
)
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -5,12 +5,10 @@ const OLX_CONFIG = require("./specificConfigs/olx");
|
|||||||
const RENTAL_CONFIG = require("./specificConfigs/rental");
|
const RENTAL_CONFIG = require("./specificConfigs/rental");
|
||||||
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
|
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
|
||||||
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
|
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
|
||||||
const SALJIC_CONFIG = require("./specificConfigs/saljic");
|
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
OLX_CONFIG,
|
OLX_CONFIG,
|
||||||
RENTAL_CONFIG,
|
RENTAL_CONFIG,
|
||||||
PROSTOR_CONFIG,
|
PROSTOR_CONFIG,
|
||||||
AKTIDO_CONFIG,
|
AKTIDO_CONFIG
|
||||||
SALJIC_CONFIG
|
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,34 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
|
|
||||||
|
|
||||||
const saljicCrawlerAdType =
|
|
||||||
process.env.SALJIC_CRAWLER_AD_TYPE !== undefined
|
|
||||||
? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE]
|
|
||||||
: null;
|
|
||||||
|
|
||||||
const saljicParsedCrawlerAdCategories =
|
|
||||||
process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined
|
|
||||||
? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
|
||||||
category.trim()
|
|
||||||
)
|
|
||||||
: ["FLAT", "HOUSE"];
|
|
||||||
|
|
||||||
const saljicIgnoredUsernames = [];
|
|
||||||
|
|
||||||
const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories
|
|
||||||
.map(categoryName =>
|
|
||||||
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
|
|
||||||
)
|
|
||||||
.filter(category => !!category);
|
|
||||||
|
|
||||||
module.exports = {
|
|
||||||
SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100,
|
|
||||||
SALJIC_MAX_RESULTS_PER_PAGE:
|
|
||||||
parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000,
|
|
||||||
SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
|
||||||
SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories,
|
|
||||||
SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [],
|
|
||||||
SALJIC_DELAY_BETWEEN_PAGES:
|
|
||||||
parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000,
|
|
||||||
SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL)
|
|
||||||
};
|
|
||||||
@@ -3,6 +3,7 @@
|
|||||||
const fetch = require("node-fetch");
|
const fetch = require("node-fetch");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const moment = require("moment-timezone");
|
const moment = require("moment-timezone");
|
||||||
|
const FormData = require("form-data");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
AD_TYPE,
|
AD_TYPE,
|
||||||
@@ -16,7 +17,8 @@ const {
|
|||||||
|
|
||||||
const {
|
const {
|
||||||
PRINT_CRAWLER_DEBUG,
|
PRINT_CRAWLER_DEBUG,
|
||||||
DEFAULT_TIMEZONE
|
DEFAULT_TIMEZONE,
|
||||||
|
PROSTOR_LOGIN
|
||||||
} = require("../../config/appConfig");
|
} = require("../../config/appConfig");
|
||||||
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
|
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
|
||||||
|
|
||||||
@@ -60,13 +62,16 @@ class ProstorCrawler {
|
|||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
//We need session cookie to use login privileges
|
||||||
|
const prostorCookie = await this.getCookies();
|
||||||
|
//New tag to check if crawler loged in
|
||||||
|
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
|
||||||
const newRealEstates = [];
|
const newRealEstates = [];
|
||||||
|
//Crawl only if login was successful
|
||||||
if (crawlAdCategories) {
|
if (crawlAdCategories && login) {
|
||||||
const indexGenerators = [];
|
const indexGenerators = [];
|
||||||
for (const adCategory of crawlAdCategories) {
|
for (const adCategory of crawlAdCategories) {
|
||||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
|
||||||
}
|
}
|
||||||
|
|
||||||
let done = false;
|
let done = false;
|
||||||
@@ -119,13 +124,14 @@ class ProstorCrawler {
|
|||||||
return newRealEstates;
|
return newRealEstates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async *categoryIndexer(adCategory) {
|
async *categoryIndexer(adCategory, prostorCookie) {
|
||||||
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
||||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
||||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
|
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
|
||||||
const listOfAllRealEstates = await this.extractRealEstates(
|
const listOfAllRealEstates = await this.extractRealEstates(
|
||||||
urlPageToCrawl
|
urlPageToCrawl,
|
||||||
|
prostorCookie
|
||||||
);
|
);
|
||||||
|
|
||||||
let elementToStartIndexFrom = 0;
|
let elementToStartIndexFrom = 0;
|
||||||
@@ -139,7 +145,8 @@ class ProstorCrawler {
|
|||||||
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
||||||
|
|
||||||
const singlePageResults = await this.indexSinglePage(
|
const singlePageResults = await this.indexSinglePage(
|
||||||
realEstatesForSinglePage
|
realEstatesForSinglePage,
|
||||||
|
prostorCookie
|
||||||
);
|
);
|
||||||
|
|
||||||
const filteredSinglePageResults = singlePageResults.filter(
|
const filteredSinglePageResults = singlePageResults.filter(
|
||||||
@@ -163,10 +170,10 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async indexSinglePage(realEstatesList) {
|
async indexSinglePage(realEstatesList, prostorCookie) {
|
||||||
const asyncActions = [];
|
const asyncActions = [];
|
||||||
for (const realEstate of realEstatesList) {
|
for (const realEstate of realEstatesList) {
|
||||||
asyncActions.push(this.scrapeAd(realEstate));
|
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -180,12 +187,25 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(realEstate) {
|
async scrapeAd(realEstate, prostorCookie) {
|
||||||
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
||||||
|
|
||||||
|
//Status information is given already in realestate list
|
||||||
|
//For VIP Ads status ='' canot be used, but no VIP ads are crawled
|
||||||
|
//We will make "fake" vip ad for RE that have size=55
|
||||||
|
//It is weird because yesterday it said 'VIP ponuda' ???
|
||||||
|
const adStatus =
|
||||||
|
size === "55"
|
||||||
|
? ProstorCrawler.getStatusId("VIP ponuda")
|
||||||
|
: ProstorCrawler.getStatusId(status);
|
||||||
|
|
||||||
const url = `https://prostor.ba${link}`;
|
const url = `https://prostor.ba${link}`;
|
||||||
|
|
||||||
// console.log("[PROSTOR] Scraping : ", url);
|
// console.log("[PROSTOR] Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url, {
|
||||||
|
headers: { Cookie: prostorCookie }
|
||||||
|
});
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
@@ -330,7 +350,6 @@ class ProstorCrawler {
|
|||||||
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
|
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
|
||||||
}
|
}
|
||||||
|
|
||||||
const adStatus = ProstorCrawler.getStatusId(status);
|
|
||||||
const title = property_name;
|
const title = property_name;
|
||||||
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
||||||
const parsedArea = parseFloat(size);
|
const parsedArea = parseFloat(size);
|
||||||
@@ -408,13 +427,15 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async extractRealEstates(url) {
|
async extractRealEstates(url, prostorCookie) {
|
||||||
if (PRINT_CRAWLER_DEBUG) {
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
console.log("[PROSTOR] Index page : ", url);
|
console.log("[PROSTOR] Index page : ", url);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
const res = await fetch(url, {
|
||||||
|
headers: { Cookie: prostorCookie }
|
||||||
|
});
|
||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
@@ -548,6 +569,8 @@ class ProstorCrawler {
|
|||||||
return AD_STATUS.STATUS_SOLD;
|
return AD_STATUS.STATUS_SOLD;
|
||||||
case "Iznajmljeno":
|
case "Iznajmljeno":
|
||||||
return AD_STATUS.STATUS_RENTED;
|
return AD_STATUS.STATUS_RENTED;
|
||||||
|
case "VIP ponuda":
|
||||||
|
return AD_STATUS.STATUS_VIP;
|
||||||
default:
|
default:
|
||||||
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
|
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
|
||||||
return AD_STATUS.STATUS_NORMAL;
|
return AD_STATUS.STATUS_NORMAL;
|
||||||
@@ -569,6 +592,51 @@ class ProstorCrawler {
|
|||||||
return savers[0].save(results);
|
return savers[0].save(results);
|
||||||
//so that we can use some sequelize options and information when data is inserted
|
//so that we can use some sequelize options and information when data is inserted
|
||||||
}
|
}
|
||||||
|
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
|
||||||
|
let formData = new FormData();
|
||||||
|
formData.append("email", PROSTOR_LOGIN.EMAIL);
|
||||||
|
formData.append("password", PROSTOR_LOGIN.PASSWORD);
|
||||||
|
|
||||||
|
return fetch("https://prostor.ba/moj-prostor/prijava", {
|
||||||
|
method: "POST",
|
||||||
|
body: formData,
|
||||||
|
headers: { Cookie: prostorCookie }
|
||||||
|
})
|
||||||
|
.then(page => {
|
||||||
|
return page.text();
|
||||||
|
})
|
||||||
|
.then(resp => {
|
||||||
|
const $ = cheerio.load(resp);
|
||||||
|
if (
|
||||||
|
$("h1")
|
||||||
|
.text()
|
||||||
|
.indexOf("Dobrodošli") !== -1
|
||||||
|
) {
|
||||||
|
console.log("[PROSTOR]: Crawler loged in!");
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
console.log("[PROSTOR]: Crawler login failed - wrong credentials!");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(err => {
|
||||||
|
console.log("[PROSTOR]: Crawler login error ", err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
async getCookies() {
|
||||||
|
const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", {
|
||||||
|
headers: { Cookie: "" }
|
||||||
|
});
|
||||||
|
const raw = getResponse.headers.raw()["set-cookie"];
|
||||||
|
const cookie = raw
|
||||||
|
.map(datastring => {
|
||||||
|
const data = datastring.split(";");
|
||||||
|
const cookieData = data[0];
|
||||||
|
return cookieData;
|
||||||
|
})
|
||||||
|
.join(";");
|
||||||
|
return cookie;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = ProstorCrawler;
|
module.exports = ProstorCrawler;
|
||||||
|
|||||||
@@ -1,611 +0,0 @@
|
|||||||
"use strict";
|
|
||||||
|
|
||||||
const fetch = require("node-fetch");
|
|
||||||
const cheerio = require("cheerio");
|
|
||||||
const moment = require("moment-timezone");
|
|
||||||
|
|
||||||
const {
|
|
||||||
AD_TYPE,
|
|
||||||
AD_CATEGORY,
|
|
||||||
AD_AGENCY,
|
|
||||||
AD_STATUS,
|
|
||||||
CRAWLER_AD_TYPE,
|
|
||||||
FURNISHING_TYPE,
|
|
||||||
HEATING_TYPE
|
|
||||||
} = require("../../common/enums");
|
|
||||||
|
|
||||||
const {
|
|
||||||
PRINT_CRAWLER_DEBUG,
|
|
||||||
DEFAULT_TIMEZONE
|
|
||||||
} = require("../../config/appConfig");
|
|
||||||
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
|
|
||||||
|
|
||||||
const SALJIC_ENUMS = {
|
|
||||||
SALJIC_AD_TYPE: {
|
|
||||||
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
|
|
||||||
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
|
|
||||||
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
|
|
||||||
},
|
|
||||||
SALJIC_AD_CATEGORY: {
|
|
||||||
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
|
|
||||||
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
|
|
||||||
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
|
|
||||||
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
|
|
||||||
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
|
|
||||||
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
|
|
||||||
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
|
|
||||||
//[AD_CATEGORY.COTTAGE.id]: ""
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class SaljicCrawler {
|
|
||||||
constructor(
|
|
||||||
savers = [],
|
|
||||||
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
|
||||||
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
|
|
||||||
maxPages = 5000,
|
|
||||||
maxResultsPerPage = 5000,
|
|
||||||
ignoredUsernames = [],
|
|
||||||
delayBetweenPages = 1000
|
|
||||||
) {
|
|
||||||
this.savers = savers;
|
|
||||||
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
|
||||||
this.crawlerAdTypes = crawlerAdTypes;
|
|
||||||
this.crawlerAdCategories = crawlerAdCategories;
|
|
||||||
this.maxResultsPerPage = maxResultsPerPage;
|
|
||||||
this.delayBetweenPages = delayBetweenPages;
|
|
||||||
}
|
|
||||||
|
|
||||||
async crawl() {
|
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
|
||||||
|
|
||||||
const newRealEstates = [];
|
|
||||||
|
|
||||||
if (crawlAdCategories) {
|
|
||||||
const indexGenerators = [];
|
|
||||||
for (const adCategory of crawlAdCategories) {
|
|
||||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
|
||||||
}
|
|
||||||
//
|
|
||||||
//console.log(indexGenerators);
|
|
||||||
//
|
|
||||||
let done = false;
|
|
||||||
while (!done) {
|
|
||||||
const categoryIndexerPromises = [];
|
|
||||||
const generatorsToRemove = [];
|
|
||||||
for (const indexGenerator of indexGenerators) {
|
|
||||||
categoryIndexerPromises.push(indexGenerator.next());
|
|
||||||
generatorsToRemove.push(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
|
||||||
const entries = singlePageResults.entries();
|
|
||||||
|
|
||||||
for (const [index, { value: singlePageResult }] of entries) {
|
|
||||||
if (singlePageResult) {
|
|
||||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
|
||||||
const { newRecords } = saveResults;
|
|
||||||
|
|
||||||
newRealEstates.push(...newRecords);
|
|
||||||
|
|
||||||
if (
|
|
||||||
Array.isArray(newRecords) &&
|
|
||||||
newRecords.length === 0 &&
|
|
||||||
!SALJIC_FORCE_CRAWL
|
|
||||||
) {
|
|
||||||
generatorsToRemove[index] = true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//Generator returned undefined, remove this generator from array
|
|
||||||
generatorsToRemove[index] = true;
|
|
||||||
// console.log("Generator ", index + 1, "has no more pages");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// console.log("Generators state : ", generatorsToRemove);
|
|
||||||
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
|
||||||
if (generatorsToRemove[i]) {
|
|
||||||
// console.log("\tRemove generator ", i + 1);
|
|
||||||
indexGenerators.splice(i, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (indexGenerators.length === 0) {
|
|
||||||
done = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
await this.sleep(this.delayBetweenPages);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return newRealEstates;
|
|
||||||
}
|
|
||||||
|
|
||||||
async *categoryIndexer(adCategory) {
|
|
||||||
let pageToIndex = 1;
|
|
||||||
|
|
||||||
const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes];
|
|
||||||
const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory];
|
|
||||||
|
|
||||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
|
||||||
while (true) {
|
|
||||||
const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11;
|
|
||||||
const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`;
|
|
||||||
|
|
||||||
const singlePageResults = await this.indexSinglePage(
|
|
||||||
urlPageToCrawl,
|
|
||||||
this.maxResultsPerPage
|
|
||||||
);
|
|
||||||
|
|
||||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
|
||||||
yield singlePageResults;
|
|
||||||
} else {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
++pageToIndex;
|
|
||||||
if (pageToIndex === this.maxPages) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async indexSinglePage(url, maxResultsPerPage) {
|
|
||||||
if (PRINT_CRAWLER_DEBUG) {
|
|
||||||
console.log("[SALJIC] Index page : ", url);
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const res = await fetch(url);
|
|
||||||
const body = await res.text();
|
|
||||||
const $ = cheerio.load(body);
|
|
||||||
let hrefs = [];
|
|
||||||
|
|
||||||
$("#shop")
|
|
||||||
.find(".product")
|
|
||||||
.each((i, elem) => {
|
|
||||||
const href = $(elem)
|
|
||||||
.find("a")
|
|
||||||
.first()
|
|
||||||
.attr("href");
|
|
||||||
if (href) {
|
|
||||||
hrefs.push(href);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let adTypesTmp = [];
|
|
||||||
|
|
||||||
$("#shop")
|
|
||||||
.find(".product")
|
|
||||||
.each((i, elem) => {
|
|
||||||
const adType = $(elem)
|
|
||||||
.find(".trakica-search-page")
|
|
||||||
.text()
|
|
||||||
.trim();
|
|
||||||
if (adType) {
|
|
||||||
adTypesTmp.push(adType);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
//Converting to AD_TYPE
|
|
||||||
const adTypes = adTypesTmp.map(adTypeText => {
|
|
||||||
return this.getAdTypeId(adTypeText);
|
|
||||||
});
|
|
||||||
|
|
||||||
//Converting to absolute URLs
|
|
||||||
const hrefsAbs = hrefs.map(link => {
|
|
||||||
return "https://www.saljicnekretnine.ba" + link;
|
|
||||||
});
|
|
||||||
|
|
||||||
let actualNoOfResults =
|
|
||||||
hrefsAbs.length <= maxResultsPerPage
|
|
||||||
? hrefsAbs.length
|
|
||||||
: maxResultsPerPage;
|
|
||||||
|
|
||||||
const asyncScraping = [];
|
|
||||||
for (let i = 0; i < actualNoOfResults; i++) {
|
|
||||||
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
|
|
||||||
}
|
|
||||||
|
|
||||||
const scrapedData = await Promise.all(asyncScraping);
|
|
||||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
|
||||||
return filteredScrapedData;
|
|
||||||
} catch (e) {
|
|
||||||
console.error("[SALJIC] Exception caught:" + e);
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async scrapeAd(url, adType) {
|
|
||||||
console.log("[SALJIC] Scraping : ", url);
|
|
||||||
try {
|
|
||||||
const adPageSource = await fetch(url);
|
|
||||||
const body = await adPageSource.text();
|
|
||||||
const $ = cheerio.load(body);
|
|
||||||
|
|
||||||
// No information for status ex. PRODAN
|
|
||||||
const status = AD_STATUS.STATUS_NORMAL;
|
|
||||||
//Extracting agency ID from url
|
|
||||||
const agencyObjectId = parseInt(url.substring(46, url.length));
|
|
||||||
|
|
||||||
//Extracting main properties
|
|
||||||
const propertySelectors = {
|
|
||||||
title:
|
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
|
|
||||||
price:
|
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
|
||||||
streetName:
|
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
|
|
||||||
|
|
||||||
descriptions:
|
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
|
|
||||||
latAndLong:
|
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
|
|
||||||
};
|
|
||||||
const title = $(propertySelectors.title)
|
|
||||||
.text()
|
|
||||||
.replace(/(\r\n|\n|\r)/gm, "")
|
|
||||||
.replace(/ {1,}/g, " ")
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
const priceText = $(propertySelectors.price)
|
|
||||||
.text()
|
|
||||||
.replace(/(\r\n|\n|\r)/gm, "")
|
|
||||||
.replace(/ {1,}/g, " ")
|
|
||||||
.trim();
|
|
||||||
const price =
|
|
||||||
priceText === "CIJENA NA UPIT"
|
|
||||||
? null
|
|
||||||
: parseFloat(
|
|
||||||
priceText.substring(8, priceText.length - 3).replace(",", "")
|
|
||||||
);
|
|
||||||
|
|
||||||
const streetName = $(propertySelectors.streetName)
|
|
||||||
.text()
|
|
||||||
.replace(/(\r\n|\n|\r)/gm, "")
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
const descriptions = $(propertySelectors.descriptions)
|
|
||||||
.text()
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
|
|
||||||
const latText = latAndLongSrc.substring(
|
|
||||||
latAndLongSrc.indexOf("marker=") + 7,
|
|
||||||
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker="))
|
|
||||||
);
|
|
||||||
const longText = latAndLongSrc.substring(
|
|
||||||
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3,
|
|
||||||
latAndLongSrc.length
|
|
||||||
);
|
|
||||||
const locationLat = parseFloat(latText) || null;
|
|
||||||
const locationLong = parseFloat(longText) || null;
|
|
||||||
|
|
||||||
//====== DETAIL INFORMATION FIELDS ==========
|
|
||||||
let area,
|
|
||||||
gardenSize,
|
|
||||||
numberOfRooms = null,
|
|
||||||
numberOfFloors = null,
|
|
||||||
floor = null,
|
|
||||||
accessRoadType = null,
|
|
||||||
heatingType = null,
|
|
||||||
furnishingType = null,
|
|
||||||
balcony = null,
|
|
||||||
newBuilding = null,
|
|
||||||
elevator = null,
|
|
||||||
water = null,
|
|
||||||
electricity = null,
|
|
||||||
drainageSystem = null,
|
|
||||||
registeredInZkBooks = null,
|
|
||||||
recentlyAdapted = null,
|
|
||||||
parking = null,
|
|
||||||
garage = null,
|
|
||||||
gas = null,
|
|
||||||
antiTheftDoor = null,
|
|
||||||
airCondition = null,
|
|
||||||
phoneConnection = null,
|
|
||||||
cableTV = null,
|
|
||||||
internet = null,
|
|
||||||
basementAttic = null,
|
|
||||||
storeRoom = null,
|
|
||||||
videoSurveillance = null,
|
|
||||||
alarm = null,
|
|
||||||
suitableForStudents = null,
|
|
||||||
includingBills = null,
|
|
||||||
animalsAllowed = null,
|
|
||||||
pool = null,
|
|
||||||
urbanPlanPermit = null,
|
|
||||||
buildingPermit = null,
|
|
||||||
utilityConnection = null,
|
|
||||||
distanceToRiver = null;
|
|
||||||
let publishedDate = null;
|
|
||||||
let renewedDate = null;
|
|
||||||
let realEstateType;
|
|
||||||
let numberOfViewsAgency = null;
|
|
||||||
|
|
||||||
//Extracting data - Glavne karakteristike
|
|
||||||
let mainFieldIndex = 1;
|
|
||||||
do {
|
|
||||||
const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
|
|
||||||
|
|
||||||
const mainField = $(mainFieldSelector)
|
|
||||||
.text()
|
|
||||||
.replace(/[\n\r\t]/gm, "")
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
|
|
||||||
const mainFieldValue = mainField
|
|
||||||
.substring(mainField.indexOf(" "), mainField.length)
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
switch (mainFieldTitle) {
|
|
||||||
case "Površina":
|
|
||||||
area = parseFloat(
|
|
||||||
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
|
|
||||||
);
|
|
||||||
break;
|
|
||||||
case "Okućnica":
|
|
||||||
gardenSize = parseFloat(
|
|
||||||
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
|
|
||||||
);
|
|
||||||
break;
|
|
||||||
case "Broj soba":
|
|
||||||
numberOfRooms = parseInt(mainFieldValue);
|
|
||||||
break;
|
|
||||||
case "Broj spratova":
|
|
||||||
numberOfFloors = parseInt(mainFieldValue);
|
|
||||||
break;
|
|
||||||
case "Sprat":
|
|
||||||
floor = parseInt(mainFieldValue);
|
|
||||||
break;
|
|
||||||
case "Godina renoviranja":
|
|
||||||
recentlyAdapted = true;
|
|
||||||
break;
|
|
||||||
case "Broj parking mjesta":
|
|
||||||
parking = true;
|
|
||||||
break;
|
|
||||||
case "Dostupno od":
|
|
||||||
const day = mainFieldValue.substring(0, 2);
|
|
||||||
const month = mainFieldValue.substring(3, 5);
|
|
||||||
const year = mainFieldValue.substring(6, mainFieldValue.length);
|
|
||||||
publishedDate = new Date(`${month}/${day}/${year}`);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mainFieldTitle === "") {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
mainFieldIndex++;
|
|
||||||
} while (true);
|
|
||||||
|
|
||||||
//Extracting data - Sadrzaji
|
|
||||||
let additionalFieldIndex = 1;
|
|
||||||
do {
|
|
||||||
const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`;
|
|
||||||
|
|
||||||
const additionalField = $(additionalFieldSelector)
|
|
||||||
.text()
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
if (additionalFieldIndex === 1) {
|
|
||||||
//Extracting data of real estate type
|
|
||||||
const categoryTmp = additionalField
|
|
||||||
.replace(/[\n\r\t]/gm, "")
|
|
||||||
.substring(
|
|
||||||
additionalField.indexOf("Kategorija") + 10,
|
|
||||||
additionalField.length
|
|
||||||
)
|
|
||||||
.trim();
|
|
||||||
realEstateType = this.getAdCategoryId(categoryTmp);
|
|
||||||
} else {
|
|
||||||
switch (additionalField) {
|
|
||||||
case "Internet":
|
|
||||||
internet = true;
|
|
||||||
break;
|
|
||||||
case "Garaža":
|
|
||||||
garage = true;
|
|
||||||
break;
|
|
||||||
case "Klima":
|
|
||||||
airCondition = true;
|
|
||||||
break;
|
|
||||||
case "Balkon":
|
|
||||||
balcony = true;
|
|
||||||
break;
|
|
||||||
case "Ostava":
|
|
||||||
storeRoom = true;
|
|
||||||
break;
|
|
||||||
case "Podrum":
|
|
||||||
basementAttic = true;
|
|
||||||
break;
|
|
||||||
case "Blindirana vrata":
|
|
||||||
antiTheftDoor = true;
|
|
||||||
break;
|
|
||||||
case "Voda":
|
|
||||||
water = true;
|
|
||||||
break;
|
|
||||||
case "Kablovska":
|
|
||||||
cableTV = true;
|
|
||||||
break;
|
|
||||||
case "Uknjiženo":
|
|
||||||
registeredInZkBooks = true;
|
|
||||||
break;
|
|
||||||
case "Grijanje - centralno":
|
|
||||||
heatingType = HEATING_TYPE.CENTRAL_CITY.id;
|
|
||||||
break;
|
|
||||||
case "Grijanje - plin":
|
|
||||||
heatingType = HEATING_TYPE.GAS.id;
|
|
||||||
break;
|
|
||||||
case "Grijanje - struja":
|
|
||||||
heatingType = HEATING_TYPE.ELECTRICITY.id;
|
|
||||||
break;
|
|
||||||
case "Grijanje":
|
|
||||||
heatingType = HEATING_TYPE.OTHER.id;
|
|
||||||
break;
|
|
||||||
case "Plin":
|
|
||||||
gas = true;
|
|
||||||
break;
|
|
||||||
case "Namješten":
|
|
||||||
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
|
||||||
break;
|
|
||||||
case "Alarm":
|
|
||||||
alarm = true;
|
|
||||||
break;
|
|
||||||
case "Video nadzor":
|
|
||||||
videoSurveillance = true;
|
|
||||||
break;
|
|
||||||
case "Lift":
|
|
||||||
elevator = true;
|
|
||||||
break;
|
|
||||||
case "Novogradnja":
|
|
||||||
newBuilding = true;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (additionalField === "") {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
additionalFieldIndex++;
|
|
||||||
} while (true);
|
|
||||||
|
|
||||||
//If no published date it takes current date of crawling
|
|
||||||
if (publishedDate) {
|
|
||||||
renewedDate = new Date();
|
|
||||||
} else {
|
|
||||||
publishedDate = new Date();
|
|
||||||
renewedDate = new Date();
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = {
|
|
||||||
url,
|
|
||||||
agencyObjectId,
|
|
||||||
originAgencyName: AD_AGENCY.SALJIC,
|
|
||||||
realEstateType,
|
|
||||||
adType,
|
|
||||||
title,
|
|
||||||
price,
|
|
||||||
area,
|
|
||||||
gardenSize,
|
|
||||||
shortDescription: descriptions.substring(0, descriptions.indexOf(".")),
|
|
||||||
longDescription: descriptions,
|
|
||||||
streetNumber: 0,
|
|
||||||
streetName,
|
|
||||||
locality: "",
|
|
||||||
municipality: "",
|
|
||||||
city: "",
|
|
||||||
region: "",
|
|
||||||
entity: "",
|
|
||||||
country: "",
|
|
||||||
locationLat,
|
|
||||||
locationLong,
|
|
||||||
adStatus: status,
|
|
||||||
publishedDate,
|
|
||||||
renewedDate,
|
|
||||||
numberOfRooms,
|
|
||||||
numberOfFloors,
|
|
||||||
floor,
|
|
||||||
accessRoadType,
|
|
||||||
heatingType,
|
|
||||||
furnishingType,
|
|
||||||
balcony,
|
|
||||||
newBuilding,
|
|
||||||
elevator,
|
|
||||||
water,
|
|
||||||
electricity,
|
|
||||||
drainageSystem,
|
|
||||||
registeredInZkBooks,
|
|
||||||
recentlyAdapted,
|
|
||||||
parking,
|
|
||||||
garage,
|
|
||||||
gas,
|
|
||||||
antiTheftDoor,
|
|
||||||
airCondition,
|
|
||||||
phoneConnection,
|
|
||||||
cableTV,
|
|
||||||
internet,
|
|
||||||
basementAttic,
|
|
||||||
storeRoom,
|
|
||||||
videoSurveillance,
|
|
||||||
alarm,
|
|
||||||
suitableForStudents,
|
|
||||||
includingBills,
|
|
||||||
animalsAllowed,
|
|
||||||
pool,
|
|
||||||
urbanPlanPermit,
|
|
||||||
buildingPermit,
|
|
||||||
utilityConnection,
|
|
||||||
distanceToRiver,
|
|
||||||
numberOfViewsAgency
|
|
||||||
};
|
|
||||||
console.log(data);
|
|
||||||
return data;
|
|
||||||
} catch (e) {
|
|
||||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
//======= HELPER FUNCTIONS =============
|
|
||||||
|
|
||||||
getAdCategoryId(categoryText) {
|
|
||||||
switch (categoryText) {
|
|
||||||
case "Stan":
|
|
||||||
return AD_CATEGORY.FLAT.id;
|
|
||||||
case "Građevinsko zemljiste":
|
|
||||||
return AD_CATEGORY.LAND.id;
|
|
||||||
case "Industrijsko zemljiste":
|
|
||||||
return AD_CATEGORY.LAND.id;
|
|
||||||
case "Poljoprivredno zemljiste":
|
|
||||||
return AD_CATEGORY.LAND.id;
|
|
||||||
case "Kuća":
|
|
||||||
return AD_CATEGORY.HOUSE.id;
|
|
||||||
case "Poslovni prostor":
|
|
||||||
return AD_CATEGORY.OFFICE.id;
|
|
||||||
case "Kancelarije":
|
|
||||||
return AD_CATEGORY.OFFICE.id;
|
|
||||||
case "Apartmani":
|
|
||||||
return AD_CATEGORY.APARTMENT.id;
|
|
||||||
case "Garaža":
|
|
||||||
return AD_CATEGORY.GARAGE.id;
|
|
||||||
case "Vikendica":
|
|
||||||
return AD_CATEGORY.COTTAGE.id;
|
|
||||||
default:
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
getAdTypeId(adTypeText) {
|
|
||||||
switch (adTypeText) {
|
|
||||||
case "PRODAJA":
|
|
||||||
return AD_TYPE.AD_TYPE_SALE.stringId;
|
|
||||||
case "NAJAM":
|
|
||||||
return AD_TYPE.AD_TYPE_RENT.stringId;
|
|
||||||
default:
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async sleep(ms) {
|
|
||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
|
||||||
}
|
|
||||||
|
|
||||||
async saveCrawledResults(results) {
|
|
||||||
const savers = this.savers;
|
|
||||||
|
|
||||||
// for (const saver of savers) {
|
|
||||||
// await saver.save(results);
|
|
||||||
// }
|
|
||||||
|
|
||||||
//For now, we use only Postgres saver, so ...
|
|
||||||
return savers[0].save(results);
|
|
||||||
//so that we can use some sequelize options and information when data is inserted
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
module.exports = SaljicCrawler;
|
|
||||||
@@ -23,16 +23,19 @@ const generateRealEstateLinks = realEstates => {
|
|||||||
const generateNotificationEmail = (
|
const generateNotificationEmail = (
|
||||||
realEstates,
|
realEstates,
|
||||||
searchRequestId,
|
searchRequestId,
|
||||||
|
noAllRealEstates,
|
||||||
dailyNotification = false
|
dailyNotification = false
|
||||||
) => {
|
) => {
|
||||||
const truncateList = realEstates.length > MAX_REAL_ESTATES_IN_EMAIL;
|
const truncateList = realEstates.length > MAX_REAL_ESTATES_IN_EMAIL;
|
||||||
|
|
||||||
const realEstatesToShow = truncateList
|
const realEstatesToShow = truncateList
|
||||||
? realEstates.slice(0, MAX_REAL_ESTATES_IN_EMAIL)
|
? realEstates.slice(0, MAX_REAL_ESTATES_IN_EMAIL)
|
||||||
: realEstates;
|
: realEstates;
|
||||||
|
|
||||||
const allRealEstatesLink = `${APP_URL}/nekretnine/${searchRequestId}`;
|
const allRealEstatesLink = `${APP_URL}/nekretnine/${searchRequestId}`;
|
||||||
|
|
||||||
const realEstateLinks = generateRealEstateLinks(realEstatesToShow);
|
const realEstateLinks = generateRealEstateLinks(realEstatesToShow);
|
||||||
const moreRealEstates = `<div>Kompletan spisak nekretnina možete pogledati na <a href="${allRealEstatesLink}">listi nekretnina</a><div>`;
|
const moreRealEstates = `<div>Kompletan spisak nekretnina (${noAllRealEstates}) možete pogledati na <a href="${allRealEstatesLink}">listi nekretnina</a><div>`;
|
||||||
const emailFooter = generateEmailFooter(searchRequestId);
|
const emailFooter = generateEmailFooter(searchRequestId);
|
||||||
const asapMessageBody =
|
const asapMessageBody =
|
||||||
realEstates.length > 1
|
realEstates.length > 1
|
||||||
@@ -70,6 +73,7 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
|
|||||||
} = searchRequest;
|
} = searchRequest;
|
||||||
|
|
||||||
const realEstateLinks = generateRealEstateLinks(matchingRealEstates);
|
const realEstateLinks = generateRealEstateLinks(matchingRealEstates);
|
||||||
|
|
||||||
const instantRealEstatesText = `<br/>
|
const instantRealEstatesText = `<br/>
|
||||||
<div>
|
<div>
|
||||||
U međuvremenu pogledajte neke od nedavno objavljenih nekretnina koje odgovaraju Vašim uslovima pretrage :<br/>
|
U međuvremenu pogledajte neke od nedavno objavljenih nekretnina koje odgovaraju Vašim uslovima pretrage :<br/>
|
||||||
|
|||||||
@@ -154,3 +154,7 @@ h3 {
|
|||||||
margin-top: 2rem;
|
margin-top: 2rem;
|
||||||
margin-bottom: 1rem;
|
margin-bottom: 1rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.estates-link {
|
||||||
|
color: rgba(0, 0, 0, 0.87);
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,7 +8,10 @@ const {
|
|||||||
generateNewSearchRequestEmail,
|
generateNewSearchRequestEmail,
|
||||||
generateEmailSubject
|
generateEmailSubject
|
||||||
} = require("../helpers/emailContentGenerator");
|
} = require("../helpers/emailContentGenerator");
|
||||||
const { findNotNotifiedMatches } = require("../helpers/db/searchRequestMatch");
|
const {
|
||||||
|
findNotNotifiedMatches,
|
||||||
|
findRealEstatesForSearchRequest
|
||||||
|
} = require("../helpers/db/searchRequestMatch");
|
||||||
const { sendEmail } = require("../services/emailService");
|
const { sendEmail } = require("../services/emailService");
|
||||||
|
|
||||||
const notifyForNewRealEstates = async newRealEstates => {
|
const notifyForNewRealEstates = async newRealEstates => {
|
||||||
@@ -39,10 +42,18 @@ const notifyMatches = async (matches, dailyNotification = false) => {
|
|||||||
const { email, subscribed } = searchRequest;
|
const { email, subscribed } = searchRequest;
|
||||||
if (notifyNow && subscribed) {
|
if (notifyNow && subscribed) {
|
||||||
const allMatchingRealEstates = matches[id].realEstates || [];
|
const allMatchingRealEstates = matches[id].realEstates || [];
|
||||||
|
|
||||||
|
//Variable allMatchingRealEstates are real estates that are "new" on the market
|
||||||
|
//the ones that we notify user in this moment, not all that already exists in db
|
||||||
|
//New variable allRealEstates are all real estates that exists in db for search req
|
||||||
|
const allRealEstates = await findRealEstatesForSearchRequest(id);
|
||||||
|
const noAllRealEstates = allRealEstates.length;
|
||||||
|
|
||||||
if (allMatchingRealEstates.length > 0) {
|
if (allMatchingRealEstates.length > 0) {
|
||||||
const emailContent = generateNotificationEmail(
|
const emailContent = generateNotificationEmail(
|
||||||
allMatchingRealEstates,
|
allMatchingRealEstates,
|
||||||
id,
|
id,
|
||||||
|
noAllRealEstates,
|
||||||
dailyNotification
|
dailyNotification
|
||||||
);
|
);
|
||||||
const emailSubject = generateEmailSubject(
|
const emailSubject = generateEmailSubject(
|
||||||
|
|||||||
@@ -1,13 +1,29 @@
|
|||||||
<div class="row center-align">
|
<div class="row center-align">
|
||||||
<ul class="collection with-header">
|
<ul class="collection with-header">
|
||||||
<% for(const realEstate of realEstates) { %>
|
<% for(const realEstate of realEstates) { %>
|
||||||
<li class="collection-item">
|
<li class="collection-item">
|
||||||
<div><%= realEstate.title %>
|
<% if(realEstate.adStatus === AD_STATUS.STATUS_VIP) {%>
|
||||||
<a href="<%= realEstate.url %>" class="kivi-color secondary-content">
|
<div>
|
||||||
|
<% //This needs to do redirecting instead of direct link to realestate
|
||||||
|
%>
|
||||||
|
<a href="/redirect/<%= realEstate.id %>" class="estates-link">
|
||||||
|
<%= realEstate.title %>
|
||||||
|
<div class="kivi-color secondary-content">
|
||||||
<i class="material-icons">send</i>
|
<i class="material-icons">send</i>
|
||||||
</a>
|
</div>
|
||||||
</div>
|
</a>
|
||||||
</li>
|
</div>
|
||||||
<% } %>
|
<%} else { %>
|
||||||
</ul>
|
<div>
|
||||||
</div>
|
<a href="<%= realEstate.url %>" class="estates-link">
|
||||||
|
<%= realEstate.title %>
|
||||||
|
<div class="kivi-color secondary-content">
|
||||||
|
<i class="material-icons">send</i>
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<% }%>
|
||||||
|
</li>
|
||||||
|
<% } %>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
|||||||
@@ -1,26 +1,49 @@
|
|||||||
<br><br>
|
<br /><br />
|
||||||
<div class="center">
|
<div class="center">
|
||||||
<div class="preloader-wrapper big active center">
|
<div class="preloader-wrapper big active center">
|
||||||
<div class="kivi-spinner-color spinner-layer spinner-green-only">
|
<div class="kivi-spinner-color spinner-layer spinner-green-only">
|
||||||
<div class="circle-clipper left">
|
<div class="circle-clipper left">
|
||||||
<div class="circle"></div>
|
<div class="circle"></div>
|
||||||
</div><div class="gap-patch">
|
</div>
|
||||||
<div class="circle"></div>
|
<div class="gap-patch">
|
||||||
</div><div class="circle-clipper right">
|
<div class="circle"></div>
|
||||||
<div class="circle"></div>
|
</div>
|
||||||
</div>
|
<div class="circle-clipper right">
|
||||||
</div>
|
<div class="circle"></div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<br>
|
<br />
|
||||||
|
<% if(vipAd) { %>
|
||||||
<div class="center">
|
<div class="center">
|
||||||
<h6>
|
<h6>
|
||||||
<a href="<%= redirectUrl %>" rel="noreferrer" id="realEstateUrl">Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski</a>
|
Ovaj oglas zahtijeva da budete član
|
||||||
</h6>
|
<a href="https://prostor.ba/" rel="noreferrer">Prostor.ba</a>.
|
||||||
|
<br />
|
||||||
|
<br />
|
||||||
|
<a href="https://prostor.ba/moj-prostor/prijava" rel="noreferrer"
|
||||||
|
>Ulogujte se</a
|
||||||
|
>
|
||||||
|
ili napravite
|
||||||
|
<a href="https://prostor.ba/moj-prostor/registracija" rel="noreferrer"
|
||||||
|
>novi račun</a
|
||||||
|
>, a potom otvorite <a href="<%= redirectUrl %>" rel="noreferrer">oglas</a>.
|
||||||
|
</h6>
|
||||||
</div>
|
</div>
|
||||||
|
<% } else { %>
|
||||||
|
<div class="center">
|
||||||
|
<h6>
|
||||||
|
<a href="<%= redirectUrl %>" rel="noreferrer" id="realEstateUrl"
|
||||||
|
>Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski</a
|
||||||
|
>
|
||||||
|
</h6>
|
||||||
|
</div>
|
||||||
|
<% }%>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
window.onload = function() {
|
window.onload = function() {
|
||||||
document.getElementById('realEstateUrl').click();
|
document.getElementById("realEstateUrl").click();
|
||||||
}
|
};
|
||||||
</script>
|
</script>
|
||||||
|
|||||||
@@ -51,6 +51,8 @@ PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories t
|
|||||||
PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!!
|
PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!!
|
||||||
PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!!
|
PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!!
|
||||||
PROSTOR_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
PROSTOR_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
|
PROSTOR_LOGIN_EMAIL=Email of valid Prostor.ba account for crawling purposes
|
||||||
|
PROSTOR_LOGIN_PASS=Password of valid Prostor.ba account for crawling purposes
|
||||||
#==AKTIDO==
|
#==AKTIDO==
|
||||||
AKTIDO_MAX_PAGES=Restrict crawler to this number of pages
|
AKTIDO_MAX_PAGES=Restrict crawler to this number of pages
|
||||||
AKTIDO_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
AKTIDO_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||||
@@ -59,8 +61,3 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to
|
|||||||
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
||||||
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||||
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
#==SALJIC NEKRETNINE==
|
|
||||||
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
|
||||||
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
|
||||||
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
|
||||||
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
|
||||||
30
package-lock.json
generated
30
package-lock.json
generated
@@ -1346,13 +1346,23 @@
|
|||||||
"integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE="
|
"integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE="
|
||||||
},
|
},
|
||||||
"form-data": {
|
"form-data": {
|
||||||
"version": "2.3.3",
|
"version": "3.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.0.tgz",
|
||||||
"integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
|
"integrity": "sha512-CKMFDglpbMi6PyN+brwB9Q/GOw0eAnsrEZDgcsH5Krhz5Od/haKHAX0NmQfha2zPPz0JpWzA7GJHGSnvCRLWsg==",
|
||||||
"requires": {
|
"requires": {
|
||||||
"asynckit": "^0.4.0",
|
"asynckit": "^0.4.0",
|
||||||
"combined-stream": "^1.0.6",
|
"combined-stream": "^1.0.8",
|
||||||
"mime-types": "^2.1.12"
|
"mime-types": "^2.1.12"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"combined-stream": {
|
||||||
|
"version": "1.0.8",
|
||||||
|
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
|
||||||
|
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
|
||||||
|
"requires": {
|
||||||
|
"delayed-stream": "~1.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"forwarded": {
|
"forwarded": {
|
||||||
@@ -3430,6 +3440,18 @@
|
|||||||
"tough-cookie": "~2.4.3",
|
"tough-cookie": "~2.4.3",
|
||||||
"tunnel-agent": "^0.6.0",
|
"tunnel-agent": "^0.6.0",
|
||||||
"uuid": "^3.3.2"
|
"uuid": "^3.3.2"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"form-data": {
|
||||||
|
"version": "2.3.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
|
||||||
|
"integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
|
||||||
|
"requires": {
|
||||||
|
"asynckit": "^0.4.0",
|
||||||
|
"combined-stream": "^1.0.6",
|
||||||
|
"mime-types": "^2.1.12"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"require-directory": {
|
"require-directory": {
|
||||||
|
|||||||
@@ -39,6 +39,7 @@
|
|||||||
"express": "^4.16.4",
|
"express": "^4.16.4",
|
||||||
"express-ejs-layouts": "^2.5.0",
|
"express-ejs-layouts": "^2.5.0",
|
||||||
"express-layout": "^0.1.0",
|
"express-layout": "^0.1.0",
|
||||||
|
"form-data": "^3.0.0",
|
||||||
"html-to-text": "^5.1.1",
|
"html-to-text": "^5.1.1",
|
||||||
"moment": "^2.24.0",
|
"moment": "^2.24.0",
|
||||||
"moment-timezone": "^0.5.26",
|
"moment-timezone": "^0.5.26",
|
||||||
|
|||||||
Reference in New Issue
Block a user