Compare commits

...

138 Commits

Author SHA1 Message Date
Senad Uka
70bd952ee1 Add ssl to sequelize 2021-03-18 11:38:42 +01:00
Senad Uka
14039975c2 Modify time to live to 6 hours 2020-10-21 10:49:22 +02:00
Senad Uka
92e4f4ed5a Add caching to fetch wrapper 2020-10-21 06:16:35 +02:00
=
88f9d10586 parseInt NODE_FETCH_TIMEOUT_MS 2020-09-16 06:21:03 -07:00
=
2b1cbcaa47 Cleanup after debugging 2020-09-16 06:16:49 -07:00
=
bf8d131025 Increase timeout to 20 secs 2020-09-15 01:34:07 -07:00
=
698acb010a Add timeout to fetch wrapper 2020-09-15 01:27:20 -07:00
=
ade3eb307d Add try/catch to generator 2020-09-14 14:54:39 -07:00
=
8031f6f8a3 Moar attempts 2020-09-14 14:46:21 -07:00
=
d7a680a3ac Modify procfile instead of package.json for --inspect 2020-09-14 07:17:37 -07:00
=
8018caab47 Enable inspector and debugger 2020-09-14 06:53:35 -07:00
Edin Dazdarevic
d871d9ad1f More debugging, disable all crawlers 2020-09-13 23:45:08 +02:00
=
dfbefc20cd Remove PromisePool from olx 2020-09-13 04:57:01 -07:00
=
a481ecfe37 Debug 2020-09-13 04:48:11 -07:00
Senad Uka
8df94da48c Saljic fix fix 2020-09-11 05:03:37 +02:00
Senad Uka
d4fcd1950d Fix saljic 2020-09-11 04:51:54 +02:00
=
e8115a9215 moar fixes 2020-09-10 14:26:48 -07:00
=
160efdf6ab More fixes 2020-09-10 14:06:09 -07:00
=
c9b8c2e1a5 Fiks 2020-09-10 13:31:57 -07:00
Senad Uka
855b93ca41 Scraped data 2020-09-10 19:46:25 +02:00
Senad Uka
70779b24c0 Scraped data 2020-09-10 19:42:50 +02:00
Senad Uka
ba873f9f4e package json 2020-09-10 19:40:51 +02:00
Senad Uka
e4775158fc Promise pool 2020-09-10 19:39:13 +02:00
Senad Uka
26377c485c Reverting 2020-09-10 18:30:14 +02:00
Senad Uka
b30b0f45a6 trying different approach 2020-09-10 18:23:01 +02:00
Senad Uka
9c1a029ff1 Debugging ba 2020-09-10 18:17:16 +02:00
Senad Uka
9b49759485 Pause is not enough 2020-09-10 18:13:47 +02:00
Senad Uka
0c2f8d11ee Add random timeout up to 500ms 2020-09-10 18:10:42 +02:00
Senad Uka
b27d9d3499 Add debugging info 2020-09-10 18:00:13 +02:00
Senad Uka
dd3f30ef0e Comment agent header 2020-09-10 17:57:21 +02:00
Senad Uka
78c6056db4 Change for scraperapi 2020-09-10 17:37:19 +02:00
Bilal Catic
ecf27f2ba1 Merge branch 'improve-olx-scrapper' into 'master'
Improve olx scrapper

See merge request saburly/marketalarm/web!113
2020-06-16 20:37:37 +00:00
Bilal
1229b3fa6c Add more debug logs 2020-06-16 19:51:13 +02:00
Edin
542ff56123 Merge branch 'improve-wom' into 'master'
Improve styling of WOM message

See merge request saburly/marketalarm/web!112
2020-06-05 16:24:31 +00:00
=
0aa851015b Improve styling of WOM message 2020-06-05 09:07:54 -07:00
Senad Uka
c033b2e47c Merge branch 'word-of-mouth-email' into 'master'
Add word of mouth requests

See merge request saburly/marketalarm/web!111
2020-06-05 14:55:44 +00:00
Senad Uka
0895654db2 Update emailContentGenerator.js 2020-06-05 14:55:00 +00:00
Senad Uka
8925eb9f4e Update emailContentGenerator.js 2020-06-05 14:54:22 +00:00
=
52201af3ba Add word of mouth requests 2020-06-05 07:52:52 -07:00
Senad Uka
1505c07363 Merge branch 'improve-crawler' into 'master'
Improve crawler

See merge request saburly/marketalarm/web!110
2020-05-18 09:30:24 +00:00
Bilal
159fedbc2d handle failed page fetch 2020-05-18 03:53:08 +02:00
Bilal
65068932ad handle failed page fetch; detect discounted price 2020-05-18 03:43:49 +02:00
Senad Uka
820227827e Merge branch 'fix-crawler-errors' into 'master'
Fix crawler errors

See merge request saburly/marketalarm/web!109
2020-05-14 21:34:17 +00:00
Bilal
d35a113baa Fix saljic crawler 2020-05-14 19:01:19 +02:00
Bilal
ba60f8749d Fix Prostor crawler - use new JSON location in page body 2020-05-14 15:38:15 +02:00
Senad Uka
f1d45fed26 Merge branch 'move-scraper-url-to-the-env' into 'master'
Move scraper api url to the ENV

See merge request saburly/marketalarm/web!108
2020-05-12 11:55:12 +00:00
Bilal
ff923605ad Move scraper api base url to the ENV; send URL as base64 string 2020-05-12 13:44:09 +02:00
Naida Vatric
692577fb8c Merge branch 'after-scraper-fix' into 'master'
After scraper fix

See merge request saburly/marketalarm/web!104
2020-02-28 16:14:43 +00:00
Naida Vatric
df5e38092d Decomment and saljic smal fix. 2020-02-28 17:12:55 +01:00
Naida Vatric
feb2d04ed6 Olx debugg log. 2020-02-28 15:43:35 +01:00
Naida Vatric
90e171d07d Olx scraper debugging. 2020-02-28 15:41:20 +01:00
Naida Vatric
747f56941a Logged olx price scraping. 2020-02-28 14:21:53 +01:00
Naida Vatric
2a13ab55ed Merge branch 'after-scraper-fix' into 'master'
Olx price parsing changed.

See merge request saburly/marketalarm/web!103
2020-02-27 22:38:00 +00:00
Naida Vatric
441f905b29 Olx price parsing changed. 2020-02-27 23:35:18 +01:00
Naida Vatric
39f9383ae2 Merge branch 'after-scraper-fix' into 'master'
Commented checkup out and Prostor fetch changed.

See merge request saburly/marketalarm/web!102
2020-02-27 20:31:14 +00:00
Naida Vatric
edca7f91af Commented checkup out and Prostor fetch changed. 2020-02-27 19:01:11 +01:00
Naida Vatric
44402a9cc4 Merge branch 'saljic-crawler-fix' into 'master'
Saljic crawler changed substring call.

See merge request saburly/marketalarm/web!95
2020-02-26 15:09:22 +00:00
Naida Vatric
b913daa1f7 Merge branch 'checkup-email-bug-fix' into 'master'
Changed checkup email logic.

See merge request saburly/marketalarm/web!97
2020-02-25 15:13:20 +00:00
Naida Vatric
a508f72d7c Merge branch 'master' into 'checkup-email-bug-fix'
# Conflicts:
#   app/services/notificationService.js
2020-02-25 15:12:47 +00:00
Naida Vatric
08ad9edfe1 Merge branch 'scraper-api-support' into 'master'
Added Scraper API option.

See merge request saburly/marketalarm/web!100
2020-02-25 15:10:39 +00:00
Naida Vatric
ce857ddce9 Renamed var. 2020-02-23 23:11:21 +01:00
Naida Vatric
148b2ea863 Changed default. 2020-02-23 16:38:40 +01:00
Naida Vatric
d436d4a37b Added Scraper API option. 2020-02-22 22:15:27 +01:00
Bilal Catic
6791a509d0 make user agent header configurable through env variable 2020-02-20 21:07:16 +01:00
Bilal Catic
edc6e2bbf7 Merge branch 'create-fetch-wrapper-with-user-agent' into 'master'
Create fetch wrapper with user agent

See merge request saburly/marketalarm/web!98
2020-02-20 19:58:32 +00:00
Bilal Catic
4f230020d7 use fetch wrapper instead of node-fetch 2020-02-20 19:49:29 +01:00
Bilal Catic
f62a7200c7 create fetch wrapper with mandatory user agent header 2020-02-20 19:47:30 +01:00
Bilal Catic
cff7cc2e9c apply prettier 2020-02-20 19:46:39 +01:00
Naida Vatric
f56cd5b549 More elegant scrape of lat and long. 2020-02-17 21:55:24 +01:00
Naida Vatric
bc7ce9d708 Changed checkup query to miliseconds. 2020-02-17 21:22:45 +01:00
Naida Vatric
df2a962d0f Merge branch 'prostor-vip-ads-fix' into 'master'
Prostor VIP ads fixed.

See merge request saburly/marketalarm/web!94
2020-02-17 14:44:58 +00:00
Naida Vatric
be4508ebea Merge branch 'include-incomplete-ads-inverse' into 'master'
Default true for include incomplete ads.

See merge request saburly/marketalarm/web!96
2020-02-17 14:44:35 +00:00
Naida Vatric
22bffc126d For staging - checkup fix. 2020-02-15 02:39:38 +01:00
Naida Vatric
06f80296f3 Changed checkup email logic. 2020-02-15 02:30:02 +01:00
Naida Vatric
81fa3f046d Default true for include incomplete ads. 2020-02-15 00:52:06 +01:00
Naida Vatric
addd8c1344 Saljic crawler changed substring call. 2020-02-14 23:42:19 +01:00
Naida Vatric
5bdc8e149a Prostor VIP ads fixed. 2020-02-14 22:41:51 +01:00
Senad Uka
fc7fe3c0b3 Notificaton service disabled 2020-02-14 15:07:42 +01:00
Naida Vatric
b3007123a5 Merge branch 'rename-settings-var' into 'master'
Rename settings var

See merge request saburly/marketalarm/web!93
2020-02-10 20:17:08 +00:00
Naida Vatric
f7d4a9cd07 Renamed settings var to describe purpose. 2020-02-10 21:15:28 +01:00
Naida Vatric
ab6812889a Merge branch 'fixing-saljic-bugs' into 'master'
Fixing saljic bugs

See merge request saburly/marketalarm/web!92
2020-02-09 18:11:00 +00:00
Naida Vatric
b82134e280 Fixed saljic bug for heroku. 2020-02-09 19:09:00 +01:00
Naida Vatric
be378883c8 Just another fix try. 2020-02-08 00:47:00 +01:00
Naida Vatric
8a87b9e253 Another fix. 2020-02-08 00:27:26 +01:00
Naida Vatric
43bc23b164 Another fix. Defined more var. 2020-02-07 22:27:01 +01:00
Naida Vatric
fc6351af46 Added columns and logs for types. 2020-02-07 22:12:53 +01:00
Naida Vatric
6267b2cab4 Merge branch 'staging-tag-to-checkup-email' into 'master'
Added staging tag to checkup email. Email footer bug fixed.

See merge request saburly/marketalarm/web!91
2020-02-06 21:43:56 +00:00
Naida Vatric
97724a47a1 Removed debugg logs. Smal fixes. 2020-02-06 22:40:56 +01:00
Naida Vatric
91a1c6a91e Added more logs for debugging. 2020-02-06 14:12:35 +01:00
Naida Vatric
eb4ab2e341 Changed misspeling. 2020-02-05 23:02:14 +01:00
Naida Vatric
2d0a00b967 Debugging- noOfRealEstates and staging tag. 2020-02-05 21:58:45 +01:00
Naida Vatric
74def9c059 Added staging tag to checkup email. Email footer bug fixed. 2020-02-05 21:35:18 +01:00
Naida Vatric
d29b3eb1b3 Merge branch 'crawler-saljicnekretnine' into 'master'
Crawler saljicnekretnine

See merge request saburly/marketalarm/web!90
2020-02-04 13:09:10 +00:00
Naida Vatric
41b59e8c7c Merge branch 'tag-staging-email' into 'master'
Tag staging email

See merge request saburly/marketalarm/web!85
2020-02-04 13:07:53 +00:00
Naida Vatric
b933fa96d4 Merge branch 'master' into 'tag-staging-email'
# Conflicts:
#   app/config/appConfig.js
2020-02-04 13:07:43 +00:00
Naida Vatric
824db4fbc3 Merge branch 'checkup-email' into 'master'
Added check up email that everything works.

See merge request saburly/marketalarm/web!89
2020-01-31 21:59:35 +00:00
Naida Vatric
0f91841c43 Merge branch 'master' into 'checkup-email'
# Conflicts:
#   app/config/appConfig.js
#   app/services/notificationService.js
2020-01-31 21:59:26 +00:00
Naida Vatric
12b4a8f6ec Merge branch 'ads-without-price' into 'master'
Ads without price

See merge request saburly/marketalarm/web!88
2020-01-31 21:53:36 +00:00
Naida Vatric
17621ad310 Merge branch 'price-history-log' into 'master'
Price history log

See merge request saburly/marketalarm/web!87
2020-01-31 21:53:18 +00:00
Naida Vatric
c461525959 Merge branch 'sliders-formating' into 'master'
Changed sliders for different real estate types.

See merge request saburly/marketalarm/web!86
2020-01-31 21:53:04 +00:00
Naida Vatric
aec9c1e1d5 Merge branch 'more-details-email' into 'master'
More details email

See merge request saburly/marketalarm/web!84
2020-01-31 21:52:33 +00:00
Naida Vatric
2d672f4660 Merge branch 'prostor-vip-ads' into 'master'
Prostor vip ads

See merge request saburly/marketalarm/web!83
2020-01-31 21:52:13 +00:00
Naida Vatric
bc15cf65a5 Merge branch 'results-link' into 'master'
Results link

See merge request saburly/marketalarm/web!82
2020-01-31 21:51:38 +00:00
Naida Vatric
ce11d57ab2 Merge branch 'no-all-results-email' into 'master'
No all results email

See merge request saburly/marketalarm/web!81
2020-01-31 21:51:20 +00:00
Naida Vatric
4a6bcf262e Merge branch 'add-even-more-filters' into 'master'
Add even more filters

See merge request saburly/marketalarm/web!77
2020-01-31 21:50:54 +00:00
Naida Vatric
712cde1632 Changed not to use NODE_ENV variable. 2020-01-31 22:47:47 +01:00
Naida Vatric
1ba7cf8531 Added crawler for Saljic nekretnine. 2020-01-31 22:03:39 +01:00
Naida Vatric
7a7aecb3ee WIP Scraped no of rooms, floors etc. 2020-01-31 00:55:24 +01:00
Naida Vatric
78c4054cde WIP Scraped title, price and location. 2020-01-30 16:24:34 +01:00
Naida Vatric
94ffc2d6d2 WIP Started saljic crawler. 2020-01-29 23:22:39 +01:00
Naida Vatric
b11f18696f Prepared config files. 2020-01-29 01:09:53 +01:00
Naida Vatric
fa46f75dd3 Changed default to switched on. 2020-01-28 22:28:52 +01:00
Naida Vatric
470f53d29b Changed to send email only to some users. 2020-01-24 16:04:28 +01:00
Naida Vatric
40509d2836 Changed logic for checkup.For testing. 2020-01-24 01:01:10 +01:00
Naida Vatric
b2c102bc1a Added check up email that everything works. 2020-01-23 15:48:48 +01:00
Naida Vatric
98263364c7 Added option to include-exclude ads without price 2020-01-23 11:13:53 +01:00
Naida Vatric
5b3491fdba Added migration and model change for searchReq table. 2020-01-23 10:12:56 +01:00
Naida Vatric
c6f0e039a5 Added price history log. 2020-01-21 23:12:04 +01:00
Naida Vatric
8d3f001678 WIP Added model, migration and bulk upsert fnc. 2020-01-21 16:28:47 +01:00
Naida Vatric
42eddb3aa5 WIP Bulk create not working. 2020-01-21 01:19:35 +01:00
Naida Vatric
0a181f742f WIP Added model and migration for new table priceHistory. 2020-01-20 23:09:02 +01:00
Naida Vatric
d117383802 Tested both ways for realestate and search req filters. 2020-01-17 22:58:22 +01:00
Naida Vatric
870b71a3c7 WIP Changed all logic for searchRequest. 2020-01-17 01:54:06 +01:00
Naida Vatric
4fd4018bf6 Merge branch 'sliders-formating' of gitlab.com:saburly/marketalarm/web into add-even-more-filters 2020-01-14 23:19:51 +01:00
Naida Vatric
b9122f8f00 Added tag to email to denote staging from production. 2020-01-14 15:59:17 +01:00
Naida Vatric
fc33c1210a Add more detail to the email 2020-01-13 14:58:09 +01:00
Naida Vatric
511b290096 Login to prostor.ba befoure crawl. 2020-01-13 12:05:33 +01:00
Naida Vatric
ba43fa0713 WIP Changed cookies. 2020-01-13 11:02:26 +01:00
Naida Vatric
e70901d369 WIP Changed login to crawler. 2020-01-13 09:12:03 +01:00
Naida Vatric
8505282670 WiP Login of crawler prostor. 2020-01-12 01:22:50 +01:00
Naida Vatric
64e4835899 Changed redirecting for VIP ads. 2020-01-10 22:52:50 +01:00
Naida Vatric
1658325c4b WIP Fake vip ads. 2020-01-10 19:20:26 +01:00
Naida Vatric
49161c1b60 WIP Changed redirecting for VIP ads. 2020-01-09 12:19:19 +01:00
Naida Vatric
d23ddf849f Results title text made into link. 2020-01-07 01:06:22 +01:00
Naida Vatric
38bd0343f5 Merge branch 'results-link' of gitlab.com:saburly/marketalarm/web into no-all-results-email 2020-01-07 01:01:57 +01:00
Naida Vatric
fa4e0d64de Changed email content to show number of all matching real estates. 2020-01-06 23:59:56 +01:00
Naida Vatric
d5d3a1f306 Changed accesRoadType logic 2019-12-26 23:30:05 +01:00
Naida Vatric
42ff1f762f Changed to avoid falsy values and not defined realestate parametrs. 2019-12-21 02:20:26 +01:00
44 changed files with 2152 additions and 274 deletions

1
.prettierignore Normal file
View File

@@ -0,0 +1 @@
*.ejs

View File

@@ -295,14 +295,16 @@ const AD_STATUS = {
STATUS_DELETED: 4,
STATUS_URGENT: 5,
STATUS_DISCOUNTED: 6,
STATUS_RENTED: 7
STATUS_RENTED: 7,
STATUS_VIP: 8
};
const AD_AGENCY = {
OLX: "OLX",
RENTAL: "RENTAL",
PROSTOR: "PROSTOR",
AKTIDO: "AKTIDO"
AKTIDO: "AKTIDO",
SALJIC: "SALJIC"
};
const CRAWLER_AD_TYPE = {

View File

@@ -9,11 +9,15 @@ const APP_URL =
? process.env.APP_URL || "http://market-alarm"
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
const STAGING = process.env.ENVIRONMENT !== "production";
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
const CRAWLER_INTERVAL = parseInt(process.env.CRAWLER_INTERVAL) || 60;
const STOP_CRAWLER = !!parseInt(process.env.STOP_CRAWLER);
const CHECK_UP_DAYS = parseInt(process.env.CHECK_UP_DAYS) || 10;
const AWS_EMAIL_CONFIG = {
REGION: process.env.AWS_REGION || "",
CREDENTIALS: {
@@ -32,6 +36,20 @@ const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0;
const API_MAP_KEY = process.env.API_MAP_KEY || "";
const PROSTOR_LOGIN = {
EMAIL: process.env.PROSTOR_LOGIN_EMAIL,
PASSWORD: process.env.PROSTOR_LOGIN_PASS
};
const USER_AGENT =
process.env.USER_AGENT ||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
const USE_SCRAPER_API = process.env.USE_SCRAPER_API === undefined ? 1 : parseInt(process.env.USE_SCRAPER_API);
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
const SCRAPER_API_BASE_URL = process.env.SCRAPER_API_BASE_URL || "";
const NODE_FETCH_TIMEOUT_MS = parseInt(process.env.NODE_FETCH_TIMEOUT_MS) || 60000
module.exports = {
APP_PORT,
APP_URL,
@@ -42,5 +60,13 @@ module.exports = {
MAX_REAL_ESTATES_IN_EMAIL,
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
PRINT_CRAWLER_DEBUG,
API_MAP_KEY
API_MAP_KEY,
STAGING,
CHECK_UP_DAYS,
PROSTOR_LOGIN,
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY,
SCRAPER_API_BASE_URL,
NODE_FETCH_TIMEOUT_MS
};

View File

@@ -35,7 +35,8 @@ const getFilters = async (req, res) => {
balcony,
elevator,
newBuilding,
accessRoadType
accessRoadType,
includeWithoutPrice
} = searchRequest;
const category = AD_CATEGORY[realEstateType] || AD_CATEGORY.FLAT;
@@ -115,7 +116,8 @@ const getFilters = async (req, res) => {
advancedSegmentSelectFilterValues,
advancedRangeFilterObjects,
advancedRangeFilterValues,
includeIncompleteAds
includeIncompleteAds,
includeWithoutPrice
});
};
@@ -191,6 +193,7 @@ const postFilters = async (req, res) => {
});
const includeIncompleteAds = req.body.includeIncompleteAds === "on";
const includeWithoutPrice = req.body.includeWithoutPrice === "on";
const balcony = req.body.balcony === "on";
const elevator = req.body.elevator === "on";
@@ -217,6 +220,7 @@ const postFilters = async (req, res) => {
searchRequest.newBuilding = newBuilding;
searchRequest.includeIncompleteAds = includeIncompleteAds;
searchRequest.includeWithoutPrice = includeWithoutPrice;
searchRequest.accessRoadType = accessRoadType;

View File

@@ -2,13 +2,14 @@
const {
findRealEstatesForSearchRequest
} = require("../helpers/db/searchRequestMatch");
const { AD_STATUS } = require("../common/enums");
const getRealEstates = async (req, res) => {
const searchRequestId = req.params["searchRequestId"] || "";
const realEstates = await findRealEstatesForSearchRequest(searchRequestId);
const title = "Nekretnine koje odgovaraju Vašim uslovima pretrage";
res.render("realEstates", { realEstates, title });
res.render("realEstates", { realEstates, title, AD_STATUS });
};
module.exports = {

View File

@@ -1,9 +1,11 @@
const { getRealEstateById } = require("../helpers/db/realEstate");
const { AD_STATUS } = require("../common/enums");
const getRedirect = async (req, res) => {
const id = req.params.id || null;
let error = false;
let redirectUrl = undefined;
let vipAd = undefined;
if (!id) {
error = true;
} else {
@@ -13,6 +15,7 @@ const getRedirect = async (req, res) => {
error = true;
} else {
redirectUrl = realEstate.url;
vipAd = realEstate.adStatus === AD_STATUS.STATUS_VIP;
}
} catch (e) {
error = true;
@@ -24,7 +27,7 @@ const getRedirect = async (req, res) => {
res.render("notFound", { title });
} else {
const title = "Preusmjeravanje";
res.render("redirect", { title, redirectUrl });
res.render("redirect", { title, redirectUrl, vipAd });
}
};

View File

@@ -9,12 +9,15 @@ const OlxCrawler = require("./specificCrawlers/olx");
const RentalCrawler = require("./specificCrawlers/rental");
const ProstorCrawler = require("./specificCrawlers/prostor");
const AktidoCrawler = require("./specificCrawlers/aktido");
const SaljicCrawler = require("./specificCrawlers/saljic");
const { logDebug } = require("../helpers/log");
const {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG,
AKTIDO_CONFIG
AKTIDO_CONFIG,
SALJIC_CONFIG
} = require("./crawlerConfig");
const PostgresSaver = require("./savers/postgres");
@@ -57,6 +60,15 @@ async function crawlAll() {
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
),
new SaljicCrawler(
[postgresSaver],
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
SALJIC_CONFIG.SALJIC_MAX_PAGES,
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
)
];
@@ -64,7 +76,9 @@ async function crawlAll() {
for (const crawler of crawlers) {
try {
logDebug('Starting crawler: ', crawler);
const newRealEstatesFromSingleCrawler = await crawler.crawl();
logDebug('Crawler done: ', crawler);
if (Array.isArray(newRealEstatesFromSingleCrawler)) {
newRealEstates.push(...newRealEstatesFromSingleCrawler);
}

View File

@@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx");
const RENTAL_CONFIG = require("./specificConfigs/rental");
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
const SALJIC_CONFIG = require("./specificConfigs/saljic");
module.exports = {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG,
AKTIDO_CONFIG
AKTIDO_CONFIG,
SALJIC_CONFIG
};

View File

@@ -1,6 +1,7 @@
const moment = require("moment");
const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate");
const { bulkUpsertPriceHistory } = require("../../helpers/db/priceHistory");
class PostgresSaver {
connect() {
@@ -11,6 +12,21 @@ class PostgresSaver {
async save(results) {
const savedRecords = await bulkUpsertRealEstates(results);
//Extruding data for price history table
const resultPrices = savedRecords.map(realEstate => {
//Null values canot be recognized by ignore duplicates in sequalize
//Value price = 0 indicates 'cijena na upit'
const priceTmp =
realEstate.dataValues.price === null ? 0 : realEstate.dataValues.price;
return {
realEstateId: realEstate.dataValues.id,
price: priceTmp,
createdAt: realEstate.dataValues.createdAt,
updatedAt: realEstate.dataValues.updatedAt
};
});
const savedPrices = await bulkUpsertPriceHistory(resultPrices);
if (Array.isArray(savedRecords)) {
const newRealEstates = [];

View File

@@ -0,0 +1,34 @@
"use strict";
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
const saljicCrawlerAdType =
process.env.SALJIC_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE]
: null;
const saljicParsedCrawlerAdCategories =
process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined
? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const saljicIgnoredUsernames = [];
const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
module.exports = {
SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100,
SALJIC_MAX_RESULTS_PER_PAGE:
parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000,
SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE,
SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories,
SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [],
SALJIC_DELAY_BETWEEN_PAGES:
parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000,
SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL)
};

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
@@ -159,7 +159,7 @@ class AktidoCrawler {
}
try {
const res = await fetch(url);
const res = await fetch(url, {}, false);
const body = await res.text();
const $ = cheerio.load(body);
let hrefs = [];
@@ -202,6 +202,10 @@ class AktidoCrawler {
const body = await adPageSource.text();
const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
const mapElementParent = $(".box-map").parent();
const scriptElement = $("script", mapElementParent);
if (

View File

@@ -1,6 +1,7 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const { logDebug } = require("../../helpers/log");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
@@ -44,6 +45,16 @@ const OLX_ENUMS = {
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
const chunk = (array, size = 10) => {
let i, j ,temparray;
const result = []
for (i=0,j=array.length; i<j; i+=size) {
temparray = array.slice(i,i+size);
result.push(temparray);
}
return result;
}
class OlxCrawler {
constructor(
savers = [],
@@ -52,7 +63,7 @@ class OlxCrawler {
maxPages = 1000,
maxResultsPerPage = 100,
ignoredUsernames = [],
delayBetweenPages = 1000
delayBetweenPages = 500
) {
this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -65,6 +76,7 @@ class OlxCrawler {
}
async crawl() {
logDebug("Starting OLX crawl");
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
@@ -88,14 +100,32 @@ class OlxCrawler {
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (PRINT_CRAWLER_DEBUG) {
console.log("================================");
console.log("Category Indexer index : ", index);
}
if (singlePageResult) {
console.log("\tTotal entries : ", singlePageResult.length)
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords, existingRecords } = saveResults;
if (PRINT_CRAWLER_DEBUG) {
console.log("--------------------------");
console.log("\tNew record URLs [", newRecords.length, "] :");
for(const newRecord of newRecords) {
console.log("\t\t",newRecord.url);
}
console.log("\t-------------------------");
console.log("\tExisting record URLs [", existingRecords.length, "] :");
}
newRealEstates.push(...newRecords);
for (const existingRecord of existingRecords) {
const { publishedDate, renewedDate } = existingRecord;
const { publishedDate, renewedDate, url } = existingRecord;
const publishedDateMoment = moment.utc(publishedDate);
const renewedDateMoment = moment.utc(renewedDate);
@@ -105,13 +135,25 @@ class OlxCrawler {
"minute"
);
if (PRINT_CRAWLER_DEBUG) {
console.log("\t\t", url);
console.log("\t\t\tPublished date : ", publishedDate);
console.log("\t\t\tRenewed date : ", renewedDate);
console.log("\t\t\tIs same (up to minute) : ", stopCrawlingThisCategory);
}
if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
generatorsToRemove[index] = true;
// console.log("\tGenerator ", index + 1, "has no more new ads");
if (PRINT_CRAWLER_DEBUG) {
console.log("\t\t\tStopping this category indexer");
}
break;
}
}
} else {
if (PRINT_CRAWLER_DEBUG) {
console.log("\tNo more entries in this category, stopping!");
}
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
@@ -136,31 +178,36 @@ class OlxCrawler {
}
async *categoryIndexer(adCategory) {
let pageToIndex = 1;
try {
let pageToIndex = 1;
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) {
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
const singlePageResults = await this.indexSinglePage(
urlPageToCrawl,
this.maxResultsPerPage
);
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) {
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
const singlePageResults = await this.indexSinglePage(
urlPageToCrawl,
this.maxResultsPerPage
);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
await this.sleep(this.delayBetweenPages);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
}
} else {
return undefined;
} catch (e) {
console.log('Error inside generator: ', e);
}
}
@@ -170,8 +217,10 @@ class OlxCrawler {
}
try {
const res = await fetch(url);
const res = await fetch(url, {}, false);
logDebug("Got category results for: ", url);
const body = await res.text();
logDebug("Got category results text for: ", url);
const $ = cheerio.load(body);
let hrefs = [];
@@ -192,26 +241,44 @@ class OlxCrawler {
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefs[i]));
asyncScraping.push(hrefs[i]);
}
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
const allChunks = chunk(asyncScraping, 2);
const dataResults = []
for (let i = 0; i < allChunks.length; i++) {
const singleChunk = allChunks[i];
const promises = singleChunk.map(c => this.scrapeAd(c))
const chunkResults = await Promise.all(promises);
await this.sleep(this.delayBetweenPages);
dataResults.push(...chunkResults);
logDebug("Chunk results len:", chunkResults.length);
}
const filteredScrapedData = dataResults.filter(adData => !!adData);
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
return filteredScrapedData;
} catch (e) {
console.error("Exception caught:" + e);
console.error("Exception caught, index single page: " + e);
return [];
}
}
async scrapeAd(url) {
// console.log("Scraping : ", url);
logDebug("Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
let status = AD_STATUS.STATUS_NORMAL;
if (body.indexOf('<html') === -1) {
console.error("This is the body: ", body);
throw { message: 'Failed to fetch page !' }
}
const propertySelectors = {
username:
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
@@ -238,37 +305,25 @@ class OlxCrawler {
//====== PRICE DETECTION AND EXTRACTION =====
let price = null;
const normalPriceValue = $("#pc > p:nth-child(2)").text();
const urgentPriceValue = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
)
.text()
.trim();
if (normalPriceValue && normalPriceValue.length > 0) {
price = normalPriceValue;
if (
$("#pc > p.n")
.text()
.indexOf("Hitna") !== -1
) {
status = AD_STATUS.STATUS_URGENT;
} else {
status = AD_STATUS.STATUS_NORMAL;
}
} else if (urgentPriceValue && urgentPriceValue.length > 0) {
const priceValues = urgentPriceValue.split("KM");
//priceValues will contain values like ["100000", "90000", ...], second element is urgent price
if (priceValues.length > 1) {
price = priceValues[1].trim();
status = AD_STATUS.STATUS_DISCOUNTED;
} else {
throw { message: "Can't find urgent price" };
}
} else {
throw {
message: "Can't find price (it is not normal nor urgent price ?)"
};
const priceHeader = $("#pc > p.n").text().trim();
const priceValue = $("#pc > p:nth-child(2)").text().trim();
price = priceValue;
if (priceHeader.indexOf('Hitn') !== -1) {
// Urgent price
status = AD_STATUS.STATUS_URGENT;
}
const discountPriceTag = $("#artikal_glavni_div > div.artikal_lijevo > p:nth-child(4)").text().trim();
if (discountPriceTag.indexOf('Akcij') !== -1) {
status = AD_STATUS.STATUS_DISCOUNTED;
const discountPriceValues = $("#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p").text().trim();
// discountPriceValues contain string like "10.000 KM 7.500 KM"
// First price is regular, second is currently active (discounted) price
const bothPrices = discountPriceValues.split('KM');
// Now, currently active price is second element of bothPrices array
price = bothPrices[1] ? bothPrices[1].trim() : null;
}
//====== OTHER AD INFORMATION ===============
@@ -278,7 +333,7 @@ class OlxCrawler {
let otherInformationDivId;
//We need to locate DIV ID where other information are stored
for (let possibleId = 10; possibleId <= 20; possibleId++) {
for (let possibleId = 1; possibleId <= 30; possibleId++) {
const adTypeFieldTitle = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1`
)
@@ -650,10 +705,12 @@ class OlxCrawler {
distanceToRiver,
numberOfViewsAgency
};
//
//console.log("Scraped data:", data);
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
console.error("Exception caught scrapeAd : " + e.message, "\r\nURL:", url);
}
return null;
}
@@ -768,6 +825,9 @@ class OlxCrawler {
if (!priceText) {
return NaN;
}
if (priceText === "Po dogovoru") {
return null;
}
const formattedPriceText = priceText.replace(".", "").replace(",", ".");
return parseFloat(formattedPriceText);
}

View File

@@ -1,8 +1,10 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const FormData = require("form-data");
const nodeFetch = require("node-fetch");
const {
AD_TYPE,
@@ -16,7 +18,8 @@ const {
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
DEFAULT_TIMEZONE,
PROSTOR_LOGIN
} = require("../../config/appConfig");
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
@@ -60,13 +63,22 @@ class ProstorCrawler {
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
const crawlAdTypes = this.crawlerAdTypes;
if (!crawlAdCategories || !crawlAdTypes) {
return []
}
const newRealEstates = [];
//We need session cookie to use login privileges
const prostorCookie = await this.getCookies();
//New tag to check if crawler logged in
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
if (crawlAdCategories) {
//Crawl only if login was successful
if (login) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory));
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
}
let done = false;
@@ -119,15 +131,21 @@ class ProstorCrawler {
return newRealEstates;
}
async *categoryIndexer(adCategory) {
async *categoryIndexer(adCategory, prostorCookie) {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl
urlPageToCrawl,
prostorCookie
);
if (!Array.isArray(listOfAllRealEstates)){
console.log('[PROSTOR] Could not find real estate JSON data, check selector !');
return undefined;
}
let elementToStartIndexFrom = 0;
while (true) {
const realEstatesForSinglePage = listOfAllRealEstates.slice(
@@ -139,7 +157,8 @@ class ProstorCrawler {
elementToStartIndexFrom += realEstatesForSinglePage.length;
const singlePageResults = await this.indexSinglePage(
realEstatesForSinglePage
realEstatesForSinglePage,
prostorCookie
);
const filteredSinglePageResults = singlePageResults.filter(
@@ -163,10 +182,10 @@ class ProstorCrawler {
}
}
async indexSinglePage(realEstatesList) {
async indexSinglePage(realEstatesList, prostorCookie) {
const asyncActions = [];
for (const realEstate of realEstatesList) {
asyncActions.push(this.scrapeAd(realEstate));
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
}
try {
@@ -180,15 +199,26 @@ class ProstorCrawler {
}
}
async scrapeAd(realEstate) {
async scrapeAd(realEstate, prostorCookie) {
const { lat, lng, property_name, price, size, link, status } = realEstate;
//Status information is given already in realestate list
const adStatus = ProstorCrawler.getStatusId(status);
const url = `https://prostor.ba${link}`;
// console.log("[PROSTOR] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const adPageSource = await nodeFetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await adPageSource.text();
const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
@@ -330,7 +360,6 @@ class ProstorCrawler {
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
}
const adStatus = ProstorCrawler.getStatusId(status);
const title = property_name;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
@@ -408,18 +437,20 @@ class ProstorCrawler {
}
}
async extractRealEstates(url) {
async extractRealEstates(url, prostorCookie) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url);
}
try {
const res = await fetch(url);
const res = await nodeFetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await res.text();
const $ = cheerio.load(body);
const scriptElement = $(
"body > div > div.container-fluid > script:nth-child(7)"
"body > div.content > div.container-fluid > script:nth-child(6)"
);
if (
@@ -548,6 +579,8 @@ class ProstorCrawler {
return AD_STATUS.STATUS_SOLD;
case "Iznajmljeno":
return AD_STATUS.STATUS_RENTED;
case "VIP ponuda":
return AD_STATUS.STATUS_VIP;
default:
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
return AD_STATUS.STATUS_NORMAL;
@@ -569,6 +602,54 @@ class ProstorCrawler {
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
let formData = new FormData();
formData.append("email", PROSTOR_LOGIN.EMAIL);
formData.append("password", PROSTOR_LOGIN.PASSWORD);
return nodeFetch("https://prostor.ba/moj-prostor/prijava", {
method: "POST",
body: formData,
headers: { Cookie: prostorCookie }
})
.then(page => {
return page.text();
})
.then(resp => {
const $ = cheerio.load(resp);
if (
$("h1")
.text()
.indexOf("Dobrodošli") !== -1
) {
console.log("[PROSTOR]: Crawler loged in!");
return true;
} else {
console.log("[PROSTOR]: Crawler login failed - wrong credentials!");
return false;
}
})
.catch(err => {
console.log("[PROSTOR]: Crawler login error ", err);
});
}
async getCookies() {
const getResponse = await nodeFetch(
"https://prostor.ba/moj-prostor/prijava",
{
headers: { Cookie: "" }
}
);
const raw = getResponse.headers.raw()["set-cookie"];
const cookie = raw
.map(datastring => {
const data = datastring.split(";");
const cookieData = data[0];
return cookieData;
})
.join(";");
return cookie;
}
}
module.exports = ProstorCrawler;

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
@@ -159,7 +159,7 @@ class RentalCrawler {
}
try {
const res = await fetch(url);
const res = await fetch(url, {} , false);
const body = await res.text();
const $ = cheerio.load(body);
let hrefs = [];
@@ -202,6 +202,10 @@ class RentalCrawler {
const body = await adPageSource.text();
const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
const mapElementParent = $(".box-map").parent();
const scriptElement = $("script", mapElementParent);
if (
@@ -399,7 +403,9 @@ class RentalCrawler {
);
if (!publishedDateMoment.isValid()) {
throw {
message: `Invalid published date : ${extractedData["re_realEstates_inserted"]}`
message: `Invalid published date : ${
extractedData["re_realEstates_inserted"]
}`
};
}
@@ -410,7 +416,9 @@ class RentalCrawler {
);
if (!renewedDateMoment.isValid()) {
throw {
message: `Invalid renewed date : ${extractedData["re_realEstates_edited"]}`
message: `Invalid renewed date : ${
extractedData["re_realEstates_edited"]
}`
};
}

View File

@@ -0,0 +1,687 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const { getUrlParams } = require("../../helpers/url");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const PromisePool = require('@supercharge/promise-pool');
const {
AD_TYPE,
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE,
FURNISHING_TYPE,
HEATING_TYPE
} = require("../../common/enums");
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
} = require("../../config/appConfig");
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
const SALJIC_ENUMS = {
SALJIC_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
},
SALJIC_AD_CATEGORY: {
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
//[AD_CATEGORY.COTTAGE.id]: ""
}
};
class SaljicCrawler {
constructor(
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
maxPages = 5000,
maxResultsPerPage = 5000,
ignoredUsernames = [],
delayBetweenPages = 500
) {
this.savers = savers;
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory));
}
//
//console.log(indexGenerators);
//
let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!SALJIC_FORCE_CRAWL
) {
generatorsToRemove[index] = true;
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
}
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
await this.sleep(this.delayBetweenPages);
}
}
return newRealEstates;
}
async *categoryIndexer(adCategory) {
let pageToIndex = 1;
const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) {
const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11;
const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`;
const singlePageResults = await this.indexSinglePage(
urlPageToCrawl,
this.maxResultsPerPage
);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
}
}
async indexSinglePage(url, maxResultsPerPage) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[SALJIC] Index page : ", url);
}
try {
const res = await fetch(url, {}, false);
const body = await res.text();
const $ = cheerio.load(body);
let hrefs = [];
$("#shop")
.find(".product")
.each((i, elem) => {
const href = $(elem)
.find("a")
.first()
.attr("href");
if (href) {
hrefs.push(href);
}
});
let adTypesTmp = [];
$("#shop")
.find(".product")
.each((i, elem) => {
const adType = $(elem)
.find(".trakica-search-page")
.text()
.trim();
if (adType) {
adTypesTmp.push(adType);
}
});
//Converting to AD_TYPE
const adTypes = adTypesTmp.map(adTypeText => {
return this.getAdTypeId(adTypeText);
});
//Converting to absolute URLs
const hrefsAbs = hrefs.map(link => {
return "https://www.saljicnekretnine.ba" + link;
});
let actualNoOfResults =
hrefsAbs.length <= maxResultsPerPage
? hrefsAbs.length
: maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push([hrefsAbs[i], adTypes[i]]);
}
const dataResults = []
const { scrapedData, errors } = await PromisePool
.withConcurrency(2)
.for(asyncScraping)
.process(async data => {
const result = await this.scrapeAd(...data)
await this.sleep(this.delayBetweenPages);
dataResults.push(result)
return result; //TODO: this does not work, scrapedData is null, dataResults works
})
const filteredScrapedData = dataResults.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
console.error("[SALJIC] Exception caught:" + e);
return [];
}
}
async scrapeAd(url, adType) {
// console.log("[SALJIC] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
// No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url
const agencyObjectId = url
? parseInt(url.substring(46, url.length))
: null;
if (!agencyObjectId) {
throw { message : 'No agency object ID - URL changed?'}
}
//Extracting main properties
const propertySelectors = {
title:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
price:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
descriptions:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
latAndLong:
"iframe"
};
const title = $(propertySelectors.title)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.replace(/ {1,}/g, " ")
.trim();
const priceText = $(propertySelectors.price)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.replace(/ {1,}/g, " ")
.trim();
const price =
priceText === "CIJENA NA UPIT"
? null
: parseFloat(
priceText.substring(8, priceText.length - 3).replace(",", "")
);
const streetName = $(propertySelectors.streetName)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.trim();
const descriptions = $(propertySelectors.descriptions)
.text()
.replace(/\"/g, "")
.trim();
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
let latText;
let longText;
if (latAndLongSrc){
const mapParams = getUrlParams(latAndLongSrc);
if (mapParams) {
if (mapParams['marker']){
const marker = mapParams['marker'].split(',');
latText = marker[0] ? marker[0] : undefined;
longText = marker[1] ? marker[1] : undefined;
}else{
if (mapParams['mlat']) {
latText = mapParams['mlat'];
}
if (mapParams['mlon']) {
longText = mapParams['mlon'];
}
}
}
}
const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null;
//====== DETAIL INFORMATION FIELDS ==========
let area = null,
gardenSize = null,
numberOfRooms = null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = null,
furnishingType = null,
balcony = null,
newBuilding = null,
elevator = null,
water = null,
electricity = null,
drainageSystem = null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = null,
garage = null,
gas = null,
antiTheftDoor = null,
airCondition = null,
phoneConnection = null,
cableTV = null,
internet = null,
basementAttic = null,
storeRoom = null,
videoSurveillance = null,
alarm = null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = null,
exchange = null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null;
let publishedDate = null;
let renewedDate = null;
let realEstateType;
let numberOfViewsAgency = null;
let numberOfViewsKivi = null;
let streetNumber = 0;
let adStatus = status;
let shortDescription = descriptions
? descriptions.substring(0, descriptions.indexOf("."))
: "";
let longDescription = descriptions || "";
//Extracting data - Glavne karakteristike
let mainFieldIndex = 1;
do {
const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
const mainField = $(mainFieldSelector)
.text()
.replace(/[\n\r\t]/gm, "")
.trim();
const mainFieldTitle = mainField
? mainField.substring(0, mainField.indexOf(" "))
: "";
const mainFieldValue = mainField
? mainField
.substring(mainField.indexOf(" "), mainField.length)
.trim()
: "";
switch (mainFieldTitle) {
case "Površina":
area = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "Okućnica":
gardenSize = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "Broj soba":
numberOfRooms = parseInt(mainFieldValue);
break;
case "Broj spratova":
numberOfFloors = this.parseNumberOfFloors(mainFieldValue);
break;
case "Sprat":
floor = parseInt(mainFieldValue);
break;
case "Godina renoviranja":
recentlyAdapted = true;
break;
case "Broj parking mjesta":
parking = true;
break;
case "Dostupno od":
const day = mainFieldValue.substring(0, 2);
const month = mainFieldValue.substring(3, 5);
const year = mainFieldValue.substring(6, mainFieldValue.length);
publishedDate = new Date(`${month}/${day}/${year}`);
break;
default:
break;
}
if (mainFieldTitle === "") {
break;
}
mainFieldIndex++;
} while (true);
//Extracting data - Sadrzaji
let additionalFieldIndex = 1;
do {
const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`;
const additionalField = $(additionalFieldSelector)
.text()
.trim();
if (additionalFieldIndex === 1) {
//Extracting data of real estate type
const categoryTmp = additionalField
.replace(/[\n\r\t]/gm, "")
.substring(
additionalField.indexOf("Kategorija") + 10,
additionalField.length
)
.trim();
realEstateType = this.getAdCategoryId(categoryTmp);
if (!realEstateType) {
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
}
} else {
switch (additionalField) {
case "Internet":
internet = true;
break;
case "Garaža":
garage = true;
break;
case "Klima":
airCondition = true;
break;
case "Balkon":
balcony = true;
break;
case "Ostava":
storeRoom = true;
break;
case "Podrum":
basementAttic = true;
break;
case "Blindirana vrata":
antiTheftDoor = true;
break;
case "Voda":
water = true;
break;
case "Kablovska":
cableTV = true;
break;
case "Uknjiženo":
registeredInZkBooks = true;
break;
case "Grijanje - centralno":
heatingType = HEATING_TYPE.CENTRAL_CITY.id;
break;
case "Grijanje - plin":
heatingType = HEATING_TYPE.GAS.id;
break;
case "Grijanje - struja":
heatingType = HEATING_TYPE.ELECTRICITY.id;
break;
case "Grijanje":
heatingType = HEATING_TYPE.OTHER.id;
break;
case "Plin":
gas = true;
break;
case "Namješten":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "Alarm":
alarm = true;
break;
case "Video nadzor":
videoSurveillance = true;
break;
case "Lift":
elevator = true;
break;
case "Novogradnja":
newBuilding = true;
break;
default:
break;
}
}
if (additionalField === "") {
break;
}
additionalFieldIndex++;
} while (true);
//If no published date it takes current date of crawling
if (publishedDate) {
renewedDate = new Date();
} else {
publishedDate = new Date();
renewedDate = new Date();
}
const originAgencyName = AD_AGENCY.SALJIC;
const locality = "";
const municipality = "";
const city = "";
const region = "";
const entity = "";
const country = "";
const data = {
url,
agencyObjectId,
originAgencyName,
realEstateType,
adType,
title,
price,
area,
gardenSize,
shortDescription,
longDescription,
streetNumber,
streetName,
locality,
municipality,
city,
region,
entity,
country,
locationLat,
locationLong,
adStatus,
publishedDate,
renewedDate,
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
exchange,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency,
numberOfViewsKivi
};
return data;
} catch (e) {
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
}
return null;
}
//======= HELPER FUNCTIONS =============
getAdCategoryId(categoryText) {
switch (categoryText) {
case "Stan":
return AD_CATEGORY.FLAT.id;
case "Građevinsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Industrijsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Poljoprivredno zemljiste":
return AD_CATEGORY.LAND.id;
case "Kuća":
return AD_CATEGORY.HOUSE.id;
case "Poslovni prostor":
return AD_CATEGORY.OFFICE.id;
case "Kancelarije":
return AD_CATEGORY.OFFICE.id;
case "Apartmani":
return AD_CATEGORY.APARTMENT.id;
case "Garaža":
return AD_CATEGORY.GARAGE.id;
case "Vikendica":
return AD_CATEGORY.COTTAGE.id;
default:
return undefined;
}
}
getAdTypeId(adTypeText) {
switch (adTypeText) {
case "PRODAJA":
return AD_TYPE.AD_TYPE_SALE.stringId;
case "NAJAM":
return AD_TYPE.AD_TYPE_RENT.stringId;
default:
return undefined;
}
}
parseNumberOfFloors(numberOfFloorsText) {
const tryNumericalValue = parseInt(numberOfFloorsText);
if (!isNaN(tryNumericalValue)){
return tryNumericalValue;
}
// Guess number of floors based on number of + sign concatenations
// e.g. P+S+Pt -> 3 floors
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
return numberOfFloorsText.split('+').length + 1
}
return null
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
const savers = this.savers;
// for (const saver of savers) {
// await saver.save(results);
// }
//For now, we use only Postgres saver, so ...
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}
module.exports = SaljicCrawler;

View File

@@ -0,0 +1,20 @@
"use strict";
const db = require("../../models/index");
const sequelize = require("sequelize");
const bulkUpsertPriceHistory = async priceHistoryData => {
try {
const order = [["realEstateId", "desc"]];
return await db.PriceHistory.bulkCreate(priceHistoryData, {
order,
ignoreDuplicates: true
});
} catch (e) {
console.log("Error bulk upserting priceHistory : ", e);
}
};
module.exports = {
bulkUpsertPriceHistory
};

View File

@@ -2,6 +2,8 @@
const db = require("../../models/index");
const sequelize = require("sequelize");
const Op = sequelize.Op;
const { AD_CATEGORY } = require("../../common/enums");
const bulkUpsertRealEstates = async realEstateData => {
try {
const fieldsToUpdateIfDuplicate = [
@@ -96,12 +98,16 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
floorMin,
floorMax,
includeIncompleteAds,
includeWithoutPrice,
balcony,
elevator,
newBuilding,
accessRoadType
} = searchRequest;
//Needed for defining which attribute should exist or not
const realEstateTypeObject = AD_CATEGORY[realEstateType];
const longitudeColumn = sequelize.col("locationLong");
const latitudeColumn = sequelize.col("locationLat");
@@ -134,15 +140,6 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
const query = {
adType,
realEstateType,
price: {
[Op.or]: {
[Op.and]: {
[Op.lte]: priceMax,
[Op.gte]: priceMin
},
[Op.is]: null
}
},
area: {
[Op.lte]: sizeMax,
[Op.gte]: sizeMin
@@ -154,15 +151,6 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
const queryIncludeIncomplete = {
adType,
realEstateType,
price: {
[Op.or]: {
[Op.and]: {
[Op.lte]: priceMax,
[Op.gte]: priceMin
},
[Op.is]: null
}
},
area: {
[Op.or]: {
[Op.and]: {
@@ -175,8 +163,49 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
[Op.and]: geoSearchQueryPart
};
//Every other attribute is checked separately and included in query only if it is defined
if (gardenSizeMax && gardenSizeMin) {
//Is user unchecked includeWithoutPrice FALSE then it shouldn't return null values of price
//If not then null values are accepted (this is DEFAULT)
//includeIncpompleteAds does not have effect on price query
if (includeWithoutPrice) {
query.price = {
[Op.or]: {
[Op.and]: {
[Op.lte]: priceMax,
[Op.gte]: priceMin
},
[Op.is]: null
}
};
queryIncludeIncomplete.price = {
[Op.or]: {
[Op.and]: {
[Op.lte]: priceMax,
[Op.gte]: priceMin
},
[Op.is]: null
}
};
} else {
query.price = {
[Op.and]: {
[Op.lte]: priceMax,
[Op.gte]: priceMin
}
};
queryIncludeIncomplete.price = {
[Op.and]: {
[Op.lte]: priceMax,
[Op.gte]: priceMin
}
};
}
//Every other attribute is checked separately and included in query only if it is defined for real estate type
if (
realEstateTypeObject.hasGardenSize &&
gardenSizeMax != null &&
gardenSizeMin != null
) {
query.gardenSize = {
[Op.lte]: gardenSizeMax,
[Op.gte]: gardenSizeMin
@@ -192,7 +221,11 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
if (numberOfRoomsMin && numberOfRoomsMax) {
if (
realEstateTypeObject.hasNumberOfRoom &&
numberOfRoomsMin != null &&
numberOfRoomsMax != null
) {
query.numberOfRooms = {
[Op.lte]: numberOfRoomsMax,
[Op.gte]: numberOfRoomsMin
@@ -208,7 +241,11 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
if (numberOfFloorsMin && numberOfFloorsMax) {
if (
realEstateTypeObject.hasNumberOfFloors &&
numberOfFloorsMin != null &&
numberOfFloorsMax != null
) {
query.numberOfFloors = {
[Op.lte]: numberOfFloorsMax,
[Op.gte]: numberOfFloorsMin
@@ -224,7 +261,11 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
if (floorMin && floorMax) {
if (
realEstateTypeObject.hasFloorProp &&
floorMin != null &&
floorMax != null
) {
query.floor = {
[Op.lte]: floorMax,
[Op.gte]: floorMin
@@ -239,8 +280,10 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
}
};
}
if (balcony) {
//Logic for balcony, newBuilding and elevator from users side
//If true is checked, then I want characteristic to be true but,
//if it is not checked, then I dont care - it can be null or false or true
if (realEstateTypeObject.hasBalconyProp && balcony === true) {
query.balcony = {
[Op.eq]: balcony
};
@@ -252,7 +295,7 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
if (newBuilding) {
if (realEstateTypeObject.hasNewBuildingProp && newBuilding === true) {
query.newBuilding = {
[Op.eq]: newBuilding
};
@@ -264,7 +307,7 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
if (elevator) {
if (realEstateTypeObject.hasElevatorProp && elevator === true) {
query.elevator = {
[Op.eq]: elevator
};
@@ -275,7 +318,8 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
}
};
}
//If user wants 'ANY' road type acces then it is not included in query -
//returns every road type and null values
if (accessRoadType !== "ANY") {
query.accessRoadType = {
[Op.eq]: accessRoadType
@@ -288,10 +332,14 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
//When includeIncompleteAds are not defined - null it will consider it true
const order = [["updatedAt", "desc"]];
return db.RealEstate.findAll({
where: includeIncompleteAds ? queryIncludeIncomplete : query,
where:
includeIncompleteAds || includeIncompleteAds == null
? queryIncludeIncomplete
: query,
limit: maxResults,
order
});

View File

@@ -3,6 +3,7 @@ const db = require("../../models/index");
const sequelize = require("sequelize");
const Op = sequelize.Op;
const { AD_CATEGORY } = require("../../common/enums");
const { CHECK_UP_DAYS } = require("../../config/appConfig");
const getSearchRequest = async searchRequestId => {
try {
@@ -16,6 +17,22 @@ const getSearchRequest = async searchRequestId => {
const createSearchRequest = async (searchRequestFields = {}) => {
return await db.SearchRequest.create(searchRequestFields);
};
const findAllRequestsForCheckUp = async () => {
const checkUpOffset = 24 * 60 * 60 * 1000 * CHECK_UP_DAYS; //in miliseconds
const checkupDate = new Date();
checkupDate.setTime(checkupDate.getTime() - checkUpOffset);
const dateQuery = {
notifiedAt: {
[Op.lte]: checkupDate
}
};
const allRequestsForCheckUp = await db.SearchRequest.findAll({
where: dateQuery
});
return allRequestsForCheckUp;
};
const findSearchRequestsForRealEstate = async realEstate => {
const {
@@ -49,132 +66,416 @@ const findSearchRequestsForRealEstate = async realEstate => {
const geoSearchQueryPart = sequelize.where(contains, true);
//General query contains only attributes that are defined for every RealEstate - not null
const query = {
adType,
realEstateType,
subscribed: true,
[Op.and]: geoSearchQueryPart
};
//Needed for defining which attribute should exist or not
const realEstateTypeObject = AD_CATEGORY[realEstateType];
//Needed to decide on including incomplete RealEstates data
// ?? Needed to decide on including incomplete RealEstates data
let checkForIncompleteWanted = false;
//Attributes are checked separately and included in query only if defined
//Price and area should be defined for every property
//Attributes are checked separately to make different query parts
if (price) {
query.priceMin = {
[Op.lte]: price
//If real estate price is number then it searches for req that have priceMin and priceMax
//If real estate price is null it searches for req that accept ads without price
//User always defines price and area (sliders) - not null in search req
let priceQuery = {};
if (price != null) {
priceQuery = {
[Op.and]: [
{
priceMin: {
[Op.lte]: price
}
},
{
priceMax: {
[Op.gte]: price
}
}
]
};
query.priceMax = {
[Op.gte]: price
} else {
priceQuery = {
includeWithoutPrice: {
[Op.eq]: true
}
};
}
if (area) {
query.sizeMin = {
[Op.lte]: area
};
query.sizeMax = {
[Op.gte]: area
let areaQuery = {};
if (area != null) {
areaQuery = {
[Op.and]: [
{
sizeMin: {
[Op.lte]: area
}
},
{
sizeMax: {
[Op.gte]: area
}
}
]
};
} else {
checkForIncompleteWanted = true;
}
//Other attributes can be defined or not depending on RealEstate type
if (gardenSize) {
query.gardenSizeMin = {
[Op.lte]: gardenSize
};
query.gardenSizeMax = {
[Op.gte]: gardenSize
};
} else if (realEstateTypeObject.hasGardenSize) {
checkForIncompleteWanted = true;
//we check what to include in query based on real estate type object
let gardenSizeQuery = {};
if (realEstateTypeObject.hasGardenSize) {
if (gardenSize != null) {
gardenSizeQuery = {
[Op.and]: [
{
gardenSizeMin: {
[Op.lte]: gardenSize
}
},
{
gardenSizeMax: {
[Op.gte]: gardenSize
}
}
]
};
} else {
checkForIncompleteWanted = true;
}
}
if (numberOfRooms) {
query.numberOfRoomsMin = {
[Op.lte]: numberOfRooms
};
query.numberOfRoomsMax = {
[Op.gte]: numberOfRooms
};
} else if (realEstateTypeObject.hasNumberOfRoom) {
checkForIncompleteWanted = true;
let numberOfRoomsQuery = {};
if (realEstateTypeObject.hasNumberOfRoom) {
if (numberOfRooms != null) {
//If real estate has defined number of rooms ex. 3 it returns req
// that accepts 3 rooms or ones that don't have defined number - null
//Ex. they didnt choose advanced filters at all
numberOfRoomsQuery = {
[Op.and]: [
{
numberOfRoomsMin: {
[Op.or]: {
[Op.lte]: numberOfRooms,
[Op.is]: null
}
}
},
{
numberOfRoomsMax: {
[Op.or]: {
[Op.gte]: numberOfRooms,
[Op.is]: null
}
}
}
]
};
} else {
// If real estate dont have defined number of rooms ex. null
//It returns requests that didn't choose number of rooms - also null
//Or ones that picked some values but also picked to includeIncomplete ads (or default)
numberOfRoomsQuery = {
[Op.or]: [
{
[Op.and]: [
{
numberOfRoomsMin: {
[Op.is]: null
}
},
{
numberOfRoomsMax: {
[Op.is]: null
}
}
]
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
};
}
}
//Same logic for number of Floors and floors
let numberOfFloorsQuery = {};
if (realEstateTypeObject.hasNumberOfFloors) {
if (numberOfFloors != null) {
numberOfFloorsQuery = {
[Op.and]: [
{
numberOfFloorsMin: {
[Op.or]: {
[Op.lte]: numberOfFloors,
[Op.is]: null
}
}
},
{
numberOfFloorsMax: {
[Op.or]: {
[Op.gte]: numberOfFloors,
[Op.is]: null
}
}
}
]
};
} else {
numberOfFloorsQuery = {
[Op.or]: [
{
[Op.and]: [
{
numberOfFloorsMin: {
[Op.is]: null
}
},
{
numberOfFloorsMax: {
[Op.is]: null
}
}
]
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
};
}
}
let floorQuery = {};
if (realEstateTypeObject.hasFloorProp) {
if (floor != null) {
floorQuery = {
[Op.and]: [
{
floorMin: {
[Op.or]: {
[Op.lte]: floor,
[Op.is]: null
}
}
},
{
floorMax: {
[Op.or]: {
[Op.gte]: floor,
[Op.is]: null
}
}
}
]
};
} else {
floorQuery = {
[Op.or]: [
{
[Op.and]: [
{
floorMin: {
[Op.is]: null
}
},
{
floorMax: {
[Op.is]: null
}
}
]
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
};
}
}
if (numberOfFloors) {
query.numberOfFloorsMin = {
[Op.lte]: numberOfFloors
};
query.numberOfFloorsMax = {
[Op.gte]: numberOfFloors
};
} else if (realEstateTypeObject.hasNumberOfFloors) {
checkForIncompleteWanted = true;
//Logic for balcony, newBuilding and elevator
//If user dont check checkbox for ex. elevator it does not mean he only wants no elevator
//If real estate characteristic =true find all req, one that wants charachertistic or dont care - dont need query
//If real estate characteristic = false, find all req exept for ones that wants characteristic to be true
//If real estate characteristic = null, dont know if true or false, find req that dont care or want char and want incomplete ads
let balconyQuery = {};
if (realEstateTypeObject.hasBalconyProp && balcony !== true) {
if (balcony === false) {
balconyQuery = {
balcony: {
[Op.ne]: true
}
};
} else if (balcony === null) {
balconyQuery = {
[Op.or]: [
{
balcony: {
[Op.ne]: true
}
},
{
[Op.and]: [
{
balcony: {
[Op.eq]: true
}
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}
]
};
}
}
if (floor) {
query.floorMin = {
[Op.lte]: floor
};
query.floorMax = {
[Op.gte]: floor
};
} else if (realEstateTypeObject.hasFloorProp) {
checkForIncompleteWanted = true;
let newBuildingQuery = {};
if (realEstateTypeObject.hasNewBuildingProp && newBuilding !== true) {
if (newBuilding === false) {
newBuildingQuery = {
newBuilding: {
[Op.ne]: true
}
};
} else if (newBuilding === null) {
newBuildingQuery = {
[Op.or]: [
{
newBuilding: {
[Op.ne]: true
}
},
{
[Op.and]: [
{
newBuilding: {
[Op.eq]: true
}
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}
]
};
}
}
let elevatorQuery = {};
if (realEstateTypeObject.hasElevatorProp && elevator !== true) {
if (elevator === false) {
elevatorQuery = {
elevator: {
[Op.ne]: true
}
};
} else if (elevator === null) {
elevatorQuery = {
[Op.or]: [
{
elevator: {
[Op.ne]: true
}
},
{
[Op.and]: [
{
elevator: {
[Op.eq]: true
}
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}
]
};
}
}
//General query consists of each individual query
const query = {
adType,
realEstateType,
subscribed: true,
[Op.and]: [
geoSearchQueryPart,
priceQuery,
areaQuery,
gardenSizeQuery,
numberOfRoomsQuery,
numberOfFloorsQuery,
floorQuery,
balconyQuery,
newBuildingQuery,
elevatorQuery
]
};
if (accessRoadType) {
//AccessRoadType is defined - should exists for each ad and estate type
if (accessRoadType != null) {
query.accessRoadType = {
[Op.or]: {
[Op.eq]: "ANY",
[Op.like]: "ANY",
[Op.eq]: accessRoadType
}
};
} else if (realEstateTypeObject.hasAccesRoadType) {
checkForIncompleteWanted = true;
}
if (balcony) {
query.balcony = {
[Op.eq]: balcony
} else {
//Null values are returned for user request that wanted ANY acces road type
query.accessRoadType = {
[Op.eq]: "ANY"
};
} else if (realEstateTypeObject.hasBalconyProp) {
checkForIncompleteWanted = true;
}
if (newBuilding) {
query.newBuilding = {
[Op.eq]: newBuilding
};
} else if (realEstateTypeObject.hasNewBuildingProp) {
checkForIncompleteWanted = true;
}
if (elevator) {
query.elevator = {
[Op.eq]: elevator
};
} else if (realEstateTypeObject.hasElevatorProp) {
checkForIncompleteWanted = true;
}
//If one of the attributes that exists for property type is null
//we include in query to check if incomplete real estates are accepted
//Tag to check if incomplete ads are accepted in query which is default
if (checkForIncompleteWanted) {
query.includeIncompleteAds = {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
};
}
return await db.SearchRequest.findAll({ where: query });
return await db.SearchRequest.findAll({
where: query
});
};
module.exports = {
getSearchRequest,
createSearchRequest,
findSearchRequestsForRealEstate
findSearchRequestsForRealEstate,
findAllRequestsForCheckUp
};

View File

@@ -1,5 +1,7 @@
"use strict";
const db = require("../../models/index");
const sequelize = require("sequelize");
const Op = sequelize.Op;
const findRealEstatesForSearchRequest = async searchRequestId => {
const query = {

View File

@@ -1,10 +1,19 @@
"use strict";
const { MAX_REAL_ESTATES_IN_EMAIL, APP_URL } = require("../config/appConfig");
const { AD_CATEGORY } = require("../common/enums");
const {
MAX_REAL_ESTATES_IN_EMAIL,
APP_URL,
STAGING
} = require("../config/appConfig");
const { AD_CATEGORY, AD_TYPE, EMAIL_FREQUENCY } = require("../common/enums");
const generateEmailFooter = searchRequestId => {
return `<div>Ako želite prestati dobijati obavještenja za ovu pretragu, <a href="${APP_URL}/odjava/${searchRequestId}">odjavite ovdje</a></div>
//Tag to recognize staging from development
const stagingTag = STAGING ? "[STAGING] " : "";
const wordOfMouthRequest = `Molimo vas <strong>recite svojim prijateljima</strong> za Kivi - što više korisnika budemo imali, moći ćemo više agencija uključiti i više nekretnina imati u bazi. Hvala!`
const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => {
return ` <div>Trenutno ste prijavljeni da obavještenja o novim nekretninama primate <strong>${emailFrequencyTitle.toLowerCase()} </strong>.</div>
<div>Ako želite prestati dobijati obavještenja za ovu pretragu, <a href="${APP_URL}/odjava/${searchRequestId}">odjavite ovdje</a></div>
<div>Ako želite pogledati ili promijeniti uslove za ovu pretragu, <a href="${APP_URL}/pregled/${searchRequestId}">pogledajte ovdje</a></div>
<br/>
<strong>Vaš,<br/>Kivi tim</strong>`;
@@ -23,17 +32,24 @@ const generateRealEstateLinks = realEstates => {
const generateNotificationEmail = (
realEstates,
searchRequestId,
noAllRealEstates,
dailyNotification = false
) => {
const truncateList = realEstates.length > MAX_REAL_ESTATES_IN_EMAIL;
const realEstatesToShow = truncateList
? realEstates.slice(0, MAX_REAL_ESTATES_IN_EMAIL)
: realEstates;
const allRealEstatesLink = `${APP_URL}/nekretnine/${searchRequestId}`;
const emailFrequencyTitle = dailyNotification
? EMAIL_FREQUENCY.DAILY.title
: EMAIL_FREQUENCY.ASAP.title;
const realEstateLinks = generateRealEstateLinks(realEstatesToShow);
const moreRealEstates = `<div>Kompletan spisak nekretnina možete pogledati na <a href="${allRealEstatesLink}">listi nekretnina</a><div>`;
const emailFooter = generateEmailFooter(searchRequestId);
const moreRealEstates = `<div>Kompletan spisak nekretnina (${noAllRealEstates}) možete pogledati na <a href="${allRealEstatesLink}">listi nekretnina</a><div>`;
const emailFooter = generateEmailFooter(searchRequestId, emailFrequencyTitle);
const asapMessageBody =
realEstates.length > 1
? "Pronašli smo nekretnine koje odgovaraju Vašoj pretrazi"
@@ -46,7 +62,7 @@ const generateNotificationEmail = (
const messageBody = dailyNotification ? dailyMessageBody : asapMessageBody;
return `<h3>Zdravo</h3>
return `<h3>${stagingTag}Zdravo</h3>
<h4>${messageBody}</h4>
<div>
${realEstateLinks}
@@ -54,11 +70,36 @@ const generateNotificationEmail = (
${moreRealEstates}
</div>
<br/>
${wordOfMouthRequest}
<br/>
<br/>
${emailFooter}`;
};
const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
const realEstateType = AD_CATEGORY[searchRequest.realEstateType];
let adTypeTitle = "";
switch (searchRequest.adType) {
case AD_TYPE.AD_TYPE_SALE.stringId:
adTypeTitle = AD_TYPE.AD_TYPE_SALE.title;
break;
case AD_TYPE.AD_TYPE_RENT.stringId:
adTypeTitle = AD_TYPE.AD_TYPE_RENT.title;
break;
default:
adTypeTitle = "-";
break;
}
let emailFrequencyTitle;
switch (searchRequest.emailFrequency) {
case EMAIL_FREQUENCY.ASAP.stringId:
emailFrequencyTitle = EMAIL_FREQUENCY.ASAP.title;
break;
case EMAIL_FREQUENCY.DAILY.stringId:
emailFrequencyTitle = EMAIL_FREQUENCY.DAILY.title;
break;
}
const {
id,
gardenSizeMin,
@@ -70,6 +111,7 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
} = searchRequest;
const realEstateLinks = generateRealEstateLinks(matchingRealEstates);
const instantRealEstatesText = `<br/>
<div>
U međuvremenu pogledajte neke od nedavno objavljenih nekretnina koje odgovaraju Vašim uslovima pretrage :<br/>
@@ -80,25 +122,30 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
? `<div><strong>Kvadratura okućnice: Od ${gardenSizeMin} do ${gardenSizeMax} m2</strong></div>`
: ``;
const emailFooter = generateEmailFooter(id);
const emailFooter = generateEmailFooter(id, emailFrequencyTitle);
return `<h3>Zdravo</h3>
return `<h3>${stagingTag}Zdravo</h3>
<div>Naručili ste da Vam javimo ako se nekretnina sa navedenim uslovima pojavi u oglasima:</div>
<br/>
<div>
<div><strong>Tip nekretnine: </strong>${realEstateType.title}</div>
<div><strong>Vrsta oglasa: </strong>${adTypeTitle}</div>
<div><strong>Kvadratura nekretnine:</strong> Od ${sizeMin} do ${sizeMax} m2</div>
${gardenSize}
<div><strong>Cijena:</strong> ${priceMin} do ${priceMax} KM</div>
</div>
${matchingRealEstates.length > 0 ? instantRealEstatesText : ""}
<br/>
<br/>
${wordOfMouthRequest}
<br/>
<br/>
${emailFooter}`;
};
const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
if (numberOfRealEstates === 1) {
return `Kivi: ${singleRealEstateTitle}`;
return `${stagingTag}Kivi: ${singleRealEstateTitle}`;
}
const leastSignificantDigit = numberOfRealEstates % 10;
@@ -106,7 +153,7 @@ const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
const secondLeastSignificantDigit = numberWithoutLastDigit % 10;
if (leastSignificantDigit === 1 && secondLeastSignificantDigit !== 1) {
return `Kivi : ${numberOfRealEstates} nova nekretnina`;
return `${stagingTag}Kivi : ${numberOfRealEstates} nova nekretnina`;
}
if (
@@ -114,14 +161,58 @@ const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
leastSignificantDigit <= 4 &&
secondLeastSignificantDigit !== 1
) {
return `Kivi: ${numberOfRealEstates} nove nekretnine`;
return `${stagingTag}Kivi: ${numberOfRealEstates} nove nekretnine`;
}
return `Kivi: ${numberOfRealEstates} novih nekretnina`;
return `${stagingTag}Kivi: ${numberOfRealEstates} novih nekretnina`;
};
const generateCheckUpEmail = searchRequest => {
const realEstateType = AD_CATEGORY[searchRequest.realEstateType];
const {
id,
gardenSizeMin,
gardenSizeMax,
sizeMin,
sizeMax,
priceMin,
priceMax
} = searchRequest;
let emailFrequencyTitle;
switch (searchRequest.emailFrequency) {
case EMAIL_FREQUENCY.ASAP.stringId:
emailFrequencyTitle = EMAIL_FREQUENCY.ASAP.title;
break;
case EMAIL_FREQUENCY.DAILY.stringId:
emailFrequencyTitle = EMAIL_FREQUENCY.DAILY.title;
break;
}
const gardenSize = realEstateType.hasGardenSize
? `<div><strong>Kvadratura okućnice: Od ${gardenSizeMin} do ${gardenSizeMax} m2</strong></div>`
: ``;
const emailFooter = generateEmailFooter(id, emailFrequencyTitle);
return `<h3>${stagingTag}Zdravo</h3>
<div><strong>Kivi tim traži nekretnine za Vas i kada to ne vidite.</strong></div>
<br />
<div>Vaša trenutno aktivna pretraga je:</div>
<br/>
<div>
<div><strong>Tip nekretnine: </strong>${realEstateType.title}</div>
<div><strong>Kvadratura nekretnine:</strong> Od ${sizeMin} do ${sizeMax} m2</div>
${gardenSize}
<div><strong>Cijena:</strong> ${priceMin} do ${priceMax} KM</div>
</div>
<br/>
${emailFooter}`;
};
module.exports = {
generateNotificationEmail,
generateNewSearchRequestEmail,
generateEmailSubject
generateEmailSubject,
generateCheckUpEmail
};

View File

@@ -0,0 +1,58 @@
const nodeFetch = require("node-fetch");
const AbortController = require('abort-controller');
const FetchCache = require('@sozialhelden/fetch-cache').default;
console.log("Fc ", FetchCache)
const {
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY,
SCRAPER_API_BASE_URL,
NODE_FETCH_TIMEOUT_MS
} = require("../config/appConfig");
const timeout = (ms) => {
return new Promise(resolve => setTimeout(resolve, ms));
}
const fetchCache = new FetchCache({
fetch: nodeFetch,
cacheOptions: {
// Don't save more than 100 responses in the cache. Allows infinite responses by default
maximalItemCount: 10000,
// When should the cache evict responses when its full?
evictExceedingItemsBy: 'age', // Valid values: 'lru' or 'age'
defaultTTL: 6 * 60 * 60 * 1000 // 6 hours
// ...see https://github.com/sozialhelden/hamster-cache for all possible options
},
});
const fetch = async (url, options = {}, useCache = true) => {
const controller = new AbortController();
const newOptions = Object.assign({}, options);
if (!newOptions["headers"]) {
newOptions["headers"] = {};
}
newOptions.signal = controller.signal;
// newOptions["headers"]["User-Agent"] = USER_AGENT;
let urlToFetchThroughAPI = Buffer.from(url).toString('base64');
if (SCRAPER_API_BASE_URL.includes('scraperapi')) {
urlToFetchThroughAPI = url;
}
const urlAdaptedForScraping = USE_SCRAPER_API
? `${SCRAPER_API_BASE_URL}?api_key=${SCRAPER_API_KEY}&url=${urlToFetchThroughAPI}`
: url;
const result = useCache ? fetchCache.fetch(urlAdaptedForScraping, newOptions) : nodeFetch(urlAdaptedForScraping, newOptions);
const timeoutId = setTimeout(() => controller.abort(), NODE_FETCH_TIMEOUT_MS);
return result;
};
module.exports = fetch;

13
app/helpers/log.js Normal file
View File

@@ -0,0 +1,13 @@
const {
PRINT_CRAWLER_DEBUG
} = require("../config/appConfig");
const logDebug = (...args) => {
if (PRINT_CRAWLER_DEBUG) {
console.log(...args);
}
}
module.exports = {
logDebug
};

View File

@@ -7,6 +7,26 @@ const currentSearchRequest = async req => {
return await getSearchRequest(searchRequestId);
};
module.exports = {
currentSearchRequest
const getUrlParams = function (url) {
if (typeof url === 'string' && url.length > 0){
const params = {};
const questionMarkIndex = url.indexOf('?');
if (questionMarkIndex === -1) {
return undefined;
}
const query = url.substring(questionMarkIndex+1);
const vars = query.split('&');
for (let i = 0; i < vars.length; i++) {
const pair = vars[i].split('=');
params[pair[0]] = decodeURIComponent(pair[1]);
}
return params;
}
return undefined;
};
module.exports = {
currentSearchRequest,
getUrlParams
};

View File

@@ -0,0 +1,42 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
const tableFields = {
id: {
type: Sequelize.BIGINT,
autoIncrement: true,
allowNull: false,
primaryKey: true
},
realEstateId: {
type: Sequelize.BIGINT,
allowNull: false,
unique: "uniquePriceRealEstate",
references: {
model: "RealEstates",
key: "id"
},
onUpdate: "CASCADE",
onDelete: "SET NULL"
},
price: {
type: Sequelize.REAL,
unique: "uniquePriceRealEstate"
},
createdAt: {
type: Sequelize.DATE,
defaultValue: Sequelize.literal("NOW()")
},
updatedAt: {
type: Sequelize.DATE,
defaultValue: Sequelize.literal("NOW()")
}
};
return queryInterface.createTable("PriceHistory", tableFields);
},
down: queryInterface => {
return queryInterface.dropTable("PriceHistory", {});
}
};

View File

@@ -0,0 +1,10 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) =>
queryInterface.addConstraint("PriceHistory", ["realEstateId", "price"], {
type: "unique",
name: "uniquePriceRealEstate"
}),
down: queryInterface =>
queryInterface.removeConstraint("PriceHistory", "uniquePriceRealEstate")
};

View File

@@ -0,0 +1,14 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
return queryInterface.addColumn("SearchRequests", "includeWithoutPrice", {
type: Sequelize.BOOLEAN,
defaultValue: true
});
},
down: (queryInterface, Sequelize) => {
return queryInterface.removeColumn("SearchRequests", "includeWithoutPrice");
}
};

View File

@@ -0,0 +1,14 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
return queryInterface.addColumn("SearchRequests", "notifiedAt", {
type: Sequelize.DATE,
defaultValue: new Date()
});
},
down: (queryInterface, Sequelize) => {
return queryInterface.removeColumn("SearchRequests", "notifiedAt");
}
};

View File

@@ -16,7 +16,7 @@ config.logging = parseInt(process.env.SEQUELIZE_LOGGING) ? console.log : false;
let sequelize;
if (config.use_env_variable) {
sequelize = new Sequelize(process.env[config.use_env_variable], config);
sequelize = new Sequelize(process.env[config.use_env_variable] + "?ssl=true", config);
} else {
sequelize = new Sequelize(
config.database,

View File

@@ -0,0 +1,44 @@
"use strict";
module.exports = (sequalize, DataTypes) => {
const PriceHistory = sequalize.define(
"PriceHistory",
{
id: {
type: DataTypes.BIGINT,
autoIncrement: true,
primaryKey: true,
allowNull: false
},
realEstateId: {
type: DataTypes.BIGINT,
allowNull: false,
unique: "uniquePriceRealEstate",
references: {
model: "RealEstates",
key: "id"
},
onUpdate: "CASCADE",
onDelete: "SET NULL"
},
price: {
type: DataTypes.REAL,
unique: "uniquePriceRealEstate"
}
},
{
freezeTableName: true
}
);
PriceHistory.associate = models => {
PriceHistory.hasMany(models.RealEstate, {
foreignKey: "id",
sourceKey: "realEstateId",
targetKey: "id",
as: "realEstates"
});
};
return PriceHistory;
};

View File

@@ -71,6 +71,7 @@ module.exports = (sequelize, DataTypes) => {
type: DataTypes.TEXT
},
includeIncompleteAds: DataTypes.BOOLEAN,
includeWithoutPrice: DataTypes.BOOLEAN,
balcony: DataTypes.BOOLEAN,
elevator: DataTypes.BOOLEAN,
newBuilding: DataTypes.BOOLEAN,
@@ -81,7 +82,11 @@ module.exports = (sequelize, DataTypes) => {
floorMin: DataTypes.INTEGER,
floorMax: DataTypes.INTEGER,
accessRoadType: DataTypes.TEXT,
heatingType: DataTypes.TEXT
heatingType: DataTypes.TEXT,
notifiedAt: {
type: DataTypes.DATE,
defaultValue: new Date()
}
});
return SearchRequest;

View File

@@ -0,0 +1,6 @@
"use strict";
const { checkUpNotify } = require("../services/notificationService");
//For testing pursposes
(async () => {
await checkUpNotify();
})();

View File

@@ -154,3 +154,7 @@ h3 {
margin-top: 2rem;
margin-bottom: 1rem;
}
.estates-link {
color: rgba(0, 0, 0, 0.87);
}

View File

@@ -0,0 +1,34 @@
"use strict";
const { RealEstate } = require("../models");
module.exports = {
async up(queryInterface, Sequelize) {
//Reading initial data from RealEstate table in db
const realEstateInitialData = await RealEstate.findAll();
//Extruding data for table PriceHistory
const priceHistoryInitialData = realEstateInitialData.map(realEstate => {
//Null values canot be recognized by ignore duplicates in sequalize
//Value price = 0 indicates 'cijena na upit'
const priceTmp =
realEstate.dataValues.price === null ? 0 : realEstate.dataValues.price;
return {
realEstateId: realEstate.dataValues.id,
price: priceTmp,
createdAt: realEstate.dataValues.createdAt,
updatedAt: realEstate.dataValues.updatedAt
};
});
return queryInterface.bulkInsert(
"PriceHistory",
priceHistoryInitialData,
{}
);
},
async down(queryInterface, Sequelize) {
return queryInterface.bulkDelete("PriceHistory", null, {});
}
};

View File

@@ -1,4 +1,8 @@
"use strict";
const { STAGING } = require("../config/appConfig");
const stagingTag = STAGING ? "[STAGING] " : "";
const {
matchRealEstates,
matchSearchRequest
@@ -6,9 +10,15 @@ const {
const {
generateNotificationEmail,
generateNewSearchRequestEmail,
generateEmailSubject
generateEmailSubject,
generateCheckUpEmail
} = require("../helpers/emailContentGenerator");
const { findNotNotifiedMatches } = require("../helpers/db/searchRequestMatch");
const {
findNotNotifiedMatches,
findRealEstatesForSearchRequest
} = require("../helpers/db/searchRequestMatch");
const { findAllRequestsForCheckUp } = require("../helpers/db/searchRequest");
const { sendEmail } = require("../services/emailService");
const notifyForNewRealEstates = async newRealEstates => {
@@ -21,13 +31,17 @@ const notifyForNewSearchRequest = async searchRequest => {
const searchRequestId = searchRequest.id;
const matchingRealEstates = matches[searchRequestId].realEstates;
const emailContent = generateNewSearchRequestEmail(
searchRequest,
matchingRealEstates
);
const { email } = searchRequest;
await sendEmail(email, "Kivi - novi zahtjev za pretragu", emailContent);
//In case of the new search req, notifiedAt column is populated with default value - now (moment of creation)
await sendEmail(
email,
`${stagingTag} Kivi - novi zahtjev za pretragu`,
emailContent
);
};
const notifyMatches = async (matches, dailyNotification = false) => {
@@ -39,10 +53,18 @@ const notifyMatches = async (matches, dailyNotification = false) => {
const { email, subscribed } = searchRequest;
if (notifyNow && subscribed) {
const allMatchingRealEstates = matches[id].realEstates || [];
//Variable allMatchingRealEstates are real estates that are "new" on the market
//the ones that we notify user in this moment, not all that already exists in db
//New variable allRealEstates are all real estates that exists in db for search req
const allRealEstates = await findRealEstatesForSearchRequest(id);
const noAllRealEstates = allRealEstates.length;
if (allMatchingRealEstates.length > 0) {
const emailContent = generateNotificationEmail(
allMatchingRealEstates,
id,
noAllRealEstates,
dailyNotification
);
const emailSubject = generateEmailSubject(
@@ -55,6 +77,10 @@ const notifyMatches = async (matches, dailyNotification = false) => {
sendEmailPromise.catch(err =>
console.log("[Email Sending Failed]", err)
);
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
}
}
}
@@ -109,8 +135,30 @@ const notifyRequestsWithDailyOption = async () => {
await notifyMatches(matches, true);
};
const checkUpNotify = async () => {
/* const searchRequestsForCheckUp = await findAllRequestsForCheckUp();
const asyncSendEmailActions = [];
for (const searchRequest of searchRequestsForCheckUp) {
const { email } = searchRequest.dataValues;
const emailSubject = `${stagingTag}Kivi: Mi tražimo nekretnine za vas!`;
const emailContent = generateCheckUpEmail(searchRequest.dataValues);
const sendEmailPromise = sendEmail(email, emailSubject, emailContent);
asyncSendEmailActions.push(sendEmailPromise);
sendEmailPromise.catch(err => console.log("[Email Sending Failed]", err));
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
}
await Promise.all(asyncSendEmailActions);*/
};
module.exports = {
notifyForNewRealEstates,
notifyForNewSearchRequest,
notifyRequestsWithDailyOption
notifyRequestsWithDailyOption,
checkUpNotify
};

View File

@@ -61,9 +61,8 @@
<p class="distinguished">
<label class="checkbox-label">
<input type="checkbox" class="filled-in" name="includeIncompleteAds"
<% if (includeIncompleteAds) { %>
checked
<% } %>>
>
<span>Uključi i oglase bez potpunih informacija</span>
</label>
</p>

View File

@@ -1,13 +1,29 @@
<div class="row center-align">
<ul class="collection with-header">
<% for(const realEstate of realEstates) { %>
<li class="collection-item">
<div><%= realEstate.title %>
<a href="<%= realEstate.url %>" class="kivi-color secondary-content">
<ul class="collection with-header">
<% for(const realEstate of realEstates) { %>
<li class="collection-item">
<% if(realEstate.adStatus === AD_STATUS.STATUS_VIP) {%>
<div>
<% //This needs to do redirecting instead of direct link to realestate
%>
<a href="/redirect/<%= realEstate.id %>" class="estates-link">
<%= realEstate.title %>
<div class="kivi-color secondary-content">
<i class="material-icons">send</i>
</a>
</div>
</li>
<% } %>
</ul>
</div>
</div>
</a>
</div>
<%} else { %>
<div>
<a href="<%= realEstate.url %>" class="estates-link">
<%= realEstate.title %>
<div class="kivi-color secondary-content">
<i class="material-icons">send</i>
</div>
</a>
</div>
<% }%>
</li>
<% } %>
</ul>
</div>

View File

@@ -1,26 +1,49 @@
<br><br>
<br /><br />
<div class="center">
<div class="preloader-wrapper big active center">
<div class="kivi-spinner-color spinner-layer spinner-green-only">
<div class="circle-clipper left">
<div class="circle"></div>
</div><div class="gap-patch">
<div class="circle"></div>
</div><div class="circle-clipper right">
<div class="circle"></div>
</div>
</div>
<div class="preloader-wrapper big active center">
<div class="kivi-spinner-color spinner-layer spinner-green-only">
<div class="circle-clipper left">
<div class="circle"></div>
</div>
<div class="gap-patch">
<div class="circle"></div>
</div>
<div class="circle-clipper right">
<div class="circle"></div>
</div>
</div>
</div>
</div>
<br>
<br />
<% if(vipAd) { %>
<div class="center">
<h6>
<a href="<%= redirectUrl %>" rel="noreferrer" id="realEstateUrl">Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski</a>
</h6>
<h6>
Ovaj oglas zahtijeva da budete član
<a href="https://prostor.ba/" rel="noreferrer">Prostor.ba</a>.
<br />
<br />
<a href="https://prostor.ba/moj-prostor/prijava" rel="noreferrer"
>Ulogujte se</a
>
ili napravite
<a href="https://prostor.ba/moj-prostor/registracija" rel="noreferrer"
>novi račun</a
>, a potom otvorite <a href="<%= redirectUrl %>" rel="noreferrer">oglas</a>.
</h6>
</div>
<% } else { %>
<div class="center">
<h6>
<a href="<%= redirectUrl %>" rel="noreferrer" id="realEstateUrl"
>Kliknite ovdje ako Vas web preglednik ne preusmjeri automatski</a
>
</h6>
</div>
<% }%>
<script>
window.onload = function() {
document.getElementById('realEstateUrl').click();
}
window.onload = function() {
document.getElementById("realEstateUrl").click();
};
</script>

View File

@@ -18,6 +18,15 @@
</div>
</div>
<br />
<p class="distinguished">
<label class="checkbox-label">
<input type="checkbox" class="filled-in" name="includeWithoutPrice"
checked
>
<span>Uključi i oglase bez cijene</span>
</label>
</p>
<br />
<div class="row center-align">

View File

@@ -8,15 +8,25 @@ SEQUELIZE_LOGGING=0- no sequelize logging, 1- log to the console
PORT=Port for the app, defaults to 5000
APP_BASE_URL=base url for the app
ENVIRONMENT=Variable to denote development, staging and production
USER_AGENT=User agent header to send in fetch requests
MAX_REAL_ESTATES_IN_EMAIL=Max number of real estates that will be shown in email, others will be truncated and URL with full list will be shwon
MAX_REAL_ESTATES_IN_FIRST_EMAIL=Max number of real estates that will be shown in first (welcome) email
CHECK_UP_DAYS=Check up email is sent after this number of days without notification
#=============== GOOGLE ANALYTICS =============#
GA_ID=Google Analytics ID
#=============== GOOGLE MAPS =============#
API_MAP_KEY=(your-key-here)
#=============== SCRAPER API SUPORT =============#
USE_SCRAPER_API= To turn it on (1) or off (0)
SCRAPER_API_KEY= Key for Scraper api
SCRAPER_API_BASE_URL= Base url without question mark (example: http://sabur.kivi.ba:1337)
#=============== AWS SDK EMAIL SETTINGS =======#
AWS_KEY_ID=(your-key-here)
AWS_SECRET_ACCESS_KEY=(your-key-here)
@@ -51,6 +61,8 @@ PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories t
PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!!
PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!!
PROSTOR_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
PROSTOR_LOGIN_EMAIL=Email of valid Prostor.ba account for crawling purposes
PROSTOR_LOGIN_PASS=Password of valid Prostor.ba account for crawling purposes
#==AKTIDO==
AKTIDO_MAX_PAGES=Restrict crawler to this number of pages
AKTIDO_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
@@ -59,3 +71,9 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==SALJIC NEKRETNINE==
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found

View File

@@ -4,6 +4,7 @@ const bodyParser = require("body-parser");
const layout = require("express-layout");
const compression = require("compression");
const forceSSL = require("./app/helpers/forceSSL");
const { logDebug } = require("./app/helpers/log");
const {
APP_PORT,
@@ -12,6 +13,7 @@ const {
} = require("./app/config/appConfig");
const routes = require("./app/routes");
const { crawlAll } = require("./app/crawler/crawl");
const { checkUpNotify } = require("./app/services/notificationService");
const {
notifyForNewRealEstates
} = require("./app/services/notificationService");
@@ -37,12 +39,21 @@ app.listen(APP_PORT, () =>
let crawlerRunning = STOP_CRAWLER;
const crawl = () => {
logDebug("Crawl start. crawlerRunning: ", crawlerRunning);
if (!crawlerRunning) {
crawlerRunning = true;
crawlAll().then(newRealEstates => {
crawlerRunning = false;
logDebug("crawlAll done, new real estate len: ", newRealEstates.length);
notifyForNewRealEstates(newRealEstates);
}).catch(e => {
console.error('Error happened: ', e);
}).finally(()=> {
crawlerRunning = false;
logDebug('Finally done!');
});
}
};
setInterval(crawl, CRAWLER_INTERVAL * 1000);
setInterval(checkUpNotify, 1000 * 60 * 60 * 24);

69
package-lock.json generated
View File

@@ -40,6 +40,32 @@
"@sendgrid/helpers": "^6.3.0"
}
},
"@sozialhelden/fetch-cache": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/@sozialhelden/fetch-cache/-/fetch-cache-2.0.1.tgz",
"integrity": "sha512-vMlsdT5JQCGjx1fcFxmMNh7ZKppjjsfUAeZEhhNwhEL7GaqbZXsD1OXEyx2IcRa25ZuZtvJSV6Q3rE77VRdLvg==",
"requires": {
"@sozialhelden/hamster-cache": "^1.0.0"
}
},
"@sozialhelden/hamster-cache": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/@sozialhelden/hamster-cache/-/hamster-cache-1.0.0.tgz",
"integrity": "sha512-/TEGA8mdMawZp4Yq/GrkL+72YL5EGuSeVXC3pKW12YY1t3C+zCN/HZ0HRp4zWF/e67svXcxuz/B0AEQxEdvi7A=="
},
"@supercharge/goodies": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/@supercharge/goodies/-/goodies-1.4.0.tgz",
"integrity": "sha512-Np6u2qjRwiA3wTgzz4n2yduydIjSXqtJWP5cOnNqjdlCR/EUAK86LAOhEcU+YW211D1ksugns3GqpARJDoXQ7g=="
},
"@supercharge/promise-pool": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/@supercharge/promise-pool/-/promise-pool-1.3.0.tgz",
"integrity": "sha512-9/EVrJevSPEqI4i/gRH8Dt7C+FQT65wRRYuu0MDaGmSLZ2aTel0jOGu8Ae84fPiQ+Ah0B80RPFUxk+K+Cz48DA==",
"requires": {
"@supercharge/goodies": "~1.4.0"
}
},
"@types/caseless": {
"version": "0.12.2",
"resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.2.tgz",
@@ -79,6 +105,14 @@
"resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz",
"integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q=="
},
"abort-controller": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
"integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
"requires": {
"event-target-shim": "^5.0.0"
}
},
"accepts": {
"version": "1.3.5",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz",
@@ -1074,6 +1108,11 @@
"es5-ext": "~0.10.14"
}
},
"event-target-shim": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
"integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="
},
"events": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz",
@@ -1346,13 +1385,23 @@
"integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE="
},
"form-data": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
"integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-3.0.0.tgz",
"integrity": "sha512-CKMFDglpbMi6PyN+brwB9Q/GOw0eAnsrEZDgcsH5Krhz5Od/haKHAX0NmQfha2zPPz0JpWzA7GJHGSnvCRLWsg==",
"requires": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.6",
"combined-stream": "^1.0.8",
"mime-types": "^2.1.12"
},
"dependencies": {
"combined-stream": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"requires": {
"delayed-stream": "~1.0.0"
}
}
}
},
"forwarded": {
@@ -3430,6 +3479,18 @@
"tough-cookie": "~2.4.3",
"tunnel-agent": "^0.6.0",
"uuid": "^3.3.2"
},
"dependencies": {
"form-data": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
"integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
"requires": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.6",
"mime-types": "^2.1.12"
}
}
}
},
"require-directory": {

View File

@@ -8,14 +8,17 @@
"start": "node ./index.js",
"start-mon": "nodemon ./index.js",
"migrate": "cd app && npx sequelize db:migrate",
"seed": "cd app && npx sequelize db:seed:all",
"setup": "docker build -t marketalerts . && docker run -e POSTGRES_USER=docker -e POSTGRES_PASSWORD=docker -e POSTGRES_DB=marketalerts --name pg_marketalerts -d -p 5432:5432 marketalerts && sleep 10 && npm run migrate",
"docker-start": "docker start pg_marketalerts",
"docker-stop": "docker stop pg_marketalerts",
"crawl": "cd app/crawler && node npmCrawl.js",
"daily-notify": "cd app/npmScripts && node npmDailyNotify.js",
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
"test-search": "cd test && node searchTest.js",
"test-olx-scraper": "cd test && node olxScrapeTest.js",
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
"test-rental-scraper": "cd test && node rentalScrapeTest.js",
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
},
"repository": {
"type": "git",
@@ -29,6 +32,9 @@
"dependencies": {
"2checkout-node": "0.0.1",
"@sendgrid/mail": "^6.3.1",
"@sozialhelden/fetch-cache": "^2.0.1",
"@supercharge/promise-pool": "^1.3.0",
"abort-controller": "^3.0.0",
"aws-sdk": "^2.422.0",
"bluebird": "^3.5.5",
"cheerio": "^1.0.0-rc.2",
@@ -39,6 +45,7 @@
"express": "^4.16.4",
"express-ejs-layouts": "^2.5.0",
"express-layout": "^0.1.0",
"form-data": "^3.0.0",
"html-to-text": "^5.1.1",
"moment": "^2.24.0",
"moment-timezone": "^0.5.26",

View File

@@ -9,7 +9,7 @@ if (urlToScrape) {
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log(data);
console.log("Scraped data:", data);
})();
} else {
console.log("No URL to scrape. Use like this : ");

17
test/saljicScrapeTest.js Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
const urlToScrape = process.argv[2] || undefined;
if (urlToScrape) {
const crawler = new saljicCrawler();
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log("Scraped data:", data);
})();
} else {
console.log("No URL to scrape. Use like this : ");
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
}