Compare commits

...

43 Commits

Author SHA1 Message Date
Naida Vatric
a508f72d7c Merge branch 'master' into 'checkup-email-bug-fix'
# Conflicts:
#   app/services/notificationService.js
2020-02-25 15:12:47 +00:00
Naida Vatric
08ad9edfe1 Merge branch 'scraper-api-support' into 'master'
Added Scraper API option.

See merge request saburly/marketalarm/web!100
2020-02-25 15:10:39 +00:00
Naida Vatric
ce857ddce9 Renamed var. 2020-02-23 23:11:21 +01:00
Naida Vatric
148b2ea863 Changed default. 2020-02-23 16:38:40 +01:00
Naida Vatric
d436d4a37b Added Scraper API option. 2020-02-22 22:15:27 +01:00
Bilal Catic
6791a509d0 make user agent header configurable through env variable 2020-02-20 21:07:16 +01:00
Bilal Catic
edc6e2bbf7 Merge branch 'create-fetch-wrapper-with-user-agent' into 'master'
Create fetch wrapper with user agent

See merge request saburly/marketalarm/web!98
2020-02-20 19:58:32 +00:00
Bilal Catic
4f230020d7 use fetch wrapper instead of node-fetch 2020-02-20 19:49:29 +01:00
Bilal Catic
f62a7200c7 create fetch wrapper with mandatory user agent header 2020-02-20 19:47:30 +01:00
Bilal Catic
cff7cc2e9c apply prettier 2020-02-20 19:46:39 +01:00
Naida Vatric
bc7ce9d708 Changed checkup query to miliseconds. 2020-02-17 21:22:45 +01:00
Naida Vatric
df2a962d0f Merge branch 'prostor-vip-ads-fix' into 'master'
Prostor VIP ads fixed.

See merge request saburly/marketalarm/web!94
2020-02-17 14:44:58 +00:00
Naida Vatric
be4508ebea Merge branch 'include-incomplete-ads-inverse' into 'master'
Default true for include incomplete ads.

See merge request saburly/marketalarm/web!96
2020-02-17 14:44:35 +00:00
Naida Vatric
22bffc126d For staging - checkup fix. 2020-02-15 02:39:38 +01:00
Naida Vatric
06f80296f3 Changed checkup email logic. 2020-02-15 02:30:02 +01:00
Naida Vatric
81fa3f046d Default true for include incomplete ads. 2020-02-15 00:52:06 +01:00
Naida Vatric
5bdc8e149a Prostor VIP ads fixed. 2020-02-14 22:41:51 +01:00
Senad Uka
fc7fe3c0b3 Notificaton service disabled 2020-02-14 15:07:42 +01:00
Naida Vatric
b3007123a5 Merge branch 'rename-settings-var' into 'master'
Rename settings var

See merge request saburly/marketalarm/web!93
2020-02-10 20:17:08 +00:00
Naida Vatric
f7d4a9cd07 Renamed settings var to describe purpose. 2020-02-10 21:15:28 +01:00
Naida Vatric
ab6812889a Merge branch 'fixing-saljic-bugs' into 'master'
Fixing saljic bugs

See merge request saburly/marketalarm/web!92
2020-02-09 18:11:00 +00:00
Naida Vatric
b82134e280 Fixed saljic bug for heroku. 2020-02-09 19:09:00 +01:00
Naida Vatric
be378883c8 Just another fix try. 2020-02-08 00:47:00 +01:00
Naida Vatric
8a87b9e253 Another fix. 2020-02-08 00:27:26 +01:00
Naida Vatric
43bc23b164 Another fix. Defined more var. 2020-02-07 22:27:01 +01:00
Naida Vatric
fc6351af46 Added columns and logs for types. 2020-02-07 22:12:53 +01:00
Naida Vatric
6267b2cab4 Merge branch 'staging-tag-to-checkup-email' into 'master'
Added staging tag to checkup email. Email footer bug fixed.

See merge request saburly/marketalarm/web!91
2020-02-06 21:43:56 +00:00
Naida Vatric
97724a47a1 Removed debugg logs. Smal fixes. 2020-02-06 22:40:56 +01:00
Naida Vatric
91a1c6a91e Added more logs for debugging. 2020-02-06 14:12:35 +01:00
Naida Vatric
eb4ab2e341 Changed misspeling. 2020-02-05 23:02:14 +01:00
Naida Vatric
2d0a00b967 Debugging- noOfRealEstates and staging tag. 2020-02-05 21:58:45 +01:00
Naida Vatric
74def9c059 Added staging tag to checkup email. Email footer bug fixed. 2020-02-05 21:35:18 +01:00
Naida Vatric
d29b3eb1b3 Merge branch 'crawler-saljicnekretnine' into 'master'
Crawler saljicnekretnine

See merge request saburly/marketalarm/web!90
2020-02-04 13:09:10 +00:00
Naida Vatric
41b59e8c7c Merge branch 'tag-staging-email' into 'master'
Tag staging email

See merge request saburly/marketalarm/web!85
2020-02-04 13:07:53 +00:00
Naida Vatric
b933fa96d4 Merge branch 'master' into 'tag-staging-email'
# Conflicts:
#   app/config/appConfig.js
2020-02-04 13:07:43 +00:00
Naida Vatric
824db4fbc3 Merge branch 'checkup-email' into 'master'
Added check up email that everything works.

See merge request saburly/marketalarm/web!89
2020-01-31 21:59:35 +00:00
Naida Vatric
712cde1632 Changed not to use NODE_ENV variable. 2020-01-31 22:47:47 +01:00
Naida Vatric
1ba7cf8531 Added crawler for Saljic nekretnine. 2020-01-31 22:03:39 +01:00
Naida Vatric
7a7aecb3ee WIP Scraped no of rooms, floors etc. 2020-01-31 00:55:24 +01:00
Naida Vatric
78c4054cde WIP Scraped title, price and location. 2020-01-30 16:24:34 +01:00
Naida Vatric
94ffc2d6d2 WIP Started saljic crawler. 2020-01-29 23:22:39 +01:00
Naida Vatric
b11f18696f Prepared config files. 2020-01-29 01:09:53 +01:00
Naida Vatric
b9122f8f00 Added tag to email to denote staging from production. 2020-01-14 15:59:17 +01:00
20 changed files with 863 additions and 92 deletions

View File

@@ -303,7 +303,8 @@ const AD_AGENCY = {
OLX: "OLX",
RENTAL: "RENTAL",
PROSTOR: "PROSTOR",
AKTIDO: "AKTIDO"
AKTIDO: "AKTIDO",
SALJIC: "SALJIC"
};
const CRAWLER_AD_TYPE = {

View File

@@ -9,6 +9,8 @@ const APP_URL =
? process.env.APP_URL || "http://market-alarm"
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
const STAGING = process.env.ENVIRONMENT !== "production";
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
const CRAWLER_INTERVAL = parseInt(process.env.CRAWLER_INTERVAL) || 60;
@@ -39,6 +41,13 @@ const PROSTOR_LOGIN = {
PASSWORD: process.env.PROSTOR_LOGIN_PASS
};
const USER_AGENT =
process.env.USER_AGENT ||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
module.exports = {
APP_PORT,
APP_URL,
@@ -50,6 +59,10 @@ module.exports = {
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
PRINT_CRAWLER_DEBUG,
API_MAP_KEY,
STAGING,
CHECK_UP_DAYS,
PROSTOR_LOGIN
PROSTOR_LOGIN,
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY
};

View File

@@ -9,12 +9,14 @@ const OlxCrawler = require("./specificCrawlers/olx");
const RentalCrawler = require("./specificCrawlers/rental");
const ProstorCrawler = require("./specificCrawlers/prostor");
const AktidoCrawler = require("./specificCrawlers/aktido");
const SaljicCrawler = require("./specificCrawlers/saljic");
const {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG,
AKTIDO_CONFIG
AKTIDO_CONFIG,
SALJIC_CONFIG
} = require("./crawlerConfig");
const PostgresSaver = require("./savers/postgres");
@@ -57,6 +59,15 @@ async function crawlAll() {
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
),
new SaljicCrawler(
[postgresSaver],
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
SALJIC_CONFIG.SALJIC_MAX_PAGES,
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
)
];

View File

@@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx");
const RENTAL_CONFIG = require("./specificConfigs/rental");
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
const SALJIC_CONFIG = require("./specificConfigs/saljic");
module.exports = {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG,
AKTIDO_CONFIG
AKTIDO_CONFIG,
SALJIC_CONFIG
};

View File

@@ -0,0 +1,34 @@
"use strict";
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
const saljicCrawlerAdType =
process.env.SALJIC_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE]
: null;
const saljicParsedCrawlerAdCategories =
process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined
? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const saljicIgnoredUsernames = [];
const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
module.exports = {
SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100,
SALJIC_MAX_RESULTS_PER_PAGE:
parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000,
SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE,
SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories,
SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [],
SALJIC_DELAY_BETWEEN_PAGES:
parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000,
SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL)
};

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const FormData = require("form-data");
@@ -191,13 +191,7 @@ class ProstorCrawler {
const { lat, lng, property_name, price, size, link, status } = realEstate;
//Status information is given already in realestate list
//For VIP Ads status ='' canot be used, but no VIP ads are crawled
//We will make "fake" vip ad for RE that have size=55
//It is weird because yesterday it said 'VIP ponuda' ???
const adStatus =
size === "55"
? ProstorCrawler.getStatusId("VIP ponuda")
: ProstorCrawler.getStatusId(status);
const adStatus = ProstorCrawler.getStatusId(status);
const url = `https://prostor.ba${link}`;

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("node-fetch");
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
@@ -399,7 +399,9 @@ class RentalCrawler {
);
if (!publishedDateMoment.isValid()) {
throw {
message: `Invalid published date : ${extractedData["re_realEstates_inserted"]}`
message: `Invalid published date : ${
extractedData["re_realEstates_inserted"]
}`
};
}
@@ -410,7 +412,9 @@ class RentalCrawler {
);
if (!renewedDateMoment.isValid()) {
throw {
message: `Invalid renewed date : ${extractedData["re_realEstates_edited"]}`
message: `Invalid renewed date : ${
extractedData["re_realEstates_edited"]
}`
};
}

View File

@@ -0,0 +1,630 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const {
AD_TYPE,
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE,
FURNISHING_TYPE,
HEATING_TYPE
} = require("../../common/enums");
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
} = require("../../config/appConfig");
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
const SALJIC_ENUMS = {
SALJIC_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
},
SALJIC_AD_CATEGORY: {
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
//[AD_CATEGORY.COTTAGE.id]: ""
}
};
class SaljicCrawler {
constructor(
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
maxPages = 5000,
maxResultsPerPage = 5000,
ignoredUsernames = [],
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
if (crawlAdCategories) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory));
}
//
//console.log(indexGenerators);
//
let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!SALJIC_FORCE_CRAWL
) {
generatorsToRemove[index] = true;
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
}
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
await this.sleep(this.delayBetweenPages);
}
}
return newRealEstates;
}
async *categoryIndexer(adCategory) {
let pageToIndex = 1;
const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) {
const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11;
const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`;
const singlePageResults = await this.indexSinglePage(
urlPageToCrawl,
this.maxResultsPerPage
);
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
yield singlePageResults;
} else {
return undefined;
}
++pageToIndex;
if (pageToIndex === this.maxPages) {
return undefined;
}
}
} else {
return undefined;
}
}
async indexSinglePage(url, maxResultsPerPage) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[SALJIC] Index page : ", url);
}
try {
const res = await fetch(url);
const body = await res.text();
const $ = cheerio.load(body);
let hrefs = [];
$("#shop")
.find(".product")
.each((i, elem) => {
const href = $(elem)
.find("a")
.first()
.attr("href");
if (href) {
hrefs.push(href);
}
});
let adTypesTmp = [];
$("#shop")
.find(".product")
.each((i, elem) => {
const adType = $(elem)
.find(".trakica-search-page")
.text()
.trim();
if (adType) {
adTypesTmp.push(adType);
}
});
//Converting to AD_TYPE
const adTypes = adTypesTmp.map(adTypeText => {
return this.getAdTypeId(adTypeText);
});
//Converting to absolute URLs
const hrefsAbs = hrefs.map(link => {
return "https://www.saljicnekretnine.ba" + link;
});
let actualNoOfResults =
hrefsAbs.length <= maxResultsPerPage
? hrefsAbs.length
: maxResultsPerPage;
const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
}
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
return filteredScrapedData;
} catch (e) {
console.error("[SALJIC] Exception caught:" + e);
return [];
}
}
async scrapeAd(url, adType) {
// console.log("[SALJIC] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
// No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url
const agencyObjectId = parseInt(url.substring(46, url.length));
//Extracting main properties
const propertySelectors = {
title:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
price:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
descriptions:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
latAndLong:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
};
const title = $(propertySelectors.title)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.replace(/ {1,}/g, " ")
.trim();
const priceText = $(propertySelectors.price)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.replace(/ {1,}/g, " ")
.trim();
const price =
priceText === "CIJENA NA UPIT"
? null
: parseFloat(
priceText.substring(8, priceText.length - 3).replace(",", "")
);
const streetName = $(propertySelectors.streetName)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
.trim();
const descriptions = $(propertySelectors.descriptions)
.text()
.replace(/\"/g, "")
.trim();
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
const latText = latAndLongSrc.substring(
latAndLongSrc.indexOf("marker=") + 7,
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker="))
);
const longText = latAndLongSrc.substring(
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3,
latAndLongSrc.length
);
const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null;
//====== DETAIL INFORMATION FIELDS ==========
let area = null,
gardenSize = null,
numberOfRooms = null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = null,
furnishingType = null,
balcony = null,
newBuilding = null,
elevator = null,
water = null,
electricity = null,
drainageSystem = null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = null,
garage = null,
gas = null,
antiTheftDoor = null,
airCondition = null,
phoneConnection = null,
cableTV = null,
internet = null,
basementAttic = null,
storeRoom = null,
videoSurveillance = null,
alarm = null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = null,
exchange = null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null;
let publishedDate = null;
let renewedDate = null;
let realEstateType;
let numberOfViewsAgency = null;
let numberOfViewsKivi = null;
let streetNumber = 0;
let adStatus = status;
let shortDescription = descriptions.substring(
0,
descriptions.indexOf(".")
);
let longDescription = descriptions;
//Extracting data - Glavne karakteristike
let mainFieldIndex = 1;
do {
const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
const mainField = $(mainFieldSelector)
.text()
.replace(/[\n\r\t]/gm, "")
.trim();
const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
const mainFieldValue = mainField
.substring(mainField.indexOf(" "), mainField.length)
.trim();
switch (mainFieldTitle) {
case "Površina":
area = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "Okućnica":
gardenSize = parseFloat(
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
);
break;
case "Broj soba":
numberOfRooms = parseInt(mainFieldValue);
break;
case "Broj spratova":
numberOfFloors = parseInt(mainFieldValue);
break;
case "Sprat":
floor = parseInt(mainFieldValue);
break;
case "Godina renoviranja":
recentlyAdapted = true;
break;
case "Broj parking mjesta":
parking = true;
break;
case "Dostupno od":
const day = mainFieldValue.substring(0, 2);
const month = mainFieldValue.substring(3, 5);
const year = mainFieldValue.substring(6, mainFieldValue.length);
publishedDate = new Date(`${month}/${day}/${year}`);
break;
default:
break;
}
if (mainFieldTitle === "") {
break;
}
mainFieldIndex++;
} while (true);
//Extracting data - Sadrzaji
let additionalFieldIndex = 1;
do {
const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`;
const additionalField = $(additionalFieldSelector)
.text()
.trim();
if (additionalFieldIndex === 1) {
//Extracting data of real estate type
const categoryTmp = additionalField
.replace(/[\n\r\t]/gm, "")
.substring(
additionalField.indexOf("Kategorija") + 10,
additionalField.length
)
.trim();
realEstateType = this.getAdCategoryId(categoryTmp);
} else {
switch (additionalField) {
case "Internet":
internet = true;
break;
case "Garaža":
garage = true;
break;
case "Klima":
airCondition = true;
break;
case "Balkon":
balcony = true;
break;
case "Ostava":
storeRoom = true;
break;
case "Podrum":
basementAttic = true;
break;
case "Blindirana vrata":
antiTheftDoor = true;
break;
case "Voda":
water = true;
break;
case "Kablovska":
cableTV = true;
break;
case "Uknjiženo":
registeredInZkBooks = true;
break;
case "Grijanje - centralno":
heatingType = HEATING_TYPE.CENTRAL_CITY.id;
break;
case "Grijanje - plin":
heatingType = HEATING_TYPE.GAS.id;
break;
case "Grijanje - struja":
heatingType = HEATING_TYPE.ELECTRICITY.id;
break;
case "Grijanje":
heatingType = HEATING_TYPE.OTHER.id;
break;
case "Plin":
gas = true;
break;
case "Namješten":
furnishingType = FURNISHING_TYPE.FURNISHED.id;
break;
case "Alarm":
alarm = true;
break;
case "Video nadzor":
videoSurveillance = true;
break;
case "Lift":
elevator = true;
break;
case "Novogradnja":
newBuilding = true;
break;
default:
break;
}
}
if (additionalField === "") {
break;
}
additionalFieldIndex++;
} while (true);
//If no published date it takes current date of crawling
if (publishedDate) {
renewedDate = new Date();
} else {
publishedDate = new Date();
renewedDate = new Date();
}
const originAgencyName = AD_AGENCY.SALJIC;
const locality = "";
const municipality = "";
const city = "";
const region = "";
const entity = "";
const country = "";
const data = {
url,
agencyObjectId,
originAgencyName,
realEstateType,
adType,
title,
price,
area,
gardenSize,
shortDescription,
longDescription,
streetNumber,
streetName,
locality,
municipality,
city,
region,
entity,
country,
locationLat,
locationLong,
adStatus,
publishedDate,
renewedDate,
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
exchange,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency,
numberOfViewsKivi
};
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
return null;
}
//======= HELPER FUNCTIONS =============
getAdCategoryId(categoryText) {
switch (categoryText) {
case "Stan":
return AD_CATEGORY.FLAT.id;
case "Građevinsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Industrijsko zemljiste":
return AD_CATEGORY.LAND.id;
case "Poljoprivredno zemljiste":
return AD_CATEGORY.LAND.id;
case "Kuća":
return AD_CATEGORY.HOUSE.id;
case "Poslovni prostor":
return AD_CATEGORY.OFFICE.id;
case "Kancelarije":
return AD_CATEGORY.OFFICE.id;
case "Apartmani":
return AD_CATEGORY.APARTMENT.id;
case "Garaža":
return AD_CATEGORY.GARAGE.id;
case "Vikendica":
return AD_CATEGORY.COTTAGE.id;
default:
return undefined;
}
}
getAdTypeId(adTypeText) {
switch (adTypeText) {
case "PRODAJA":
return AD_TYPE.AD_TYPE_SALE.stringId;
case "NAJAM":
return AD_TYPE.AD_TYPE_RENT.stringId;
default:
return undefined;
}
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
const savers = this.savers;
// for (const saver of savers) {
// await saver.save(results);
// }
//For now, we use only Postgres saver, so ...
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}
module.exports = SaljicCrawler;

View File

@@ -332,10 +332,14 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
//When includeIncompleteAds are not defined - null it will consider it true
const order = [["updatedAt", "desc"]];
return db.RealEstate.findAll({
where: includeIncompleteAds ? queryIncludeIncomplete : query,
where:
includeIncompleteAds || includeIncompleteAds == null
? queryIncludeIncomplete
: query,
limit: maxResults,
order
});

View File

@@ -3,6 +3,7 @@ const db = require("../../models/index");
const sequelize = require("sequelize");
const Op = sequelize.Op;
const { AD_CATEGORY } = require("../../common/enums");
const { CHECK_UP_DAYS } = require("../../config/appConfig");
const getSearchRequest = async searchRequestId => {
try {
@@ -16,6 +17,22 @@ const getSearchRequest = async searchRequestId => {
const createSearchRequest = async (searchRequestFields = {}) => {
return await db.SearchRequest.create(searchRequestFields);
};
const findAllRequestsForCheckUp = async () => {
const checkUpOffset = 24 * 60 * 60 * 1000 * CHECK_UP_DAYS; //in miliseconds
const checkupDate = new Date();
checkupDate.setTime(checkupDate.getTime() - checkUpOffset);
const dateQuery = {
notifiedAt: {
[Op.lte]: checkupDate
}
};
const allRequestsForCheckUp = await db.SearchRequest.findAll({
where: dateQuery
});
return allRequestsForCheckUp;
};
const findSearchRequestsForRealEstate = async realEstate => {
const {
@@ -157,7 +174,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
} else {
// If real estate dont have defined number of rooms ex. null
//It returns requests that didn't choose number of rooms - also null
//Or ones that picked some values but also picked to includeIncomplete ads
//Or ones that picked some values but also picked to includeIncomplete ads (or default)
numberOfRoomsQuery = {
[Op.or]: [
{
@@ -176,7 +193,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
@@ -226,7 +246,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
@@ -275,7 +298,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
@@ -313,7 +339,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
@@ -347,7 +376,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
@@ -381,7 +413,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
@@ -423,10 +458,13 @@ const findSearchRequestsForRealEstate = async realEstate => {
[Op.eq]: "ANY"
};
}
//Tag to check if incomplete ads are accepted in query
//Tag to check if incomplete ads are accepted in query which is default
if (checkForIncompleteWanted) {
query.includeIncompleteAds = {
[Op.eq]: true
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
};
}
@@ -438,5 +476,6 @@ const findSearchRequestsForRealEstate = async realEstate => {
module.exports = {
getSearchRequest,
createSearchRequest,
findSearchRequestsForRealEstate
findSearchRequestsForRealEstate,
findAllRequestsForCheckUp
};

View File

@@ -2,7 +2,6 @@
const db = require("../../models/index");
const sequelize = require("sequelize");
const Op = sequelize.Op;
const { CHECK_UP_DAYS } = require("../../config/appConfig");
const findRealEstatesForSearchRequest = async searchRequestId => {
const query = {
@@ -43,42 +42,6 @@ const findNotNotifiedMatches = async () => {
return matchingRecords;
};
const findAllRequestsForCheckUp = async () => {
//First we find IDs of search request that don't need to be emailed for check up - to EXCLUDE
//The ones that received notification for real estate CHECK_UP_DAYS days from now
const date = new Date();
const checkUpDate = date.getDate() - CHECK_UP_DAYS;
date.setDate(checkUpDate);
const dateQuery = {
createdAt: {
[Op.gte]: date
}
};
const excludedMatches = await db.SearchRequestMatch.findAll({
attributes: ["searchRequestId"],
where: dateQuery,
order: [["searchRequestId", "ASC"]]
});
const excludedRequestsAll = excludedMatches.map(match => {
return match.dataValues.searchRequestId;
});
//Removing duplicate search request id-s for optimization
const excludedRequests = [...new Set(excludedRequestsAll)];
const query = {
subscribed: true,
id: {
[Op.notIn]: excludedRequests
}
};
const allRequestsForCheckUp = await db.SearchRequest.findAll({
where: query
});
return allRequestsForCheckUp;
};
const addMatches = async matchingRecords => {
return await db.SearchRequestMatch.bulkCreate(matchingRecords, {
@@ -89,6 +52,5 @@ const addMatches = async matchingRecords => {
module.exports = {
findRealEstatesForSearchRequest,
addMatches,
findNotNotifiedMatches,
findAllRequestsForCheckUp
findNotNotifiedMatches
};

View File

@@ -1,8 +1,15 @@
"use strict";
const { MAX_REAL_ESTATES_IN_EMAIL, APP_URL } = require("../config/appConfig");
const {
MAX_REAL_ESTATES_IN_EMAIL,
APP_URL,
STAGING
} = require("../config/appConfig");
const { AD_CATEGORY, AD_TYPE, EMAIL_FREQUENCY } = require("../common/enums");
//Tag to recognize staging from development
const stagingTag = STAGING ? "[STAGING] " : "";
const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => {
return ` <div>Trenutno ste prijavljeni da obavještenja o novim nekretninama primate <strong>${emailFrequencyTitle.toLowerCase()} </strong>.</div>
<div>Ako želite prestati dobijati obavještenja za ovu pretragu, <a href="${APP_URL}/odjava/${searchRequestId}">odjavite ovdje</a></div>
@@ -54,7 +61,7 @@ const generateNotificationEmail = (
const messageBody = dailyNotification ? dailyMessageBody : asapMessageBody;
return `<h3>Zdravo</h3>
return `<h3>${stagingTag}Zdravo</h3>
<h4>${messageBody}</h4>
<div>
${realEstateLinks}
@@ -113,7 +120,7 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
const emailFooter = generateEmailFooter(id, emailFrequencyTitle);
return `<h3>Zdravo</h3>
return `<h3>${stagingTag}Zdravo</h3>
<div>Naručili ste da Vam javimo ako se nekretnina sa navedenim uslovima pojavi u oglasima:</div>
<br/>
<div>
@@ -130,7 +137,7 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
if (numberOfRealEstates === 1) {
return `Kivi: ${singleRealEstateTitle}`;
return `${stagingTag}Kivi: ${singleRealEstateTitle}`;
}
const leastSignificantDigit = numberOfRealEstates % 10;
@@ -138,7 +145,7 @@ const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
const secondLeastSignificantDigit = numberWithoutLastDigit % 10;
if (leastSignificantDigit === 1 && secondLeastSignificantDigit !== 1) {
return `Kivi : ${numberOfRealEstates} nova nekretnina`;
return `${stagingTag}Kivi : ${numberOfRealEstates} nova nekretnina`;
}
if (
@@ -146,10 +153,10 @@ const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
leastSignificantDigit <= 4 &&
secondLeastSignificantDigit !== 1
) {
return `Kivi: ${numberOfRealEstates} nove nekretnine`;
return `${stagingTag}Kivi: ${numberOfRealEstates} nove nekretnine`;
}
return `Kivi: ${numberOfRealEstates} novih nekretnina`;
return `${stagingTag}Kivi: ${numberOfRealEstates} novih nekretnina`;
};
const generateCheckUpEmail = searchRequest => {
@@ -164,13 +171,23 @@ const generateCheckUpEmail = searchRequest => {
priceMax
} = searchRequest;
let emailFrequencyTitle;
switch (searchRequest.emailFrequency) {
case EMAIL_FREQUENCY.ASAP.stringId:
emailFrequencyTitle = EMAIL_FREQUENCY.ASAP.title;
break;
case EMAIL_FREQUENCY.DAILY.stringId:
emailFrequencyTitle = EMAIL_FREQUENCY.DAILY.title;
break;
}
const gardenSize = realEstateType.hasGardenSize
? `<div><strong>Kvadratura okućnice: Od ${gardenSizeMin} do ${gardenSizeMax} m2</strong></div>`
: ``;
const emailFooter = generateEmailFooter(id);
const emailFooter = generateEmailFooter(id, emailFrequencyTitle);
return `<h3>Zdravo</h3>
return `<h3>${stagingTag}Zdravo</h3>
<div><strong>Kivi tim traži nekretnine za Vas i kada to ne vidite.</strong></div>
<br />
<div>Vaša trenutno aktivna pretraga je:</div>

View File

@@ -0,0 +1,21 @@
const nodeFetch = require("node-fetch");
const {
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY
} = require("../config/appConfig");
const fetch = async (url, options = {}) => {
const newOptions = Object.assign({}, options);
if (!newOptions["headers"]) {
newOptions["headers"] = {};
}
newOptions["headers"]["User-Agent"] = USER_AGENT;
const urlAdaptedForScraping = USE_SCRAPER_API
? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}`
: url;
return nodeFetch(urlAdaptedForScraping, newOptions);
};
module.exports = fetch;

View File

@@ -0,0 +1,14 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
return queryInterface.addColumn("SearchRequests", "notifiedAt", {
type: Sequelize.DATE,
defaultValue: new Date()
});
},
down: (queryInterface, Sequelize) => {
return queryInterface.removeColumn("SearchRequests", "notifiedAt");
}
};

View File

@@ -15,15 +15,7 @@ module.exports = (sequelize, DataTypes) => {
allowNull: false,
defaultValue: {
type: "Polygon",
coordinates: [
[
[0, 0],
[0, 0],
[0, 0],
[0, 0],
[0, 0]
]
],
coordinates: [[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
crs: { type: "name", properties: { name: "EPSG:4326" } }
}
},
@@ -90,7 +82,11 @@ module.exports = (sequelize, DataTypes) => {
floorMin: DataTypes.INTEGER,
floorMax: DataTypes.INTEGER,
accessRoadType: DataTypes.TEXT,
heatingType: DataTypes.TEXT
heatingType: DataTypes.TEXT,
notifiedAt: {
type: DataTypes.DATE,
defaultValue: new Date()
}
});
return SearchRequest;

View File

@@ -1,4 +1,8 @@
"use strict";
const { STAGING } = require("../config/appConfig");
const stagingTag = STAGING ? "[STAGING] " : "";
const {
matchRealEstates,
matchSearchRequest
@@ -11,9 +15,10 @@ const {
} = require("../helpers/emailContentGenerator");
const {
findNotNotifiedMatches,
findAllRequestsForCheckUp,
findRealEstatesForSearchRequest
} = require("../helpers/db/searchRequestMatch");
const { findAllRequestsForCheckUp } = require("../helpers/db/searchRequest");
const { sendEmail } = require("../services/emailService");
const notifyForNewRealEstates = async newRealEstates => {
@@ -26,13 +31,17 @@ const notifyForNewSearchRequest = async searchRequest => {
const searchRequestId = searchRequest.id;
const matchingRealEstates = matches[searchRequestId].realEstates;
const emailContent = generateNewSearchRequestEmail(
searchRequest,
matchingRealEstates
);
const { email } = searchRequest;
await sendEmail(email, "Kivi - novi zahtjev za pretragu", emailContent);
//In case of the new search req, notifiedAt column is populated with default value - now (moment of creation)
await sendEmail(
email,
`${stagingTag} Kivi - novi zahtjev za pretragu`,
emailContent
);
};
const notifyMatches = async (matches, dailyNotification = false) => {
@@ -68,6 +77,10 @@ const notifyMatches = async (matches, dailyNotification = false) => {
sendEmailPromise.catch(err =>
console.log("[Email Sending Failed]", err)
);
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
}
}
}
@@ -129,12 +142,16 @@ const checkUpNotify = async () => {
for (const searchRequest of searchRequestsForCheckUp) {
const { email } = searchRequest.dataValues;
const emailSubject = `Kivi: Mi tražimo nekretnine za vas!`;
const emailSubject = `${stagingTag}Kivi: Mi tražimo nekretnine za vas!`;
const emailContent = generateCheckUpEmail(searchRequest.dataValues);
const sendEmailPromise = sendEmail(email, emailSubject, emailContent);
asyncSendEmailActions.push(sendEmailPromise);
sendEmailPromise.catch(err => console.log("[Email Sending Failed]", err));
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
}
await Promise.all(asyncSendEmailActions);
};

View File

@@ -61,9 +61,8 @@
<p class="distinguished">
<label class="checkbox-label">
<input type="checkbox" class="filled-in" name="includeIncompleteAds"
<% if (includeIncompleteAds) { %>
checked
<% } %>>
>
<span>Uključi i oglase bez potpunih informacija</span>
</label>
</p>

View File

@@ -8,6 +8,10 @@ SEQUELIZE_LOGGING=0- no sequelize logging, 1- log to the console
PORT=Port for the app, defaults to 5000
APP_BASE_URL=base url for the app
ENVIRONMENT=Variable to denote development, staging and production
USER_AGENT=User agent header to send in fetch requests
MAX_REAL_ESTATES_IN_EMAIL=Max number of real estates that will be shown in email, others will be truncated and URL with full list will be shwon
MAX_REAL_ESTATES_IN_FIRST_EMAIL=Max number of real estates that will be shown in first (welcome) email
@@ -18,6 +22,10 @@ GA_ID=Google Analytics ID
#=============== GOOGLE MAPS =============#
API_MAP_KEY=(your-key-here)
#=============== SCRAPER API SUPORT =============#
USE_SCRAPER_API= To turn it on (1) or off (0)
SCRAPER_API_KEY= Key for Scraper api
#=============== AWS SDK EMAIL SETTINGS =======#
AWS_KEY_ID=(your-key-here)
AWS_SECRET_ACCESS_KEY=(your-key-here)
@@ -62,3 +70,8 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==SALJIC NEKRETNINE==
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found