Compare commits

..

2 Commits

Author SHA1 Message Date
Naida Vatric
f56cd5b549 More elegant scrape of lat and long. 2020-02-17 21:55:24 +01:00
Naida Vatric
addd8c1344 Saljic crawler changed substring call. 2020-02-14 23:42:19 +01:00
15 changed files with 101 additions and 157 deletions

View File

@@ -41,13 +41,6 @@ const PROSTOR_LOGIN = {
PASSWORD: process.env.PROSTOR_LOGIN_PASS
};
const USER_AGENT =
process.env.USER_AGENT ||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
module.exports = {
APP_PORT,
APP_URL,
@@ -61,8 +54,5 @@ module.exports = {
API_MAP_KEY,
STAGING,
CHECK_UP_DAYS,
PROSTOR_LOGIN,
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY
PROSTOR_LOGIN
};

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const FormData = require("form-data");
@@ -191,7 +191,13 @@ class ProstorCrawler {
const { lat, lng, property_name, price, size, link, status } = realEstate;
//Status information is given already in realestate list
const adStatus = ProstorCrawler.getStatusId(status);
//For VIP Ads status ='' canot be used, but no VIP ads are crawled
//We will make "fake" vip ad for RE that have size=55
//It is weird because yesterday it said 'VIP ponuda' ???
const adStatus =
size === "55"
? ProstorCrawler.getStatusId("VIP ponuda")
: ProstorCrawler.getStatusId(status);
const url = `https://prostor.ba${link}`;

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const Promise = require("bluebird");
const moment = require("moment-timezone");
@@ -399,9 +399,7 @@ class RentalCrawler {
);
if (!publishedDateMoment.isValid()) {
throw {
message: `Invalid published date : ${
extractedData["re_realEstates_inserted"]
}`
message: `Invalid published date : ${extractedData["re_realEstates_inserted"]}`
};
}
@@ -412,9 +410,7 @@ class RentalCrawler {
);
if (!renewedDateMoment.isValid()) {
throw {
message: `Invalid renewed date : ${
extractedData["re_realEstates_edited"]
}`
message: `Invalid renewed date : ${extractedData["re_realEstates_edited"]}`
};
}

View File

@@ -1,6 +1,6 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
@@ -218,7 +218,7 @@ class SaljicCrawler {
}
async scrapeAd(url, adType) {
// console.log("[SALJIC] Scraping : ", url);
console.log("[SALJIC] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
@@ -227,7 +227,9 @@ class SaljicCrawler {
// No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url
const agencyObjectId = parseInt(url.substring(46, url.length));
const agencyObjectId = url
? parseInt(url.substring(46, url.length))
: null;
//Extracting main properties
const propertySelectors = {
@@ -272,14 +274,10 @@ class SaljicCrawler {
.trim();
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
const latText = latAndLongSrc.substring(
latAndLongSrc.indexOf("marker=") + 7,
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker="))
);
const longText = latAndLongSrc.substring(
latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3,
latAndLongSrc.length
);
const tmpLatLong = latAndLongSrc.split("marker=")[1];
const latText = tmpLatLong.split("%2C")[0];
const longText = tmpLatLong.split("%2C")[1];
const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null;
@@ -328,11 +326,10 @@ class SaljicCrawler {
let numberOfViewsKivi = null;
let streetNumber = 0;
let adStatus = status;
let shortDescription = descriptions.substring(
0,
descriptions.indexOf(".")
);
let longDescription = descriptions;
let shortDescription = descriptions
? descriptions.substring(0, descriptions.indexOf("."))
: "";
let longDescription = descriptions || "";
//Extracting data - Glavne karakteristike
let mainFieldIndex = 1;
do {
@@ -343,10 +340,14 @@ class SaljicCrawler {
.replace(/[\n\r\t]/gm, "")
.trim();
const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
const mainFieldTitle = mainField
? mainField.substring(0, mainField.indexOf(" "))
: "";
const mainFieldValue = mainField
.substring(mainField.indexOf(" "), mainField.length)
.trim();
? mainField
.substring(mainField.indexOf(" "), mainField.length)
.trim()
: "";
switch (mainFieldTitle) {
case "Površina":

View File

@@ -332,14 +332,10 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
};
}
//When includeIncompleteAds are not defined - null it will consider it true
const order = [["updatedAt", "desc"]];
return db.RealEstate.findAll({
where:
includeIncompleteAds || includeIncompleteAds == null
? queryIncludeIncomplete
: query,
where: includeIncompleteAds ? queryIncludeIncomplete : query,
limit: maxResults,
order
});

View File

@@ -3,7 +3,6 @@ const db = require("../../models/index");
const sequelize = require("sequelize");
const Op = sequelize.Op;
const { AD_CATEGORY } = require("../../common/enums");
const { CHECK_UP_DAYS } = require("../../config/appConfig");
const getSearchRequest = async searchRequestId => {
try {
@@ -17,22 +16,6 @@ const getSearchRequest = async searchRequestId => {
const createSearchRequest = async (searchRequestFields = {}) => {
return await db.SearchRequest.create(searchRequestFields);
};
const findAllRequestsForCheckUp = async () => {
const checkUpOffset = 24 * 60 * 60 * 1000 * CHECK_UP_DAYS; //in miliseconds
const checkupDate = new Date();
checkupDate.setTime(checkupDate.getTime() - checkUpOffset);
const dateQuery = {
notifiedAt: {
[Op.lte]: checkupDate
}
};
const allRequestsForCheckUp = await db.SearchRequest.findAll({
where: dateQuery
});
return allRequestsForCheckUp;
};
const findSearchRequestsForRealEstate = async realEstate => {
const {
@@ -174,7 +157,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
} else {
// If real estate dont have defined number of rooms ex. null
//It returns requests that didn't choose number of rooms - also null
//Or ones that picked some values but also picked to includeIncomplete ads (or default)
//Or ones that picked some values but also picked to includeIncomplete ads
numberOfRoomsQuery = {
[Op.or]: [
{
@@ -193,10 +176,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
[Op.eq]: true
}
}
]
@@ -246,10 +226,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
[Op.eq]: true
}
}
]
@@ -298,10 +275,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
[Op.eq]: true
}
}
]
@@ -339,10 +313,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
[Op.eq]: true
}
}
]
@@ -376,10 +347,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
[Op.eq]: true
}
}
]
@@ -413,10 +381,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
[Op.eq]: true
}
}
]
@@ -458,13 +423,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
[Op.eq]: "ANY"
};
}
//Tag to check if incomplete ads are accepted in query which is default
//Tag to check if incomplete ads are accepted in query
if (checkForIncompleteWanted) {
query.includeIncompleteAds = {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
[Op.eq]: true
};
}
@@ -476,6 +438,5 @@ const findSearchRequestsForRealEstate = async realEstate => {
module.exports = {
getSearchRequest,
createSearchRequest,
findSearchRequestsForRealEstate,
findAllRequestsForCheckUp
findSearchRequestsForRealEstate
};

View File

@@ -2,6 +2,7 @@
const db = require("../../models/index");
const sequelize = require("sequelize");
const Op = sequelize.Op;
const { CHECK_UP_DAYS } = require("../../config/appConfig");
const findRealEstatesForSearchRequest = async searchRequestId => {
const query = {
@@ -42,6 +43,42 @@ const findNotNotifiedMatches = async () => {
return matchingRecords;
};
const findAllRequestsForCheckUp = async () => {
//First we find IDs of search request that don't need to be emailed for check up - to EXCLUDE
//The ones that received notification for real estate CHECK_UP_DAYS days from now
const date = new Date();
const checkUpDate = date.getDate() - CHECK_UP_DAYS;
date.setDate(checkUpDate);
const dateQuery = {
createdAt: {
[Op.gte]: date
}
};
const excludedMatches = await db.SearchRequestMatch.findAll({
attributes: ["searchRequestId"],
where: dateQuery,
order: [["searchRequestId", "ASC"]]
});
const excludedRequestsAll = excludedMatches.map(match => {
return match.dataValues.searchRequestId;
});
//Removing duplicate search request id-s for optimization
const excludedRequests = [...new Set(excludedRequestsAll)];
const query = {
subscribed: true,
id: {
[Op.notIn]: excludedRequests
}
};
const allRequestsForCheckUp = await db.SearchRequest.findAll({
where: query
});
return allRequestsForCheckUp;
};
const addMatches = async matchingRecords => {
return await db.SearchRequestMatch.bulkCreate(matchingRecords, {
@@ -52,5 +89,6 @@ const addMatches = async matchingRecords => {
module.exports = {
findRealEstatesForSearchRequest,
addMatches,
findNotNotifiedMatches
findNotNotifiedMatches,
findAllRequestsForCheckUp
};

View File

@@ -1,21 +0,0 @@
const nodeFetch = require("node-fetch");
const {
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY
} = require("../config/appConfig");
const fetch = async (url, options = {}) => {
const newOptions = Object.assign({}, options);
if (!newOptions["headers"]) {
newOptions["headers"] = {};
}
newOptions["headers"]["User-Agent"] = USER_AGENT;
const urlAdaptedForScraping = USE_SCRAPER_API
? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}`
: url;
return nodeFetch(urlAdaptedForScraping, newOptions);
};
module.exports = fetch;

View File

@@ -1,14 +0,0 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
return queryInterface.addColumn("SearchRequests", "notifiedAt", {
type: Sequelize.DATE,
defaultValue: new Date()
});
},
down: (queryInterface, Sequelize) => {
return queryInterface.removeColumn("SearchRequests", "notifiedAt");
}
};

View File

@@ -15,7 +15,15 @@ module.exports = (sequelize, DataTypes) => {
allowNull: false,
defaultValue: {
type: "Polygon",
coordinates: [[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
coordinates: [
[
[0, 0],
[0, 0],
[0, 0],
[0, 0],
[0, 0]
]
],
crs: { type: "name", properties: { name: "EPSG:4326" } }
}
},
@@ -82,11 +90,7 @@ module.exports = (sequelize, DataTypes) => {
floorMin: DataTypes.INTEGER,
floorMax: DataTypes.INTEGER,
accessRoadType: DataTypes.TEXT,
heatingType: DataTypes.TEXT,
notifiedAt: {
type: DataTypes.DATE,
defaultValue: new Date()
}
heatingType: DataTypes.TEXT
});
return SearchRequest;

View File

@@ -15,10 +15,9 @@ const {
} = require("../helpers/emailContentGenerator");
const {
findNotNotifiedMatches,
findAllRequestsForCheckUp,
findRealEstatesForSearchRequest
} = require("../helpers/db/searchRequestMatch");
const { findAllRequestsForCheckUp } = require("../helpers/db/searchRequest");
const { sendEmail } = require("../services/emailService");
const notifyForNewRealEstates = async newRealEstates => {
@@ -36,7 +35,7 @@ const notifyForNewSearchRequest = async searchRequest => {
matchingRealEstates
);
const { email } = searchRequest;
//In case of the new search req, notifiedAt column is populated with default value - now (moment of creation)
await sendEmail(
email,
`${stagingTag} Kivi - novi zahtjev za pretragu`,
@@ -77,10 +76,6 @@ const notifyMatches = async (matches, dailyNotification = false) => {
sendEmailPromise.catch(err =>
console.log("[Email Sending Failed]", err)
);
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
}
}
}
@@ -136,7 +131,7 @@ const notifyRequestsWithDailyOption = async () => {
};
const checkUpNotify = async () => {
const searchRequestsForCheckUp = await findAllRequestsForCheckUp();
/* const searchRequestsForCheckUp = await findAllRequestsForCheckUp();
const asyncSendEmailActions = [];
@@ -148,12 +143,8 @@ const checkUpNotify = async () => {
const sendEmailPromise = sendEmail(email, emailSubject, emailContent);
asyncSendEmailActions.push(sendEmailPromise);
sendEmailPromise.catch(err => console.log("[Email Sending Failed]", err));
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
}
await Promise.all(asyncSendEmailActions);
await Promise.all(asyncSendEmailActions); */
};
module.exports = {

View File

@@ -61,8 +61,9 @@
<p class="distinguished">
<label class="checkbox-label">
<input type="checkbox" class="filled-in" name="includeIncompleteAds"
<% if (includeIncompleteAds) { %>
checked
>
<% } %>>
<span>Uključi i oglase bez potpunih informacija</span>
</label>
</p>

View File

@@ -10,7 +10,6 @@ APP_BASE_URL=base url for the app
ENVIRONMENT=Variable to denote development, staging and production
USER_AGENT=User agent header to send in fetch requests
MAX_REAL_ESTATES_IN_EMAIL=Max number of real estates that will be shown in email, others will be truncated and URL with full list will be shwon
MAX_REAL_ESTATES_IN_FIRST_EMAIL=Max number of real estates that will be shown in first (welcome) email
@@ -22,10 +21,6 @@ GA_ID=Google Analytics ID
#=============== GOOGLE MAPS =============#
API_MAP_KEY=(your-key-here)
#=============== SCRAPER API SUPORT =============#
USE_SCRAPER_API= To turn it on (1) or off (0)
SCRAPER_API_KEY= Key for Scraper api
#=============== AWS SDK EMAIL SETTINGS =======#
AWS_KEY_ID=(your-key-here)
AWS_SECRET_ACCESS_KEY=(your-key-here)
@@ -74,4 +69,4 @@ AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found