Compare commits

..

4 Commits

Author SHA1 Message Date
Naida Vatric
d7fcb2a278 Merge branch 'master' after user-agent change into email-density 2020-02-21 14:26:33 +01:00
Naida Vatric
6bad24d735 New query for search req search. 2020-02-21 14:25:10 +01:00
Naida Vatric
7302edceec Changed queries logic again. 2020-02-18 15:04:26 +01:00
Naida Vatric
bd33a6b80e Logs for query check. 2020-02-17 23:24:55 +01:00
23 changed files with 175 additions and 623 deletions

View File

@@ -45,11 +45,6 @@ const USER_AGENT =
process.env.USER_AGENT || process.env.USER_AGENT ||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"; "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
const USE_SCRAPER_API = process.env.USE_SCRAPER_API === undefined ? 1 : parseInt(process.env.USE_SCRAPER_API);
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
const SCRAPER_API_BASE_URL = process.env.SCRAPER_API_BASE_URL || "";
const NODE_FETCH_TIMEOUT_MS = parseInt(process.env.NODE_FETCH_TIMEOUT_MS) || 60000
module.exports = { module.exports = {
APP_PORT, APP_PORT,
APP_URL, APP_URL,
@@ -64,9 +59,5 @@ module.exports = {
STAGING, STAGING,
CHECK_UP_DAYS, CHECK_UP_DAYS,
PROSTOR_LOGIN, PROSTOR_LOGIN,
USER_AGENT, USER_AGENT
USE_SCRAPER_API,
SCRAPER_API_KEY,
SCRAPER_API_BASE_URL,
NODE_FETCH_TIMEOUT_MS
}; };

View File

@@ -10,7 +10,6 @@ const RentalCrawler = require("./specificCrawlers/rental");
const ProstorCrawler = require("./specificCrawlers/prostor"); const ProstorCrawler = require("./specificCrawlers/prostor");
const AktidoCrawler = require("./specificCrawlers/aktido"); const AktidoCrawler = require("./specificCrawlers/aktido");
const SaljicCrawler = require("./specificCrawlers/saljic"); const SaljicCrawler = require("./specificCrawlers/saljic");
const { logDebug } = require("../helpers/log");
const { const {
OLX_CONFIG, OLX_CONFIG,
@@ -76,9 +75,7 @@ async function crawlAll() {
for (const crawler of crawlers) { for (const crawler of crawlers) {
try { try {
logDebug('Starting crawler: ', crawler);
const newRealEstatesFromSingleCrawler = await crawler.crawl(); const newRealEstatesFromSingleCrawler = await crawler.crawl();
logDebug('Crawler done: ', crawler);
if (Array.isArray(newRealEstatesFromSingleCrawler)) { if (Array.isArray(newRealEstatesFromSingleCrawler)) {
newRealEstates.push(...newRealEstatesFromSingleCrawler); newRealEstates.push(...newRealEstatesFromSingleCrawler);
} }

View File

@@ -159,7 +159,7 @@ class AktidoCrawler {
} }
try { try {
const res = await fetch(url, {}, false); const res = await fetch(url);
const body = await res.text(); const body = await res.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let hrefs = []; let hrefs = [];
@@ -202,10 +202,6 @@ class AktidoCrawler {
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
const mapElementParent = $(".box-map").parent(); const mapElementParent = $(".box-map").parent();
const scriptElement = $("script", mapElementParent); const scriptElement = $("script", mapElementParent);
if ( if (

View File

@@ -1,7 +1,6 @@
"use strict"; "use strict";
const fetch = require("../../helpers/fetchWrapper"); const fetch = require("../../helpers/fetchWrapper");
const { logDebug } = require("../../helpers/log");
const cheerio = require("cheerio"); const cheerio = require("cheerio");
const Promise = require("bluebird"); const Promise = require("bluebird");
const moment = require("moment-timezone"); const moment = require("moment-timezone");
@@ -45,16 +44,6 @@ const OLX_ENUMS = {
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx"); const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
const chunk = (array, size = 10) => {
let i, j ,temparray;
const result = []
for (i=0,j=array.length; i<j; i+=size) {
temparray = array.slice(i,i+size);
result.push(temparray);
}
return result;
}
class OlxCrawler { class OlxCrawler {
constructor( constructor(
savers = [], savers = [],
@@ -63,7 +52,7 @@ class OlxCrawler {
maxPages = 1000, maxPages = 1000,
maxResultsPerPage = 100, maxResultsPerPage = 100,
ignoredUsernames = [], ignoredUsernames = [],
delayBetweenPages = 500 delayBetweenPages = 1000
) { ) {
this.savers = savers; this.savers = savers;
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
@@ -76,7 +65,6 @@ class OlxCrawler {
} }
async crawl() { async crawl() {
logDebug("Starting OLX crawl");
const crawlAdCategories = this.crawlerAdCategories; const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = []; const newRealEstates = [];
@@ -100,32 +88,14 @@ class OlxCrawler {
const entries = singlePageResults.entries(); const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) { for (const [index, { value: singlePageResult }] of entries) {
if (PRINT_CRAWLER_DEBUG) {
console.log("================================");
console.log("Category Indexer index : ", index);
}
if (singlePageResult) { if (singlePageResult) {
console.log("\tTotal entries : ", singlePageResult.length)
const saveResults = await this.saveCrawledResults(singlePageResult); const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords, existingRecords } = saveResults; const { newRecords, existingRecords } = saveResults;
if (PRINT_CRAWLER_DEBUG) {
console.log("--------------------------");
console.log("\tNew record URLs [", newRecords.length, "] :");
for(const newRecord of newRecords) {
console.log("\t\t",newRecord.url);
}
console.log("\t-------------------------");
console.log("\tExisting record URLs [", existingRecords.length, "] :");
}
newRealEstates.push(...newRecords); newRealEstates.push(...newRecords);
for (const existingRecord of existingRecords) { for (const existingRecord of existingRecords) {
const { publishedDate, renewedDate, url } = existingRecord; const { publishedDate, renewedDate } = existingRecord;
const publishedDateMoment = moment.utc(publishedDate); const publishedDateMoment = moment.utc(publishedDate);
const renewedDateMoment = moment.utc(renewedDate); const renewedDateMoment = moment.utc(renewedDate);
@@ -135,25 +105,13 @@ class OlxCrawler {
"minute" "minute"
); );
if (PRINT_CRAWLER_DEBUG) {
console.log("\t\t", url);
console.log("\t\t\tPublished date : ", publishedDate);
console.log("\t\t\tRenewed date : ", renewedDate);
console.log("\t\t\tIs same (up to minute) : ", stopCrawlingThisCategory);
}
if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) { if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
generatorsToRemove[index] = true; generatorsToRemove[index] = true;
if (PRINT_CRAWLER_DEBUG) { // console.log("\tGenerator ", index + 1, "has no more new ads");
console.log("\t\t\tStopping this category indexer");
}
break; break;
} }
} }
} else { } else {
if (PRINT_CRAWLER_DEBUG) {
console.log("\tNo more entries in this category, stopping!");
}
//Generator returned undefined, remove this generator from array //Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true; generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages"); // console.log("Generator ", index + 1, "has no more pages");
@@ -178,36 +136,31 @@ class OlxCrawler {
} }
async *categoryIndexer(adCategory) { async *categoryIndexer(adCategory) {
try { let pageToIndex = 1;
let pageToIndex = 1;
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) { while (true) {
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
const singlePageResults = await this.indexSinglePage( const singlePageResults = await this.indexSinglePage(
urlPageToCrawl, urlPageToCrawl,
this.maxResultsPerPage this.maxResultsPerPage
); );
await this.sleep(this.delayBetweenPages); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults;
yield singlePageResults; } else {
} else { return undefined;
return undefined; }
}
++pageToIndex;
++pageToIndex; if (pageToIndex === this.maxPages) {
if (pageToIndex === this.maxPages) { return undefined;
return undefined;
}
} }
} else {
return undefined;
} }
} catch (e) { } else {
console.log('Error inside generator: ', e); return undefined;
} }
} }
@@ -217,10 +170,8 @@ class OlxCrawler {
} }
try { try {
const res = await fetch(url, {}, false); const res = await fetch(url);
logDebug("Got category results for: ", url);
const body = await res.text(); const body = await res.text();
logDebug("Got category results text for: ", url);
const $ = cheerio.load(body); const $ = cheerio.load(body);
let hrefs = []; let hrefs = [];
@@ -241,44 +192,26 @@ class OlxCrawler {
const asyncScraping = []; const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) { for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push(hrefs[i]); asyncScraping.push(this.scrapeAd(hrefs[i]));
} }
const allChunks = chunk(asyncScraping, 2); const scrapedData = await Promise.all(asyncScraping);
const dataResults = [] const filteredScrapedData = scrapedData.filter(adData => !!adData);
for (let i = 0; i < allChunks.length; i++) {
const singleChunk = allChunks[i];
const promises = singleChunk.map(c => this.scrapeAd(c))
const chunkResults = await Promise.all(promises);
await this.sleep(this.delayBetweenPages);
dataResults.push(...chunkResults);
logDebug("Chunk results len:", chunkResults.length);
}
const filteredScrapedData = dataResults.filter(adData => !!adData);
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
return filteredScrapedData; return filteredScrapedData;
} catch (e) { } catch (e) {
console.error("Exception caught, index single page: " + e); console.error("Exception caught:" + e);
return []; return [];
} }
} }
async scrapeAd(url) { async scrapeAd(url) {
logDebug("Scraping : ", url); // console.log("Scraping : ", url);
try { try {
const adPageSource = await fetch(url); const adPageSource = await fetch(url);
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let status = AD_STATUS.STATUS_NORMAL; let status = AD_STATUS.STATUS_NORMAL;
if (body.indexOf('<html') === -1) {
console.error("This is the body: ", body);
throw { message: 'Failed to fetch page !' }
}
const propertySelectors = { const propertySelectors = {
username: username:
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
@@ -305,25 +238,37 @@ class OlxCrawler {
//====== PRICE DETECTION AND EXTRACTION ===== //====== PRICE DETECTION AND EXTRACTION =====
let price = null; let price = null;
const normalPriceValue = $("#pc > p:nth-child(2)").text();
const urgentPriceValue = $(
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
)
.text()
.trim();
const priceHeader = $("#pc > p.n").text().trim(); if (normalPriceValue && normalPriceValue.length > 0) {
const priceValue = $("#pc > p:nth-child(2)").text().trim(); price = normalPriceValue;
price = priceValue; if (
$("#pc > p.n")
if (priceHeader.indexOf('Hitn') !== -1) { .text()
// Urgent price .indexOf("Hitna") !== -1
status = AD_STATUS.STATUS_URGENT; ) {
} status = AD_STATUS.STATUS_URGENT;
} else {
const discountPriceTag = $("#artikal_glavni_div > div.artikal_lijevo > p:nth-child(4)").text().trim(); status = AD_STATUS.STATUS_NORMAL;
if (discountPriceTag.indexOf('Akcij') !== -1) { }
status = AD_STATUS.STATUS_DISCOUNTED; } else if (urgentPriceValue && urgentPriceValue.length > 0) {
const discountPriceValues = $("#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p").text().trim(); const priceValues = urgentPriceValue.split("KM");
// discountPriceValues contain string like "10.000 KM 7.500 KM" //priceValues will contain values like ["100000", "90000", ...], second element is urgent price
// First price is regular, second is currently active (discounted) price if (priceValues.length > 1) {
const bothPrices = discountPriceValues.split('KM'); price = priceValues[1].trim();
// Now, currently active price is second element of bothPrices array status = AD_STATUS.STATUS_DISCOUNTED;
price = bothPrices[1] ? bothPrices[1].trim() : null; } else {
throw { message: "Can't find urgent price" };
}
} else {
throw {
message: "Can't find price (it is not normal nor urgent price ?)"
};
} }
//====== OTHER AD INFORMATION =============== //====== OTHER AD INFORMATION ===============
@@ -333,7 +278,7 @@ class OlxCrawler {
let otherInformationDivId; let otherInformationDivId;
//We need to locate DIV ID where other information are stored //We need to locate DIV ID where other information are stored
for (let possibleId = 1; possibleId <= 30; possibleId++) { for (let possibleId = 10; possibleId <= 20; possibleId++) {
const adTypeFieldTitle = $( const adTypeFieldTitle = $(
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1`
) )
@@ -705,12 +650,10 @@ class OlxCrawler {
distanceToRiver, distanceToRiver,
numberOfViewsAgency numberOfViewsAgency
}; };
//
//console.log("Scraped data:", data);
return data; return data;
} catch (e) { } catch (e) {
console.error("Exception caught scrapeAd : " + e.message, "\r\nURL:", url); console.error("Exception caught: " + e.message, "\r\nURL:", url);
} }
return null; return null;
} }
@@ -825,9 +768,6 @@ class OlxCrawler {
if (!priceText) { if (!priceText) {
return NaN; return NaN;
} }
if (priceText === "Po dogovoru") {
return null;
}
const formattedPriceText = priceText.replace(".", "").replace(",", "."); const formattedPriceText = priceText.replace(".", "").replace(",", ".");
return parseFloat(formattedPriceText); return parseFloat(formattedPriceText);
} }

View File

@@ -4,7 +4,6 @@ const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio"); const cheerio = require("cheerio");
const moment = require("moment-timezone"); const moment = require("moment-timezone");
const FormData = require("form-data"); const FormData = require("form-data");
const nodeFetch = require("node-fetch");
const { const {
AD_TYPE, AD_TYPE,
@@ -63,19 +62,13 @@ class ProstorCrawler {
async crawl() { async crawl() {
const crawlAdCategories = this.crawlerAdCategories; const crawlAdCategories = this.crawlerAdCategories;
const crawlAdTypes = this.crawlerAdTypes;
if (!crawlAdCategories || !crawlAdTypes) {
return []
}
const newRealEstates = [];
//We need session cookie to use login privileges //We need session cookie to use login privileges
const prostorCookie = await this.getCookies(); const prostorCookie = await this.getCookies();
//New tag to check if crawler logged in //New tag to check if crawler loged in
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie); const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
const newRealEstates = [];
//Crawl only if login was successful //Crawl only if login was successful
if (login) { if (crawlAdCategories && login) {
const indexGenerators = []; const indexGenerators = [];
for (const adCategory of crawlAdCategories) { for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie)); indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
@@ -141,11 +134,6 @@ class ProstorCrawler {
prostorCookie prostorCookie
); );
if (!Array.isArray(listOfAllRealEstates)){
console.log('[PROSTOR] Could not find real estate JSON data, check selector !');
return undefined;
}
let elementToStartIndexFrom = 0; let elementToStartIndexFrom = 0;
while (true) { while (true) {
const realEstatesForSinglePage = listOfAllRealEstates.slice( const realEstatesForSinglePage = listOfAllRealEstates.slice(
@@ -209,16 +197,12 @@ class ProstorCrawler {
// console.log("[PROSTOR] Scraping : ", url); // console.log("[PROSTOR] Scraping : ", url);
try { try {
const adPageSource = await nodeFetch(url, { const adPageSource = await fetch(url, {
headers: { Cookie: prostorCookie } headers: { Cookie: prostorCookie }
}); });
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
// link contains part of the URL in the format of : /prodaja/stan/stup/9556 // link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID // general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
@@ -443,14 +427,14 @@ class ProstorCrawler {
} }
try { try {
const res = await nodeFetch(url, { const res = await fetch(url, {
headers: { Cookie: prostorCookie } headers: { Cookie: prostorCookie }
}); });
const body = await res.text(); const body = await res.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
const scriptElement = $( const scriptElement = $(
"body > div.content > div.container-fluid > script:nth-child(6)" "body > div > div.container-fluid > script:nth-child(7)"
); );
if ( if (
@@ -607,7 +591,7 @@ class ProstorCrawler {
formData.append("email", PROSTOR_LOGIN.EMAIL); formData.append("email", PROSTOR_LOGIN.EMAIL);
formData.append("password", PROSTOR_LOGIN.PASSWORD); formData.append("password", PROSTOR_LOGIN.PASSWORD);
return nodeFetch("https://prostor.ba/moj-prostor/prijava", { return fetch("https://prostor.ba/moj-prostor/prijava", {
method: "POST", method: "POST",
body: formData, body: formData,
headers: { Cookie: prostorCookie } headers: { Cookie: prostorCookie }
@@ -634,12 +618,9 @@ class ProstorCrawler {
}); });
} }
async getCookies() { async getCookies() {
const getResponse = await nodeFetch( const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", {
"https://prostor.ba/moj-prostor/prijava", headers: { Cookie: "" }
{ });
headers: { Cookie: "" }
}
);
const raw = getResponse.headers.raw()["set-cookie"]; const raw = getResponse.headers.raw()["set-cookie"];
const cookie = raw const cookie = raw
.map(datastring => { .map(datastring => {

View File

@@ -159,7 +159,7 @@ class RentalCrawler {
} }
try { try {
const res = await fetch(url, {} , false); const res = await fetch(url);
const body = await res.text(); const body = await res.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let hrefs = []; let hrefs = [];
@@ -202,10 +202,6 @@ class RentalCrawler {
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
const mapElementParent = $(".box-map").parent(); const mapElementParent = $(".box-map").parent();
const scriptElement = $("script", mapElementParent); const scriptElement = $("script", mapElementParent);
if ( if (

View File

@@ -1,10 +1,8 @@
"use strict"; "use strict";
const fetch = require("../../helpers/fetchWrapper"); const fetch = require("../../helpers/fetchWrapper");
const { getUrlParams } = require("../../helpers/url");
const cheerio = require("cheerio"); const cheerio = require("cheerio");
const moment = require("moment-timezone"); const moment = require("moment-timezone");
const PromisePool = require('@supercharge/promise-pool');
const { const {
AD_TYPE, AD_TYPE,
@@ -48,13 +46,12 @@ class SaljicCrawler {
maxPages = 5000, maxPages = 5000,
maxResultsPerPage = 5000, maxResultsPerPage = 5000,
ignoredUsernames = [], ignoredUsernames = [],
delayBetweenPages = 500 delayBetweenPages = 1000
) { ) {
this.savers = savers; this.savers = savers;
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories; this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages
this.maxResultsPerPage = maxResultsPerPage; this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages; this.delayBetweenPages = delayBetweenPages;
} }
@@ -160,7 +157,7 @@ class SaljicCrawler {
} }
try { try {
const res = await fetch(url, {}, false); const res = await fetch(url);
const body = await res.text(); const body = await res.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
let hrefs = []; let hrefs = [];
@@ -208,23 +205,11 @@ class SaljicCrawler {
const asyncScraping = []; const asyncScraping = [];
for (let i = 0; i < actualNoOfResults; i++) { for (let i = 0; i < actualNoOfResults; i++) {
asyncScraping.push([hrefsAbs[i], adTypes[i]]); asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i]));
} }
const scrapedData = await Promise.all(asyncScraping);
const filteredScrapedData = scrapedData.filter(adData => !!adData);
const dataResults = []
const { scrapedData, errors } = await PromisePool
.withConcurrency(2)
.for(asyncScraping)
.process(async data => {
const result = await this.scrapeAd(...data)
await this.sleep(this.delayBetweenPages);
dataResults.push(result)
return result; //TODO: this does not work, scrapedData is null, dataResults works
})
const filteredScrapedData = dataResults.filter(adData => !!adData);
return filteredScrapedData; return filteredScrapedData;
} catch (e) { } catch (e) {
console.error("[SALJIC] Exception caught:" + e); console.error("[SALJIC] Exception caught:" + e);
@@ -239,20 +224,10 @@ class SaljicCrawler {
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
if (body.indexOf('<html') === -1) {
throw { message: 'Failed to fetch page !' }
}
// No information for status ex. PRODAN // No information for status ex. PRODAN
const status = AD_STATUS.STATUS_NORMAL; const status = AD_STATUS.STATUS_NORMAL;
//Extracting agency ID from url //Extracting agency ID from url
const agencyObjectId = url const agencyObjectId = parseInt(url.substring(46, url.length));
? parseInt(url.substring(46, url.length))
: null;
if (!agencyObjectId) {
throw { message : 'No agency object ID - URL changed?'}
}
//Extracting main properties //Extracting main properties
const propertySelectors = { const propertySelectors = {
@@ -262,10 +237,11 @@ class SaljicCrawler {
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName: streetName:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p", "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
descriptions: descriptions:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)", "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
latAndLong: latAndLong:
"iframe" "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
}; };
const title = $(propertySelectors.title) const title = $(propertySelectors.title)
.text() .text()
@@ -296,26 +272,14 @@ class SaljicCrawler {
.trim(); .trim();
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
let latText; const latText = latAndLongSrc.substring(
let longText; latAndLongSrc.indexOf("marker=") + 7,
if (latAndLongSrc){ latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker="))
const mapParams = getUrlParams(latAndLongSrc); );
if (mapParams) { const longText = latAndLongSrc.substring(
if (mapParams['marker']){ latAndLongSrc.indexOf("%2C", latAndLongSrc.indexOf("marker=")) + 3,
const marker = mapParams['marker'].split(','); latAndLongSrc.length
latText = marker[0] ? marker[0] : undefined; );
longText = marker[1] ? marker[1] : undefined;
}else{
if (mapParams['mlat']) {
latText = mapParams['mlat'];
}
if (mapParams['mlon']) {
longText = mapParams['mlon'];
}
}
}
}
const locationLat = parseFloat(latText) || null; const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null; const locationLong = parseFloat(longText) || null;
@@ -364,10 +328,11 @@ class SaljicCrawler {
let numberOfViewsKivi = null; let numberOfViewsKivi = null;
let streetNumber = 0; let streetNumber = 0;
let adStatus = status; let adStatus = status;
let shortDescription = descriptions let shortDescription = descriptions.substring(
? descriptions.substring(0, descriptions.indexOf(".")) 0,
: ""; descriptions.indexOf(".")
let longDescription = descriptions || ""; );
let longDescription = descriptions;
//Extracting data - Glavne karakteristike //Extracting data - Glavne karakteristike
let mainFieldIndex = 1; let mainFieldIndex = 1;
do { do {
@@ -378,14 +343,10 @@ class SaljicCrawler {
.replace(/[\n\r\t]/gm, "") .replace(/[\n\r\t]/gm, "")
.trim(); .trim();
const mainFieldTitle = mainField const mainFieldTitle = mainField.substring(0, mainField.indexOf(" "));
? mainField.substring(0, mainField.indexOf(" "))
: "";
const mainFieldValue = mainField const mainFieldValue = mainField
? mainField .substring(mainField.indexOf(" "), mainField.length)
.substring(mainField.indexOf(" "), mainField.length) .trim();
.trim()
: "";
switch (mainFieldTitle) { switch (mainFieldTitle) {
case "Površina": case "Površina":
@@ -402,7 +363,7 @@ class SaljicCrawler {
numberOfRooms = parseInt(mainFieldValue); numberOfRooms = parseInt(mainFieldValue);
break; break;
case "Broj spratova": case "Broj spratova":
numberOfFloors = this.parseNumberOfFloors(mainFieldValue); numberOfFloors = parseInt(mainFieldValue);
break; break;
case "Sprat": case "Sprat":
floor = parseInt(mainFieldValue); floor = parseInt(mainFieldValue);
@@ -448,9 +409,6 @@ class SaljicCrawler {
) )
.trim(); .trim();
realEstateType = this.getAdCategoryId(categoryTmp); realEstateType = this.getAdCategoryId(categoryTmp);
if (!realEstateType) {
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
}
} else { } else {
switch (additionalField) { switch (additionalField) {
case "Internet": case "Internet":
@@ -607,7 +565,7 @@ class SaljicCrawler {
return data; return data;
} catch (e) { } catch (e) {
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url); console.error("Exception caught: " + e.message, "\r\nURL:", url);
} }
return null; return null;
} }
@@ -652,21 +610,6 @@ class SaljicCrawler {
} }
} }
parseNumberOfFloors(numberOfFloorsText) {
const tryNumericalValue = parseInt(numberOfFloorsText);
if (!isNaN(tryNumericalValue)){
return tryNumericalValue;
}
// Guess number of floors based on number of + sign concatenations
// e.g. P+S+Pt -> 3 floors
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
return numberOfFloorsText.split('+').length + 1
}
return null
}
async sleep(ms) { async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }

View File

@@ -3,7 +3,6 @@ const db = require("../../models/index");
const sequelize = require("sequelize"); const sequelize = require("sequelize");
const Op = sequelize.Op; const Op = sequelize.Op;
const { AD_CATEGORY } = require("../../common/enums"); const { AD_CATEGORY } = require("../../common/enums");
const { CHECK_UP_DAYS } = require("../../config/appConfig");
const getSearchRequest = async searchRequestId => { const getSearchRequest = async searchRequestId => {
try { try {
@@ -17,22 +16,6 @@ const getSearchRequest = async searchRequestId => {
const createSearchRequest = async (searchRequestFields = {}) => { const createSearchRequest = async (searchRequestFields = {}) => {
return await db.SearchRequest.create(searchRequestFields); return await db.SearchRequest.create(searchRequestFields);
}; };
const findAllRequestsForCheckUp = async () => {
const checkUpOffset = 24 * 60 * 60 * 1000 * CHECK_UP_DAYS; //in miliseconds
const checkupDate = new Date();
checkupDate.setTime(checkupDate.getTime() - checkUpOffset);
const dateQuery = {
notifiedAt: {
[Op.lte]: checkupDate
}
};
const allRequestsForCheckUp = await db.SearchRequest.findAll({
where: dateQuery
});
return allRequestsForCheckUp;
};
const findSearchRequestsForRealEstate = async realEstate => { const findSearchRequestsForRealEstate = async realEstate => {
const { const {
@@ -173,33 +156,11 @@ const findSearchRequestsForRealEstate = async realEstate => {
}; };
} else { } else {
// If real estate dont have defined number of rooms ex. null // If real estate dont have defined number of rooms ex. null
//It returns requests that didn't choose number of rooms - also null //It returns all search requests except for ones that dont want incpomlete ads
//Or ones that picked some values but also picked to includeIncomplete ads (or default)
numberOfRoomsQuery = { numberOfRoomsQuery = {
[Op.or]: [ includeIncompleteAds: {
{ [Op.ne]: false
[Op.and]: [ }
{
numberOfRoomsMin: {
[Op.is]: null
}
},
{
numberOfRoomsMax: {
[Op.is]: null
}
}
]
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}; };
} }
} }
@@ -229,30 +190,9 @@ const findSearchRequestsForRealEstate = async realEstate => {
}; };
} else { } else {
numberOfFloorsQuery = { numberOfFloorsQuery = {
[Op.or]: [ includeIncompleteAds: {
{ [Op.ne]: false
[Op.and]: [ }
{
numberOfFloorsMin: {
[Op.is]: null
}
},
{
numberOfFloorsMax: {
[Op.is]: null
}
}
]
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}; };
} }
} }
@@ -281,30 +221,9 @@ const findSearchRequestsForRealEstate = async realEstate => {
}; };
} else { } else {
floorQuery = { floorQuery = {
[Op.or]: [ includeIncompleteAds: {
{ [Op.ne]: false
[Op.and]: [ }
{
floorMin: {
[Op.is]: null
}
},
{
floorMax: {
[Op.is]: null
}
}
]
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}; };
} }
} }
@@ -313,7 +232,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
//If user dont check checkbox for ex. elevator it does not mean he only wants no elevator //If user dont check checkbox for ex. elevator it does not mean he only wants no elevator
//If real estate characteristic =true find all req, one that wants charachertistic or dont care - dont need query //If real estate characteristic =true find all req, one that wants charachertistic or dont care - dont need query
//If real estate characteristic = false, find all req exept for ones that wants characteristic to be true //If real estate characteristic = false, find all req exept for ones that wants characteristic to be true
//If real estate characteristic = null, dont know if true or false, find req that dont care or want char and want incomplete ads //If real estate characteristic = null, dont know if true or false, find all req except ones that dont want incomplete ads
let balconyQuery = {}; let balconyQuery = {};
if (realEstateTypeObject.hasBalconyProp && balcony !== true) { if (realEstateTypeObject.hasBalconyProp && balcony !== true) {
if (balcony === false) { if (balcony === false) {
@@ -324,30 +243,9 @@ const findSearchRequestsForRealEstate = async realEstate => {
}; };
} else if (balcony === null) { } else if (balcony === null) {
balconyQuery = { balconyQuery = {
[Op.or]: [ includeIncompleteAds: {
{ [Op.ne]: false
balcony: { }
[Op.ne]: true
}
},
{
[Op.and]: [
{
balcony: {
[Op.eq]: true
}
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}
]
}; };
} }
} }
@@ -361,30 +259,9 @@ const findSearchRequestsForRealEstate = async realEstate => {
}; };
} else if (newBuilding === null) { } else if (newBuilding === null) {
newBuildingQuery = { newBuildingQuery = {
[Op.or]: [ includeIncompleteAds: {
{ [Op.ne]: false
newBuilding: { }
[Op.ne]: true
}
},
{
[Op.and]: [
{
newBuilding: {
[Op.eq]: true
}
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}
]
}; };
} }
} }
@@ -398,33 +275,13 @@ const findSearchRequestsForRealEstate = async realEstate => {
}; };
} else if (elevator === null) { } else if (elevator === null) {
elevatorQuery = { elevatorQuery = {
[Op.or]: [ includeIncompleteAds: {
{ [Op.ne]: false
elevator: { }
[Op.ne]: true
}
},
{
[Op.and]: [
{
elevator: {
[Op.eq]: true
}
},
{
includeIncompleteAds: {
[Op.or]: {
[Op.eq]: true,
[Op.is]: null
}
}
}
]
}
]
}; };
} }
} }
//General query consists of each individual query //General query consists of each individual query
const query = { const query = {
adType, adType,
@@ -476,6 +333,5 @@ const findSearchRequestsForRealEstate = async realEstate => {
module.exports = { module.exports = {
getSearchRequest, getSearchRequest,
createSearchRequest, createSearchRequest,
findSearchRequestsForRealEstate, findSearchRequestsForRealEstate
findAllRequestsForCheckUp
}; };

View File

@@ -2,6 +2,7 @@
const db = require("../../models/index"); const db = require("../../models/index");
const sequelize = require("sequelize"); const sequelize = require("sequelize");
const Op = sequelize.Op; const Op = sequelize.Op;
const { CHECK_UP_DAYS } = require("../../config/appConfig");
const findRealEstatesForSearchRequest = async searchRequestId => { const findRealEstatesForSearchRequest = async searchRequestId => {
const query = { const query = {
@@ -42,6 +43,42 @@ const findNotNotifiedMatches = async () => {
return matchingRecords; return matchingRecords;
}; };
const findAllRequestsForCheckUp = async () => {
//First we find IDs of search request that don't need to be emailed for check up - to EXCLUDE
//The ones that received notification for real estate CHECK_UP_DAYS days from now
const date = new Date();
const checkUpDate = date.getDate() - CHECK_UP_DAYS;
date.setDate(checkUpDate);
const dateQuery = {
createdAt: {
[Op.gte]: date
}
};
const excludedMatches = await db.SearchRequestMatch.findAll({
attributes: ["searchRequestId"],
where: dateQuery,
order: [["searchRequestId", "ASC"]]
});
const excludedRequestsAll = excludedMatches.map(match => {
return match.dataValues.searchRequestId;
});
//Removing duplicate search request id-s for optimization
const excludedRequests = [...new Set(excludedRequestsAll)];
const query = {
subscribed: true,
id: {
[Op.notIn]: excludedRequests
}
};
const allRequestsForCheckUp = await db.SearchRequest.findAll({
where: query
});
return allRequestsForCheckUp;
};
const addMatches = async matchingRecords => { const addMatches = async matchingRecords => {
return await db.SearchRequestMatch.bulkCreate(matchingRecords, { return await db.SearchRequestMatch.bulkCreate(matchingRecords, {
@@ -52,5 +89,6 @@ const addMatches = async matchingRecords => {
module.exports = { module.exports = {
findRealEstatesForSearchRequest, findRealEstatesForSearchRequest,
addMatches, addMatches,
findNotNotifiedMatches findNotNotifiedMatches,
findAllRequestsForCheckUp
}; };

View File

@@ -9,7 +9,6 @@ const { AD_CATEGORY, AD_TYPE, EMAIL_FREQUENCY } = require("../common/enums");
//Tag to recognize staging from development //Tag to recognize staging from development
const stagingTag = STAGING ? "[STAGING] " : ""; const stagingTag = STAGING ? "[STAGING] " : "";
const wordOfMouthRequest = `Molimo vas <strong>recite svojim prijateljima</strong> za Kivi - što više korisnika budemo imali, moći ćemo više agencija uključiti i više nekretnina imati u bazi. Hvala!`
const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => { const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => {
return ` <div>Trenutno ste prijavljeni da obavještenja o novim nekretninama primate <strong>${emailFrequencyTitle.toLowerCase()} </strong>.</div> return ` <div>Trenutno ste prijavljeni da obavještenja o novim nekretninama primate <strong>${emailFrequencyTitle.toLowerCase()} </strong>.</div>
@@ -70,9 +69,6 @@ const generateNotificationEmail = (
${moreRealEstates} ${moreRealEstates}
</div> </div>
<br/> <br/>
${wordOfMouthRequest}
<br/>
<br/>
${emailFooter}`; ${emailFooter}`;
}; };
@@ -136,10 +132,6 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
</div> </div>
${matchingRealEstates.length > 0 ? instantRealEstatesText : ""} ${matchingRealEstates.length > 0 ? instantRealEstatesText : ""}
<br/> <br/>
<br/>
${wordOfMouthRequest}
<br/>
<br/>
${emailFooter}`; ${emailFooter}`;
}; };

View File

@@ -1,58 +1,13 @@
const nodeFetch = require("node-fetch"); const nodeFetch = require("node-fetch");
const AbortController = require('abort-controller'); const { USER_AGENT } = require("../config/appConfig");
const FetchCache = require('@sozialhelden/fetch-cache').default;
console.log("Fc ", FetchCache)
const {
USER_AGENT,
USE_SCRAPER_API,
SCRAPER_API_KEY,
SCRAPER_API_BASE_URL,
NODE_FETCH_TIMEOUT_MS
} = require("../config/appConfig");
const timeout = (ms) => {
return new Promise(resolve => setTimeout(resolve, ms));
}
const fetchCache = new FetchCache({
fetch: nodeFetch,
cacheOptions: {
// Don't save more than 100 responses in the cache. Allows infinite responses by default
maximalItemCount: 10000,
// When should the cache evict responses when its full?
evictExceedingItemsBy: 'age', // Valid values: 'lru' or 'age'
defaultTTL: 6 * 60 * 60 * 1000 // 6 hours
// ...see https://github.com/sozialhelden/hamster-cache for all possible options
},
});
const fetch = async (url, options = {}, useCache = true) => {
const controller = new AbortController();
const fetch = async (url, options = {}) => {
const newOptions = Object.assign({}, options); const newOptions = Object.assign({}, options);
if (!newOptions["headers"]) { if (!newOptions["headers"]) {
newOptions["headers"] = {}; newOptions["headers"] = {};
} }
newOptions["headers"]["User-Agent"] = USER_AGENT;
newOptions.signal = controller.signal; return nodeFetch(url, newOptions);
// newOptions["headers"]["User-Agent"] = USER_AGENT;
let urlToFetchThroughAPI = Buffer.from(url).toString('base64');
if (SCRAPER_API_BASE_URL.includes('scraperapi')) {
urlToFetchThroughAPI = url;
}
const urlAdaptedForScraping = USE_SCRAPER_API
? `${SCRAPER_API_BASE_URL}?api_key=${SCRAPER_API_KEY}&url=${urlToFetchThroughAPI}`
: url;
const result = useCache ? fetchCache.fetch(urlAdaptedForScraping, newOptions) : nodeFetch(urlAdaptedForScraping, newOptions);
const timeoutId = setTimeout(() => controller.abort(), NODE_FETCH_TIMEOUT_MS);
return result;
}; };
module.exports = fetch; module.exports = fetch;

View File

@@ -1,13 +0,0 @@
const {
PRINT_CRAWLER_DEBUG
} = require("../config/appConfig");
const logDebug = (...args) => {
if (PRINT_CRAWLER_DEBUG) {
console.log(...args);
}
}
module.exports = {
logDebug
};

View File

@@ -7,26 +7,6 @@ const currentSearchRequest = async req => {
return await getSearchRequest(searchRequestId); return await getSearchRequest(searchRequestId);
}; };
const getUrlParams = function (url) {
if (typeof url === 'string' && url.length > 0){
const params = {};
const questionMarkIndex = url.indexOf('?');
if (questionMarkIndex === -1) {
return undefined;
}
const query = url.substring(questionMarkIndex+1);
const vars = query.split('&');
for (let i = 0; i < vars.length; i++) {
const pair = vars[i].split('=');
params[pair[0]] = decodeURIComponent(pair[1]);
}
return params;
}
return undefined;
};
module.exports = { module.exports = {
currentSearchRequest, currentSearchRequest
getUrlParams
}; };

View File

@@ -1,14 +0,0 @@
"use strict";
module.exports = {
up: (queryInterface, Sequelize) => {
return queryInterface.addColumn("SearchRequests", "notifiedAt", {
type: Sequelize.DATE,
defaultValue: new Date()
});
},
down: (queryInterface, Sequelize) => {
return queryInterface.removeColumn("SearchRequests", "notifiedAt");
}
};

View File

@@ -16,7 +16,7 @@ config.logging = parseInt(process.env.SEQUELIZE_LOGGING) ? console.log : false;
let sequelize; let sequelize;
if (config.use_env_variable) { if (config.use_env_variable) {
sequelize = new Sequelize(process.env[config.use_env_variable] + "?ssl=true", config); sequelize = new Sequelize(process.env[config.use_env_variable], config);
} else { } else {
sequelize = new Sequelize( sequelize = new Sequelize(
config.database, config.database,

View File

@@ -82,11 +82,7 @@ module.exports = (sequelize, DataTypes) => {
floorMin: DataTypes.INTEGER, floorMin: DataTypes.INTEGER,
floorMax: DataTypes.INTEGER, floorMax: DataTypes.INTEGER,
accessRoadType: DataTypes.TEXT, accessRoadType: DataTypes.TEXT,
heatingType: DataTypes.TEXT, heatingType: DataTypes.TEXT
notifiedAt: {
type: DataTypes.DATE,
defaultValue: new Date()
}
}); });
return SearchRequest; return SearchRequest;

View File

@@ -15,10 +15,9 @@ const {
} = require("../helpers/emailContentGenerator"); } = require("../helpers/emailContentGenerator");
const { const {
findNotNotifiedMatches, findNotNotifiedMatches,
findAllRequestsForCheckUp,
findRealEstatesForSearchRequest findRealEstatesForSearchRequest
} = require("../helpers/db/searchRequestMatch"); } = require("../helpers/db/searchRequestMatch");
const { findAllRequestsForCheckUp } = require("../helpers/db/searchRequest");
const { sendEmail } = require("../services/emailService"); const { sendEmail } = require("../services/emailService");
const notifyForNewRealEstates = async newRealEstates => { const notifyForNewRealEstates = async newRealEstates => {
@@ -36,7 +35,7 @@ const notifyForNewSearchRequest = async searchRequest => {
matchingRealEstates matchingRealEstates
); );
const { email } = searchRequest; const { email } = searchRequest;
//In case of the new search req, notifiedAt column is populated with default value - now (moment of creation)
await sendEmail( await sendEmail(
email, email,
`${stagingTag} Kivi - novi zahtjev za pretragu`, `${stagingTag} Kivi - novi zahtjev za pretragu`,
@@ -77,10 +76,6 @@ const notifyMatches = async (matches, dailyNotification = false) => {
sendEmailPromise.catch(err => sendEmailPromise.catch(err =>
console.log("[Email Sending Failed]", err) console.log("[Email Sending Failed]", err)
); );
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
} }
} }
} }
@@ -148,12 +143,8 @@ const checkUpNotify = async () => {
const sendEmailPromise = sendEmail(email, emailSubject, emailContent); const sendEmailPromise = sendEmail(email, emailSubject, emailContent);
asyncSendEmailActions.push(sendEmailPromise); asyncSendEmailActions.push(sendEmailPromise);
sendEmailPromise.catch(err => console.log("[Email Sending Failed]", err)); sendEmailPromise.catch(err => console.log("[Email Sending Failed]", err));
//Change time of notified At for searchReq
searchRequest.notifiedAt = new Date();
searchRequest.save();
} }
await Promise.all(asyncSendEmailActions);*/ await Promise.all(asyncSendEmailActions); */
}; };
module.exports = { module.exports = {

View File

@@ -22,11 +22,6 @@ GA_ID=Google Analytics ID
#=============== GOOGLE MAPS =============# #=============== GOOGLE MAPS =============#
API_MAP_KEY=(your-key-here) API_MAP_KEY=(your-key-here)
#=============== SCRAPER API SUPORT =============#
USE_SCRAPER_API= To turn it on (1) or off (0)
SCRAPER_API_KEY= Key for Scraper api
SCRAPER_API_BASE_URL= Base url without question mark (example: http://sabur.kivi.ba:1337)
#=============== AWS SDK EMAIL SETTINGS =======# #=============== AWS SDK EMAIL SETTINGS =======#
AWS_KEY_ID=(your-key-here) AWS_KEY_ID=(your-key-here)
AWS_SECRET_ACCESS_KEY=(your-key-here) AWS_SECRET_ACCESS_KEY=(your-key-here)
@@ -72,7 +67,6 @@ AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==SALJIC NEKRETNINE== #==SALJIC NEKRETNINE==
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values

View File

@@ -4,7 +4,6 @@ const bodyParser = require("body-parser");
const layout = require("express-layout"); const layout = require("express-layout");
const compression = require("compression"); const compression = require("compression");
const forceSSL = require("./app/helpers/forceSSL"); const forceSSL = require("./app/helpers/forceSSL");
const { logDebug } = require("./app/helpers/log");
const { const {
APP_PORT, APP_PORT,
@@ -39,17 +38,11 @@ app.listen(APP_PORT, () =>
let crawlerRunning = STOP_CRAWLER; let crawlerRunning = STOP_CRAWLER;
const crawl = () => { const crawl = () => {
logDebug("Crawl start. crawlerRunning: ", crawlerRunning);
if (!crawlerRunning) { if (!crawlerRunning) {
crawlerRunning = true; crawlerRunning = true;
crawlAll().then(newRealEstates => { crawlAll().then(newRealEstates => {
logDebug("crawlAll done, new real estate len: ", newRealEstates.length);
notifyForNewRealEstates(newRealEstates);
}).catch(e => {
console.error('Error happened: ', e);
}).finally(()=> {
crawlerRunning = false; crawlerRunning = false;
logDebug('Finally done!'); notifyForNewRealEstates(newRealEstates);
}); });
} }
}; };

39
package-lock.json generated
View File

@@ -40,32 +40,6 @@
"@sendgrid/helpers": "^6.3.0" "@sendgrid/helpers": "^6.3.0"
} }
}, },
"@sozialhelden/fetch-cache": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/@sozialhelden/fetch-cache/-/fetch-cache-2.0.1.tgz",
"integrity": "sha512-vMlsdT5JQCGjx1fcFxmMNh7ZKppjjsfUAeZEhhNwhEL7GaqbZXsD1OXEyx2IcRa25ZuZtvJSV6Q3rE77VRdLvg==",
"requires": {
"@sozialhelden/hamster-cache": "^1.0.0"
}
},
"@sozialhelden/hamster-cache": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/@sozialhelden/hamster-cache/-/hamster-cache-1.0.0.tgz",
"integrity": "sha512-/TEGA8mdMawZp4Yq/GrkL+72YL5EGuSeVXC3pKW12YY1t3C+zCN/HZ0HRp4zWF/e67svXcxuz/B0AEQxEdvi7A=="
},
"@supercharge/goodies": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/@supercharge/goodies/-/goodies-1.4.0.tgz",
"integrity": "sha512-Np6u2qjRwiA3wTgzz4n2yduydIjSXqtJWP5cOnNqjdlCR/EUAK86LAOhEcU+YW211D1ksugns3GqpARJDoXQ7g=="
},
"@supercharge/promise-pool": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/@supercharge/promise-pool/-/promise-pool-1.3.0.tgz",
"integrity": "sha512-9/EVrJevSPEqI4i/gRH8Dt7C+FQT65wRRYuu0MDaGmSLZ2aTel0jOGu8Ae84fPiQ+Ah0B80RPFUxk+K+Cz48DA==",
"requires": {
"@supercharge/goodies": "~1.4.0"
}
},
"@types/caseless": { "@types/caseless": {
"version": "0.12.2", "version": "0.12.2",
"resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.2.tgz", "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.2.tgz",
@@ -105,14 +79,6 @@
"resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz",
"integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==" "integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q=="
}, },
"abort-controller": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
"integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
"requires": {
"event-target-shim": "^5.0.0"
}
},
"accepts": { "accepts": {
"version": "1.3.5", "version": "1.3.5",
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz",
@@ -1108,11 +1074,6 @@
"es5-ext": "~0.10.14" "es5-ext": "~0.10.14"
} }
}, },
"event-target-shim": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
"integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="
},
"events": { "events": {
"version": "1.1.1", "version": "1.1.1",
"resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz", "resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz",

View File

@@ -17,8 +17,7 @@
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js", "checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
"test-search": "cd test && node searchTest.js", "test-search": "cd test && node searchTest.js",
"test-olx-scraper": "cd test && node olxScrapeTest.js", "test-olx-scraper": "cd test && node olxScrapeTest.js",
"test-rental-scraper": "cd test && node rentalScrapeTest.js", "test-rental-scraper": "cd test && node rentalScrapeTest.js"
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
}, },
"repository": { "repository": {
"type": "git", "type": "git",
@@ -32,9 +31,6 @@
"dependencies": { "dependencies": {
"2checkout-node": "0.0.1", "2checkout-node": "0.0.1",
"@sendgrid/mail": "^6.3.1", "@sendgrid/mail": "^6.3.1",
"@sozialhelden/fetch-cache": "^2.0.1",
"@supercharge/promise-pool": "^1.3.0",
"abort-controller": "^3.0.0",
"aws-sdk": "^2.422.0", "aws-sdk": "^2.422.0",
"bluebird": "^3.5.5", "bluebird": "^3.5.5",
"cheerio": "^1.0.0-rc.2", "cheerio": "^1.0.0-rc.2",

View File

@@ -9,7 +9,7 @@ if (urlToScrape) {
(async () => { (async () => {
const data = await crawler.scrapeAd(urlToScrape); const data = await crawler.scrapeAd(urlToScrape);
console.log("Scraped data:", data); console.log(data);
})(); })();
} else { } else {
console.log("No URL to scrape. Use like this : "); console.log("No URL to scrape. Use like this : ");

View File

@@ -1,17 +0,0 @@
"use strict";
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
const urlToScrape = process.argv[2] || undefined;
if (urlToScrape) {
const crawler = new saljicCrawler();
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log("Scraped data:", data);
})();
} else {
console.log("No URL to scrape. Use like this : ");
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
}