Compare commits
103 Commits
checkup-em
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
70bd952ee1 | ||
|
|
14039975c2 | ||
|
|
92e4f4ed5a | ||
|
|
88f9d10586 | ||
|
|
2b1cbcaa47 | ||
|
|
bf8d131025 | ||
|
|
698acb010a | ||
|
|
ade3eb307d | ||
|
|
8031f6f8a3 | ||
|
|
d7a680a3ac | ||
|
|
8018caab47 | ||
|
|
d871d9ad1f | ||
|
|
dfbefc20cd | ||
|
|
a481ecfe37 | ||
|
|
8df94da48c | ||
|
|
d4fcd1950d | ||
|
|
e8115a9215 | ||
|
|
160efdf6ab | ||
|
|
c9b8c2e1a5 | ||
|
|
855b93ca41 | ||
|
|
70779b24c0 | ||
|
|
ba873f9f4e | ||
|
|
e4775158fc | ||
|
|
26377c485c | ||
|
|
b30b0f45a6 | ||
|
|
9c1a029ff1 | ||
|
|
9b49759485 | ||
|
|
0c2f8d11ee | ||
|
|
b27d9d3499 | ||
|
|
dd3f30ef0e | ||
|
|
78c6056db4 | ||
|
|
ecf27f2ba1 | ||
|
|
1229b3fa6c | ||
|
|
542ff56123 | ||
|
|
0aa851015b | ||
|
|
c033b2e47c | ||
|
|
0895654db2 | ||
|
|
8925eb9f4e | ||
|
|
52201af3ba | ||
|
|
1505c07363 | ||
|
|
159fedbc2d | ||
|
|
65068932ad | ||
|
|
820227827e | ||
|
|
d35a113baa | ||
|
|
ba60f8749d | ||
|
|
f1d45fed26 | ||
|
|
ff923605ad | ||
|
|
692577fb8c | ||
|
|
df5e38092d | ||
|
|
feb2d04ed6 | ||
|
|
90e171d07d | ||
|
|
747f56941a | ||
|
|
2a13ab55ed | ||
|
|
441f905b29 | ||
|
|
39f9383ae2 | ||
|
|
edca7f91af | ||
|
|
44402a9cc4 | ||
|
|
b913daa1f7 | ||
|
|
a508f72d7c | ||
|
|
08ad9edfe1 | ||
|
|
ce857ddce9 | ||
|
|
148b2ea863 | ||
|
|
d436d4a37b | ||
|
|
6791a509d0 | ||
|
|
edc6e2bbf7 | ||
|
|
4f230020d7 | ||
|
|
f62a7200c7 | ||
|
|
cff7cc2e9c | ||
|
|
f56cd5b549 | ||
|
|
bc7ce9d708 | ||
|
|
df2a962d0f | ||
|
|
be4508ebea | ||
|
|
22bffc126d | ||
|
|
06f80296f3 | ||
|
|
81fa3f046d | ||
|
|
addd8c1344 | ||
|
|
5bdc8e149a | ||
|
|
fc7fe3c0b3 | ||
|
|
b3007123a5 | ||
|
|
f7d4a9cd07 | ||
|
|
ab6812889a | ||
|
|
b82134e280 | ||
|
|
be378883c8 | ||
|
|
8a87b9e253 | ||
|
|
43bc23b164 | ||
|
|
fc6351af46 | ||
|
|
6267b2cab4 | ||
|
|
97724a47a1 | ||
|
|
91a1c6a91e | ||
|
|
eb4ab2e341 | ||
|
|
2d0a00b967 | ||
|
|
74def9c059 | ||
|
|
d29b3eb1b3 | ||
|
|
41b59e8c7c | ||
|
|
b933fa96d4 | ||
|
|
824db4fbc3 | ||
|
|
712cde1632 | ||
|
|
1ba7cf8531 | ||
|
|
7a7aecb3ee | ||
|
|
78c4054cde | ||
|
|
94ffc2d6d2 | ||
|
|
b11f18696f | ||
|
|
b9122f8f00 |
@@ -303,7 +303,8 @@ const AD_AGENCY = {
|
|||||||
OLX: "OLX",
|
OLX: "OLX",
|
||||||
RENTAL: "RENTAL",
|
RENTAL: "RENTAL",
|
||||||
PROSTOR: "PROSTOR",
|
PROSTOR: "PROSTOR",
|
||||||
AKTIDO: "AKTIDO"
|
AKTIDO: "AKTIDO",
|
||||||
|
SALJIC: "SALJIC"
|
||||||
};
|
};
|
||||||
|
|
||||||
const CRAWLER_AD_TYPE = {
|
const CRAWLER_AD_TYPE = {
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ const APP_URL =
|
|||||||
? process.env.APP_URL || "http://market-alarm"
|
? process.env.APP_URL || "http://market-alarm"
|
||||||
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
: process.env.APP_URL || `${APP_BASE_URL}:${APP_PORT}`;
|
||||||
|
|
||||||
|
const STAGING = process.env.ENVIRONMENT !== "production";
|
||||||
|
|
||||||
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
|
const DEFAULT_TIMEZONE = "Europe/Sarajevo";
|
||||||
|
|
||||||
const CRAWLER_INTERVAL = parseInt(process.env.CRAWLER_INTERVAL) || 60;
|
const CRAWLER_INTERVAL = parseInt(process.env.CRAWLER_INTERVAL) || 60;
|
||||||
@@ -39,6 +41,15 @@ const PROSTOR_LOGIN = {
|
|||||||
PASSWORD: process.env.PROSTOR_LOGIN_PASS
|
PASSWORD: process.env.PROSTOR_LOGIN_PASS
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const USER_AGENT =
|
||||||
|
process.env.USER_AGENT ||
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
|
||||||
|
|
||||||
|
const USE_SCRAPER_API = process.env.USE_SCRAPER_API === undefined ? 1 : parseInt(process.env.USE_SCRAPER_API);
|
||||||
|
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
|
||||||
|
const SCRAPER_API_BASE_URL = process.env.SCRAPER_API_BASE_URL || "";
|
||||||
|
const NODE_FETCH_TIMEOUT_MS = parseInt(process.env.NODE_FETCH_TIMEOUT_MS) || 60000
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
APP_PORT,
|
APP_PORT,
|
||||||
APP_URL,
|
APP_URL,
|
||||||
@@ -50,6 +61,12 @@ module.exports = {
|
|||||||
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
|
MAX_REAL_ESTATES_IN_FIRST_EMAIL,
|
||||||
PRINT_CRAWLER_DEBUG,
|
PRINT_CRAWLER_DEBUG,
|
||||||
API_MAP_KEY,
|
API_MAP_KEY,
|
||||||
|
STAGING,
|
||||||
CHECK_UP_DAYS,
|
CHECK_UP_DAYS,
|
||||||
PROSTOR_LOGIN
|
PROSTOR_LOGIN,
|
||||||
|
USER_AGENT,
|
||||||
|
USE_SCRAPER_API,
|
||||||
|
SCRAPER_API_KEY,
|
||||||
|
SCRAPER_API_BASE_URL,
|
||||||
|
NODE_FETCH_TIMEOUT_MS
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -9,12 +9,15 @@ const OlxCrawler = require("./specificCrawlers/olx");
|
|||||||
const RentalCrawler = require("./specificCrawlers/rental");
|
const RentalCrawler = require("./specificCrawlers/rental");
|
||||||
const ProstorCrawler = require("./specificCrawlers/prostor");
|
const ProstorCrawler = require("./specificCrawlers/prostor");
|
||||||
const AktidoCrawler = require("./specificCrawlers/aktido");
|
const AktidoCrawler = require("./specificCrawlers/aktido");
|
||||||
|
const SaljicCrawler = require("./specificCrawlers/saljic");
|
||||||
|
const { logDebug } = require("../helpers/log");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
OLX_CONFIG,
|
OLX_CONFIG,
|
||||||
RENTAL_CONFIG,
|
RENTAL_CONFIG,
|
||||||
PROSTOR_CONFIG,
|
PROSTOR_CONFIG,
|
||||||
AKTIDO_CONFIG
|
AKTIDO_CONFIG,
|
||||||
|
SALJIC_CONFIG
|
||||||
} = require("./crawlerConfig");
|
} = require("./crawlerConfig");
|
||||||
const PostgresSaver = require("./savers/postgres");
|
const PostgresSaver = require("./savers/postgres");
|
||||||
|
|
||||||
@@ -57,6 +60,15 @@ async function crawlAll() {
|
|||||||
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
||||||
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
||||||
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
||||||
|
),
|
||||||
|
new SaljicCrawler(
|
||||||
|
[postgresSaver],
|
||||||
|
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||||
|
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||||
|
SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||||
|
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||||
|
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||||
|
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||||
)
|
)
|
||||||
];
|
];
|
||||||
|
|
||||||
@@ -64,7 +76,9 @@ async function crawlAll() {
|
|||||||
|
|
||||||
for (const crawler of crawlers) {
|
for (const crawler of crawlers) {
|
||||||
try {
|
try {
|
||||||
|
logDebug('Starting crawler: ', crawler);
|
||||||
const newRealEstatesFromSingleCrawler = await crawler.crawl();
|
const newRealEstatesFromSingleCrawler = await crawler.crawl();
|
||||||
|
logDebug('Crawler done: ', crawler);
|
||||||
if (Array.isArray(newRealEstatesFromSingleCrawler)) {
|
if (Array.isArray(newRealEstatesFromSingleCrawler)) {
|
||||||
newRealEstates.push(...newRealEstatesFromSingleCrawler);
|
newRealEstates.push(...newRealEstatesFromSingleCrawler);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx");
|
|||||||
const RENTAL_CONFIG = require("./specificConfigs/rental");
|
const RENTAL_CONFIG = require("./specificConfigs/rental");
|
||||||
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
|
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
|
||||||
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
|
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
|
||||||
|
const SALJIC_CONFIG = require("./specificConfigs/saljic");
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
OLX_CONFIG,
|
OLX_CONFIG,
|
||||||
RENTAL_CONFIG,
|
RENTAL_CONFIG,
|
||||||
PROSTOR_CONFIG,
|
PROSTOR_CONFIG,
|
||||||
AKTIDO_CONFIG
|
AKTIDO_CONFIG,
|
||||||
|
SALJIC_CONFIG
|
||||||
};
|
};
|
||||||
|
|||||||
34
app/crawler/specificConfigs/saljic.js
Normal file
34
app/crawler/specificConfigs/saljic.js
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
"use strict";
|
||||||
|
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
|
||||||
|
|
||||||
|
const saljicCrawlerAdType =
|
||||||
|
process.env.SALJIC_CRAWLER_AD_TYPE !== undefined
|
||||||
|
? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE]
|
||||||
|
: null;
|
||||||
|
|
||||||
|
const saljicParsedCrawlerAdCategories =
|
||||||
|
process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined
|
||||||
|
? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
||||||
|
category.trim()
|
||||||
|
)
|
||||||
|
: ["FLAT", "HOUSE"];
|
||||||
|
|
||||||
|
const saljicIgnoredUsernames = [];
|
||||||
|
|
||||||
|
const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories
|
||||||
|
.map(categoryName =>
|
||||||
|
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
|
||||||
|
)
|
||||||
|
.filter(category => !!category);
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100,
|
||||||
|
SALJIC_MAX_RESULTS_PER_PAGE:
|
||||||
|
parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000,
|
||||||
|
SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||||
|
SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories,
|
||||||
|
SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [],
|
||||||
|
SALJIC_DELAY_BETWEEN_PAGES:
|
||||||
|
parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000,
|
||||||
|
SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL)
|
||||||
|
};
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const fetch = require("node-fetch");
|
const fetch = require("../../helpers/fetchWrapper");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const Promise = require("bluebird");
|
const Promise = require("bluebird");
|
||||||
const moment = require("moment-timezone");
|
const moment = require("moment-timezone");
|
||||||
@@ -159,7 +159,7 @@ class AktidoCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
const res = await fetch(url, {}, false);
|
||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let hrefs = [];
|
let hrefs = [];
|
||||||
@@ -202,6 +202,10 @@ class AktidoCrawler {
|
|||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
|
if (body.indexOf('<html') === -1) {
|
||||||
|
throw { message: 'Failed to fetch page !' }
|
||||||
|
}
|
||||||
|
|
||||||
const mapElementParent = $(".box-map").parent();
|
const mapElementParent = $(".box-map").parent();
|
||||||
const scriptElement = $("script", mapElementParent);
|
const scriptElement = $("script", mapElementParent);
|
||||||
if (
|
if (
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const fetch = require("node-fetch");
|
const fetch = require("../../helpers/fetchWrapper");
|
||||||
|
const { logDebug } = require("../../helpers/log");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const Promise = require("bluebird");
|
const Promise = require("bluebird");
|
||||||
const moment = require("moment-timezone");
|
const moment = require("moment-timezone");
|
||||||
@@ -44,6 +45,16 @@ const OLX_ENUMS = {
|
|||||||
|
|
||||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||||
|
|
||||||
|
const chunk = (array, size = 10) => {
|
||||||
|
let i, j ,temparray;
|
||||||
|
const result = []
|
||||||
|
for (i=0,j=array.length; i<j; i+=size) {
|
||||||
|
temparray = array.slice(i,i+size);
|
||||||
|
result.push(temparray);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
class OlxCrawler {
|
class OlxCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
savers = [],
|
savers = [],
|
||||||
@@ -52,7 +63,7 @@ class OlxCrawler {
|
|||||||
maxPages = 1000,
|
maxPages = 1000,
|
||||||
maxResultsPerPage = 100,
|
maxResultsPerPage = 100,
|
||||||
ignoredUsernames = [],
|
ignoredUsernames = [],
|
||||||
delayBetweenPages = 1000
|
delayBetweenPages = 500
|
||||||
) {
|
) {
|
||||||
this.savers = savers;
|
this.savers = savers;
|
||||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||||
@@ -65,6 +76,7 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
|
logDebug("Starting OLX crawl");
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
|
||||||
const newRealEstates = [];
|
const newRealEstates = [];
|
||||||
@@ -88,14 +100,32 @@ class OlxCrawler {
|
|||||||
const entries = singlePageResults.entries();
|
const entries = singlePageResults.entries();
|
||||||
|
|
||||||
for (const [index, { value: singlePageResult }] of entries) {
|
for (const [index, { value: singlePageResult }] of entries) {
|
||||||
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log("================================");
|
||||||
|
console.log("Category Indexer index : ", index);
|
||||||
|
}
|
||||||
|
|
||||||
if (singlePageResult) {
|
if (singlePageResult) {
|
||||||
|
console.log("\tTotal entries : ", singlePageResult.length)
|
||||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||||
const { newRecords, existingRecords } = saveResults;
|
const { newRecords, existingRecords } = saveResults;
|
||||||
|
|
||||||
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log("--------------------------");
|
||||||
|
console.log("\tNew record URLs [", newRecords.length, "] :");
|
||||||
|
|
||||||
|
for(const newRecord of newRecords) {
|
||||||
|
console.log("\t\t",newRecord.url);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("\t-------------------------");
|
||||||
|
console.log("\tExisting record URLs [", existingRecords.length, "] :");
|
||||||
|
}
|
||||||
|
|
||||||
newRealEstates.push(...newRecords);
|
newRealEstates.push(...newRecords);
|
||||||
|
|
||||||
for (const existingRecord of existingRecords) {
|
for (const existingRecord of existingRecords) {
|
||||||
const { publishedDate, renewedDate } = existingRecord;
|
const { publishedDate, renewedDate, url } = existingRecord;
|
||||||
|
|
||||||
const publishedDateMoment = moment.utc(publishedDate);
|
const publishedDateMoment = moment.utc(publishedDate);
|
||||||
const renewedDateMoment = moment.utc(renewedDate);
|
const renewedDateMoment = moment.utc(renewedDate);
|
||||||
@@ -105,13 +135,25 @@ class OlxCrawler {
|
|||||||
"minute"
|
"minute"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log("\t\t", url);
|
||||||
|
console.log("\t\t\tPublished date : ", publishedDate);
|
||||||
|
console.log("\t\t\tRenewed date : ", renewedDate);
|
||||||
|
console.log("\t\t\tIs same (up to minute) : ", stopCrawlingThisCategory);
|
||||||
|
}
|
||||||
|
|
||||||
if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
|
if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
|
||||||
generatorsToRemove[index] = true;
|
generatorsToRemove[index] = true;
|
||||||
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log("\t\t\tStopping this category indexer");
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log("\tNo more entries in this category, stopping!");
|
||||||
|
}
|
||||||
//Generator returned undefined, remove this generator from array
|
//Generator returned undefined, remove this generator from array
|
||||||
generatorsToRemove[index] = true;
|
generatorsToRemove[index] = true;
|
||||||
// console.log("Generator ", index + 1, "has no more pages");
|
// console.log("Generator ", index + 1, "has no more pages");
|
||||||
@@ -136,31 +178,36 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async *categoryIndexer(adCategory) {
|
async *categoryIndexer(adCategory) {
|
||||||
let pageToIndex = 1;
|
try {
|
||||||
|
let pageToIndex = 1;
|
||||||
|
|
||||||
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||||
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||||
while (true) {
|
while (true) {
|
||||||
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||||
const singlePageResults = await this.indexSinglePage(
|
const singlePageResults = await this.indexSinglePage(
|
||||||
urlPageToCrawl,
|
urlPageToCrawl,
|
||||||
this.maxResultsPerPage
|
this.maxResultsPerPage
|
||||||
);
|
);
|
||||||
|
|
||||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
await this.sleep(this.delayBetweenPages);
|
||||||
yield singlePageResults;
|
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||||
} else {
|
yield singlePageResults;
|
||||||
return undefined;
|
} else {
|
||||||
}
|
return undefined;
|
||||||
|
}
|
||||||
++pageToIndex;
|
|
||||||
if (pageToIndex === this.maxPages) {
|
++pageToIndex;
|
||||||
return undefined;
|
if (pageToIndex === this.maxPages) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
}
|
}
|
||||||
} else {
|
} catch (e) {
|
||||||
return undefined;
|
console.log('Error inside generator: ', e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -170,8 +217,10 @@ class OlxCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
const res = await fetch(url, {}, false);
|
||||||
|
logDebug("Got category results for: ", url);
|
||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
|
logDebug("Got category results text for: ", url);
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let hrefs = [];
|
let hrefs = [];
|
||||||
|
|
||||||
@@ -192,26 +241,44 @@ class OlxCrawler {
|
|||||||
|
|
||||||
const asyncScraping = [];
|
const asyncScraping = [];
|
||||||
for (let i = 0; i < actualNoOfResults; i++) {
|
for (let i = 0; i < actualNoOfResults; i++) {
|
||||||
asyncScraping.push(this.scrapeAd(hrefs[i]));
|
asyncScraping.push(hrefs[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
const scrapedData = await Promise.all(asyncScraping);
|
const allChunks = chunk(asyncScraping, 2);
|
||||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
const dataResults = []
|
||||||
|
for (let i = 0; i < allChunks.length; i++) {
|
||||||
|
const singleChunk = allChunks[i];
|
||||||
|
const promises = singleChunk.map(c => this.scrapeAd(c))
|
||||||
|
const chunkResults = await Promise.all(promises);
|
||||||
|
await this.sleep(this.delayBetweenPages);
|
||||||
|
dataResults.push(...chunkResults);
|
||||||
|
logDebug("Chunk results len:", chunkResults.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
||||||
|
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
|
||||||
|
|
||||||
return filteredScrapedData;
|
return filteredScrapedData;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught:" + e);
|
console.error("Exception caught, index single page: " + e);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(url) {
|
async scrapeAd(url) {
|
||||||
// console.log("Scraping : ", url);
|
logDebug("Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let status = AD_STATUS.STATUS_NORMAL;
|
let status = AD_STATUS.STATUS_NORMAL;
|
||||||
|
|
||||||
|
if (body.indexOf('<html') === -1) {
|
||||||
|
console.error("This is the body: ", body);
|
||||||
|
throw { message: 'Failed to fetch page !' }
|
||||||
|
}
|
||||||
|
|
||||||
const propertySelectors = {
|
const propertySelectors = {
|
||||||
username:
|
username:
|
||||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
||||||
@@ -238,37 +305,25 @@ class OlxCrawler {
|
|||||||
|
|
||||||
//====== PRICE DETECTION AND EXTRACTION =====
|
//====== PRICE DETECTION AND EXTRACTION =====
|
||||||
let price = null;
|
let price = null;
|
||||||
const normalPriceValue = $("#pc > p:nth-child(2)").text();
|
|
||||||
const urgentPriceValue = $(
|
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
|
||||||
)
|
|
||||||
.text()
|
|
||||||
.trim();
|
|
||||||
|
|
||||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
const priceHeader = $("#pc > p.n").text().trim();
|
||||||
price = normalPriceValue;
|
const priceValue = $("#pc > p:nth-child(2)").text().trim();
|
||||||
if (
|
price = priceValue;
|
||||||
$("#pc > p.n")
|
|
||||||
.text()
|
if (priceHeader.indexOf('Hitn') !== -1) {
|
||||||
.indexOf("Hitna") !== -1
|
// Urgent price
|
||||||
) {
|
status = AD_STATUS.STATUS_URGENT;
|
||||||
status = AD_STATUS.STATUS_URGENT;
|
}
|
||||||
} else {
|
|
||||||
status = AD_STATUS.STATUS_NORMAL;
|
const discountPriceTag = $("#artikal_glavni_div > div.artikal_lijevo > p:nth-child(4)").text().trim();
|
||||||
}
|
if (discountPriceTag.indexOf('Akcij') !== -1) {
|
||||||
} else if (urgentPriceValue && urgentPriceValue.length > 0) {
|
status = AD_STATUS.STATUS_DISCOUNTED;
|
||||||
const priceValues = urgentPriceValue.split("KM");
|
const discountPriceValues = $("#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p").text().trim();
|
||||||
//priceValues will contain values like ["100000", "90000", ...], second element is urgent price
|
// discountPriceValues contain string like "10.000 KM 7.500 KM"
|
||||||
if (priceValues.length > 1) {
|
// First price is regular, second is currently active (discounted) price
|
||||||
price = priceValues[1].trim();
|
const bothPrices = discountPriceValues.split('KM');
|
||||||
status = AD_STATUS.STATUS_DISCOUNTED;
|
// Now, currently active price is second element of bothPrices array
|
||||||
} else {
|
price = bothPrices[1] ? bothPrices[1].trim() : null;
|
||||||
throw { message: "Can't find urgent price" };
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
throw {
|
|
||||||
message: "Can't find price (it is not normal nor urgent price ?)"
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//====== OTHER AD INFORMATION ===============
|
//====== OTHER AD INFORMATION ===============
|
||||||
@@ -278,7 +333,7 @@ class OlxCrawler {
|
|||||||
|
|
||||||
let otherInformationDivId;
|
let otherInformationDivId;
|
||||||
//We need to locate DIV ID where other information are stored
|
//We need to locate DIV ID where other information are stored
|
||||||
for (let possibleId = 10; possibleId <= 20; possibleId++) {
|
for (let possibleId = 1; possibleId <= 30; possibleId++) {
|
||||||
const adTypeFieldTitle = $(
|
const adTypeFieldTitle = $(
|
||||||
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1`
|
`#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1`
|
||||||
)
|
)
|
||||||
@@ -650,10 +705,12 @@ class OlxCrawler {
|
|||||||
distanceToRiver,
|
distanceToRiver,
|
||||||
numberOfViewsAgency
|
numberOfViewsAgency
|
||||||
};
|
};
|
||||||
|
//
|
||||||
|
//console.log("Scraped data:", data);
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
console.error("Exception caught scrapeAd : " + e.message, "\r\nURL:", url);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -768,6 +825,9 @@ class OlxCrawler {
|
|||||||
if (!priceText) {
|
if (!priceText) {
|
||||||
return NaN;
|
return NaN;
|
||||||
}
|
}
|
||||||
|
if (priceText === "Po dogovoru") {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
const formattedPriceText = priceText.replace(".", "").replace(",", ".");
|
const formattedPriceText = priceText.replace(".", "").replace(",", ".");
|
||||||
return parseFloat(formattedPriceText);
|
return parseFloat(formattedPriceText);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const fetch = require("node-fetch");
|
const fetch = require("../../helpers/fetchWrapper");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const moment = require("moment-timezone");
|
const moment = require("moment-timezone");
|
||||||
const FormData = require("form-data");
|
const FormData = require("form-data");
|
||||||
|
const nodeFetch = require("node-fetch");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
AD_TYPE,
|
AD_TYPE,
|
||||||
@@ -62,13 +63,19 @@ class ProstorCrawler {
|
|||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
const crawlAdTypes = this.crawlerAdTypes;
|
||||||
|
if (!crawlAdCategories || !crawlAdTypes) {
|
||||||
|
return []
|
||||||
|
}
|
||||||
|
|
||||||
|
const newRealEstates = [];
|
||||||
//We need session cookie to use login privileges
|
//We need session cookie to use login privileges
|
||||||
const prostorCookie = await this.getCookies();
|
const prostorCookie = await this.getCookies();
|
||||||
//New tag to check if crawler loged in
|
//New tag to check if crawler logged in
|
||||||
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
|
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
|
||||||
const newRealEstates = [];
|
|
||||||
//Crawl only if login was successful
|
//Crawl only if login was successful
|
||||||
if (crawlAdCategories && login) {
|
if (login) {
|
||||||
const indexGenerators = [];
|
const indexGenerators = [];
|
||||||
for (const adCategory of crawlAdCategories) {
|
for (const adCategory of crawlAdCategories) {
|
||||||
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
|
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
|
||||||
@@ -134,6 +141,11 @@ class ProstorCrawler {
|
|||||||
prostorCookie
|
prostorCookie
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (!Array.isArray(listOfAllRealEstates)){
|
||||||
|
console.log('[PROSTOR] Could not find real estate JSON data, check selector !');
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
let elementToStartIndexFrom = 0;
|
let elementToStartIndexFrom = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
const realEstatesForSinglePage = listOfAllRealEstates.slice(
|
const realEstatesForSinglePage = listOfAllRealEstates.slice(
|
||||||
@@ -191,24 +203,22 @@ class ProstorCrawler {
|
|||||||
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
||||||
|
|
||||||
//Status information is given already in realestate list
|
//Status information is given already in realestate list
|
||||||
//For VIP Ads status ='' canot be used, but no VIP ads are crawled
|
const adStatus = ProstorCrawler.getStatusId(status);
|
||||||
//We will make "fake" vip ad for RE that have size=55
|
|
||||||
//It is weird because yesterday it said 'VIP ponuda' ???
|
|
||||||
const adStatus =
|
|
||||||
size === "55"
|
|
||||||
? ProstorCrawler.getStatusId("VIP ponuda")
|
|
||||||
: ProstorCrawler.getStatusId(status);
|
|
||||||
|
|
||||||
const url = `https://prostor.ba${link}`;
|
const url = `https://prostor.ba${link}`;
|
||||||
|
|
||||||
// console.log("[PROSTOR] Scraping : ", url);
|
// console.log("[PROSTOR] Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url, {
|
const adPageSource = await nodeFetch(url, {
|
||||||
headers: { Cookie: prostorCookie }
|
headers: { Cookie: prostorCookie }
|
||||||
});
|
});
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
|
if (body.indexOf('<html') === -1) {
|
||||||
|
throw { message: 'Failed to fetch page !' }
|
||||||
|
}
|
||||||
|
|
||||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||||
// general form is : /actionType/realEstateType/location/realEstateID
|
// general form is : /actionType/realEstateType/location/realEstateID
|
||||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||||
@@ -433,14 +443,14 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url, {
|
const res = await nodeFetch(url, {
|
||||||
headers: { Cookie: prostorCookie }
|
headers: { Cookie: prostorCookie }
|
||||||
});
|
});
|
||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
const scriptElement = $(
|
const scriptElement = $(
|
||||||
"body > div > div.container-fluid > script:nth-child(7)"
|
"body > div.content > div.container-fluid > script:nth-child(6)"
|
||||||
);
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -597,7 +607,7 @@ class ProstorCrawler {
|
|||||||
formData.append("email", PROSTOR_LOGIN.EMAIL);
|
formData.append("email", PROSTOR_LOGIN.EMAIL);
|
||||||
formData.append("password", PROSTOR_LOGIN.PASSWORD);
|
formData.append("password", PROSTOR_LOGIN.PASSWORD);
|
||||||
|
|
||||||
return fetch("https://prostor.ba/moj-prostor/prijava", {
|
return nodeFetch("https://prostor.ba/moj-prostor/prijava", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
body: formData,
|
body: formData,
|
||||||
headers: { Cookie: prostorCookie }
|
headers: { Cookie: prostorCookie }
|
||||||
@@ -624,9 +634,12 @@ class ProstorCrawler {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
async getCookies() {
|
async getCookies() {
|
||||||
const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", {
|
const getResponse = await nodeFetch(
|
||||||
headers: { Cookie: "" }
|
"https://prostor.ba/moj-prostor/prijava",
|
||||||
});
|
{
|
||||||
|
headers: { Cookie: "" }
|
||||||
|
}
|
||||||
|
);
|
||||||
const raw = getResponse.headers.raw()["set-cookie"];
|
const raw = getResponse.headers.raw()["set-cookie"];
|
||||||
const cookie = raw
|
const cookie = raw
|
||||||
.map(datastring => {
|
.map(datastring => {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const fetch = require("node-fetch");
|
const fetch = require("../../helpers/fetchWrapper");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const Promise = require("bluebird");
|
const Promise = require("bluebird");
|
||||||
const moment = require("moment-timezone");
|
const moment = require("moment-timezone");
|
||||||
@@ -159,7 +159,7 @@ class RentalCrawler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
const res = await fetch(url, {} , false);
|
||||||
const body = await res.text();
|
const body = await res.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
let hrefs = [];
|
let hrefs = [];
|
||||||
@@ -202,6 +202,10 @@ class RentalCrawler {
|
|||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
|
if (body.indexOf('<html') === -1) {
|
||||||
|
throw { message: 'Failed to fetch page !' }
|
||||||
|
}
|
||||||
|
|
||||||
const mapElementParent = $(".box-map").parent();
|
const mapElementParent = $(".box-map").parent();
|
||||||
const scriptElement = $("script", mapElementParent);
|
const scriptElement = $("script", mapElementParent);
|
||||||
if (
|
if (
|
||||||
@@ -399,7 +403,9 @@ class RentalCrawler {
|
|||||||
);
|
);
|
||||||
if (!publishedDateMoment.isValid()) {
|
if (!publishedDateMoment.isValid()) {
|
||||||
throw {
|
throw {
|
||||||
message: `Invalid published date : ${extractedData["re_realEstates_inserted"]}`
|
message: `Invalid published date : ${
|
||||||
|
extractedData["re_realEstates_inserted"]
|
||||||
|
}`
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -410,7 +416,9 @@ class RentalCrawler {
|
|||||||
);
|
);
|
||||||
if (!renewedDateMoment.isValid()) {
|
if (!renewedDateMoment.isValid()) {
|
||||||
throw {
|
throw {
|
||||||
message: `Invalid renewed date : ${extractedData["re_realEstates_edited"]}`
|
message: `Invalid renewed date : ${
|
||||||
|
extractedData["re_realEstates_edited"]
|
||||||
|
}`
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
687
app/crawler/specificCrawlers/saljic.js
Normal file
687
app/crawler/specificCrawlers/saljic.js
Normal file
@@ -0,0 +1,687 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const fetch = require("../../helpers/fetchWrapper");
|
||||||
|
const { getUrlParams } = require("../../helpers/url");
|
||||||
|
const cheerio = require("cheerio");
|
||||||
|
const moment = require("moment-timezone");
|
||||||
|
const PromisePool = require('@supercharge/promise-pool');
|
||||||
|
|
||||||
|
const {
|
||||||
|
AD_TYPE,
|
||||||
|
AD_CATEGORY,
|
||||||
|
AD_AGENCY,
|
||||||
|
AD_STATUS,
|
||||||
|
CRAWLER_AD_TYPE,
|
||||||
|
FURNISHING_TYPE,
|
||||||
|
HEATING_TYPE
|
||||||
|
} = require("../../common/enums");
|
||||||
|
|
||||||
|
const {
|
||||||
|
PRINT_CRAWLER_DEBUG,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
} = require("../../config/appConfig");
|
||||||
|
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
|
||||||
|
|
||||||
|
const SALJIC_ENUMS = {
|
||||||
|
SALJIC_AD_TYPE: {
|
||||||
|
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
|
||||||
|
},
|
||||||
|
SALJIC_AD_CATEGORY: {
|
||||||
|
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
|
||||||
|
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
|
||||||
|
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
|
||||||
|
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
|
||||||
|
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
|
||||||
|
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
|
||||||
|
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
|
||||||
|
//[AD_CATEGORY.COTTAGE.id]: ""
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SaljicCrawler {
|
||||||
|
constructor(
|
||||||
|
savers = [],
|
||||||
|
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||||
|
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
|
||||||
|
maxPages = 5000,
|
||||||
|
maxResultsPerPage = 5000,
|
||||||
|
ignoredUsernames = [],
|
||||||
|
delayBetweenPages = 500
|
||||||
|
) {
|
||||||
|
this.savers = savers;
|
||||||
|
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
||||||
|
this.crawlerAdTypes = crawlerAdTypes;
|
||||||
|
this.crawlerAdCategories = crawlerAdCategories;
|
||||||
|
this.maxPages = maxPages
|
||||||
|
this.maxResultsPerPage = maxResultsPerPage;
|
||||||
|
this.delayBetweenPages = delayBetweenPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
async crawl() {
|
||||||
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
|
||||||
|
const newRealEstates = [];
|
||||||
|
|
||||||
|
if (crawlAdCategories) {
|
||||||
|
const indexGenerators = [];
|
||||||
|
for (const adCategory of crawlAdCategories) {
|
||||||
|
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||||
|
}
|
||||||
|
//
|
||||||
|
//console.log(indexGenerators);
|
||||||
|
//
|
||||||
|
let done = false;
|
||||||
|
while (!done) {
|
||||||
|
const categoryIndexerPromises = [];
|
||||||
|
const generatorsToRemove = [];
|
||||||
|
for (const indexGenerator of indexGenerators) {
|
||||||
|
categoryIndexerPromises.push(indexGenerator.next());
|
||||||
|
generatorsToRemove.push(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
||||||
|
const entries = singlePageResults.entries();
|
||||||
|
|
||||||
|
for (const [index, { value: singlePageResult }] of entries) {
|
||||||
|
if (singlePageResult) {
|
||||||
|
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||||
|
const { newRecords } = saveResults;
|
||||||
|
|
||||||
|
newRealEstates.push(...newRecords);
|
||||||
|
|
||||||
|
if (
|
||||||
|
Array.isArray(newRecords) &&
|
||||||
|
newRecords.length === 0 &&
|
||||||
|
!SALJIC_FORCE_CRAWL
|
||||||
|
) {
|
||||||
|
generatorsToRemove[index] = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//Generator returned undefined, remove this generator from array
|
||||||
|
generatorsToRemove[index] = true;
|
||||||
|
// console.log("Generator ", index + 1, "has no more pages");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// console.log("Generators state : ", generatorsToRemove);
|
||||||
|
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
||||||
|
if (generatorsToRemove[i]) {
|
||||||
|
// console.log("\tRemove generator ", i + 1);
|
||||||
|
indexGenerators.splice(i, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (indexGenerators.length === 0) {
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.sleep(this.delayBetweenPages);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newRealEstates;
|
||||||
|
}
|
||||||
|
|
||||||
|
async *categoryIndexer(adCategory) {
|
||||||
|
let pageToIndex = 1;
|
||||||
|
|
||||||
|
const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes];
|
||||||
|
const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory];
|
||||||
|
|
||||||
|
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||||
|
while (true) {
|
||||||
|
const urlPagePart = pageToIndex === 1 ? "" : (pageToIndex - 1) * 2 * 11;
|
||||||
|
const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}&per_page=${urlPagePart}`;
|
||||||
|
|
||||||
|
const singlePageResults = await this.indexSinglePage(
|
||||||
|
urlPageToCrawl,
|
||||||
|
this.maxResultsPerPage
|
||||||
|
);
|
||||||
|
|
||||||
|
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||||
|
yield singlePageResults;
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
++pageToIndex;
|
||||||
|
if (pageToIndex === this.maxPages) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async indexSinglePage(url, maxResultsPerPage) {
|
||||||
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log("[SALJIC] Index page : ", url);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(url, {}, false);
|
||||||
|
const body = await res.text();
|
||||||
|
const $ = cheerio.load(body);
|
||||||
|
let hrefs = [];
|
||||||
|
|
||||||
|
$("#shop")
|
||||||
|
.find(".product")
|
||||||
|
.each((i, elem) => {
|
||||||
|
const href = $(elem)
|
||||||
|
.find("a")
|
||||||
|
.first()
|
||||||
|
.attr("href");
|
||||||
|
if (href) {
|
||||||
|
hrefs.push(href);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let adTypesTmp = [];
|
||||||
|
|
||||||
|
$("#shop")
|
||||||
|
.find(".product")
|
||||||
|
.each((i, elem) => {
|
||||||
|
const adType = $(elem)
|
||||||
|
.find(".trakica-search-page")
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
if (adType) {
|
||||||
|
adTypesTmp.push(adType);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
//Converting to AD_TYPE
|
||||||
|
const adTypes = adTypesTmp.map(adTypeText => {
|
||||||
|
return this.getAdTypeId(adTypeText);
|
||||||
|
});
|
||||||
|
|
||||||
|
//Converting to absolute URLs
|
||||||
|
const hrefsAbs = hrefs.map(link => {
|
||||||
|
return "https://www.saljicnekretnine.ba" + link;
|
||||||
|
});
|
||||||
|
|
||||||
|
let actualNoOfResults =
|
||||||
|
hrefsAbs.length <= maxResultsPerPage
|
||||||
|
? hrefsAbs.length
|
||||||
|
: maxResultsPerPage;
|
||||||
|
|
||||||
|
const asyncScraping = [];
|
||||||
|
for (let i = 0; i < actualNoOfResults; i++) {
|
||||||
|
asyncScraping.push([hrefsAbs[i], adTypes[i]]);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
const dataResults = []
|
||||||
|
const { scrapedData, errors } = await PromisePool
|
||||||
|
.withConcurrency(2)
|
||||||
|
.for(asyncScraping)
|
||||||
|
.process(async data => {
|
||||||
|
const result = await this.scrapeAd(...data)
|
||||||
|
await this.sleep(this.delayBetweenPages);
|
||||||
|
dataResults.push(result)
|
||||||
|
return result; //TODO: this does not work, scrapedData is null, dataResults works
|
||||||
|
})
|
||||||
|
|
||||||
|
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
||||||
|
return filteredScrapedData;
|
||||||
|
} catch (e) {
|
||||||
|
console.error("[SALJIC] Exception caught:" + e);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapeAd(url, adType) {
|
||||||
|
// console.log("[SALJIC] Scraping : ", url);
|
||||||
|
try {
|
||||||
|
const adPageSource = await fetch(url);
|
||||||
|
const body = await adPageSource.text();
|
||||||
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
|
if (body.indexOf('<html') === -1) {
|
||||||
|
throw { message: 'Failed to fetch page !' }
|
||||||
|
}
|
||||||
|
|
||||||
|
// No information for status ex. PRODAN
|
||||||
|
const status = AD_STATUS.STATUS_NORMAL;
|
||||||
|
//Extracting agency ID from url
|
||||||
|
const agencyObjectId = url
|
||||||
|
? parseInt(url.substring(46, url.length))
|
||||||
|
: null;
|
||||||
|
|
||||||
|
if (!agencyObjectId) {
|
||||||
|
throw { message : 'No agency object ID - URL changed?'}
|
||||||
|
}
|
||||||
|
|
||||||
|
//Extracting main properties
|
||||||
|
const propertySelectors = {
|
||||||
|
title:
|
||||||
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
|
||||||
|
price:
|
||||||
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
||||||
|
streetName:
|
||||||
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
|
||||||
|
descriptions:
|
||||||
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
|
||||||
|
latAndLong:
|
||||||
|
"iframe"
|
||||||
|
};
|
||||||
|
const title = $(propertySelectors.title)
|
||||||
|
.text()
|
||||||
|
.replace(/(\r\n|\n|\r)/gm, "")
|
||||||
|
.replace(/ {1,}/g, " ")
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const priceText = $(propertySelectors.price)
|
||||||
|
.text()
|
||||||
|
.replace(/(\r\n|\n|\r)/gm, "")
|
||||||
|
.replace(/ {1,}/g, " ")
|
||||||
|
.trim();
|
||||||
|
const price =
|
||||||
|
priceText === "CIJENA NA UPIT"
|
||||||
|
? null
|
||||||
|
: parseFloat(
|
||||||
|
priceText.substring(8, priceText.length - 3).replace(",", "")
|
||||||
|
);
|
||||||
|
|
||||||
|
const streetName = $(propertySelectors.streetName)
|
||||||
|
.text()
|
||||||
|
.replace(/(\r\n|\n|\r)/gm, "")
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const descriptions = $(propertySelectors.descriptions)
|
||||||
|
.text()
|
||||||
|
.replace(/\"/g, "")
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
|
||||||
|
let latText;
|
||||||
|
let longText;
|
||||||
|
if (latAndLongSrc){
|
||||||
|
const mapParams = getUrlParams(latAndLongSrc);
|
||||||
|
if (mapParams) {
|
||||||
|
if (mapParams['marker']){
|
||||||
|
const marker = mapParams['marker'].split(',');
|
||||||
|
latText = marker[0] ? marker[0] : undefined;
|
||||||
|
longText = marker[1] ? marker[1] : undefined;
|
||||||
|
}else{
|
||||||
|
if (mapParams['mlat']) {
|
||||||
|
latText = mapParams['mlat'];
|
||||||
|
}
|
||||||
|
if (mapParams['mlon']) {
|
||||||
|
longText = mapParams['mlon'];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const locationLat = parseFloat(latText) || null;
|
||||||
|
const locationLong = parseFloat(longText) || null;
|
||||||
|
|
||||||
|
//====== DETAIL INFORMATION FIELDS ==========
|
||||||
|
let area = null,
|
||||||
|
gardenSize = null,
|
||||||
|
numberOfRooms = null,
|
||||||
|
numberOfFloors = null,
|
||||||
|
floor = null,
|
||||||
|
accessRoadType = null,
|
||||||
|
heatingType = null,
|
||||||
|
furnishingType = null,
|
||||||
|
balcony = null,
|
||||||
|
newBuilding = null,
|
||||||
|
elevator = null,
|
||||||
|
water = null,
|
||||||
|
electricity = null,
|
||||||
|
drainageSystem = null,
|
||||||
|
registeredInZkBooks = null,
|
||||||
|
recentlyAdapted = null,
|
||||||
|
parking = null,
|
||||||
|
garage = null,
|
||||||
|
gas = null,
|
||||||
|
antiTheftDoor = null,
|
||||||
|
airCondition = null,
|
||||||
|
phoneConnection = null,
|
||||||
|
cableTV = null,
|
||||||
|
internet = null,
|
||||||
|
basementAttic = null,
|
||||||
|
storeRoom = null,
|
||||||
|
videoSurveillance = null,
|
||||||
|
alarm = null,
|
||||||
|
suitableForStudents = null,
|
||||||
|
includingBills = null,
|
||||||
|
animalsAllowed = null,
|
||||||
|
pool = null,
|
||||||
|
exchange = null,
|
||||||
|
urbanPlanPermit = null,
|
||||||
|
buildingPermit = null,
|
||||||
|
utilityConnection = null,
|
||||||
|
distanceToRiver = null;
|
||||||
|
let publishedDate = null;
|
||||||
|
let renewedDate = null;
|
||||||
|
let realEstateType;
|
||||||
|
let numberOfViewsAgency = null;
|
||||||
|
let numberOfViewsKivi = null;
|
||||||
|
let streetNumber = 0;
|
||||||
|
let adStatus = status;
|
||||||
|
let shortDescription = descriptions
|
||||||
|
? descriptions.substring(0, descriptions.indexOf("."))
|
||||||
|
: "";
|
||||||
|
let longDescription = descriptions || "";
|
||||||
|
//Extracting data - Glavne karakteristike
|
||||||
|
let mainFieldIndex = 1;
|
||||||
|
do {
|
||||||
|
const mainFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.list-group-item:nth-child(${mainFieldIndex})`;
|
||||||
|
|
||||||
|
const mainField = $(mainFieldSelector)
|
||||||
|
.text()
|
||||||
|
.replace(/[\n\r\t]/gm, "")
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const mainFieldTitle = mainField
|
||||||
|
? mainField.substring(0, mainField.indexOf(" "))
|
||||||
|
: "";
|
||||||
|
const mainFieldValue = mainField
|
||||||
|
? mainField
|
||||||
|
.substring(mainField.indexOf(" "), mainField.length)
|
||||||
|
.trim()
|
||||||
|
: "";
|
||||||
|
|
||||||
|
switch (mainFieldTitle) {
|
||||||
|
case "Površina":
|
||||||
|
area = parseFloat(
|
||||||
|
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
case "Okućnica":
|
||||||
|
gardenSize = parseFloat(
|
||||||
|
mainFieldValue.substring(0, mainFieldValue.indexOf(" "))
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
case "Broj soba":
|
||||||
|
numberOfRooms = parseInt(mainFieldValue);
|
||||||
|
break;
|
||||||
|
case "Broj spratova":
|
||||||
|
numberOfFloors = this.parseNumberOfFloors(mainFieldValue);
|
||||||
|
break;
|
||||||
|
case "Sprat":
|
||||||
|
floor = parseInt(mainFieldValue);
|
||||||
|
break;
|
||||||
|
case "Godina renoviranja":
|
||||||
|
recentlyAdapted = true;
|
||||||
|
break;
|
||||||
|
case "Broj parking mjesta":
|
||||||
|
parking = true;
|
||||||
|
break;
|
||||||
|
case "Dostupno od":
|
||||||
|
const day = mainFieldValue.substring(0, 2);
|
||||||
|
const month = mainFieldValue.substring(3, 5);
|
||||||
|
const year = mainFieldValue.substring(6, mainFieldValue.length);
|
||||||
|
publishedDate = new Date(`${month}/${day}/${year}`);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mainFieldTitle === "") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
mainFieldIndex++;
|
||||||
|
} while (true);
|
||||||
|
|
||||||
|
//Extracting data - Sadrzaji
|
||||||
|
let additionalFieldIndex = 1;
|
||||||
|
do {
|
||||||
|
const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`;
|
||||||
|
|
||||||
|
const additionalField = $(additionalFieldSelector)
|
||||||
|
.text()
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
if (additionalFieldIndex === 1) {
|
||||||
|
//Extracting data of real estate type
|
||||||
|
const categoryTmp = additionalField
|
||||||
|
.replace(/[\n\r\t]/gm, "")
|
||||||
|
.substring(
|
||||||
|
additionalField.indexOf("Kategorija") + 10,
|
||||||
|
additionalField.length
|
||||||
|
)
|
||||||
|
.trim();
|
||||||
|
realEstateType = this.getAdCategoryId(categoryTmp);
|
||||||
|
if (!realEstateType) {
|
||||||
|
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
switch (additionalField) {
|
||||||
|
case "Internet":
|
||||||
|
internet = true;
|
||||||
|
break;
|
||||||
|
case "Garaža":
|
||||||
|
garage = true;
|
||||||
|
break;
|
||||||
|
case "Klima":
|
||||||
|
airCondition = true;
|
||||||
|
break;
|
||||||
|
case "Balkon":
|
||||||
|
balcony = true;
|
||||||
|
break;
|
||||||
|
case "Ostava":
|
||||||
|
storeRoom = true;
|
||||||
|
break;
|
||||||
|
case "Podrum":
|
||||||
|
basementAttic = true;
|
||||||
|
break;
|
||||||
|
case "Blindirana vrata":
|
||||||
|
antiTheftDoor = true;
|
||||||
|
break;
|
||||||
|
case "Voda":
|
||||||
|
water = true;
|
||||||
|
break;
|
||||||
|
case "Kablovska":
|
||||||
|
cableTV = true;
|
||||||
|
break;
|
||||||
|
case "Uknjiženo":
|
||||||
|
registeredInZkBooks = true;
|
||||||
|
break;
|
||||||
|
case "Grijanje - centralno":
|
||||||
|
heatingType = HEATING_TYPE.CENTRAL_CITY.id;
|
||||||
|
break;
|
||||||
|
case "Grijanje - plin":
|
||||||
|
heatingType = HEATING_TYPE.GAS.id;
|
||||||
|
break;
|
||||||
|
case "Grijanje - struja":
|
||||||
|
heatingType = HEATING_TYPE.ELECTRICITY.id;
|
||||||
|
break;
|
||||||
|
case "Grijanje":
|
||||||
|
heatingType = HEATING_TYPE.OTHER.id;
|
||||||
|
break;
|
||||||
|
case "Plin":
|
||||||
|
gas = true;
|
||||||
|
break;
|
||||||
|
case "Namješten":
|
||||||
|
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
||||||
|
break;
|
||||||
|
case "Alarm":
|
||||||
|
alarm = true;
|
||||||
|
break;
|
||||||
|
case "Video nadzor":
|
||||||
|
videoSurveillance = true;
|
||||||
|
break;
|
||||||
|
case "Lift":
|
||||||
|
elevator = true;
|
||||||
|
break;
|
||||||
|
case "Novogradnja":
|
||||||
|
newBuilding = true;
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (additionalField === "") {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
additionalFieldIndex++;
|
||||||
|
} while (true);
|
||||||
|
|
||||||
|
//If no published date it takes current date of crawling
|
||||||
|
if (publishedDate) {
|
||||||
|
renewedDate = new Date();
|
||||||
|
} else {
|
||||||
|
publishedDate = new Date();
|
||||||
|
renewedDate = new Date();
|
||||||
|
}
|
||||||
|
|
||||||
|
const originAgencyName = AD_AGENCY.SALJIC;
|
||||||
|
const locality = "";
|
||||||
|
const municipality = "";
|
||||||
|
const city = "";
|
||||||
|
const region = "";
|
||||||
|
const entity = "";
|
||||||
|
const country = "";
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
url,
|
||||||
|
agencyObjectId,
|
||||||
|
originAgencyName,
|
||||||
|
realEstateType,
|
||||||
|
adType,
|
||||||
|
title,
|
||||||
|
price,
|
||||||
|
area,
|
||||||
|
gardenSize,
|
||||||
|
shortDescription,
|
||||||
|
longDescription,
|
||||||
|
streetNumber,
|
||||||
|
streetName,
|
||||||
|
locality,
|
||||||
|
municipality,
|
||||||
|
city,
|
||||||
|
region,
|
||||||
|
entity,
|
||||||
|
country,
|
||||||
|
locationLat,
|
||||||
|
locationLong,
|
||||||
|
adStatus,
|
||||||
|
publishedDate,
|
||||||
|
renewedDate,
|
||||||
|
numberOfRooms,
|
||||||
|
numberOfFloors,
|
||||||
|
floor,
|
||||||
|
accessRoadType,
|
||||||
|
heatingType,
|
||||||
|
furnishingType,
|
||||||
|
balcony,
|
||||||
|
newBuilding,
|
||||||
|
elevator,
|
||||||
|
water,
|
||||||
|
electricity,
|
||||||
|
drainageSystem,
|
||||||
|
registeredInZkBooks,
|
||||||
|
recentlyAdapted,
|
||||||
|
parking,
|
||||||
|
garage,
|
||||||
|
gas,
|
||||||
|
antiTheftDoor,
|
||||||
|
airCondition,
|
||||||
|
phoneConnection,
|
||||||
|
cableTV,
|
||||||
|
internet,
|
||||||
|
basementAttic,
|
||||||
|
storeRoom,
|
||||||
|
videoSurveillance,
|
||||||
|
alarm,
|
||||||
|
suitableForStudents,
|
||||||
|
includingBills,
|
||||||
|
animalsAllowed,
|
||||||
|
pool,
|
||||||
|
exchange,
|
||||||
|
urbanPlanPermit,
|
||||||
|
buildingPermit,
|
||||||
|
utilityConnection,
|
||||||
|
distanceToRiver,
|
||||||
|
numberOfViewsAgency,
|
||||||
|
numberOfViewsKivi
|
||||||
|
};
|
||||||
|
|
||||||
|
return data;
|
||||||
|
} catch (e) {
|
||||||
|
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
//======= HELPER FUNCTIONS =============
|
||||||
|
|
||||||
|
getAdCategoryId(categoryText) {
|
||||||
|
switch (categoryText) {
|
||||||
|
case "Stan":
|
||||||
|
return AD_CATEGORY.FLAT.id;
|
||||||
|
case "Građevinsko zemljiste":
|
||||||
|
return AD_CATEGORY.LAND.id;
|
||||||
|
case "Industrijsko zemljiste":
|
||||||
|
return AD_CATEGORY.LAND.id;
|
||||||
|
case "Poljoprivredno zemljiste":
|
||||||
|
return AD_CATEGORY.LAND.id;
|
||||||
|
case "Kuća":
|
||||||
|
return AD_CATEGORY.HOUSE.id;
|
||||||
|
case "Poslovni prostor":
|
||||||
|
return AD_CATEGORY.OFFICE.id;
|
||||||
|
case "Kancelarije":
|
||||||
|
return AD_CATEGORY.OFFICE.id;
|
||||||
|
case "Apartmani":
|
||||||
|
return AD_CATEGORY.APARTMENT.id;
|
||||||
|
case "Garaža":
|
||||||
|
return AD_CATEGORY.GARAGE.id;
|
||||||
|
case "Vikendica":
|
||||||
|
return AD_CATEGORY.COTTAGE.id;
|
||||||
|
default:
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getAdTypeId(adTypeText) {
|
||||||
|
switch (adTypeText) {
|
||||||
|
case "PRODAJA":
|
||||||
|
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||||
|
case "NAJAM":
|
||||||
|
return AD_TYPE.AD_TYPE_RENT.stringId;
|
||||||
|
default:
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
parseNumberOfFloors(numberOfFloorsText) {
|
||||||
|
const tryNumericalValue = parseInt(numberOfFloorsText);
|
||||||
|
if (!isNaN(tryNumericalValue)){
|
||||||
|
return tryNumericalValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guess number of floors based on number of + sign concatenations
|
||||||
|
// e.g. P+S+Pt -> 3 floors
|
||||||
|
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
|
||||||
|
return numberOfFloorsText.split('+').length + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
async sleep(ms) {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async saveCrawledResults(results) {
|
||||||
|
const savers = this.savers;
|
||||||
|
|
||||||
|
// for (const saver of savers) {
|
||||||
|
// await saver.save(results);
|
||||||
|
// }
|
||||||
|
|
||||||
|
//For now, we use only Postgres saver, so ...
|
||||||
|
return savers[0].save(results);
|
||||||
|
//so that we can use some sequelize options and information when data is inserted
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = SaljicCrawler;
|
||||||
@@ -332,10 +332,14 @@ const findRealEstatesForSearchRequest = async (searchRequest, maxResults) => {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//When includeIncompleteAds are not defined - null it will consider it true
|
||||||
const order = [["updatedAt", "desc"]];
|
const order = [["updatedAt", "desc"]];
|
||||||
|
|
||||||
return db.RealEstate.findAll({
|
return db.RealEstate.findAll({
|
||||||
where: includeIncompleteAds ? queryIncludeIncomplete : query,
|
where:
|
||||||
|
includeIncompleteAds || includeIncompleteAds == null
|
||||||
|
? queryIncludeIncomplete
|
||||||
|
: query,
|
||||||
limit: maxResults,
|
limit: maxResults,
|
||||||
order
|
order
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ const db = require("../../models/index");
|
|||||||
const sequelize = require("sequelize");
|
const sequelize = require("sequelize");
|
||||||
const Op = sequelize.Op;
|
const Op = sequelize.Op;
|
||||||
const { AD_CATEGORY } = require("../../common/enums");
|
const { AD_CATEGORY } = require("../../common/enums");
|
||||||
|
const { CHECK_UP_DAYS } = require("../../config/appConfig");
|
||||||
|
|
||||||
const getSearchRequest = async searchRequestId => {
|
const getSearchRequest = async searchRequestId => {
|
||||||
try {
|
try {
|
||||||
@@ -16,6 +17,22 @@ const getSearchRequest = async searchRequestId => {
|
|||||||
const createSearchRequest = async (searchRequestFields = {}) => {
|
const createSearchRequest = async (searchRequestFields = {}) => {
|
||||||
return await db.SearchRequest.create(searchRequestFields);
|
return await db.SearchRequest.create(searchRequestFields);
|
||||||
};
|
};
|
||||||
|
const findAllRequestsForCheckUp = async () => {
|
||||||
|
const checkUpOffset = 24 * 60 * 60 * 1000 * CHECK_UP_DAYS; //in miliseconds
|
||||||
|
const checkupDate = new Date();
|
||||||
|
checkupDate.setTime(checkupDate.getTime() - checkUpOffset);
|
||||||
|
|
||||||
|
const dateQuery = {
|
||||||
|
notifiedAt: {
|
||||||
|
[Op.lte]: checkupDate
|
||||||
|
}
|
||||||
|
};
|
||||||
|
const allRequestsForCheckUp = await db.SearchRequest.findAll({
|
||||||
|
where: dateQuery
|
||||||
|
});
|
||||||
|
|
||||||
|
return allRequestsForCheckUp;
|
||||||
|
};
|
||||||
|
|
||||||
const findSearchRequestsForRealEstate = async realEstate => {
|
const findSearchRequestsForRealEstate = async realEstate => {
|
||||||
const {
|
const {
|
||||||
@@ -157,7 +174,7 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
} else {
|
} else {
|
||||||
// If real estate dont have defined number of rooms ex. null
|
// If real estate dont have defined number of rooms ex. null
|
||||||
//It returns requests that didn't choose number of rooms - also null
|
//It returns requests that didn't choose number of rooms - also null
|
||||||
//Or ones that picked some values but also picked to includeIncomplete ads
|
//Or ones that picked some values but also picked to includeIncomplete ads (or default)
|
||||||
numberOfRoomsQuery = {
|
numberOfRoomsQuery = {
|
||||||
[Op.or]: [
|
[Op.or]: [
|
||||||
{
|
{
|
||||||
@@ -176,7 +193,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
includeIncompleteAds: {
|
includeIncompleteAds: {
|
||||||
[Op.eq]: true
|
[Op.or]: {
|
||||||
|
[Op.eq]: true,
|
||||||
|
[Op.is]: null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -226,7 +246,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
includeIncompleteAds: {
|
includeIncompleteAds: {
|
||||||
[Op.eq]: true
|
[Op.or]: {
|
||||||
|
[Op.eq]: true,
|
||||||
|
[Op.is]: null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -275,7 +298,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
includeIncompleteAds: {
|
includeIncompleteAds: {
|
||||||
[Op.eq]: true
|
[Op.or]: {
|
||||||
|
[Op.eq]: true,
|
||||||
|
[Op.is]: null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -313,7 +339,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
includeIncompleteAds: {
|
includeIncompleteAds: {
|
||||||
[Op.eq]: true
|
[Op.or]: {
|
||||||
|
[Op.eq]: true,
|
||||||
|
[Op.is]: null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -347,7 +376,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
includeIncompleteAds: {
|
includeIncompleteAds: {
|
||||||
[Op.eq]: true
|
[Op.or]: {
|
||||||
|
[Op.eq]: true,
|
||||||
|
[Op.is]: null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -381,7 +413,10 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
includeIncompleteAds: {
|
includeIncompleteAds: {
|
||||||
[Op.eq]: true
|
[Op.or]: {
|
||||||
|
[Op.eq]: true,
|
||||||
|
[Op.is]: null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -423,10 +458,13 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
[Op.eq]: "ANY"
|
[Op.eq]: "ANY"
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
//Tag to check if incomplete ads are accepted in query
|
//Tag to check if incomplete ads are accepted in query which is default
|
||||||
if (checkForIncompleteWanted) {
|
if (checkForIncompleteWanted) {
|
||||||
query.includeIncompleteAds = {
|
query.includeIncompleteAds = {
|
||||||
[Op.eq]: true
|
[Op.or]: {
|
||||||
|
[Op.eq]: true,
|
||||||
|
[Op.is]: null
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -438,5 +476,6 @@ const findSearchRequestsForRealEstate = async realEstate => {
|
|||||||
module.exports = {
|
module.exports = {
|
||||||
getSearchRequest,
|
getSearchRequest,
|
||||||
createSearchRequest,
|
createSearchRequest,
|
||||||
findSearchRequestsForRealEstate
|
findSearchRequestsForRealEstate,
|
||||||
|
findAllRequestsForCheckUp
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
const db = require("../../models/index");
|
const db = require("../../models/index");
|
||||||
const sequelize = require("sequelize");
|
const sequelize = require("sequelize");
|
||||||
const Op = sequelize.Op;
|
const Op = sequelize.Op;
|
||||||
const { CHECK_UP_DAYS } = require("../../config/appConfig");
|
|
||||||
|
|
||||||
const findRealEstatesForSearchRequest = async searchRequestId => {
|
const findRealEstatesForSearchRequest = async searchRequestId => {
|
||||||
const query = {
|
const query = {
|
||||||
@@ -43,42 +42,6 @@ const findNotNotifiedMatches = async () => {
|
|||||||
|
|
||||||
return matchingRecords;
|
return matchingRecords;
|
||||||
};
|
};
|
||||||
const findAllRequestsForCheckUp = async () => {
|
|
||||||
//First we find IDs of search request that don't need to be emailed for check up - to EXCLUDE
|
|
||||||
//The ones that received notification for real estate CHECK_UP_DAYS days from now
|
|
||||||
const date = new Date();
|
|
||||||
const checkUpDate = date.getDate() - CHECK_UP_DAYS;
|
|
||||||
date.setDate(checkUpDate);
|
|
||||||
const dateQuery = {
|
|
||||||
createdAt: {
|
|
||||||
[Op.gte]: date
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const excludedMatches = await db.SearchRequestMatch.findAll({
|
|
||||||
attributes: ["searchRequestId"],
|
|
||||||
where: dateQuery,
|
|
||||||
order: [["searchRequestId", "ASC"]]
|
|
||||||
});
|
|
||||||
|
|
||||||
const excludedRequestsAll = excludedMatches.map(match => {
|
|
||||||
return match.dataValues.searchRequestId;
|
|
||||||
});
|
|
||||||
//Removing duplicate search request id-s for optimization
|
|
||||||
const excludedRequests = [...new Set(excludedRequestsAll)];
|
|
||||||
|
|
||||||
const query = {
|
|
||||||
subscribed: true,
|
|
||||||
id: {
|
|
||||||
[Op.notIn]: excludedRequests
|
|
||||||
}
|
|
||||||
};
|
|
||||||
const allRequestsForCheckUp = await db.SearchRequest.findAll({
|
|
||||||
where: query
|
|
||||||
});
|
|
||||||
|
|
||||||
return allRequestsForCheckUp;
|
|
||||||
};
|
|
||||||
|
|
||||||
const addMatches = async matchingRecords => {
|
const addMatches = async matchingRecords => {
|
||||||
return await db.SearchRequestMatch.bulkCreate(matchingRecords, {
|
return await db.SearchRequestMatch.bulkCreate(matchingRecords, {
|
||||||
@@ -89,6 +52,5 @@ const addMatches = async matchingRecords => {
|
|||||||
module.exports = {
|
module.exports = {
|
||||||
findRealEstatesForSearchRequest,
|
findRealEstatesForSearchRequest,
|
||||||
addMatches,
|
addMatches,
|
||||||
findNotNotifiedMatches,
|
findNotNotifiedMatches
|
||||||
findAllRequestsForCheckUp
|
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,8 +1,16 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const { MAX_REAL_ESTATES_IN_EMAIL, APP_URL } = require("../config/appConfig");
|
const {
|
||||||
|
MAX_REAL_ESTATES_IN_EMAIL,
|
||||||
|
APP_URL,
|
||||||
|
STAGING
|
||||||
|
} = require("../config/appConfig");
|
||||||
const { AD_CATEGORY, AD_TYPE, EMAIL_FREQUENCY } = require("../common/enums");
|
const { AD_CATEGORY, AD_TYPE, EMAIL_FREQUENCY } = require("../common/enums");
|
||||||
|
|
||||||
|
//Tag to recognize staging from development
|
||||||
|
const stagingTag = STAGING ? "[STAGING] " : "";
|
||||||
|
const wordOfMouthRequest = `Molimo vas <strong>recite svojim prijateljima</strong> za Kivi - što više korisnika budemo imali, moći ćemo više agencija uključiti i više nekretnina imati u bazi. Hvala!`
|
||||||
|
|
||||||
const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => {
|
const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => {
|
||||||
return ` <div>Trenutno ste prijavljeni da obavještenja o novim nekretninama primate <strong>${emailFrequencyTitle.toLowerCase()} </strong>.</div>
|
return ` <div>Trenutno ste prijavljeni da obavještenja o novim nekretninama primate <strong>${emailFrequencyTitle.toLowerCase()} </strong>.</div>
|
||||||
<div>Ako želite prestati dobijati obavještenja za ovu pretragu, <a href="${APP_URL}/odjava/${searchRequestId}">odjavite ovdje</a></div>
|
<div>Ako želite prestati dobijati obavještenja za ovu pretragu, <a href="${APP_URL}/odjava/${searchRequestId}">odjavite ovdje</a></div>
|
||||||
@@ -54,7 +62,7 @@ const generateNotificationEmail = (
|
|||||||
|
|
||||||
const messageBody = dailyNotification ? dailyMessageBody : asapMessageBody;
|
const messageBody = dailyNotification ? dailyMessageBody : asapMessageBody;
|
||||||
|
|
||||||
return `<h3>Zdravo</h3>
|
return `<h3>${stagingTag}Zdravo</h3>
|
||||||
<h4>${messageBody}</h4>
|
<h4>${messageBody}</h4>
|
||||||
<div>
|
<div>
|
||||||
${realEstateLinks}
|
${realEstateLinks}
|
||||||
@@ -62,6 +70,9 @@ const generateNotificationEmail = (
|
|||||||
${moreRealEstates}
|
${moreRealEstates}
|
||||||
</div>
|
</div>
|
||||||
<br/>
|
<br/>
|
||||||
|
${wordOfMouthRequest}
|
||||||
|
<br/>
|
||||||
|
<br/>
|
||||||
${emailFooter}`;
|
${emailFooter}`;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -113,7 +124,7 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
|
|||||||
|
|
||||||
const emailFooter = generateEmailFooter(id, emailFrequencyTitle);
|
const emailFooter = generateEmailFooter(id, emailFrequencyTitle);
|
||||||
|
|
||||||
return `<h3>Zdravo</h3>
|
return `<h3>${stagingTag}Zdravo</h3>
|
||||||
<div>Naručili ste da Vam javimo ako se nekretnina sa navedenim uslovima pojavi u oglasima:</div>
|
<div>Naručili ste da Vam javimo ako se nekretnina sa navedenim uslovima pojavi u oglasima:</div>
|
||||||
<br/>
|
<br/>
|
||||||
<div>
|
<div>
|
||||||
@@ -125,12 +136,16 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
|
|||||||
</div>
|
</div>
|
||||||
${matchingRealEstates.length > 0 ? instantRealEstatesText : ""}
|
${matchingRealEstates.length > 0 ? instantRealEstatesText : ""}
|
||||||
<br/>
|
<br/>
|
||||||
|
<br/>
|
||||||
|
${wordOfMouthRequest}
|
||||||
|
<br/>
|
||||||
|
<br/>
|
||||||
${emailFooter}`;
|
${emailFooter}`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
|
const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
|
||||||
if (numberOfRealEstates === 1) {
|
if (numberOfRealEstates === 1) {
|
||||||
return `Kivi: ${singleRealEstateTitle}`;
|
return `${stagingTag}Kivi: ${singleRealEstateTitle}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
const leastSignificantDigit = numberOfRealEstates % 10;
|
const leastSignificantDigit = numberOfRealEstates % 10;
|
||||||
@@ -138,7 +153,7 @@ const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
|
|||||||
const secondLeastSignificantDigit = numberWithoutLastDigit % 10;
|
const secondLeastSignificantDigit = numberWithoutLastDigit % 10;
|
||||||
|
|
||||||
if (leastSignificantDigit === 1 && secondLeastSignificantDigit !== 1) {
|
if (leastSignificantDigit === 1 && secondLeastSignificantDigit !== 1) {
|
||||||
return `Kivi : ${numberOfRealEstates} nova nekretnina`;
|
return `${stagingTag}Kivi : ${numberOfRealEstates} nova nekretnina`;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -146,10 +161,10 @@ const generateEmailSubject = (numberOfRealEstates, singleRealEstateTitle) => {
|
|||||||
leastSignificantDigit <= 4 &&
|
leastSignificantDigit <= 4 &&
|
||||||
secondLeastSignificantDigit !== 1
|
secondLeastSignificantDigit !== 1
|
||||||
) {
|
) {
|
||||||
return `Kivi: ${numberOfRealEstates} nove nekretnine`;
|
return `${stagingTag}Kivi: ${numberOfRealEstates} nove nekretnine`;
|
||||||
}
|
}
|
||||||
|
|
||||||
return `Kivi: ${numberOfRealEstates} novih nekretnina`;
|
return `${stagingTag}Kivi: ${numberOfRealEstates} novih nekretnina`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const generateCheckUpEmail = searchRequest => {
|
const generateCheckUpEmail = searchRequest => {
|
||||||
@@ -164,13 +179,23 @@ const generateCheckUpEmail = searchRequest => {
|
|||||||
priceMax
|
priceMax
|
||||||
} = searchRequest;
|
} = searchRequest;
|
||||||
|
|
||||||
|
let emailFrequencyTitle;
|
||||||
|
switch (searchRequest.emailFrequency) {
|
||||||
|
case EMAIL_FREQUENCY.ASAP.stringId:
|
||||||
|
emailFrequencyTitle = EMAIL_FREQUENCY.ASAP.title;
|
||||||
|
break;
|
||||||
|
case EMAIL_FREQUENCY.DAILY.stringId:
|
||||||
|
emailFrequencyTitle = EMAIL_FREQUENCY.DAILY.title;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
const gardenSize = realEstateType.hasGardenSize
|
const gardenSize = realEstateType.hasGardenSize
|
||||||
? `<div><strong>Kvadratura okućnice: Od ${gardenSizeMin} do ${gardenSizeMax} m2</strong></div>`
|
? `<div><strong>Kvadratura okućnice: Od ${gardenSizeMin} do ${gardenSizeMax} m2</strong></div>`
|
||||||
: ``;
|
: ``;
|
||||||
|
|
||||||
const emailFooter = generateEmailFooter(id);
|
const emailFooter = generateEmailFooter(id, emailFrequencyTitle);
|
||||||
|
|
||||||
return `<h3>Zdravo</h3>
|
return `<h3>${stagingTag}Zdravo</h3>
|
||||||
<div><strong>Kivi tim traži nekretnine za Vas i kada to ne vidite.</strong></div>
|
<div><strong>Kivi tim traži nekretnine za Vas i kada to ne vidite.</strong></div>
|
||||||
<br />
|
<br />
|
||||||
<div>Vaša trenutno aktivna pretraga je:</div>
|
<div>Vaša trenutno aktivna pretraga je:</div>
|
||||||
|
|||||||
58
app/helpers/fetchWrapper.js
Normal file
58
app/helpers/fetchWrapper.js
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
const nodeFetch = require("node-fetch");
|
||||||
|
const AbortController = require('abort-controller');
|
||||||
|
const FetchCache = require('@sozialhelden/fetch-cache').default;
|
||||||
|
|
||||||
|
console.log("Fc ", FetchCache)
|
||||||
|
|
||||||
|
const {
|
||||||
|
USER_AGENT,
|
||||||
|
USE_SCRAPER_API,
|
||||||
|
SCRAPER_API_KEY,
|
||||||
|
SCRAPER_API_BASE_URL,
|
||||||
|
NODE_FETCH_TIMEOUT_MS
|
||||||
|
} = require("../config/appConfig");
|
||||||
|
|
||||||
|
const timeout = (ms) => {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
const fetchCache = new FetchCache({
|
||||||
|
fetch: nodeFetch,
|
||||||
|
cacheOptions: {
|
||||||
|
// Don't save more than 100 responses in the cache. Allows infinite responses by default
|
||||||
|
maximalItemCount: 10000,
|
||||||
|
// When should the cache evict responses when its full?
|
||||||
|
evictExceedingItemsBy: 'age', // Valid values: 'lru' or 'age'
|
||||||
|
defaultTTL: 6 * 60 * 60 * 1000 // 6 hours
|
||||||
|
// ...see https://github.com/sozialhelden/hamster-cache for all possible options
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
const fetch = async (url, options = {}, useCache = true) => {
|
||||||
|
const controller = new AbortController();
|
||||||
|
|
||||||
|
const newOptions = Object.assign({}, options);
|
||||||
|
if (!newOptions["headers"]) {
|
||||||
|
newOptions["headers"] = {};
|
||||||
|
}
|
||||||
|
|
||||||
|
newOptions.signal = controller.signal;
|
||||||
|
|
||||||
|
// newOptions["headers"]["User-Agent"] = USER_AGENT;
|
||||||
|
|
||||||
|
let urlToFetchThroughAPI = Buffer.from(url).toString('base64');
|
||||||
|
if (SCRAPER_API_BASE_URL.includes('scraperapi')) {
|
||||||
|
urlToFetchThroughAPI = url;
|
||||||
|
}
|
||||||
|
|
||||||
|
const urlAdaptedForScraping = USE_SCRAPER_API
|
||||||
|
? `${SCRAPER_API_BASE_URL}?api_key=${SCRAPER_API_KEY}&url=${urlToFetchThroughAPI}`
|
||||||
|
: url;
|
||||||
|
const result = useCache ? fetchCache.fetch(urlAdaptedForScraping, newOptions) : nodeFetch(urlAdaptedForScraping, newOptions);
|
||||||
|
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), NODE_FETCH_TIMEOUT_MS);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports = fetch;
|
||||||
13
app/helpers/log.js
Normal file
13
app/helpers/log.js
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
const {
|
||||||
|
PRINT_CRAWLER_DEBUG
|
||||||
|
} = require("../config/appConfig");
|
||||||
|
|
||||||
|
const logDebug = (...args) => {
|
||||||
|
if (PRINT_CRAWLER_DEBUG) {
|
||||||
|
console.log(...args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
logDebug
|
||||||
|
};
|
||||||
@@ -7,6 +7,26 @@ const currentSearchRequest = async req => {
|
|||||||
|
|
||||||
return await getSearchRequest(searchRequestId);
|
return await getSearchRequest(searchRequestId);
|
||||||
};
|
};
|
||||||
module.exports = {
|
|
||||||
currentSearchRequest
|
const getUrlParams = function (url) {
|
||||||
|
if (typeof url === 'string' && url.length > 0){
|
||||||
|
const params = {};
|
||||||
|
const questionMarkIndex = url.indexOf('?');
|
||||||
|
if (questionMarkIndex === -1) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const query = url.substring(questionMarkIndex+1);
|
||||||
|
const vars = query.split('&');
|
||||||
|
for (let i = 0; i < vars.length; i++) {
|
||||||
|
const pair = vars[i].split('=');
|
||||||
|
params[pair[0]] = decodeURIComponent(pair[1]);
|
||||||
|
}
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
currentSearchRequest,
|
||||||
|
getUrlParams
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
up: (queryInterface, Sequelize) => {
|
||||||
|
return queryInterface.addColumn("SearchRequests", "notifiedAt", {
|
||||||
|
type: Sequelize.DATE,
|
||||||
|
defaultValue: new Date()
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
|
down: (queryInterface, Sequelize) => {
|
||||||
|
return queryInterface.removeColumn("SearchRequests", "notifiedAt");
|
||||||
|
}
|
||||||
|
};
|
||||||
@@ -16,7 +16,7 @@ config.logging = parseInt(process.env.SEQUELIZE_LOGGING) ? console.log : false;
|
|||||||
|
|
||||||
let sequelize;
|
let sequelize;
|
||||||
if (config.use_env_variable) {
|
if (config.use_env_variable) {
|
||||||
sequelize = new Sequelize(process.env[config.use_env_variable], config);
|
sequelize = new Sequelize(process.env[config.use_env_variable] + "?ssl=true", config);
|
||||||
} else {
|
} else {
|
||||||
sequelize = new Sequelize(
|
sequelize = new Sequelize(
|
||||||
config.database,
|
config.database,
|
||||||
|
|||||||
@@ -15,15 +15,7 @@ module.exports = (sequelize, DataTypes) => {
|
|||||||
allowNull: false,
|
allowNull: false,
|
||||||
defaultValue: {
|
defaultValue: {
|
||||||
type: "Polygon",
|
type: "Polygon",
|
||||||
coordinates: [
|
coordinates: [[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
|
||||||
[
|
|
||||||
[0, 0],
|
|
||||||
[0, 0],
|
|
||||||
[0, 0],
|
|
||||||
[0, 0],
|
|
||||||
[0, 0]
|
|
||||||
]
|
|
||||||
],
|
|
||||||
crs: { type: "name", properties: { name: "EPSG:4326" } }
|
crs: { type: "name", properties: { name: "EPSG:4326" } }
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -90,7 +82,11 @@ module.exports = (sequelize, DataTypes) => {
|
|||||||
floorMin: DataTypes.INTEGER,
|
floorMin: DataTypes.INTEGER,
|
||||||
floorMax: DataTypes.INTEGER,
|
floorMax: DataTypes.INTEGER,
|
||||||
accessRoadType: DataTypes.TEXT,
|
accessRoadType: DataTypes.TEXT,
|
||||||
heatingType: DataTypes.TEXT
|
heatingType: DataTypes.TEXT,
|
||||||
|
notifiedAt: {
|
||||||
|
type: DataTypes.DATE,
|
||||||
|
defaultValue: new Date()
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return SearchRequest;
|
return SearchRequest;
|
||||||
|
|||||||
@@ -1,4 +1,8 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
const { STAGING } = require("../config/appConfig");
|
||||||
|
|
||||||
|
const stagingTag = STAGING ? "[STAGING] " : "";
|
||||||
|
|
||||||
const {
|
const {
|
||||||
matchRealEstates,
|
matchRealEstates,
|
||||||
matchSearchRequest
|
matchSearchRequest
|
||||||
@@ -11,9 +15,10 @@ const {
|
|||||||
} = require("../helpers/emailContentGenerator");
|
} = require("../helpers/emailContentGenerator");
|
||||||
const {
|
const {
|
||||||
findNotNotifiedMatches,
|
findNotNotifiedMatches,
|
||||||
findAllRequestsForCheckUp,
|
|
||||||
findRealEstatesForSearchRequest
|
findRealEstatesForSearchRequest
|
||||||
} = require("../helpers/db/searchRequestMatch");
|
} = require("../helpers/db/searchRequestMatch");
|
||||||
|
const { findAllRequestsForCheckUp } = require("../helpers/db/searchRequest");
|
||||||
|
|
||||||
const { sendEmail } = require("../services/emailService");
|
const { sendEmail } = require("../services/emailService");
|
||||||
|
|
||||||
const notifyForNewRealEstates = async newRealEstates => {
|
const notifyForNewRealEstates = async newRealEstates => {
|
||||||
@@ -26,13 +31,17 @@ const notifyForNewSearchRequest = async searchRequest => {
|
|||||||
|
|
||||||
const searchRequestId = searchRequest.id;
|
const searchRequestId = searchRequest.id;
|
||||||
const matchingRealEstates = matches[searchRequestId].realEstates;
|
const matchingRealEstates = matches[searchRequestId].realEstates;
|
||||||
|
|
||||||
const emailContent = generateNewSearchRequestEmail(
|
const emailContent = generateNewSearchRequestEmail(
|
||||||
searchRequest,
|
searchRequest,
|
||||||
matchingRealEstates
|
matchingRealEstates
|
||||||
);
|
);
|
||||||
const { email } = searchRequest;
|
const { email } = searchRequest;
|
||||||
await sendEmail(email, "Kivi - novi zahtjev za pretragu", emailContent);
|
//In case of the new search req, notifiedAt column is populated with default value - now (moment of creation)
|
||||||
|
await sendEmail(
|
||||||
|
email,
|
||||||
|
`${stagingTag} Kivi - novi zahtjev za pretragu`,
|
||||||
|
emailContent
|
||||||
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
const notifyMatches = async (matches, dailyNotification = false) => {
|
const notifyMatches = async (matches, dailyNotification = false) => {
|
||||||
@@ -68,6 +77,10 @@ const notifyMatches = async (matches, dailyNotification = false) => {
|
|||||||
sendEmailPromise.catch(err =>
|
sendEmailPromise.catch(err =>
|
||||||
console.log("[Email Sending Failed]", err)
|
console.log("[Email Sending Failed]", err)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
//Change time of notified At for searchReq
|
||||||
|
searchRequest.notifiedAt = new Date();
|
||||||
|
searchRequest.save();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -123,20 +136,24 @@ const notifyRequestsWithDailyOption = async () => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const checkUpNotify = async () => {
|
const checkUpNotify = async () => {
|
||||||
const searchRequestsForCheckUp = await findAllRequestsForCheckUp();
|
/* const searchRequestsForCheckUp = await findAllRequestsForCheckUp();
|
||||||
|
|
||||||
const asyncSendEmailActions = [];
|
const asyncSendEmailActions = [];
|
||||||
|
|
||||||
for (const searchRequest of searchRequestsForCheckUp) {
|
for (const searchRequest of searchRequestsForCheckUp) {
|
||||||
const { email } = searchRequest.dataValues;
|
const { email } = searchRequest.dataValues;
|
||||||
const emailSubject = `Kivi: Mi tražimo nekretnine za vas!`;
|
const emailSubject = `${stagingTag}Kivi: Mi tražimo nekretnine za vas!`;
|
||||||
const emailContent = generateCheckUpEmail(searchRequest.dataValues);
|
const emailContent = generateCheckUpEmail(searchRequest.dataValues);
|
||||||
|
|
||||||
const sendEmailPromise = sendEmail(email, emailSubject, emailContent);
|
const sendEmailPromise = sendEmail(email, emailSubject, emailContent);
|
||||||
asyncSendEmailActions.push(sendEmailPromise);
|
asyncSendEmailActions.push(sendEmailPromise);
|
||||||
sendEmailPromise.catch(err => console.log("[Email Sending Failed]", err));
|
sendEmailPromise.catch(err => console.log("[Email Sending Failed]", err));
|
||||||
|
|
||||||
|
//Change time of notified At for searchReq
|
||||||
|
searchRequest.notifiedAt = new Date();
|
||||||
|
searchRequest.save();
|
||||||
}
|
}
|
||||||
await Promise.all(asyncSendEmailActions);
|
await Promise.all(asyncSendEmailActions);*/
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|||||||
@@ -61,9 +61,8 @@
|
|||||||
<p class="distinguished">
|
<p class="distinguished">
|
||||||
<label class="checkbox-label">
|
<label class="checkbox-label">
|
||||||
<input type="checkbox" class="filled-in" name="includeIncompleteAds"
|
<input type="checkbox" class="filled-in" name="includeIncompleteAds"
|
||||||
<% if (includeIncompleteAds) { %>
|
|
||||||
checked
|
checked
|
||||||
<% } %>>
|
>
|
||||||
<span>Uključi i oglase bez potpunih informacija</span>
|
<span>Uključi i oglase bez potpunih informacija</span>
|
||||||
</label>
|
</label>
|
||||||
</p>
|
</p>
|
||||||
|
|||||||
@@ -8,6 +8,10 @@ SEQUELIZE_LOGGING=0- no sequelize logging, 1- log to the console
|
|||||||
PORT=Port for the app, defaults to 5000
|
PORT=Port for the app, defaults to 5000
|
||||||
APP_BASE_URL=base url for the app
|
APP_BASE_URL=base url for the app
|
||||||
|
|
||||||
|
ENVIRONMENT=Variable to denote development, staging and production
|
||||||
|
|
||||||
|
USER_AGENT=User agent header to send in fetch requests
|
||||||
|
|
||||||
MAX_REAL_ESTATES_IN_EMAIL=Max number of real estates that will be shown in email, others will be truncated and URL with full list will be shwon
|
MAX_REAL_ESTATES_IN_EMAIL=Max number of real estates that will be shown in email, others will be truncated and URL with full list will be shwon
|
||||||
MAX_REAL_ESTATES_IN_FIRST_EMAIL=Max number of real estates that will be shown in first (welcome) email
|
MAX_REAL_ESTATES_IN_FIRST_EMAIL=Max number of real estates that will be shown in first (welcome) email
|
||||||
|
|
||||||
@@ -18,6 +22,11 @@ GA_ID=Google Analytics ID
|
|||||||
#=============== GOOGLE MAPS =============#
|
#=============== GOOGLE MAPS =============#
|
||||||
API_MAP_KEY=(your-key-here)
|
API_MAP_KEY=(your-key-here)
|
||||||
|
|
||||||
|
#=============== SCRAPER API SUPORT =============#
|
||||||
|
USE_SCRAPER_API= To turn it on (1) or off (0)
|
||||||
|
SCRAPER_API_KEY= Key for Scraper api
|
||||||
|
SCRAPER_API_BASE_URL= Base url without question mark (example: http://sabur.kivi.ba:1337)
|
||||||
|
|
||||||
#=============== AWS SDK EMAIL SETTINGS =======#
|
#=============== AWS SDK EMAIL SETTINGS =======#
|
||||||
AWS_KEY_ID=(your-key-here)
|
AWS_KEY_ID=(your-key-here)
|
||||||
AWS_SECRET_ACCESS_KEY=(your-key-here)
|
AWS_SECRET_ACCESS_KEY=(your-key-here)
|
||||||
@@ -62,3 +71,9 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to
|
|||||||
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
||||||
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||||
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
|
#==SALJIC NEKRETNINE==
|
||||||
|
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
|
||||||
|
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
||||||
|
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||||
|
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||||
|
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
|
|||||||
9
index.js
9
index.js
@@ -4,6 +4,7 @@ const bodyParser = require("body-parser");
|
|||||||
const layout = require("express-layout");
|
const layout = require("express-layout");
|
||||||
const compression = require("compression");
|
const compression = require("compression");
|
||||||
const forceSSL = require("./app/helpers/forceSSL");
|
const forceSSL = require("./app/helpers/forceSSL");
|
||||||
|
const { logDebug } = require("./app/helpers/log");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
APP_PORT,
|
APP_PORT,
|
||||||
@@ -38,11 +39,17 @@ app.listen(APP_PORT, () =>
|
|||||||
|
|
||||||
let crawlerRunning = STOP_CRAWLER;
|
let crawlerRunning = STOP_CRAWLER;
|
||||||
const crawl = () => {
|
const crawl = () => {
|
||||||
|
logDebug("Crawl start. crawlerRunning: ", crawlerRunning);
|
||||||
if (!crawlerRunning) {
|
if (!crawlerRunning) {
|
||||||
crawlerRunning = true;
|
crawlerRunning = true;
|
||||||
crawlAll().then(newRealEstates => {
|
crawlAll().then(newRealEstates => {
|
||||||
crawlerRunning = false;
|
logDebug("crawlAll done, new real estate len: ", newRealEstates.length);
|
||||||
notifyForNewRealEstates(newRealEstates);
|
notifyForNewRealEstates(newRealEstates);
|
||||||
|
}).catch(e => {
|
||||||
|
console.error('Error happened: ', e);
|
||||||
|
}).finally(()=> {
|
||||||
|
crawlerRunning = false;
|
||||||
|
logDebug('Finally done!');
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
39
package-lock.json
generated
39
package-lock.json
generated
@@ -40,6 +40,32 @@
|
|||||||
"@sendgrid/helpers": "^6.3.0"
|
"@sendgrid/helpers": "^6.3.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"@sozialhelden/fetch-cache": {
|
||||||
|
"version": "2.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/@sozialhelden/fetch-cache/-/fetch-cache-2.0.1.tgz",
|
||||||
|
"integrity": "sha512-vMlsdT5JQCGjx1fcFxmMNh7ZKppjjsfUAeZEhhNwhEL7GaqbZXsD1OXEyx2IcRa25ZuZtvJSV6Q3rE77VRdLvg==",
|
||||||
|
"requires": {
|
||||||
|
"@sozialhelden/hamster-cache": "^1.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"@sozialhelden/hamster-cache": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@sozialhelden/hamster-cache/-/hamster-cache-1.0.0.tgz",
|
||||||
|
"integrity": "sha512-/TEGA8mdMawZp4Yq/GrkL+72YL5EGuSeVXC3pKW12YY1t3C+zCN/HZ0HRp4zWF/e67svXcxuz/B0AEQxEdvi7A=="
|
||||||
|
},
|
||||||
|
"@supercharge/goodies": {
|
||||||
|
"version": "1.4.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@supercharge/goodies/-/goodies-1.4.0.tgz",
|
||||||
|
"integrity": "sha512-Np6u2qjRwiA3wTgzz4n2yduydIjSXqtJWP5cOnNqjdlCR/EUAK86LAOhEcU+YW211D1ksugns3GqpARJDoXQ7g=="
|
||||||
|
},
|
||||||
|
"@supercharge/promise-pool": {
|
||||||
|
"version": "1.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@supercharge/promise-pool/-/promise-pool-1.3.0.tgz",
|
||||||
|
"integrity": "sha512-9/EVrJevSPEqI4i/gRH8Dt7C+FQT65wRRYuu0MDaGmSLZ2aTel0jOGu8Ae84fPiQ+Ah0B80RPFUxk+K+Cz48DA==",
|
||||||
|
"requires": {
|
||||||
|
"@supercharge/goodies": "~1.4.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"@types/caseless": {
|
"@types/caseless": {
|
||||||
"version": "0.12.2",
|
"version": "0.12.2",
|
||||||
"resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.2.tgz",
|
"resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.2.tgz",
|
||||||
@@ -79,6 +105,14 @@
|
|||||||
"resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz",
|
||||||
"integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q=="
|
"integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q=="
|
||||||
},
|
},
|
||||||
|
"abort-controller": {
|
||||||
|
"version": "3.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
|
||||||
|
"integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
|
||||||
|
"requires": {
|
||||||
|
"event-target-shim": "^5.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"accepts": {
|
"accepts": {
|
||||||
"version": "1.3.5",
|
"version": "1.3.5",
|
||||||
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz",
|
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz",
|
||||||
@@ -1074,6 +1108,11 @@
|
|||||||
"es5-ext": "~0.10.14"
|
"es5-ext": "~0.10.14"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"event-target-shim": {
|
||||||
|
"version": "5.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
|
||||||
|
"integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="
|
||||||
|
},
|
||||||
"events": {
|
"events": {
|
||||||
"version": "1.1.1",
|
"version": "1.1.1",
|
||||||
"resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz",
|
||||||
|
|||||||
@@ -17,7 +17,8 @@
|
|||||||
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
||||||
"test-search": "cd test && node searchTest.js",
|
"test-search": "cd test && node searchTest.js",
|
||||||
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
||||||
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
|
"test-rental-scraper": "cd test && node rentalScrapeTest.js",
|
||||||
|
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
@@ -31,6 +32,9 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"2checkout-node": "0.0.1",
|
"2checkout-node": "0.0.1",
|
||||||
"@sendgrid/mail": "^6.3.1",
|
"@sendgrid/mail": "^6.3.1",
|
||||||
|
"@sozialhelden/fetch-cache": "^2.0.1",
|
||||||
|
"@supercharge/promise-pool": "^1.3.0",
|
||||||
|
"abort-controller": "^3.0.0",
|
||||||
"aws-sdk": "^2.422.0",
|
"aws-sdk": "^2.422.0",
|
||||||
"bluebird": "^3.5.5",
|
"bluebird": "^3.5.5",
|
||||||
"cheerio": "^1.0.0-rc.2",
|
"cheerio": "^1.0.0-rc.2",
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ if (urlToScrape) {
|
|||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
const data = await crawler.scrapeAd(urlToScrape);
|
const data = await crawler.scrapeAd(urlToScrape);
|
||||||
console.log(data);
|
console.log("Scraped data:", data);
|
||||||
})();
|
})();
|
||||||
} else {
|
} else {
|
||||||
console.log("No URL to scrape. Use like this : ");
|
console.log("No URL to scrape. Use like this : ");
|
||||||
|
|||||||
17
test/saljicScrapeTest.js
Normal file
17
test/saljicScrapeTest.js
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
|
||||||
|
|
||||||
|
const urlToScrape = process.argv[2] || undefined;
|
||||||
|
|
||||||
|
if (urlToScrape) {
|
||||||
|
const crawler = new saljicCrawler();
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const data = await crawler.scrapeAd(urlToScrape);
|
||||||
|
console.log("Scraped data:", data);
|
||||||
|
})();
|
||||||
|
} else {
|
||||||
|
console.log("No URL to scrape. Use like this : ");
|
||||||
|
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user