Fix saljic crawler
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
"use strict";
|
||||
|
||||
const fetch = require("../../helpers/fetchWrapper");
|
||||
const { getUrlParams } = require("../../helpers/url");
|
||||
const cheerio = require("cheerio");
|
||||
const moment = require("moment-timezone");
|
||||
|
||||
@@ -52,6 +53,7 @@ class SaljicCrawler {
|
||||
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
||||
this.crawlerAdTypes = crawlerAdTypes;
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxPages = maxPages
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
}
|
||||
@@ -231,6 +233,10 @@ class SaljicCrawler {
|
||||
? parseInt(url.substring(46, url.length))
|
||||
: null;
|
||||
|
||||
if (!agencyObjectId) {
|
||||
throw { message : 'No agency object ID - URL changed?'}
|
||||
}
|
||||
|
||||
//Extracting main properties
|
||||
const propertySelectors = {
|
||||
title:
|
||||
@@ -239,11 +245,10 @@ class SaljicCrawler {
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
||||
streetName:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
|
||||
|
||||
descriptions:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
|
||||
latAndLong:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
|
||||
"iframe"
|
||||
};
|
||||
const title = $(propertySelectors.title)
|
||||
.text()
|
||||
@@ -274,14 +279,26 @@ class SaljicCrawler {
|
||||
.trim();
|
||||
|
||||
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
|
||||
let tmpLatLong;
|
||||
let latText;
|
||||
let longText;
|
||||
if (latAndLongSrc) {
|
||||
tmpLatLong = latAndLongSrc.split("marker=")[1];
|
||||
latText = tmpLatLong.split("%2C")[0];
|
||||
longText = tmpLatLong.split("%2C")[1];
|
||||
if (latAndLongSrc){
|
||||
const mapParams = getUrlParams(latAndLongSrc);
|
||||
if (mapParams) {
|
||||
if (mapParams['marker']){
|
||||
const marker = mapParams['marker'].split(',');
|
||||
latText = marker[0] ? marker[0] : undefined;
|
||||
longText = marker[1] ? marker[1] : undefined;
|
||||
}else{
|
||||
if (mapParams['mlat']) {
|
||||
latText = mapParams['mlat'];
|
||||
}
|
||||
if (mapParams['mlon']) {
|
||||
longText = mapParams['mlon'];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const locationLat = parseFloat(latText) || null;
|
||||
const locationLong = parseFloat(longText) || null;
|
||||
|
||||
@@ -368,7 +385,7 @@ class SaljicCrawler {
|
||||
numberOfRooms = parseInt(mainFieldValue);
|
||||
break;
|
||||
case "Broj spratova":
|
||||
numberOfFloors = parseInt(mainFieldValue);
|
||||
numberOfFloors = this.parseNumberOfFloors(mainFieldValue);
|
||||
break;
|
||||
case "Sprat":
|
||||
floor = parseInt(mainFieldValue);
|
||||
@@ -414,6 +431,9 @@ class SaljicCrawler {
|
||||
)
|
||||
.trim();
|
||||
realEstateType = this.getAdCategoryId(categoryTmp);
|
||||
if (!realEstateType) {
|
||||
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
|
||||
}
|
||||
} else {
|
||||
switch (additionalField) {
|
||||
case "Internet":
|
||||
@@ -570,7 +590,7 @@ class SaljicCrawler {
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
@@ -615,6 +635,21 @@ class SaljicCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
parseNumberOfFloors(numberOfFloorsText) {
|
||||
const tryNumericalValue = parseInt(numberOfFloorsText);
|
||||
if (!isNaN(tryNumericalValue)){
|
||||
return tryNumericalValue;
|
||||
}
|
||||
|
||||
// Guess number of floors based on number of + sign concatenations
|
||||
// e.g. P+S+Pt -> 3 floors
|
||||
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
|
||||
return numberOfFloorsText.split('+').length + 1
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
@@ -7,6 +7,26 @@ const currentSearchRequest = async req => {
|
||||
|
||||
return await getSearchRequest(searchRequestId);
|
||||
};
|
||||
module.exports = {
|
||||
currentSearchRequest
|
||||
|
||||
const getUrlParams = function (url) {
|
||||
if (typeof url === 'string' && url.length > 0){
|
||||
const params = {};
|
||||
const questionMarkIndex = url.indexOf('?');
|
||||
if (questionMarkIndex === -1) {
|
||||
return undefined;
|
||||
}
|
||||
const query = url.substring(questionMarkIndex+1);
|
||||
const vars = query.split('&');
|
||||
for (let i = 0; i < vars.length; i++) {
|
||||
const pair = vars[i].split('=');
|
||||
params[pair[0]] = decodeURIComponent(pair[1]);
|
||||
}
|
||||
return params;
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
currentSearchRequest,
|
||||
getUrlParams
|
||||
};
|
||||
|
||||
@@ -72,6 +72,7 @@ AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
||||
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||
#==SALJIC NEKRETNINE==
|
||||
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
|
||||
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
||||
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||
|
||||
@@ -17,7 +17,8 @@
|
||||
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
||||
"test-search": "cd test && node searchTest.js",
|
||||
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
||||
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
|
||||
"test-rental-scraper": "cd test && node rentalScrapeTest.js",
|
||||
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
|
||||
17
test/saljicScrapeTest.js
Normal file
17
test/saljicScrapeTest.js
Normal file
@@ -0,0 +1,17 @@
|
||||
"use strict";
|
||||
|
||||
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
|
||||
|
||||
const urlToScrape = process.argv[2] || undefined;
|
||||
|
||||
if (urlToScrape) {
|
||||
const crawler = new saljicCrawler();
|
||||
|
||||
(async () => {
|
||||
const data = await crawler.scrapeAd(urlToScrape);
|
||||
console.log("Scraped data:", data);
|
||||
})();
|
||||
} else {
|
||||
console.log("No URL to scrape. Use like this : ");
|
||||
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
|
||||
}
|
||||
Reference in New Issue
Block a user