Fix saljic crawler
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
"use strict";
|
"use strict";
|
||||||
|
|
||||||
const fetch = require("../../helpers/fetchWrapper");
|
const fetch = require("../../helpers/fetchWrapper");
|
||||||
|
const { getUrlParams } = require("../../helpers/url");
|
||||||
const cheerio = require("cheerio");
|
const cheerio = require("cheerio");
|
||||||
const moment = require("moment-timezone");
|
const moment = require("moment-timezone");
|
||||||
|
|
||||||
@@ -52,6 +53,7 @@ class SaljicCrawler {
|
|||||||
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
||||||
this.crawlerAdTypes = crawlerAdTypes;
|
this.crawlerAdTypes = crawlerAdTypes;
|
||||||
this.crawlerAdCategories = crawlerAdCategories;
|
this.crawlerAdCategories = crawlerAdCategories;
|
||||||
|
this.maxPages = maxPages
|
||||||
this.maxResultsPerPage = maxResultsPerPage;
|
this.maxResultsPerPage = maxResultsPerPage;
|
||||||
this.delayBetweenPages = delayBetweenPages;
|
this.delayBetweenPages = delayBetweenPages;
|
||||||
}
|
}
|
||||||
@@ -231,6 +233,10 @@ class SaljicCrawler {
|
|||||||
? parseInt(url.substring(46, url.length))
|
? parseInt(url.substring(46, url.length))
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
|
if (!agencyObjectId) {
|
||||||
|
throw { message : 'No agency object ID - URL changed?'}
|
||||||
|
}
|
||||||
|
|
||||||
//Extracting main properties
|
//Extracting main properties
|
||||||
const propertySelectors = {
|
const propertySelectors = {
|
||||||
title:
|
title:
|
||||||
@@ -239,11 +245,10 @@ class SaljicCrawler {
|
|||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
||||||
streetName:
|
streetName:
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
|
||||||
|
|
||||||
descriptions:
|
descriptions:
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
|
||||||
latAndLong:
|
latAndLong:
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
|
"iframe"
|
||||||
};
|
};
|
||||||
const title = $(propertySelectors.title)
|
const title = $(propertySelectors.title)
|
||||||
.text()
|
.text()
|
||||||
@@ -274,14 +279,26 @@ class SaljicCrawler {
|
|||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
|
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
|
||||||
let tmpLatLong;
|
|
||||||
let latText;
|
let latText;
|
||||||
let longText;
|
let longText;
|
||||||
if (latAndLongSrc) {
|
if (latAndLongSrc){
|
||||||
tmpLatLong = latAndLongSrc.split("marker=")[1];
|
const mapParams = getUrlParams(latAndLongSrc);
|
||||||
latText = tmpLatLong.split("%2C")[0];
|
if (mapParams) {
|
||||||
longText = tmpLatLong.split("%2C")[1];
|
if (mapParams['marker']){
|
||||||
|
const marker = mapParams['marker'].split(',');
|
||||||
|
latText = marker[0] ? marker[0] : undefined;
|
||||||
|
longText = marker[1] ? marker[1] : undefined;
|
||||||
|
}else{
|
||||||
|
if (mapParams['mlat']) {
|
||||||
|
latText = mapParams['mlat'];
|
||||||
|
}
|
||||||
|
if (mapParams['mlon']) {
|
||||||
|
longText = mapParams['mlon'];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const locationLat = parseFloat(latText) || null;
|
const locationLat = parseFloat(latText) || null;
|
||||||
const locationLong = parseFloat(longText) || null;
|
const locationLong = parseFloat(longText) || null;
|
||||||
|
|
||||||
@@ -368,7 +385,7 @@ class SaljicCrawler {
|
|||||||
numberOfRooms = parseInt(mainFieldValue);
|
numberOfRooms = parseInt(mainFieldValue);
|
||||||
break;
|
break;
|
||||||
case "Broj spratova":
|
case "Broj spratova":
|
||||||
numberOfFloors = parseInt(mainFieldValue);
|
numberOfFloors = this.parseNumberOfFloors(mainFieldValue);
|
||||||
break;
|
break;
|
||||||
case "Sprat":
|
case "Sprat":
|
||||||
floor = parseInt(mainFieldValue);
|
floor = parseInt(mainFieldValue);
|
||||||
@@ -414,6 +431,9 @@ class SaljicCrawler {
|
|||||||
)
|
)
|
||||||
.trim();
|
.trim();
|
||||||
realEstateType = this.getAdCategoryId(categoryTmp);
|
realEstateType = this.getAdCategoryId(categoryTmp);
|
||||||
|
if (!realEstateType) {
|
||||||
|
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
switch (additionalField) {
|
switch (additionalField) {
|
||||||
case "Internet":
|
case "Internet":
|
||||||
@@ -570,7 +590,7 @@ class SaljicCrawler {
|
|||||||
|
|
||||||
return data;
|
return data;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -615,6 +635,21 @@ class SaljicCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
parseNumberOfFloors(numberOfFloorsText) {
|
||||||
|
const tryNumericalValue = parseInt(numberOfFloorsText);
|
||||||
|
if (!isNaN(tryNumericalValue)){
|
||||||
|
return tryNumericalValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Guess number of floors based on number of + sign concatenations
|
||||||
|
// e.g. P+S+Pt -> 3 floors
|
||||||
|
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
|
||||||
|
return numberOfFloorsText.split('+').length + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
async sleep(ms) {
|
async sleep(ms) {
|
||||||
return new Promise(resolve => setTimeout(resolve, ms));
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,6 +7,26 @@ const currentSearchRequest = async req => {
|
|||||||
|
|
||||||
return await getSearchRequest(searchRequestId);
|
return await getSearchRequest(searchRequestId);
|
||||||
};
|
};
|
||||||
module.exports = {
|
|
||||||
currentSearchRequest
|
const getUrlParams = function (url) {
|
||||||
|
if (typeof url === 'string' && url.length > 0){
|
||||||
|
const params = {};
|
||||||
|
const questionMarkIndex = url.indexOf('?');
|
||||||
|
if (questionMarkIndex === -1) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const query = url.substring(questionMarkIndex+1);
|
||||||
|
const vars = query.split('&');
|
||||||
|
for (let i = 0; i < vars.length; i++) {
|
||||||
|
const pair = vars[i].split('=');
|
||||||
|
params[pair[0]] = decodeURIComponent(pair[1]);
|
||||||
|
}
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
currentSearchRequest,
|
||||||
|
getUrlParams
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -72,6 +72,7 @@ AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
|||||||
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||||
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
#==SALJIC NEKRETNINE==
|
#==SALJIC NEKRETNINE==
|
||||||
|
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
|
||||||
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
||||||
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||||
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||||
|
|||||||
@@ -17,7 +17,8 @@
|
|||||||
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
||||||
"test-search": "cd test && node searchTest.js",
|
"test-search": "cd test && node searchTest.js",
|
||||||
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
||||||
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
|
"test-rental-scraper": "cd test && node rentalScrapeTest.js",
|
||||||
|
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
|||||||
17
test/saljicScrapeTest.js
Normal file
17
test/saljicScrapeTest.js
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
|
||||||
|
|
||||||
|
const urlToScrape = process.argv[2] || undefined;
|
||||||
|
|
||||||
|
if (urlToScrape) {
|
||||||
|
const crawler = new saljicCrawler();
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const data = await crawler.scrapeAd(urlToScrape);
|
||||||
|
console.log("Scraped data:", data);
|
||||||
|
})();
|
||||||
|
} else {
|
||||||
|
console.log("No URL to scrape. Use like this : ");
|
||||||
|
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user