Fix saljic crawler

This commit is contained in:
Bilal
2020-05-14 19:01:19 +02:00
parent ba60f8749d
commit d35a113baa
5 changed files with 86 additions and 12 deletions

View File

@@ -1,6 +1,7 @@
"use strict"; "use strict";
const fetch = require("../../helpers/fetchWrapper"); const fetch = require("../../helpers/fetchWrapper");
const { getUrlParams } = require("../../helpers/url");
const cheerio = require("cheerio"); const cheerio = require("cheerio");
const moment = require("moment-timezone"); const moment = require("moment-timezone");
@@ -52,6 +53,7 @@ class SaljicCrawler {
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories; this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages
this.maxResultsPerPage = maxResultsPerPage; this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages; this.delayBetweenPages = delayBetweenPages;
} }
@@ -231,6 +233,10 @@ class SaljicCrawler {
? parseInt(url.substring(46, url.length)) ? parseInt(url.substring(46, url.length))
: null; : null;
if (!agencyObjectId) {
throw { message : 'No agency object ID - URL changed?'}
}
//Extracting main properties //Extracting main properties
const propertySelectors = { const propertySelectors = {
title: title:
@@ -239,11 +245,10 @@ class SaljicCrawler {
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName: streetName:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p", "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
descriptions: descriptions:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)", "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
latAndLong: latAndLong:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" "iframe"
}; };
const title = $(propertySelectors.title) const title = $(propertySelectors.title)
.text() .text()
@@ -274,14 +279,26 @@ class SaljicCrawler {
.trim(); .trim();
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
let tmpLatLong;
let latText; let latText;
let longText; let longText;
if (latAndLongSrc) { if (latAndLongSrc){
tmpLatLong = latAndLongSrc.split("marker=")[1]; const mapParams = getUrlParams(latAndLongSrc);
latText = tmpLatLong.split("%2C")[0]; if (mapParams) {
longText = tmpLatLong.split("%2C")[1]; if (mapParams['marker']){
const marker = mapParams['marker'].split(',');
latText = marker[0] ? marker[0] : undefined;
longText = marker[1] ? marker[1] : undefined;
}else{
if (mapParams['mlat']) {
latText = mapParams['mlat'];
}
if (mapParams['mlon']) {
longText = mapParams['mlon'];
}
}
}
} }
const locationLat = parseFloat(latText) || null; const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null; const locationLong = parseFloat(longText) || null;
@@ -368,7 +385,7 @@ class SaljicCrawler {
numberOfRooms = parseInt(mainFieldValue); numberOfRooms = parseInt(mainFieldValue);
break; break;
case "Broj spratova": case "Broj spratova":
numberOfFloors = parseInt(mainFieldValue); numberOfFloors = this.parseNumberOfFloors(mainFieldValue);
break; break;
case "Sprat": case "Sprat":
floor = parseInt(mainFieldValue); floor = parseInt(mainFieldValue);
@@ -414,6 +431,9 @@ class SaljicCrawler {
) )
.trim(); .trim();
realEstateType = this.getAdCategoryId(categoryTmp); realEstateType = this.getAdCategoryId(categoryTmp);
if (!realEstateType) {
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
}
} else { } else {
switch (additionalField) { switch (additionalField) {
case "Internet": case "Internet":
@@ -570,7 +590,7 @@ class SaljicCrawler {
return data; return data;
} catch (e) { } catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url); console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
} }
return null; return null;
} }
@@ -615,6 +635,21 @@ class SaljicCrawler {
} }
} }
parseNumberOfFloors(numberOfFloorsText) {
const tryNumericalValue = parseInt(numberOfFloorsText);
if (!isNaN(tryNumericalValue)){
return tryNumericalValue;
}
// Guess number of floors based on number of + sign concatenations
// e.g. P+S+Pt -> 3 floors
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
return numberOfFloorsText.split('+').length + 1
}
return null
}
async sleep(ms) { async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms)); return new Promise(resolve => setTimeout(resolve, ms));
} }

View File

@@ -7,6 +7,26 @@ const currentSearchRequest = async req => {
return await getSearchRequest(searchRequestId); return await getSearchRequest(searchRequestId);
}; };
module.exports = {
currentSearchRequest const getUrlParams = function (url) {
if (typeof url === 'string' && url.length > 0){
const params = {};
const questionMarkIndex = url.indexOf('?');
if (questionMarkIndex === -1) {
return undefined;
}
const query = url.substring(questionMarkIndex+1);
const vars = query.split('&');
for (let i = 0; i < vars.length; i++) {
const pair = vars[i].split('=');
params[pair[0]] = decodeURIComponent(pair[1]);
}
return params;
}
return undefined;
};
module.exports = {
currentSearchRequest,
getUrlParams
}; };

View File

@@ -72,6 +72,7 @@ AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==SALJIC NEKRETNINE== #==SALJIC NEKRETNINE==
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values

View File

@@ -17,7 +17,8 @@
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js", "checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
"test-search": "cd test && node searchTest.js", "test-search": "cd test && node searchTest.js",
"test-olx-scraper": "cd test && node olxScrapeTest.js", "test-olx-scraper": "cd test && node olxScrapeTest.js",
"test-rental-scraper": "cd test && node rentalScrapeTest.js" "test-rental-scraper": "cd test && node rentalScrapeTest.js",
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
}, },
"repository": { "repository": {
"type": "git", "type": "git",

17
test/saljicScrapeTest.js Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
const urlToScrape = process.argv[2] || undefined;
if (urlToScrape) {
const crawler = new saljicCrawler();
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log("Scraped data:", data);
})();
} else {
console.log("No URL to scrape. Use like this : ");
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
}