Fix saljic crawler

This commit is contained in:
Bilal
2020-05-14 19:01:19 +02:00
parent ba60f8749d
commit d35a113baa
5 changed files with 86 additions and 12 deletions

View File

@@ -1,6 +1,7 @@
"use strict";
const fetch = require("../../helpers/fetchWrapper");
const { getUrlParams } = require("../../helpers/url");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
@@ -52,6 +53,7 @@ class SaljicCrawler {
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxPages = maxPages
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
@@ -231,6 +233,10 @@ class SaljicCrawler {
? parseInt(url.substring(46, url.length))
: null;
if (!agencyObjectId) {
throw { message : 'No agency object ID - URL changed?'}
}
//Extracting main properties
const propertySelectors = {
title:
@@ -239,11 +245,10 @@ class SaljicCrawler {
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
descriptions:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
latAndLong:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
"iframe"
};
const title = $(propertySelectors.title)
.text()
@@ -274,14 +279,26 @@ class SaljicCrawler {
.trim();
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
let tmpLatLong;
let latText;
let longText;
if (latAndLongSrc) {
tmpLatLong = latAndLongSrc.split("marker=")[1];
latText = tmpLatLong.split("%2C")[0];
longText = tmpLatLong.split("%2C")[1];
if (latAndLongSrc){
const mapParams = getUrlParams(latAndLongSrc);
if (mapParams) {
if (mapParams['marker']){
const marker = mapParams['marker'].split(',');
latText = marker[0] ? marker[0] : undefined;
longText = marker[1] ? marker[1] : undefined;
}else{
if (mapParams['mlat']) {
latText = mapParams['mlat'];
}
if (mapParams['mlon']) {
longText = mapParams['mlon'];
}
}
}
}
const locationLat = parseFloat(latText) || null;
const locationLong = parseFloat(longText) || null;
@@ -368,7 +385,7 @@ class SaljicCrawler {
numberOfRooms = parseInt(mainFieldValue);
break;
case "Broj spratova":
numberOfFloors = parseInt(mainFieldValue);
numberOfFloors = this.parseNumberOfFloors(mainFieldValue);
break;
case "Sprat":
floor = parseInt(mainFieldValue);
@@ -414,6 +431,9 @@ class SaljicCrawler {
)
.trim();
realEstateType = this.getAdCategoryId(categoryTmp);
if (!realEstateType) {
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
}
} else {
switch (additionalField) {
case "Internet":
@@ -570,7 +590,7 @@ class SaljicCrawler {
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
}
return null;
}
@@ -615,6 +635,21 @@ class SaljicCrawler {
}
}
parseNumberOfFloors(numberOfFloorsText) {
const tryNumericalValue = parseInt(numberOfFloorsText);
if (!isNaN(tryNumericalValue)){
return tryNumericalValue;
}
// Guess number of floors based on number of + sign concatenations
// e.g. P+S+Pt -> 3 floors
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
return numberOfFloorsText.split('+').length + 1
}
return null
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}

View File

@@ -7,6 +7,26 @@ const currentSearchRequest = async req => {
return await getSearchRequest(searchRequestId);
};
module.exports = {
currentSearchRequest
const getUrlParams = function (url) {
if (typeof url === 'string' && url.length > 0){
const params = {};
const questionMarkIndex = url.indexOf('?');
if (questionMarkIndex === -1) {
return undefined;
}
const query = url.substring(questionMarkIndex+1);
const vars = query.split('&');
for (let i = 0; i < vars.length; i++) {
const pair = vars[i].split('=');
params[pair[0]] = decodeURIComponent(pair[1]);
}
return params;
}
return undefined;
};
module.exports = {
currentSearchRequest,
getUrlParams
};

View File

@@ -72,6 +72,7 @@ AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==SALJIC NEKRETNINE==
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values

View File

@@ -17,7 +17,8 @@
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
"test-search": "cd test && node searchTest.js",
"test-olx-scraper": "cd test && node olxScrapeTest.js",
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
"test-rental-scraper": "cd test && node rentalScrapeTest.js",
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
},
"repository": {
"type": "git",

17
test/saljicScrapeTest.js Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
const urlToScrape = process.argv[2] || undefined;
if (urlToScrape) {
const crawler = new saljicCrawler();
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log("Scraped data:", data);
})();
} else {
console.log("No URL to scrape. Use like this : ");
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
}