From d35a113baad823aec931d0e79f17e68445944f1c Mon Sep 17 00:00:00 2001 From: Bilal Date: Thu, 14 May 2020 19:01:19 +0200 Subject: [PATCH] Fix saljic crawler --- app/crawler/specificCrawlers/saljic.js | 53 +++++++++++++++++++++----- app/helpers/url.js | 24 +++++++++++- development.env | 1 + package.json | 3 +- test/saljicScrapeTest.js | 17 +++++++++ 5 files changed, 86 insertions(+), 12 deletions(-) create mode 100644 test/saljicScrapeTest.js diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 8afc751..3d5e30e 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -1,6 +1,7 @@ "use strict"; const fetch = require("../../helpers/fetchWrapper"); +const { getUrlParams } = require("../../helpers/url"); const cheerio = require("cheerio"); const moment = require("moment-timezone"); @@ -52,6 +53,7 @@ class SaljicCrawler { this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; + this.maxPages = maxPages this.maxResultsPerPage = maxResultsPerPage; this.delayBetweenPages = delayBetweenPages; } @@ -231,6 +233,10 @@ class SaljicCrawler { ? parseInt(url.substring(46, url.length)) : null; + if (!agencyObjectId) { + throw { message : 'No agency object ID - URL changed?'} + } + //Extracting main properties const propertySelectors = { title: @@ -239,11 +245,10 @@ class SaljicCrawler { "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", streetName: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p", - descriptions: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)", latAndLong: - "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" + "iframe" }; const title = $(propertySelectors.title) .text() @@ -274,14 +279,26 @@ class SaljicCrawler { .trim(); const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); - let tmpLatLong; let latText; let longText; - if (latAndLongSrc) { - tmpLatLong = latAndLongSrc.split("marker=")[1]; - latText = tmpLatLong.split("%2C")[0]; - longText = tmpLatLong.split("%2C")[1]; + if (latAndLongSrc){ + const mapParams = getUrlParams(latAndLongSrc); + if (mapParams) { + if (mapParams['marker']){ + const marker = mapParams['marker'].split(','); + latText = marker[0] ? marker[0] : undefined; + longText = marker[1] ? marker[1] : undefined; + }else{ + if (mapParams['mlat']) { + latText = mapParams['mlat']; + } + if (mapParams['mlon']) { + longText = mapParams['mlon']; + } + } + } } + const locationLat = parseFloat(latText) || null; const locationLong = parseFloat(longText) || null; @@ -368,7 +385,7 @@ class SaljicCrawler { numberOfRooms = parseInt(mainFieldValue); break; case "Broj spratova": - numberOfFloors = parseInt(mainFieldValue); + numberOfFloors = this.parseNumberOfFloors(mainFieldValue); break; case "Sprat": floor = parseInt(mainFieldValue); @@ -414,6 +431,9 @@ class SaljicCrawler { ) .trim(); realEstateType = this.getAdCategoryId(categoryTmp); + if (!realEstateType) { + throw { message: 'No real estate type - page body not loaded correctly or page changed?' } + } } else { switch (additionalField) { case "Internet": @@ -570,7 +590,7 @@ class SaljicCrawler { return data; } catch (e) { - console.error("Exception caught: " + e.message, "\r\nURL:", url); + console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url); } return null; } @@ -615,6 +635,21 @@ class SaljicCrawler { } } + parseNumberOfFloors(numberOfFloorsText) { + const tryNumericalValue = parseInt(numberOfFloorsText); + if (!isNaN(tryNumericalValue)){ + return tryNumericalValue; + } + + // Guess number of floors based on number of + sign concatenations + // e.g. P+S+Pt -> 3 floors + if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) { + return numberOfFloorsText.split('+').length + 1 + } + + return null + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } diff --git a/app/helpers/url.js b/app/helpers/url.js index 9ed771f..214c461 100644 --- a/app/helpers/url.js +++ b/app/helpers/url.js @@ -7,6 +7,26 @@ const currentSearchRequest = async req => { return await getSearchRequest(searchRequestId); }; -module.exports = { - currentSearchRequest + +const getUrlParams = function (url) { + if (typeof url === 'string' && url.length > 0){ + const params = {}; + const questionMarkIndex = url.indexOf('?'); + if (questionMarkIndex === -1) { + return undefined; + } + const query = url.substring(questionMarkIndex+1); + const vars = query.split('&'); + for (let i = 0; i < vars.length; i++) { + const pair = vars[i].split('='); + params[pair[0]] = decodeURIComponent(pair[1]); + } + return params; + } + return undefined; +}; + +module.exports = { + currentSearchRequest, + getUrlParams }; diff --git a/development.env b/development.env index f4a2588..67dc72b 100644 --- a/development.env +++ b/development.env @@ -72,6 +72,7 @@ AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!! AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found #==SALJIC NEKRETNINE== +SALJIC_MAX_PAGES=Restrict crawler to this number of pages SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values diff --git a/package.json b/package.json index 3bc99a7..1dd28bd 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,8 @@ "checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js", "test-search": "cd test && node searchTest.js", "test-olx-scraper": "cd test && node olxScrapeTest.js", - "test-rental-scraper": "cd test && node rentalScrapeTest.js" + "test-rental-scraper": "cd test && node rentalScrapeTest.js", + "test-saljic-scraper": "cd test && node saljicScrapeTest.js" }, "repository": { "type": "git", diff --git a/test/saljicScrapeTest.js b/test/saljicScrapeTest.js new file mode 100644 index 0000000..384719c --- /dev/null +++ b/test/saljicScrapeTest.js @@ -0,0 +1,17 @@ +"use strict"; + +const saljicCrawler = require("../app/crawler/specificCrawlers/saljic"); + +const urlToScrape = process.argv[2] || undefined; + +if (urlToScrape) { + const crawler = new saljicCrawler(); + + (async () => { + const data = await crawler.scrapeAd(urlToScrape); + console.log("Scraped data:", data); + })(); +} else { + console.log("No URL to scrape. Use like this : "); + console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE"); +}