diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index f2964a3..eed5050 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -217,6 +217,8 @@ class OlxCrawler { //let numberOfParseErrors = 0; // do { try { + await this.sleep(this.delayBetweenAds); + const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); @@ -694,6 +696,8 @@ class OlxCrawler { console.error("Exception caught: " + e.message, "\r\nURL:", url); } // } while (hasParseErrors && numberOfParseErrors <= 1); + await this.sleep(this.delayBetweenAds); + return null; } diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 8afc751..3e3b371 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -84,6 +84,7 @@ class SaljicCrawler { for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; newRealEstates.push(...newRecords); @@ -217,8 +218,8 @@ class SaljicCrawler { } } - async scrapeAd(url, adType) { - console.log("[SALJIC] Scraping : ", url); + async scrapeAd(url, adTypeAttribute) { + //console.log("[SALJIC] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); @@ -234,7 +235,7 @@ class SaljicCrawler { //Extracting main properties const propertySelectors = { title: - "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", + "div.content-wrap > div.container.clearfix.wpc > div.col-md-8.nobottommargin > div.single-post.nobottommargin > div.entry.clearfix > div.entry-title > h2", price: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", streetName: @@ -245,6 +246,7 @@ class SaljicCrawler { latAndLong: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" }; + const title = $(propertySelectors.title) .text() .replace(/(\r\n|\n|\r)/gm, "") @@ -277,7 +279,8 @@ class SaljicCrawler { let tmpLatLong; let latText; let longText; - if (latAndLongSrc) { + + if (latAndLongSrc && latAndLongSrc.indexOf("openstreetmap") !== -1) { tmpLatLong = latAndLongSrc.split("marker=")[1]; latText = tmpLatLong.split("%2C")[0]; longText = tmpLatLong.split("%2C")[1]; @@ -330,6 +333,7 @@ class SaljicCrawler { let numberOfViewsKivi = null; let streetNumber = 0; let adStatus = status; + let adType = adTypeAttribute; let shortDescription = descriptions ? descriptions.substring(0, descriptions.indexOf(".")) : ""; @@ -413,6 +417,7 @@ class SaljicCrawler { additionalField.length ) .trim(); + realEstateType = this.getAdCategoryId(categoryTmp); } else { switch (additionalField) { @@ -568,10 +573,14 @@ class SaljicCrawler { numberOfViewsKivi }; + await this.sleep(1000); + return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); } + await this.sleep(1000); + return null; } diff --git a/package.json b/package.json index 3bc99a7..96d5274 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,7 @@ "checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js", "test-search": "cd test && node searchTest.js", "test-olx-scraper": "cd test && node olxScrapeTest.js", + "test-saljic-scraper": "cd test && node saljicScrapeTest.js", "test-rental-scraper": "cd test && node rentalScrapeTest.js" }, "repository": { diff --git a/test/saljicScrapeTest.js b/test/saljicScrapeTest.js new file mode 100644 index 0000000..384719c --- /dev/null +++ b/test/saljicScrapeTest.js @@ -0,0 +1,17 @@ +"use strict"; + +const saljicCrawler = require("../app/crawler/specificCrawlers/saljic"); + +const urlToScrape = process.argv[2] || undefined; + +if (urlToScrape) { + const crawler = new saljicCrawler(); + + (async () => { + const data = await crawler.scrapeAd(urlToScrape); + console.log("Scraped data:", data); + })(); +} else { + console.log("No URL to scrape. Use like this : "); + console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE"); +}