Changed delay between ads.

This commit is contained in:
Naida Vatric
2020-03-01 00:44:48 +01:00
parent e1651306eb
commit ccea5fe2aa
4 changed files with 35 additions and 4 deletions

View File

@@ -217,6 +217,8 @@ class OlxCrawler {
//let numberOfParseErrors = 0;
// do {
try {
await this.sleep(this.delayBetweenAds);
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
@@ -694,6 +696,8 @@ class OlxCrawler {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
// } while (hasParseErrors && numberOfParseErrors <= 1);
await this.sleep(this.delayBetweenAds);
return null;
}

View File

@@ -84,6 +84,7 @@ class SaljicCrawler {
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
@@ -217,8 +218,8 @@ class SaljicCrawler {
}
}
async scrapeAd(url, adType) {
console.log("[SALJIC] Scraping : ", url);
async scrapeAd(url, adTypeAttribute) {
//console.log("[SALJIC] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
@@ -234,7 +235,7 @@ class SaljicCrawler {
//Extracting main properties
const propertySelectors = {
title:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
"div.content-wrap > div.container.clearfix.wpc > div.col-md-8.nobottommargin > div.single-post.nobottommargin > div.entry.clearfix > div.entry-title > h2",
price:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName:
@@ -245,6 +246,7 @@ class SaljicCrawler {
latAndLong:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
};
const title = $(propertySelectors.title)
.text()
.replace(/(\r\n|\n|\r)/gm, "")
@@ -277,7 +279,8 @@ class SaljicCrawler {
let tmpLatLong;
let latText;
let longText;
if (latAndLongSrc) {
if (latAndLongSrc && latAndLongSrc.indexOf("openstreetmap") !== -1) {
tmpLatLong = latAndLongSrc.split("marker=")[1];
latText = tmpLatLong.split("%2C")[0];
longText = tmpLatLong.split("%2C")[1];
@@ -330,6 +333,7 @@ class SaljicCrawler {
let numberOfViewsKivi = null;
let streetNumber = 0;
let adStatus = status;
let adType = adTypeAttribute;
let shortDescription = descriptions
? descriptions.substring(0, descriptions.indexOf("."))
: "";
@@ -413,6 +417,7 @@ class SaljicCrawler {
additionalField.length
)
.trim();
realEstateType = this.getAdCategoryId(categoryTmp);
} else {
switch (additionalField) {
@@ -568,10 +573,14 @@ class SaljicCrawler {
numberOfViewsKivi
};
await this.sleep(1000);
return data;
} catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url);
}
await this.sleep(1000);
return null;
}

View File

@@ -17,6 +17,7 @@
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
"test-search": "cd test && node searchTest.js",
"test-olx-scraper": "cd test && node olxScrapeTest.js",
"test-saljic-scraper": "cd test && node saljicScrapeTest.js",
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
},
"repository": {

17
test/saljicScrapeTest.js Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
const urlToScrape = process.argv[2] || undefined;
if (urlToScrape) {
const crawler = new saljicCrawler();
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log("Scraped data:", data);
})();
} else {
console.log("No URL to scrape. Use like this : ");
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
}