Changed delay between ads.

This commit is contained in:
Naida Vatric
2020-03-01 00:44:48 +01:00
parent e1651306eb
commit ccea5fe2aa
4 changed files with 35 additions and 4 deletions

View File

@@ -217,6 +217,8 @@ class OlxCrawler {
//let numberOfParseErrors = 0; //let numberOfParseErrors = 0;
// do { // do {
try { try {
await this.sleep(this.delayBetweenAds);
const adPageSource = await fetch(url); const adPageSource = await fetch(url);
const body = await adPageSource.text(); const body = await adPageSource.text();
const $ = cheerio.load(body); const $ = cheerio.load(body);
@@ -694,6 +696,8 @@ class OlxCrawler {
console.error("Exception caught: " + e.message, "\r\nURL:", url); console.error("Exception caught: " + e.message, "\r\nURL:", url);
} }
// } while (hasParseErrors && numberOfParseErrors <= 1); // } while (hasParseErrors && numberOfParseErrors <= 1);
await this.sleep(this.delayBetweenAds);
return null; return null;
} }

View File

@@ -84,6 +84,7 @@ class SaljicCrawler {
for (const [index, { value: singlePageResult }] of entries) { for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) { if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult); const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults; const { newRecords } = saveResults;
newRealEstates.push(...newRecords); newRealEstates.push(...newRecords);
@@ -217,8 +218,8 @@ class SaljicCrawler {
} }
} }
async scrapeAd(url, adType) { async scrapeAd(url, adTypeAttribute) {
console.log("[SALJIC] Scraping : ", url); //console.log("[SALJIC] Scraping : ", url);
try { try {
const adPageSource = await fetch(url); const adPageSource = await fetch(url);
const body = await adPageSource.text(); const body = await adPageSource.text();
@@ -234,7 +235,7 @@ class SaljicCrawler {
//Extracting main properties //Extracting main properties
const propertySelectors = { const propertySelectors = {
title: title:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", "div.content-wrap > div.container.clearfix.wpc > div.col-md-8.nobottommargin > div.single-post.nobottommargin > div.entry.clearfix > div.entry-title > h2",
price: price:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins", "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
streetName: streetName:
@@ -245,6 +246,7 @@ class SaljicCrawler {
latAndLong: latAndLong:
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe" "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
}; };
const title = $(propertySelectors.title) const title = $(propertySelectors.title)
.text() .text()
.replace(/(\r\n|\n|\r)/gm, "") .replace(/(\r\n|\n|\r)/gm, "")
@@ -277,7 +279,8 @@ class SaljicCrawler {
let tmpLatLong; let tmpLatLong;
let latText; let latText;
let longText; let longText;
if (latAndLongSrc) {
if (latAndLongSrc && latAndLongSrc.indexOf("openstreetmap") !== -1) {
tmpLatLong = latAndLongSrc.split("marker=")[1]; tmpLatLong = latAndLongSrc.split("marker=")[1];
latText = tmpLatLong.split("%2C")[0]; latText = tmpLatLong.split("%2C")[0];
longText = tmpLatLong.split("%2C")[1]; longText = tmpLatLong.split("%2C")[1];
@@ -330,6 +333,7 @@ class SaljicCrawler {
let numberOfViewsKivi = null; let numberOfViewsKivi = null;
let streetNumber = 0; let streetNumber = 0;
let adStatus = status; let adStatus = status;
let adType = adTypeAttribute;
let shortDescription = descriptions let shortDescription = descriptions
? descriptions.substring(0, descriptions.indexOf(".")) ? descriptions.substring(0, descriptions.indexOf("."))
: ""; : "";
@@ -413,6 +417,7 @@ class SaljicCrawler {
additionalField.length additionalField.length
) )
.trim(); .trim();
realEstateType = this.getAdCategoryId(categoryTmp); realEstateType = this.getAdCategoryId(categoryTmp);
} else { } else {
switch (additionalField) { switch (additionalField) {
@@ -568,10 +573,14 @@ class SaljicCrawler {
numberOfViewsKivi numberOfViewsKivi
}; };
await this.sleep(1000);
return data; return data;
} catch (e) { } catch (e) {
console.error("Exception caught: " + e.message, "\r\nURL:", url); console.error("Exception caught: " + e.message, "\r\nURL:", url);
} }
await this.sleep(1000);
return null; return null;
} }

View File

@@ -17,6 +17,7 @@
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js", "checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
"test-search": "cd test && node searchTest.js", "test-search": "cd test && node searchTest.js",
"test-olx-scraper": "cd test && node olxScrapeTest.js", "test-olx-scraper": "cd test && node olxScrapeTest.js",
"test-saljic-scraper": "cd test && node saljicScrapeTest.js",
"test-rental-scraper": "cd test && node rentalScrapeTest.js" "test-rental-scraper": "cd test && node rentalScrapeTest.js"
}, },
"repository": { "repository": {

17
test/saljicScrapeTest.js Normal file
View File

@@ -0,0 +1,17 @@
"use strict";
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
const urlToScrape = process.argv[2] || undefined;
if (urlToScrape) {
const crawler = new saljicCrawler();
(async () => {
const data = await crawler.scrapeAd(urlToScrape);
console.log("Scraped data:", data);
})();
} else {
console.log("No URL to scrape. Use like this : ");
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
}