Changed delay between ads.
This commit is contained in:
@@ -217,6 +217,8 @@ class OlxCrawler {
|
|||||||
//let numberOfParseErrors = 0;
|
//let numberOfParseErrors = 0;
|
||||||
// do {
|
// do {
|
||||||
try {
|
try {
|
||||||
|
await this.sleep(this.delayBetweenAds);
|
||||||
|
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
@@ -694,6 +696,8 @@ class OlxCrawler {
|
|||||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||||
}
|
}
|
||||||
// } while (hasParseErrors && numberOfParseErrors <= 1);
|
// } while (hasParseErrors && numberOfParseErrors <= 1);
|
||||||
|
await this.sleep(this.delayBetweenAds);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -84,6 +84,7 @@ class SaljicCrawler {
|
|||||||
for (const [index, { value: singlePageResult }] of entries) {
|
for (const [index, { value: singlePageResult }] of entries) {
|
||||||
if (singlePageResult) {
|
if (singlePageResult) {
|
||||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||||
|
|
||||||
const { newRecords } = saveResults;
|
const { newRecords } = saveResults;
|
||||||
|
|
||||||
newRealEstates.push(...newRecords);
|
newRealEstates.push(...newRecords);
|
||||||
@@ -217,8 +218,8 @@ class SaljicCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async scrapeAd(url, adType) {
|
async scrapeAd(url, adTypeAttribute) {
|
||||||
console.log("[SALJIC] Scraping : ", url);
|
//console.log("[SALJIC] Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
@@ -234,7 +235,7 @@ class SaljicCrawler {
|
|||||||
//Extracting main properties
|
//Extracting main properties
|
||||||
const propertySelectors = {
|
const propertySelectors = {
|
||||||
title:
|
title:
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
|
"div.content-wrap > div.container.clearfix.wpc > div.col-md-8.nobottommargin > div.single-post.nobottommargin > div.entry.clearfix > div.entry-title > h2",
|
||||||
price:
|
price:
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
||||||
streetName:
|
streetName:
|
||||||
@@ -245,6 +246,7 @@ class SaljicCrawler {
|
|||||||
latAndLong:
|
latAndLong:
|
||||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
|
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
|
||||||
};
|
};
|
||||||
|
|
||||||
const title = $(propertySelectors.title)
|
const title = $(propertySelectors.title)
|
||||||
.text()
|
.text()
|
||||||
.replace(/(\r\n|\n|\r)/gm, "")
|
.replace(/(\r\n|\n|\r)/gm, "")
|
||||||
@@ -277,7 +279,8 @@ class SaljicCrawler {
|
|||||||
let tmpLatLong;
|
let tmpLatLong;
|
||||||
let latText;
|
let latText;
|
||||||
let longText;
|
let longText;
|
||||||
if (latAndLongSrc) {
|
|
||||||
|
if (latAndLongSrc && latAndLongSrc.indexOf("openstreetmap") !== -1) {
|
||||||
tmpLatLong = latAndLongSrc.split("marker=")[1];
|
tmpLatLong = latAndLongSrc.split("marker=")[1];
|
||||||
latText = tmpLatLong.split("%2C")[0];
|
latText = tmpLatLong.split("%2C")[0];
|
||||||
longText = tmpLatLong.split("%2C")[1];
|
longText = tmpLatLong.split("%2C")[1];
|
||||||
@@ -330,6 +333,7 @@ class SaljicCrawler {
|
|||||||
let numberOfViewsKivi = null;
|
let numberOfViewsKivi = null;
|
||||||
let streetNumber = 0;
|
let streetNumber = 0;
|
||||||
let adStatus = status;
|
let adStatus = status;
|
||||||
|
let adType = adTypeAttribute;
|
||||||
let shortDescription = descriptions
|
let shortDescription = descriptions
|
||||||
? descriptions.substring(0, descriptions.indexOf("."))
|
? descriptions.substring(0, descriptions.indexOf("."))
|
||||||
: "";
|
: "";
|
||||||
@@ -413,6 +417,7 @@ class SaljicCrawler {
|
|||||||
additionalField.length
|
additionalField.length
|
||||||
)
|
)
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
realEstateType = this.getAdCategoryId(categoryTmp);
|
realEstateType = this.getAdCategoryId(categoryTmp);
|
||||||
} else {
|
} else {
|
||||||
switch (additionalField) {
|
switch (additionalField) {
|
||||||
@@ -568,10 +573,14 @@ class SaljicCrawler {
|
|||||||
numberOfViewsKivi
|
numberOfViewsKivi
|
||||||
};
|
};
|
||||||
|
|
||||||
|
await this.sleep(1000);
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||||
}
|
}
|
||||||
|
await this.sleep(1000);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
||||||
"test-search": "cd test && node searchTest.js",
|
"test-search": "cd test && node searchTest.js",
|
||||||
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
||||||
|
"test-saljic-scraper": "cd test && node saljicScrapeTest.js",
|
||||||
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
|
"test-rental-scraper": "cd test && node rentalScrapeTest.js"
|
||||||
},
|
},
|
||||||
"repository": {
|
"repository": {
|
||||||
|
|||||||
17
test/saljicScrapeTest.js
Normal file
17
test/saljicScrapeTest.js
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const saljicCrawler = require("../app/crawler/specificCrawlers/saljic");
|
||||||
|
|
||||||
|
const urlToScrape = process.argv[2] || undefined;
|
||||||
|
|
||||||
|
if (urlToScrape) {
|
||||||
|
const crawler = new saljicCrawler();
|
||||||
|
|
||||||
|
(async () => {
|
||||||
|
const data = await crawler.scrapeAd(urlToScrape);
|
||||||
|
console.log("Scraped data:", data);
|
||||||
|
})();
|
||||||
|
} else {
|
||||||
|
console.log("No URL to scrape. Use like this : ");
|
||||||
|
console.log("npm run test-saljic-scraper -- URL_TO_SCRAPE");
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user