implement RENT and REQUEST option for OLX; implement force crawl option

This commit is contained in:
Bilal Catic
2019-10-30 15:03:59 +01:00
parent 97d93a3f37
commit 3abbed183e

View File

@@ -22,7 +22,8 @@ const OLX_ENUMS = {
OLX_AD_TYPE: { OLX_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "", [CRAWLER_AD_TYPE.ALL]: "",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje" [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje",
[CRAWLER_AD_TYPE.ONLY_REQUEST]: "&vrsta=samopotraznja"
}, },
OLX_AD_CATEGORY: { OLX_AD_CATEGORY: {
[AD_CATEGORY.FLAT.id]: "&kategorija=23", [AD_CATEGORY.FLAT.id]: "&kategorija=23",
@@ -38,6 +39,8 @@ const OLX_ENUMS = {
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
}; };
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
class OlxCrawler { class OlxCrawler {
constructor( constructor(
savers = [], savers = [],
@@ -99,7 +102,7 @@ class OlxCrawler {
"minute" "minute"
); );
if (stopCrawlingThisCategory) { if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
generatorsToRemove[index] = true; generatorsToRemove[index] = true;
// console.log("\tGenerator ", index + 1, "has no more new ads"); // console.log("\tGenerator ", index + 1, "has no more new ads");
break; break;
@@ -134,7 +137,7 @@ class OlxCrawler {
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
if (urlAdTypePart && urlCategoryPart) { if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) { while (true) {
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
const singlePageResults = await this.indexSinglePage( const singlePageResults = await this.indexSinglePage(
@@ -212,7 +215,7 @@ class OlxCrawler {
title: "#naslovartikla", title: "#naslovartikla",
descriptions: ".artikal_detaljniopis_tekst", descriptions: ".artikal_detaljniopis_tekst",
category: category:
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" "#artikal_glavni_div > div.artikal_lijevo > div.artikal_kat > div > span:nth-child(3) > a > span"
}; };
const username = $(propertySelectors.username) const username = $(propertySelectors.username)
@@ -384,7 +387,7 @@ class OlxCrawler {
//========================================= //=========================================
const parsedCategory = this.getAdCategoryId(category); const parsedCategory = this.getAdCategoryId(category);
if (!parsedCategory) { if (!parsedCategory) {
throw { message: "Unknown ad category" }; throw { message: `Unknown ad category [${category}]` };
} }
const parsedAdType = this.getAdTypeId(adType); const parsedAdType = this.getAdTypeId(adType);
@@ -475,6 +478,8 @@ class OlxCrawler {
return AD_TYPE.AD_TYPE_SALE; return AD_TYPE.AD_TYPE_SALE;
case "Izdavanje": case "Izdavanje":
return AD_TYPE.AD_TYPE_RENT; return AD_TYPE.AD_TYPE_RENT;
case "Potražnja":
return AD_TYPE.AD_TYPE_RENT;
default: default:
return undefined; return undefined;
} }