Merge branch 'implement-renting-option' into 'master'
Implement renting option - crawler part See merge request saburly/marketalarm/web!64
This commit was merged in pull request #64.
This commit is contained in:
@@ -60,7 +60,8 @@ const GARAGE_PRICE_SLIDER_OPTIONS = {
|
|||||||
|
|
||||||
const AD_TYPE = {
|
const AD_TYPE = {
|
||||||
AD_TYPE_SALE: "SALE",
|
AD_TYPE_SALE: "SALE",
|
||||||
AD_TYPE_RENT: "RENT"
|
AD_TYPE_RENT: "RENT",
|
||||||
|
AD_TYPE_REQUEST: "REQUEST"
|
||||||
};
|
};
|
||||||
|
|
||||||
const AD_CATEGORY = {
|
const AD_CATEGORY = {
|
||||||
@@ -140,7 +141,8 @@ const CRAWLER_AD_TYPE = {
|
|||||||
NONE: 0,
|
NONE: 0,
|
||||||
ALL: 1,
|
ALL: 1,
|
||||||
ONLY_SELL: 2,
|
ONLY_SELL: 2,
|
||||||
ONLY_RENT: 3
|
ONLY_RENT: 3,
|
||||||
|
ONLY_REQUEST: 4
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|||||||
@@ -29,5 +29,6 @@ module.exports = {
|
|||||||
AKTIDO_CRAWLER_AD_CATEGORIES: transformedAktidoCrawlerAdCategories,
|
AKTIDO_CRAWLER_AD_CATEGORIES: transformedAktidoCrawlerAdCategories,
|
||||||
AKTIDO_IGNORED_USERNAMES: aktidoIgnoredUsernames || [],
|
AKTIDO_IGNORED_USERNAMES: aktidoIgnoredUsernames || [],
|
||||||
AKTIDO_DELAY_BETWEEN_PAGES:
|
AKTIDO_DELAY_BETWEEN_PAGES:
|
||||||
parseInt(process.env.AKTIDO_DELAY_BETWEEN_PAGES) || 1000
|
parseInt(process.env.AKTIDO_DELAY_BETWEEN_PAGES) || 1000,
|
||||||
|
AKTIDO_FORCE_CRAWL: !!parseInt(process.env.AKTIDO_FORCE_CRAWL)
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -33,5 +33,7 @@ module.exports = {
|
|||||||
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||||
OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories,
|
OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories,
|
||||||
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
|
||||||
OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000
|
OLX_DELAY_BETWEEN_PAGES:
|
||||||
|
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
|
||||||
|
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -29,5 +29,6 @@ module.exports = {
|
|||||||
RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories,
|
RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories,
|
||||||
RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [],
|
RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [],
|
||||||
RENTAL_DELAY_BETWEEN_PAGES:
|
RENTAL_DELAY_BETWEEN_PAGES:
|
||||||
parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000
|
parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000,
|
||||||
|
RENTAL_FORCE_CRAWL: !!parseInt(process.env.RENTAL_FORCE_CRAWL)
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ const AKTIDO_ENUMS = {
|
|||||||
AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
|
AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const { AKTIDO_FORCE_CRAWL } = require("../specificConfigs/aktido");
|
||||||
|
|
||||||
class AktidoCrawler {
|
class AktidoCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
savers = [],
|
savers = [],
|
||||||
@@ -88,27 +90,13 @@ class AktidoCrawler {
|
|||||||
|
|
||||||
newRealEstates.push(...newRecords);
|
newRealEstates.push(...newRecords);
|
||||||
|
|
||||||
if (Array.isArray(newRecords) && newRecords.length === 0) {
|
if (
|
||||||
|
Array.isArray(newRecords) &&
|
||||||
|
newRecords.length === 0 &&
|
||||||
|
!AKTIDO_FORCE_CRAWL
|
||||||
|
) {
|
||||||
generatorsToRemove[index] = true;
|
generatorsToRemove[index] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// for (const existingRecord of existingRecords) {
|
|
||||||
// const { publishedDate, renewedDate } = existingRecord;
|
|
||||||
//
|
|
||||||
// const publishedDateMoment = moment.utc(publishedDate);
|
|
||||||
// const renewedDateMoment = moment.utc(renewedDate);
|
|
||||||
//
|
|
||||||
// const stopCrawlingThisCategory = publishedDateMoment.isSame(
|
|
||||||
// renewedDateMoment,
|
|
||||||
// "minute"
|
|
||||||
// );
|
|
||||||
//
|
|
||||||
// if (stopCrawlingThisCategory) {
|
|
||||||
// generatorsToRemove[index] = true;
|
|
||||||
// // console.log("\tGenerator ", index + 1, "has no more new ads");
|
|
||||||
// break;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
} else {
|
} else {
|
||||||
//Generator returned undefined, remove this generator from array
|
//Generator returned undefined, remove this generator from array
|
||||||
generatorsToRemove[index] = true;
|
generatorsToRemove[index] = true;
|
||||||
|
|||||||
@@ -22,7 +22,8 @@ const OLX_ENUMS = {
|
|||||||
OLX_AD_TYPE: {
|
OLX_AD_TYPE: {
|
||||||
[CRAWLER_AD_TYPE.ALL]: "",
|
[CRAWLER_AD_TYPE.ALL]: "",
|
||||||
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
|
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
|
||||||
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje"
|
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_REQUEST]: "&vrsta=samopotraznja"
|
||||||
},
|
},
|
||||||
OLX_AD_CATEGORY: {
|
OLX_AD_CATEGORY: {
|
||||||
[AD_CATEGORY.FLAT.id]: "&kategorija=23",
|
[AD_CATEGORY.FLAT.id]: "&kategorija=23",
|
||||||
@@ -38,6 +39,8 @@ const OLX_ENUMS = {
|
|||||||
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||||
|
|
||||||
class OlxCrawler {
|
class OlxCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
savers = [],
|
savers = [],
|
||||||
@@ -99,7 +102,7 @@ class OlxCrawler {
|
|||||||
"minute"
|
"minute"
|
||||||
);
|
);
|
||||||
|
|
||||||
if (stopCrawlingThisCategory) {
|
if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
|
||||||
generatorsToRemove[index] = true;
|
generatorsToRemove[index] = true;
|
||||||
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
||||||
break;
|
break;
|
||||||
@@ -134,7 +137,7 @@ class OlxCrawler {
|
|||||||
|
|
||||||
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||||
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||||
if (urlAdTypePart && urlCategoryPart) {
|
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||||
while (true) {
|
while (true) {
|
||||||
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||||
const singlePageResults = await this.indexSinglePage(
|
const singlePageResults = await this.indexSinglePage(
|
||||||
@@ -212,7 +215,7 @@ class OlxCrawler {
|
|||||||
title: "#naslovartikla",
|
title: "#naslovartikla",
|
||||||
descriptions: ".artikal_detaljniopis_tekst",
|
descriptions: ".artikal_detaljniopis_tekst",
|
||||||
category:
|
category:
|
||||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span"
|
"#artikal_glavni_div > div.artikal_lijevo > div.artikal_kat > div > span:nth-child(3) > a > span"
|
||||||
};
|
};
|
||||||
|
|
||||||
const username = $(propertySelectors.username)
|
const username = $(propertySelectors.username)
|
||||||
@@ -384,7 +387,7 @@ class OlxCrawler {
|
|||||||
//=========================================
|
//=========================================
|
||||||
const parsedCategory = this.getAdCategoryId(category);
|
const parsedCategory = this.getAdCategoryId(category);
|
||||||
if (!parsedCategory) {
|
if (!parsedCategory) {
|
||||||
throw { message: "Unknown ad category" };
|
throw { message: `Unknown ad category [${category}]` };
|
||||||
}
|
}
|
||||||
|
|
||||||
const parsedAdType = this.getAdTypeId(adType);
|
const parsedAdType = this.getAdTypeId(adType);
|
||||||
@@ -475,6 +478,8 @@ class OlxCrawler {
|
|||||||
return AD_TYPE.AD_TYPE_SALE;
|
return AD_TYPE.AD_TYPE_SALE;
|
||||||
case "Izdavanje":
|
case "Izdavanje":
|
||||||
return AD_TYPE.AD_TYPE_RENT;
|
return AD_TYPE.AD_TYPE_RENT;
|
||||||
|
case "Potražnja":
|
||||||
|
return AD_TYPE.AD_TYPE_RENT;
|
||||||
default:
|
default:
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ const RENTAL_ENUMS = {
|
|||||||
RENTAL_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
|
RENTAL_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const { RENTAL_FORCE_CRAWL } = require("../specificConfigs/rental");
|
||||||
|
|
||||||
class RentalCrawler {
|
class RentalCrawler {
|
||||||
constructor(
|
constructor(
|
||||||
savers = [],
|
savers = [],
|
||||||
@@ -88,27 +90,13 @@ class RentalCrawler {
|
|||||||
|
|
||||||
newRealEstates.push(...newRecords);
|
newRealEstates.push(...newRecords);
|
||||||
|
|
||||||
if (Array.isArray(newRecords) && newRecords.length === 0) {
|
if (
|
||||||
|
Array.isArray(newRecords) &&
|
||||||
|
newRecords.length === 0 &&
|
||||||
|
!RENTAL_FORCE_CRAWL
|
||||||
|
) {
|
||||||
generatorsToRemove[index] = true;
|
generatorsToRemove[index] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// for (const existingRecord of existingRecords) {
|
|
||||||
// const { publishedDate, renewedDate } = existingRecord;
|
|
||||||
//
|
|
||||||
// const publishedDateMoment = moment.utc(publishedDate);
|
|
||||||
// const renewedDateMoment = moment.utc(renewedDate);
|
|
||||||
//
|
|
||||||
// const stopCrawlingThisCategory = publishedDateMoment.isSame(
|
|
||||||
// renewedDateMoment,
|
|
||||||
// "minute"
|
|
||||||
// );
|
|
||||||
//
|
|
||||||
// if (stopCrawlingThisCategory) {
|
|
||||||
// generatorsToRemove[index] = true;
|
|
||||||
// // console.log("\tGenerator ", index + 1, "has no more new ads");
|
|
||||||
// break;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
} else {
|
} else {
|
||||||
//Generator returned undefined, remove this generator from array
|
//Generator returned undefined, remove this generator from array
|
||||||
generatorsToRemove[index] = true;
|
generatorsToRemove[index] = true;
|
||||||
|
|||||||
Reference in New Issue
Block a user