Merge branch 'implement-renting-option' into 'master'

Implement renting option - crawler part

See merge request saburly/marketalarm/web!64
This commit was merged in pull request #64.
This commit is contained in:
Bilal Catic
2019-10-31 18:05:54 +00:00
7 changed files with 35 additions and 48 deletions

View File

@@ -60,7 +60,8 @@ const GARAGE_PRICE_SLIDER_OPTIONS = {
const AD_TYPE = { const AD_TYPE = {
AD_TYPE_SALE: "SALE", AD_TYPE_SALE: "SALE",
AD_TYPE_RENT: "RENT" AD_TYPE_RENT: "RENT",
AD_TYPE_REQUEST: "REQUEST"
}; };
const AD_CATEGORY = { const AD_CATEGORY = {
@@ -140,7 +141,8 @@ const CRAWLER_AD_TYPE = {
NONE: 0, NONE: 0,
ALL: 1, ALL: 1,
ONLY_SELL: 2, ONLY_SELL: 2,
ONLY_RENT: 3 ONLY_RENT: 3,
ONLY_REQUEST: 4
}; };
module.exports = { module.exports = {

View File

@@ -29,5 +29,6 @@ module.exports = {
AKTIDO_CRAWLER_AD_CATEGORIES: transformedAktidoCrawlerAdCategories, AKTIDO_CRAWLER_AD_CATEGORIES: transformedAktidoCrawlerAdCategories,
AKTIDO_IGNORED_USERNAMES: aktidoIgnoredUsernames || [], AKTIDO_IGNORED_USERNAMES: aktidoIgnoredUsernames || [],
AKTIDO_DELAY_BETWEEN_PAGES: AKTIDO_DELAY_BETWEEN_PAGES:
parseInt(process.env.AKTIDO_DELAY_BETWEEN_PAGES) || 1000 parseInt(process.env.AKTIDO_DELAY_BETWEEN_PAGES) || 1000,
AKTIDO_FORCE_CRAWL: !!parseInt(process.env.AKTIDO_FORCE_CRAWL)
}; };

View File

@@ -33,5 +33,7 @@ module.exports = {
OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE,
OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories, OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories,
OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [],
OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 OLX_DELAY_BETWEEN_PAGES:
parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000,
OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL)
}; };

View File

@@ -29,5 +29,6 @@ module.exports = {
RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories, RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories,
RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [], RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [],
RENTAL_DELAY_BETWEEN_PAGES: RENTAL_DELAY_BETWEEN_PAGES:
parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000 parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000,
RENTAL_FORCE_CRAWL: !!parseInt(process.env.RENTAL_FORCE_CRAWL)
}; };

View File

@@ -39,6 +39,8 @@ const AKTIDO_ENUMS = {
AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
}; };
const { AKTIDO_FORCE_CRAWL } = require("../specificConfigs/aktido");
class AktidoCrawler { class AktidoCrawler {
constructor( constructor(
savers = [], savers = [],
@@ -88,27 +90,13 @@ class AktidoCrawler {
newRealEstates.push(...newRecords); newRealEstates.push(...newRecords);
if (Array.isArray(newRecords) && newRecords.length === 0) { if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!AKTIDO_FORCE_CRAWL
) {
generatorsToRemove[index] = true; generatorsToRemove[index] = true;
} }
// for (const existingRecord of existingRecords) {
// const { publishedDate, renewedDate } = existingRecord;
//
// const publishedDateMoment = moment.utc(publishedDate);
// const renewedDateMoment = moment.utc(renewedDate);
//
// const stopCrawlingThisCategory = publishedDateMoment.isSame(
// renewedDateMoment,
// "minute"
// );
//
// if (stopCrawlingThisCategory) {
// generatorsToRemove[index] = true;
// // console.log("\tGenerator ", index + 1, "has no more new ads");
// break;
// }
// }
} else { } else {
//Generator returned undefined, remove this generator from array //Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true; generatorsToRemove[index] = true;

View File

@@ -22,7 +22,8 @@ const OLX_ENUMS = {
OLX_AD_TYPE: { OLX_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "", [CRAWLER_AD_TYPE.ALL]: "",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje" [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje",
[CRAWLER_AD_TYPE.ONLY_REQUEST]: "&vrsta=samopotraznja"
}, },
OLX_AD_CATEGORY: { OLX_AD_CATEGORY: {
[AD_CATEGORY.FLAT.id]: "&kategorija=23", [AD_CATEGORY.FLAT.id]: "&kategorija=23",
@@ -38,6 +39,8 @@ const OLX_ENUMS = {
OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm"
}; };
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
class OlxCrawler { class OlxCrawler {
constructor( constructor(
savers = [], savers = [],
@@ -99,7 +102,7 @@ class OlxCrawler {
"minute" "minute"
); );
if (stopCrawlingThisCategory) { if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
generatorsToRemove[index] = true; generatorsToRemove[index] = true;
// console.log("\tGenerator ", index + 1, "has no more new ads"); // console.log("\tGenerator ", index + 1, "has no more new ads");
break; break;
@@ -134,7 +137,7 @@ class OlxCrawler {
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
if (urlAdTypePart && urlCategoryPart) { if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
while (true) { while (true) {
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
const singlePageResults = await this.indexSinglePage( const singlePageResults = await this.indexSinglePage(
@@ -212,7 +215,7 @@ class OlxCrawler {
title: "#naslovartikla", title: "#naslovartikla",
descriptions: ".artikal_detaljniopis_tekst", descriptions: ".artikal_detaljniopis_tekst",
category: category:
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" "#artikal_glavni_div > div.artikal_lijevo > div.artikal_kat > div > span:nth-child(3) > a > span"
}; };
const username = $(propertySelectors.username) const username = $(propertySelectors.username)
@@ -384,7 +387,7 @@ class OlxCrawler {
//========================================= //=========================================
const parsedCategory = this.getAdCategoryId(category); const parsedCategory = this.getAdCategoryId(category);
if (!parsedCategory) { if (!parsedCategory) {
throw { message: "Unknown ad category" }; throw { message: `Unknown ad category [${category}]` };
} }
const parsedAdType = this.getAdTypeId(adType); const parsedAdType = this.getAdTypeId(adType);
@@ -475,6 +478,8 @@ class OlxCrawler {
return AD_TYPE.AD_TYPE_SALE; return AD_TYPE.AD_TYPE_SALE;
case "Izdavanje": case "Izdavanje":
return AD_TYPE.AD_TYPE_RENT; return AD_TYPE.AD_TYPE_RENT;
case "Potražnja":
return AD_TYPE.AD_TYPE_RENT;
default: default:
return undefined; return undefined;
} }

View File

@@ -39,6 +39,8 @@ const RENTAL_ENUMS = {
RENTAL_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" RENTAL_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
}; };
const { RENTAL_FORCE_CRAWL } = require("../specificConfigs/rental");
class RentalCrawler { class RentalCrawler {
constructor( constructor(
savers = [], savers = [],
@@ -88,27 +90,13 @@ class RentalCrawler {
newRealEstates.push(...newRecords); newRealEstates.push(...newRecords);
if (Array.isArray(newRecords) && newRecords.length === 0) { if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!RENTAL_FORCE_CRAWL
) {
generatorsToRemove[index] = true; generatorsToRemove[index] = true;
} }
// for (const existingRecord of existingRecords) {
// const { publishedDate, renewedDate } = existingRecord;
//
// const publishedDateMoment = moment.utc(publishedDate);
// const renewedDateMoment = moment.utc(renewedDate);
//
// const stopCrawlingThisCategory = publishedDateMoment.isSame(
// renewedDateMoment,
// "minute"
// );
//
// if (stopCrawlingThisCategory) {
// generatorsToRemove[index] = true;
// // console.log("\tGenerator ", index + 1, "has no more new ads");
// break;
// }
// }
} else { } else {
//Generator returned undefined, remove this generator from array //Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true; generatorsToRemove[index] = true;