From 3bb67a4db979143dc77b69e89981688e9d647e40 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 30 Oct 2019 15:02:31 +0100 Subject: [PATCH 1/5] add REQUEST category --- app/common/enums.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/app/common/enums.js b/app/common/enums.js index 9742f21..aa61b84 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -60,7 +60,8 @@ const GARAGE_PRICE_SLIDER_OPTIONS = { const AD_TYPE = { AD_TYPE_SALE: "SALE", - AD_TYPE_RENT: "RENT" + AD_TYPE_RENT: "RENT", + AD_TYPE_REQUEST: "REQUEST" }; const AD_CATEGORY = { @@ -140,7 +141,8 @@ const CRAWLER_AD_TYPE = { NONE: 0, ALL: 1, ONLY_SELL: 2, - ONLY_RENT: 3 + ONLY_RENT: 3, + ONLY_REQUEST: 4 }; module.exports = { From 97d93a3f375951f1149b0f97132d9fba6deacb01 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 30 Oct 2019 15:02:54 +0100 Subject: [PATCH 2/5] add force crawl ENV option for OLX --- app/crawler/specificConfigs/olx.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/app/crawler/specificConfigs/olx.js b/app/crawler/specificConfigs/olx.js index 53ca727..150ec16 100644 --- a/app/crawler/specificConfigs/olx.js +++ b/app/crawler/specificConfigs/olx.js @@ -33,5 +33,7 @@ module.exports = { OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories, OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], - OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 + OLX_DELAY_BETWEEN_PAGES: + parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000, + OLX_FORCE_CRAWL: !!parseInt(process.env.OLX_FORCE_CRAWL) }; From 3abbed183ebc837013ab1e6976662bad4d6ce87e Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 30 Oct 2019 15:03:59 +0100 Subject: [PATCH 3/5] implement RENT and REQUEST option for OLX; implement force crawl option --- app/crawler/specificCrawlers/olx.js | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index d7176d1..bb08b95 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -22,7 +22,8 @@ const OLX_ENUMS = { OLX_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "", [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", - [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje" + [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje", + [CRAWLER_AD_TYPE.ONLY_REQUEST]: "&vrsta=samopotraznja" }, OLX_AD_CATEGORY: { [AD_CATEGORY.FLAT.id]: "&kategorija=23", @@ -38,6 +39,8 @@ const OLX_ENUMS = { OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" }; +const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx"); + class OlxCrawler { constructor( savers = [], @@ -99,7 +102,7 @@ class OlxCrawler { "minute" ); - if (stopCrawlingThisCategory) { + if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) { generatorsToRemove[index] = true; // console.log("\tGenerator ", index + 1, "has no more new ads"); break; @@ -134,7 +137,7 @@ class OlxCrawler { const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; - if (urlAdTypePart && urlCategoryPart) { + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; const singlePageResults = await this.indexSinglePage( @@ -212,7 +215,7 @@ class OlxCrawler { title: "#naslovartikla", descriptions: ".artikal_detaljniopis_tekst", category: - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" + "#artikal_glavni_div > div.artikal_lijevo > div.artikal_kat > div > span:nth-child(3) > a > span" }; const username = $(propertySelectors.username) @@ -384,7 +387,7 @@ class OlxCrawler { //========================================= const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { - throw { message: "Unknown ad category" }; + throw { message: `Unknown ad category [${category}]` }; } const parsedAdType = this.getAdTypeId(adType); @@ -475,6 +478,8 @@ class OlxCrawler { return AD_TYPE.AD_TYPE_SALE; case "Izdavanje": return AD_TYPE.AD_TYPE_RENT; + case "Potražnja": + return AD_TYPE.AD_TYPE_RENT; default: return undefined; } From fa712ce97d7e11b260bf5104fbb20cb2b293a4fd Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 30 Oct 2019 15:53:11 +0100 Subject: [PATCH 4/5] implement RENT option for Rental; implement force crawl option --- app/crawler/specificConfigs/rental.js | 3 ++- app/crawler/specificCrawlers/rental.js | 26 +++++++------------------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/app/crawler/specificConfigs/rental.js b/app/crawler/specificConfigs/rental.js index 8930d64..103723e 100644 --- a/app/crawler/specificConfigs/rental.js +++ b/app/crawler/specificConfigs/rental.js @@ -29,5 +29,6 @@ module.exports = { RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories, RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [], RENTAL_DELAY_BETWEEN_PAGES: - parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000 + parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000, + RENTAL_FORCE_CRAWL: !!parseInt(process.env.RENTAL_FORCE_CRAWL) }; diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index b73278e..6293d1f 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -39,6 +39,8 @@ const RENTAL_ENUMS = { RENTAL_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" }; +const { RENTAL_FORCE_CRAWL } = require("../specificConfigs/rental"); + class RentalCrawler { constructor( savers = [], @@ -88,27 +90,13 @@ class RentalCrawler { newRealEstates.push(...newRecords); - if (Array.isArray(newRecords) && newRecords.length === 0) { + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !RENTAL_FORCE_CRAWL + ) { generatorsToRemove[index] = true; } - - // for (const existingRecord of existingRecords) { - // const { publishedDate, renewedDate } = existingRecord; - // - // const publishedDateMoment = moment.utc(publishedDate); - // const renewedDateMoment = moment.utc(renewedDate); - // - // const stopCrawlingThisCategory = publishedDateMoment.isSame( - // renewedDateMoment, - // "minute" - // ); - // - // if (stopCrawlingThisCategory) { - // generatorsToRemove[index] = true; - // // console.log("\tGenerator ", index + 1, "has no more new ads"); - // break; - // } - // } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; From ecc5b174a075d9e501c9fc2effe9f5b9dd168362 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 30 Oct 2019 17:23:43 +0100 Subject: [PATCH 5/5] implement RENT option for Aktido; implement force crawl option --- app/crawler/specificConfigs/aktido.js | 3 ++- app/crawler/specificCrawlers/aktido.js | 26 +++++++------------------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/app/crawler/specificConfigs/aktido.js b/app/crawler/specificConfigs/aktido.js index bd06645..092f09d 100644 --- a/app/crawler/specificConfigs/aktido.js +++ b/app/crawler/specificConfigs/aktido.js @@ -29,5 +29,6 @@ module.exports = { AKTIDO_CRAWLER_AD_CATEGORIES: transformedAktidoCrawlerAdCategories, AKTIDO_IGNORED_USERNAMES: aktidoIgnoredUsernames || [], AKTIDO_DELAY_BETWEEN_PAGES: - parseInt(process.env.AKTIDO_DELAY_BETWEEN_PAGES) || 1000 + parseInt(process.env.AKTIDO_DELAY_BETWEEN_PAGES) || 1000, + AKTIDO_FORCE_CRAWL: !!parseInt(process.env.AKTIDO_FORCE_CRAWL) }; diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index a2ea43d..6512f15 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -39,6 +39,8 @@ const AKTIDO_ENUMS = { AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" }; +const { AKTIDO_FORCE_CRAWL } = require("../specificConfigs/aktido"); + class AktidoCrawler { constructor( savers = [], @@ -88,27 +90,13 @@ class AktidoCrawler { newRealEstates.push(...newRecords); - if (Array.isArray(newRecords) && newRecords.length === 0) { + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !AKTIDO_FORCE_CRAWL + ) { generatorsToRemove[index] = true; } - - // for (const existingRecord of existingRecords) { - // const { publishedDate, renewedDate } = existingRecord; - // - // const publishedDateMoment = moment.utc(publishedDate); - // const renewedDateMoment = moment.utc(renewedDate); - // - // const stopCrawlingThisCategory = publishedDateMoment.isSame( - // renewedDateMoment, - // "minute" - // ); - // - // if (stopCrawlingThisCategory) { - // generatorsToRemove[index] = true; - // // console.log("\tGenerator ", index + 1, "has no more new ads"); - // break; - // } - // } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true;