From 935ae60ae1cb592d03e063ecec5d3c45d76e2147 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 24 Oct 2019 16:57:23 +0200 Subject: [PATCH] move specific crawler config to the separated files --- app/crawler/crawl.js | 4 +- app/crawler/crawlerConfig.js | 68 +------------------ app/crawler/specificConfigs/olx.js | 37 ++++++++++ app/crawler/specificConfigs/rental.js | 33 +++++++++ .../{specific => specificCrawlers}/olx.js | 0 .../{specific => specificCrawlers}/rental.js | 0 test/olxScrapeTest.js | 2 +- 7 files changed, 75 insertions(+), 69 deletions(-) create mode 100644 app/crawler/specificConfigs/olx.js create mode 100644 app/crawler/specificConfigs/rental.js rename app/crawler/{specific => specificCrawlers}/olx.js (100%) rename app/crawler/{specific => specificCrawlers}/rental.js (100%) diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index b219e01..ac4825d 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -5,8 +5,8 @@ All environment specific configuration is read here and passed to the crawlers and savers. */ -const OlxCrawler = require("./specific/olx"); -const RentalCrawler = require("./specific/rental"); +const OlxCrawler = require("./specificCrawlers/olx"); +const RentalCrawler = require("./specificCrawlers/rental"); const { OLX_CONFIG, RENTAL_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index a58575d..29c68f2 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -1,72 +1,8 @@ "use strict"; require("dotenv").config({ path: __dirname + "/./../../.env" }); -const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); -const olxCrawlerAdType = - process.env.OLX_CRAWLER_AD_TYPE !== undefined - ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] - : null; - -const rentalCrawlerAdType = - process.env.RENTAL_CRAWLER_AD_TYPE !== undefined - ? CRAWLER_AD_TYPE[process.env.RENTAL_CRAWLER_AD_TYPE] - : null; - -const olxParsedCrawlerAdCategories = - process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined - ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => - category.trim() - ) - : ["FLAT", "HOUSE"]; - -const rentalParsedCrawlerAdCategories = - process.env.RENTAL_CRAWLER_AD_CATEGORIES !== undefined - ? process.env.RENTAL_CRAWLER_AD_CATEGORIES.split(",").map(category => - category.trim() - ) - : ["FLAT", "HOUSE"]; - -const olxIgnoredUsernames = - process.env.OLX_IGNORED_USERNAMES !== undefined - ? process.env.OLX_IGNORED_USERNAMES.split(",").map(username => - username.trim() - ) - : []; - -const rentalIgnoredUsernames = []; - -const transformedOlxCrawlerAdCategories = olxParsedCrawlerAdCategories - .map(categoryName => - AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined - ) - .filter(category => !!category); - -const transformedRentalCrawlerAdCategories = rentalParsedCrawlerAdCategories - .map(categoryName => - AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined - ) - .filter(category => !!category); - -const OLX_CONFIG = { - OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500, - OLX_MAX_RESULTS_PER_PAGE: - parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, - OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, - OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories, - OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], - OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 -}; - -const RENTAL_CONFIG = { - RENTAL_MAX_PAGES: parseInt(process.env.RENTAL_MAX_PAGES) || 500, - RENTAL_MAX_RESULTS_PER_PAGE: - parseInt(process.env.RENTAL_MAX_RESULTS_PER_PAGE) || 50, - RENTAL_CRAWLER_AD_TYPE: rentalCrawlerAdType || CRAWLER_AD_TYPE.NONE, - RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories, - RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [], - RENTAL_DELAY_BETWEEN_PAGES: - parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000 -}; +const OLX_CONFIG = require("./specificConfigs/olx"); +const RENTAL_CONFIG = require("./specificConfigs/rental"); module.exports = { OLX_CONFIG, diff --git a/app/crawler/specificConfigs/olx.js b/app/crawler/specificConfigs/olx.js new file mode 100644 index 0000000..53ca727 --- /dev/null +++ b/app/crawler/specificConfigs/olx.js @@ -0,0 +1,37 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const olxCrawlerAdType = + process.env.OLX_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] + : null; + +const olxParsedCrawlerAdCategories = + process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const olxIgnoredUsernames = + process.env.OLX_IGNORED_USERNAMES !== undefined + ? process.env.OLX_IGNORED_USERNAMES.split(",").map(username => + username.trim() + ) + : []; + +const transformedOlxCrawlerAdCategories = olxParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500, + OLX_MAX_RESULTS_PER_PAGE: + parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, + OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, + OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories, + OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], + OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 +}; diff --git a/app/crawler/specificConfigs/rental.js b/app/crawler/specificConfigs/rental.js new file mode 100644 index 0000000..8930d64 --- /dev/null +++ b/app/crawler/specificConfigs/rental.js @@ -0,0 +1,33 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const rentalCrawlerAdType = + process.env.RENTAL_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.RENTAL_CRAWLER_AD_TYPE] + : null; + +const rentalParsedCrawlerAdCategories = + process.env.RENTAL_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.RENTAL_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const rentalIgnoredUsernames = []; + +const transformedRentalCrawlerAdCategories = rentalParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + RENTAL_MAX_PAGES: parseInt(process.env.RENTAL_MAX_PAGES) || 500, + RENTAL_MAX_RESULTS_PER_PAGE: + parseInt(process.env.RENTAL_MAX_RESULTS_PER_PAGE) || 50, + RENTAL_CRAWLER_AD_TYPE: rentalCrawlerAdType || CRAWLER_AD_TYPE.NONE, + RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories, + RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [], + RENTAL_DELAY_BETWEEN_PAGES: + parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000 +}; diff --git a/app/crawler/specific/olx.js b/app/crawler/specificCrawlers/olx.js similarity index 100% rename from app/crawler/specific/olx.js rename to app/crawler/specificCrawlers/olx.js diff --git a/app/crawler/specific/rental.js b/app/crawler/specificCrawlers/rental.js similarity index 100% rename from app/crawler/specific/rental.js rename to app/crawler/specificCrawlers/rental.js diff --git a/test/olxScrapeTest.js b/test/olxScrapeTest.js index d95dde6..745dbcb 100644 --- a/test/olxScrapeTest.js +++ b/test/olxScrapeTest.js @@ -1,6 +1,6 @@ "use strict"; -const olxCrawler = require("../app/crawler/specific/olx"); +const olxCrawler = require("../app/crawler/specificCrawlers/olx"); const urlToScrape = process.argv[2] || undefined;