diff --git a/app/common/enums.js b/app/common/enums.js index f72d0bd..1e46f03 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -133,7 +133,8 @@ const AD_STATUS = { }; const AD_AGENCY = { - OLX: "OLX" + OLX: "OLX", + RENTAL: "RENTAL" }; const CRAWLER_AD_TYPE = { diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 65d94ad..b219e01 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -6,30 +6,49 @@ passed to the crawlers and savers. */ const OlxCrawler = require("./specific/olx"); -const { OLX_CONFIG } = require("./crawlerConfig"); +const RentalCrawler = require("./specific/rental"); + +const { OLX_CONFIG, RENTAL_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); -const crawlers = [ - new OlxCrawler( - [new PostgresSaver()], - OLX_CONFIG.OLX_CRAWLER_AD_TYPE, - OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, - OLX_CONFIG.OLX_MAX_PAGES, - OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, - OLX_CONFIG.OLX_IGNORED_USERNAMES, - OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES - ) -]; - async function crawlAll() { - for (let crawler of crawlers) { + const postgresSaver = new PostgresSaver(); + + const crawlers = [ + new OlxCrawler( + [postgresSaver], + OLX_CONFIG.OLX_CRAWLER_AD_TYPE, + OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, + OLX_CONFIG.OLX_MAX_PAGES, + OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, + OLX_CONFIG.OLX_IGNORED_USERNAMES, + OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES + ), + new RentalCrawler( + [postgresSaver], + RENTAL_CONFIG.RENTAL_CRAWLER_AD_TYPE, + RENTAL_CONFIG.RENTAL_CRAWLER_AD_CATEGORIES, + RENTAL_CONFIG.RENTAL_MAX_PAGES, + RENTAL_CONFIG.RENTAL_MAX_RESULTS_PER_PAGE, + RENTAL_CONFIG.RENTAL_IGNORED_USERNAMES, + RENTAL_CONFIG.RENTAL_DELAY_BETWEEN_PAGES + ) + ]; + + const newRealEstates = []; + + for (const crawler of crawlers) { try { - return await crawler.crawl(); + const newRealEstatesFromSingleCrawler = await crawler.crawl(); + if (Array.isArray(newRealEstatesFromSingleCrawler)) { + newRealEstates.push(...newRealEstatesFromSingleCrawler); + } } catch (e) { console.log("Error crawling. Trying next crawler! ", e); - return []; } } + + return newRealEstates; } module.exports = { diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index 3b2abef..a58575d 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -7,6 +7,11 @@ const olxCrawlerAdType = ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] : null; +const rentalCrawlerAdType = + process.env.RENTAL_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.RENTAL_CRAWLER_AD_TYPE] + : null; + const olxParsedCrawlerAdCategories = process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => @@ -14,6 +19,13 @@ const olxParsedCrawlerAdCategories = ) : ["FLAT", "HOUSE"]; +const rentalParsedCrawlerAdCategories = + process.env.RENTAL_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.RENTAL_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + const olxIgnoredUsernames = process.env.OLX_IGNORED_USERNAMES !== undefined ? process.env.OLX_IGNORED_USERNAMES.split(",").map(username => @@ -21,7 +33,15 @@ const olxIgnoredUsernames = ) : []; -const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories +const rentalIgnoredUsernames = []; + +const transformedOlxCrawlerAdCategories = olxParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +const transformedRentalCrawlerAdCategories = rentalParsedCrawlerAdCategories .map(categoryName => AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined ) @@ -32,11 +52,23 @@ const OLX_CONFIG = { OLX_MAX_RESULTS_PER_PAGE: parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, - OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories, + OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories, OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 }; -module.exports = { - OLX_CONFIG +const RENTAL_CONFIG = { + RENTAL_MAX_PAGES: parseInt(process.env.RENTAL_MAX_PAGES) || 500, + RENTAL_MAX_RESULTS_PER_PAGE: + parseInt(process.env.RENTAL_MAX_RESULTS_PER_PAGE) || 50, + RENTAL_CRAWLER_AD_TYPE: rentalCrawlerAdType || CRAWLER_AD_TYPE.NONE, + RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories, + RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [], + RENTAL_DELAY_BETWEEN_PAGES: + parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000 +}; + +module.exports = { + OLX_CONFIG, + RENTAL_CONFIG }; diff --git a/development.env b/development.env index 0221eaf..656ed15 100644 --- a/development.env +++ b/development.env @@ -30,3 +30,10 @@ OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check commo OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page +#==RENTAL== +RENTAL_MAX_PAGES=Restrict crawler to this number of pages +RENTAL_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved +RENTAL_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values +RENTAL_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values +RENTAL_IGNORED_USERNAMES=!!! This is not used for rental crawler !!! +RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page