Files
old-web/app/crawler/crawl.js

93 lines
2.8 KiB
JavaScript
Raw Normal View History

"use strict";
/*
Entry point for crawling functionality
All communication between crawlers and savers is here
All environment specific configuration is read here and
passed to the crawlers and savers.
*/
const OlxCrawler = require("./specificCrawlers/olx");
const RentalCrawler = require("./specificCrawlers/rental");
2019-10-25 10:54:08 +02:00
const ProstorCrawler = require("./specificCrawlers/prostor");
const AktidoCrawler = require("./specificCrawlers/aktido");
2020-01-29 01:09:53 +01:00
const SaljicCrawler = require("./specificCrawlers/saljic");
2019-10-25 10:54:08 +02:00
const {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG,
2020-01-29 01:09:53 +01:00
AKTIDO_CONFIG,
SALJIC_CONFIG
2019-10-25 10:54:08 +02:00
} = require("./crawlerConfig");
const PostgresSaver = require("./savers/postgres");
async function crawlAll() {
const postgresSaver = new PostgresSaver();
const crawlers = [
new OlxCrawler(
[postgresSaver],
OLX_CONFIG.OLX_CRAWLER_AD_TYPE,
OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES,
OLX_CONFIG.OLX_MAX_PAGES,
OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE,
OLX_CONFIG.OLX_IGNORED_USERNAMES,
OLX_CONFIG.OLX_DELAY_BETWEEN_PAGES
),
new RentalCrawler(
[postgresSaver],
RENTAL_CONFIG.RENTAL_CRAWLER_AD_TYPE,
RENTAL_CONFIG.RENTAL_CRAWLER_AD_CATEGORIES,
RENTAL_CONFIG.RENTAL_MAX_PAGES,
RENTAL_CONFIG.RENTAL_MAX_RESULTS_PER_PAGE,
RENTAL_CONFIG.RENTAL_IGNORED_USERNAMES,
RENTAL_CONFIG.RENTAL_DELAY_BETWEEN_PAGES
2019-10-25 10:54:08 +02:00
),
new ProstorCrawler(
[postgresSaver],
PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_TYPE,
PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_CATEGORIES,
PROSTOR_CONFIG.PROSTOR_MAX_PAGES,
PROSTOR_CONFIG.PROSTOR_MAX_RESULTS_PER_PAGE,
PROSTOR_CONFIG.PROSTOR_IGNORED_USERNAMES,
PROSTOR_CONFIG.PROSTOR_DELAY_BETWEEN_PAGES
),
new AktidoCrawler(
[postgresSaver],
AKTIDO_CONFIG.AKTIDO_CRAWLER_AD_TYPE,
AKTIDO_CONFIG.AKTIDO_CRAWLER_AD_CATEGORIES,
AKTIDO_CONFIG.AKTIDO_MAX_PAGES,
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
2020-01-29 01:09:53 +01:00
),
new SaljicCrawler(
[postgresSaver],
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
SALJIC_CONFIG.SALJIC_MAX_PAGES,
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
)
];
const newRealEstates = [];
for (const crawler of crawlers) {
try {
const newRealEstatesFromSingleCrawler = await crawler.crawl();
if (Array.isArray(newRealEstatesFromSingleCrawler)) {
newRealEstates.push(...newRealEstatesFromSingleCrawler);
}
} catch (e) {
console.log("Error crawling. Trying next crawler! ", e);
}
}
return newRealEstates;
}
module.exports = {
crawlAll
};