Prepared config files.

This commit is contained in:
Naida Vatric
2020-01-29 01:09:53 +01:00
parent 259799144e
commit b11f18696f
6 changed files with 139 additions and 3 deletions

View File

@@ -223,7 +223,8 @@ const AD_AGENCY = {
OLX: "OLX",
RENTAL: "RENTAL",
PROSTOR: "PROSTOR",
AKTIDO: "AKTIDO"
AKTIDO: "AKTIDO",
SALJIC: "SALJIC"
};
const CRAWLER_AD_TYPE = {

View File

@@ -9,12 +9,14 @@ const OlxCrawler = require("./specificCrawlers/olx");
const RentalCrawler = require("./specificCrawlers/rental");
const ProstorCrawler = require("./specificCrawlers/prostor");
const AktidoCrawler = require("./specificCrawlers/aktido");
const SaljicCrawler = require("./specificCrawlers/saljic");
const {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG,
AKTIDO_CONFIG
AKTIDO_CONFIG,
SALJIC_CONFIG
} = require("./crawlerConfig");
const PostgresSaver = require("./savers/postgres");
@@ -57,6 +59,15 @@ async function crawlAll() {
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
),
new SaljicCrawler(
[postgresSaver],
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
SALJIC_CONFIG.SALJIC_MAX_PAGES,
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
)
];

View File

@@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx");
const RENTAL_CONFIG = require("./specificConfigs/rental");
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
const SALJIC_CONFIG = require("./specificConfigs/saljic");
module.exports = {
OLX_CONFIG,
RENTAL_CONFIG,
PROSTOR_CONFIG,
AKTIDO_CONFIG
AKTIDO_CONFIG,
SALJIC_CONFIG
};

View File

@@ -0,0 +1,34 @@
"use strict";
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
const saljicCrawlerAdType =
process.env.SALJIC_CRAWLER_AD_TYPE !== undefined
? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE]
: null;
const saljicParsedCrawlerAdCategories =
process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined
? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category =>
category.trim()
)
: ["FLAT", "HOUSE"];
const saljicIgnoredUsernames = [];
const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories
.map(categoryName =>
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
)
.filter(category => !!category);
module.exports = {
SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100,
SALJIC_MAX_RESULTS_PER_PAGE:
parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000,
SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE,
SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories,
SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [],
SALJIC_DELAY_BETWEEN_PAGES:
parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000,
SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL)
};

View File

@@ -0,0 +1,83 @@
"use strict";
const fetch = require("node-fetch");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const {
AD_TYPE,
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE,
FURNISHING_TYPE,
HEATING_TYPE
} = require("../../common/enums");
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE
} = require("../../config/appConfig");
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
const SALJIC_ENUMS = {
SALJIC_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
},
SALJIC_AD_CATEGORY: {
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
//[AD_CATEGORY.COTTAGE.id]: ""
}
};
class SaljicCrawler {
constructor(
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
maxPages = 5000,
maxResultsPerPage = 5000,
ignoredUsernames = [],
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
//
console.log("Saljic URL: ", this.baseUrl);
}
//======= HELPER FUNCTIONS =============
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
const savers = this.savers;
// for (const saver of savers) {
// await saver.save(results);
// }
//For now, we use only Postgres saver, so ...
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
}
module.exports = SaljicCrawler;

View File

@@ -59,3 +59,8 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
#==SALJIC NEKRETNINE==
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found