Prepared config files.
This commit is contained in:
@@ -223,7 +223,8 @@ const AD_AGENCY = {
|
||||
OLX: "OLX",
|
||||
RENTAL: "RENTAL",
|
||||
PROSTOR: "PROSTOR",
|
||||
AKTIDO: "AKTIDO"
|
||||
AKTIDO: "AKTIDO",
|
||||
SALJIC: "SALJIC"
|
||||
};
|
||||
|
||||
const CRAWLER_AD_TYPE = {
|
||||
|
||||
@@ -9,12 +9,14 @@ const OlxCrawler = require("./specificCrawlers/olx");
|
||||
const RentalCrawler = require("./specificCrawlers/rental");
|
||||
const ProstorCrawler = require("./specificCrawlers/prostor");
|
||||
const AktidoCrawler = require("./specificCrawlers/aktido");
|
||||
const SaljicCrawler = require("./specificCrawlers/saljic");
|
||||
|
||||
const {
|
||||
OLX_CONFIG,
|
||||
RENTAL_CONFIG,
|
||||
PROSTOR_CONFIG,
|
||||
AKTIDO_CONFIG
|
||||
AKTIDO_CONFIG,
|
||||
SALJIC_CONFIG
|
||||
} = require("./crawlerConfig");
|
||||
const PostgresSaver = require("./savers/postgres");
|
||||
|
||||
@@ -57,6 +59,15 @@ async function crawlAll() {
|
||||
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
||||
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
||||
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
||||
),
|
||||
new SaljicCrawler(
|
||||
[postgresSaver],
|
||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||
SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||
)
|
||||
];
|
||||
|
||||
|
||||
@@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx");
|
||||
const RENTAL_CONFIG = require("./specificConfigs/rental");
|
||||
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
|
||||
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
|
||||
const SALJIC_CONFIG = require("./specificConfigs/saljic");
|
||||
|
||||
module.exports = {
|
||||
OLX_CONFIG,
|
||||
RENTAL_CONFIG,
|
||||
PROSTOR_CONFIG,
|
||||
AKTIDO_CONFIG
|
||||
AKTIDO_CONFIG,
|
||||
SALJIC_CONFIG
|
||||
};
|
||||
|
||||
34
app/crawler/specificConfigs/saljic.js
Normal file
34
app/crawler/specificConfigs/saljic.js
Normal file
@@ -0,0 +1,34 @@
|
||||
"use strict";
|
||||
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
|
||||
|
||||
const saljicCrawlerAdType =
|
||||
process.env.SALJIC_CRAWLER_AD_TYPE !== undefined
|
||||
? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE]
|
||||
: null;
|
||||
|
||||
const saljicParsedCrawlerAdCategories =
|
||||
process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined
|
||||
? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
||||
category.trim()
|
||||
)
|
||||
: ["FLAT", "HOUSE"];
|
||||
|
||||
const saljicIgnoredUsernames = [];
|
||||
|
||||
const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories
|
||||
.map(categoryName =>
|
||||
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
|
||||
)
|
||||
.filter(category => !!category);
|
||||
|
||||
module.exports = {
|
||||
SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100,
|
||||
SALJIC_MAX_RESULTS_PER_PAGE:
|
||||
parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000,
|
||||
SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||
SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories,
|
||||
SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [],
|
||||
SALJIC_DELAY_BETWEEN_PAGES:
|
||||
parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000,
|
||||
SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL)
|
||||
};
|
||||
83
app/crawler/specificCrawlers/saljic.js
Normal file
83
app/crawler/specificCrawlers/saljic.js
Normal file
@@ -0,0 +1,83 @@
|
||||
"use strict";
|
||||
|
||||
const fetch = require("node-fetch");
|
||||
const cheerio = require("cheerio");
|
||||
const moment = require("moment-timezone");
|
||||
|
||||
const {
|
||||
AD_TYPE,
|
||||
AD_CATEGORY,
|
||||
AD_AGENCY,
|
||||
AD_STATUS,
|
||||
CRAWLER_AD_TYPE,
|
||||
FURNISHING_TYPE,
|
||||
HEATING_TYPE
|
||||
} = require("../../common/enums");
|
||||
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
DEFAULT_TIMEZONE
|
||||
} = require("../../config/appConfig");
|
||||
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
|
||||
|
||||
const SALJIC_ENUMS = {
|
||||
SALJIC_AD_TYPE: {
|
||||
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
|
||||
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
|
||||
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
|
||||
},
|
||||
SALJIC_AD_CATEGORY: {
|
||||
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
|
||||
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
|
||||
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
|
||||
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
|
||||
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
|
||||
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
|
||||
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
|
||||
//[AD_CATEGORY.COTTAGE.id]: ""
|
||||
}
|
||||
};
|
||||
|
||||
class SaljicCrawler {
|
||||
constructor(
|
||||
savers = [],
|
||||
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
|
||||
maxPages = 5000,
|
||||
maxResultsPerPage = 5000,
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
||||
this.crawlerAdTypes = crawlerAdTypes;
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
//
|
||||
console.log("Saljic URL: ", this.baseUrl);
|
||||
}
|
||||
|
||||
//======= HELPER FUNCTIONS =============
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async saveCrawledResults(results) {
|
||||
const savers = this.savers;
|
||||
|
||||
// for (const saver of savers) {
|
||||
// await saver.save(results);
|
||||
// }
|
||||
|
||||
//For now, we use only Postgres saver, so ...
|
||||
return savers[0].save(results);
|
||||
//so that we can use some sequelize options and information when data is inserted
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = SaljicCrawler;
|
||||
@@ -59,3 +59,8 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to
|
||||
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
||||
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||
#==SALJIC NEKRETNINE==
|
||||
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
||||
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||
Reference in New Issue
Block a user