Prepared config files.
This commit is contained in:
@@ -223,7 +223,8 @@ const AD_AGENCY = {
|
|||||||
OLX: "OLX",
|
OLX: "OLX",
|
||||||
RENTAL: "RENTAL",
|
RENTAL: "RENTAL",
|
||||||
PROSTOR: "PROSTOR",
|
PROSTOR: "PROSTOR",
|
||||||
AKTIDO: "AKTIDO"
|
AKTIDO: "AKTIDO",
|
||||||
|
SALJIC: "SALJIC"
|
||||||
};
|
};
|
||||||
|
|
||||||
const CRAWLER_AD_TYPE = {
|
const CRAWLER_AD_TYPE = {
|
||||||
|
|||||||
@@ -9,12 +9,14 @@ const OlxCrawler = require("./specificCrawlers/olx");
|
|||||||
const RentalCrawler = require("./specificCrawlers/rental");
|
const RentalCrawler = require("./specificCrawlers/rental");
|
||||||
const ProstorCrawler = require("./specificCrawlers/prostor");
|
const ProstorCrawler = require("./specificCrawlers/prostor");
|
||||||
const AktidoCrawler = require("./specificCrawlers/aktido");
|
const AktidoCrawler = require("./specificCrawlers/aktido");
|
||||||
|
const SaljicCrawler = require("./specificCrawlers/saljic");
|
||||||
|
|
||||||
const {
|
const {
|
||||||
OLX_CONFIG,
|
OLX_CONFIG,
|
||||||
RENTAL_CONFIG,
|
RENTAL_CONFIG,
|
||||||
PROSTOR_CONFIG,
|
PROSTOR_CONFIG,
|
||||||
AKTIDO_CONFIG
|
AKTIDO_CONFIG,
|
||||||
|
SALJIC_CONFIG
|
||||||
} = require("./crawlerConfig");
|
} = require("./crawlerConfig");
|
||||||
const PostgresSaver = require("./savers/postgres");
|
const PostgresSaver = require("./savers/postgres");
|
||||||
|
|
||||||
@@ -57,6 +59,15 @@ async function crawlAll() {
|
|||||||
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE,
|
||||||
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES,
|
||||||
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES
|
||||||
|
),
|
||||||
|
new SaljicCrawler(
|
||||||
|
[postgresSaver],
|
||||||
|
SALJIC_CONFIG.SALJIC_CRAWLER_AD_TYPE,
|
||||||
|
SALJIC_CONFIG.SALJIC_CRAWLER_AD_CATEGORIES,
|
||||||
|
SALJIC_CONFIG.SALJIC_MAX_PAGES,
|
||||||
|
SALJIC_CONFIG.SALJIC_MAX_RESULTS_PER_PAGE,
|
||||||
|
SALJIC_CONFIG.SALJIC_IGNORED_USERNAMES,
|
||||||
|
SALJIC_CONFIG.SALJIC_DELAY_BETWEEN_PAGES
|
||||||
)
|
)
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
@@ -5,10 +5,12 @@ const OLX_CONFIG = require("./specificConfigs/olx");
|
|||||||
const RENTAL_CONFIG = require("./specificConfigs/rental");
|
const RENTAL_CONFIG = require("./specificConfigs/rental");
|
||||||
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
|
const PROSTOR_CONFIG = require("./specificConfigs/prostor");
|
||||||
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
|
const AKTIDO_CONFIG = require("./specificConfigs/aktido");
|
||||||
|
const SALJIC_CONFIG = require("./specificConfigs/saljic");
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
OLX_CONFIG,
|
OLX_CONFIG,
|
||||||
RENTAL_CONFIG,
|
RENTAL_CONFIG,
|
||||||
PROSTOR_CONFIG,
|
PROSTOR_CONFIG,
|
||||||
AKTIDO_CONFIG
|
AKTIDO_CONFIG,
|
||||||
|
SALJIC_CONFIG
|
||||||
};
|
};
|
||||||
|
|||||||
34
app/crawler/specificConfigs/saljic.js
Normal file
34
app/crawler/specificConfigs/saljic.js
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
"use strict";
|
||||||
|
const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums");
|
||||||
|
|
||||||
|
const saljicCrawlerAdType =
|
||||||
|
process.env.SALJIC_CRAWLER_AD_TYPE !== undefined
|
||||||
|
? CRAWLER_AD_TYPE[process.env.SALJIC_CRAWLER_AD_TYPE]
|
||||||
|
: null;
|
||||||
|
|
||||||
|
const saljicParsedCrawlerAdCategories =
|
||||||
|
process.env.SALJIC_CRAWLER_AD_CATEGORIES !== undefined
|
||||||
|
? process.env.SALJIC_CRAWLER_AD_CATEGORIES.split(",").map(category =>
|
||||||
|
category.trim()
|
||||||
|
)
|
||||||
|
: ["FLAT", "HOUSE"];
|
||||||
|
|
||||||
|
const saljicIgnoredUsernames = [];
|
||||||
|
|
||||||
|
const transformedSaljicCrawlerAdCategories = saljicParsedCrawlerAdCategories
|
||||||
|
.map(categoryName =>
|
||||||
|
AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined
|
||||||
|
)
|
||||||
|
.filter(category => !!category);
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
SALJIC_MAX_PAGES: parseInt(process.env.SALJIC_MAX_PAGES) || 100,
|
||||||
|
SALJIC_MAX_RESULTS_PER_PAGE:
|
||||||
|
parseInt(process.env.SALJIC_MAX_RESULTS_PER_PAGE) || 5000,
|
||||||
|
SALJIC_CRAWLER_AD_TYPE: saljicCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||||
|
SALJIC_CRAWLER_AD_CATEGORIES: transformedSaljicCrawlerAdCategories,
|
||||||
|
SALJIC_IGNORED_USERNAMES: saljicIgnoredUsernames || [],
|
||||||
|
SALJIC_DELAY_BETWEEN_PAGES:
|
||||||
|
parseInt(process.env.SALJIC_DELAY_BETWEEN_PAGES) || 1000,
|
||||||
|
SALJIC_FORCE_CRAWL: !!parseInt(process.env.SALJIC_FORCE_CRAWL)
|
||||||
|
};
|
||||||
83
app/crawler/specificCrawlers/saljic.js
Normal file
83
app/crawler/specificCrawlers/saljic.js
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
"use strict";
|
||||||
|
|
||||||
|
const fetch = require("node-fetch");
|
||||||
|
const cheerio = require("cheerio");
|
||||||
|
const moment = require("moment-timezone");
|
||||||
|
|
||||||
|
const {
|
||||||
|
AD_TYPE,
|
||||||
|
AD_CATEGORY,
|
||||||
|
AD_AGENCY,
|
||||||
|
AD_STATUS,
|
||||||
|
CRAWLER_AD_TYPE,
|
||||||
|
FURNISHING_TYPE,
|
||||||
|
HEATING_TYPE
|
||||||
|
} = require("../../common/enums");
|
||||||
|
|
||||||
|
const {
|
||||||
|
PRINT_CRAWLER_DEBUG,
|
||||||
|
DEFAULT_TIMEZONE
|
||||||
|
} = require("../../config/appConfig");
|
||||||
|
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
|
||||||
|
|
||||||
|
const SALJIC_ENUMS = {
|
||||||
|
SALJIC_AD_TYPE: {
|
||||||
|
[CRAWLER_AD_TYPE.ALL]: "&input_vrsta=",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_SELL]: "&input_vrsta=1",
|
||||||
|
[CRAWLER_AD_TYPE.ONLY_RENT]: "&input_vrsta=2"
|
||||||
|
},
|
||||||
|
SALJIC_AD_CATEGORY: {
|
||||||
|
[AD_CATEGORY.ALL.id]: "&input_kategorija=",
|
||||||
|
[AD_CATEGORY.FLAT.id]: "&input_kategorija=15",
|
||||||
|
[AD_CATEGORY.HOUSE.id]: "&input_kategorija=9",
|
||||||
|
[AD_CATEGORY.LAND.id]: "&input_kategorija=5", //3 and 4 also gradjevinsko
|
||||||
|
[AD_CATEGORY.OFFICE.id]: "&input_kategorija=8",
|
||||||
|
[AD_CATEGORY.APARTMENT.id]: "&input_kategorija=1",
|
||||||
|
[AD_CATEGORY.GARAGE.id]: "&input_kategorija=2"
|
||||||
|
//[AD_CATEGORY.COTTAGE.id]: ""
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SaljicCrawler {
|
||||||
|
constructor(
|
||||||
|
savers = [],
|
||||||
|
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||||
|
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
|
||||||
|
maxPages = 5000,
|
||||||
|
maxResultsPerPage = 5000,
|
||||||
|
ignoredUsernames = [],
|
||||||
|
delayBetweenPages = 1000
|
||||||
|
) {
|
||||||
|
this.savers = savers;
|
||||||
|
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
||||||
|
this.crawlerAdTypes = crawlerAdTypes;
|
||||||
|
this.crawlerAdCategories = crawlerAdCategories;
|
||||||
|
this.maxResultsPerPage = maxResultsPerPage;
|
||||||
|
this.delayBetweenPages = delayBetweenPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
async crawl() {
|
||||||
|
//
|
||||||
|
console.log("Saljic URL: ", this.baseUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
//======= HELPER FUNCTIONS =============
|
||||||
|
|
||||||
|
async sleep(ms) {
|
||||||
|
return new Promise(resolve => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async saveCrawledResults(results) {
|
||||||
|
const savers = this.savers;
|
||||||
|
|
||||||
|
// for (const saver of savers) {
|
||||||
|
// await saver.save(results);
|
||||||
|
// }
|
||||||
|
|
||||||
|
//For now, we use only Postgres saver, so ...
|
||||||
|
return savers[0].save(results);
|
||||||
|
//so that we can use some sequelize options and information when data is inserted
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = SaljicCrawler;
|
||||||
@@ -59,3 +59,8 @@ AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to
|
|||||||
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
||||||
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||||
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
|
#==SALJIC NEKRETNINE==
|
||||||
|
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
||||||
|
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||||
|
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||||
|
SALJIC_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||||
Reference in New Issue
Block a user