implement crawler for Prostor agency
This commit is contained in:
@@ -7,8 +7,13 @@
|
||||
*/
|
||||
const OlxCrawler = require("./specificCrawlers/olx");
|
||||
const RentalCrawler = require("./specificCrawlers/rental");
|
||||
const ProstorCrawler = require("./specificCrawlers/prostor");
|
||||
|
||||
const { OLX_CONFIG, RENTAL_CONFIG } = require("./crawlerConfig");
|
||||
const {
|
||||
OLX_CONFIG,
|
||||
RENTAL_CONFIG,
|
||||
PROSTOR_CONFIG
|
||||
} = require("./crawlerConfig");
|
||||
const PostgresSaver = require("./savers/postgres");
|
||||
|
||||
async function crawlAll() {
|
||||
@@ -32,6 +37,15 @@ async function crawlAll() {
|
||||
RENTAL_CONFIG.RENTAL_MAX_RESULTS_PER_PAGE,
|
||||
RENTAL_CONFIG.RENTAL_IGNORED_USERNAMES,
|
||||
RENTAL_CONFIG.RENTAL_DELAY_BETWEEN_PAGES
|
||||
),
|
||||
new ProstorCrawler(
|
||||
[postgresSaver],
|
||||
PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_TYPE,
|
||||
PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_CATEGORIES,
|
||||
PROSTOR_CONFIG.PROSTOR_MAX_PAGES,
|
||||
PROSTOR_CONFIG.PROSTOR_MAX_RESULTS_PER_PAGE,
|
||||
PROSTOR_CONFIG.PROSTOR_IGNORED_USERNAMES,
|
||||
PROSTOR_CONFIG.PROSTOR_DELAY_BETWEEN_PAGES
|
||||
)
|
||||
];
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ const transformedProstorCrawlerAdCategories = prostorParsedCrawlerAdCategories
|
||||
module.exports = {
|
||||
PROSTOR_MAX_PAGES: parseInt(process.env.PROSTOR_MAX_PAGES) || 100,
|
||||
PROSTOR_MAX_RESULTS_PER_PAGE:
|
||||
parseInt(process.env.PROSTOR_MAX_RESULTS_PER_PAGE) || 50,
|
||||
parseInt(process.env.PROSTOR_MAX_RESULTS_PER_PAGE) || 5000,
|
||||
PROSTOR_CRAWLER_AD_TYPE: prostorCrawlerAdType || CRAWLER_AD_TYPE.NONE,
|
||||
PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories,
|
||||
PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [],
|
||||
|
||||
248
app/crawler/specificCrawlers/prostor.js
Normal file
248
app/crawler/specificCrawlers/prostor.js
Normal file
@@ -0,0 +1,248 @@
|
||||
"use strict";
|
||||
|
||||
const fetch = require("node-fetch");
|
||||
const cheerio = require("cheerio");
|
||||
|
||||
const {
|
||||
AD_TYPE,
|
||||
AD_CATEGORY,
|
||||
AD_AGENCY,
|
||||
AD_STATUS,
|
||||
CRAWLER_AD_TYPE
|
||||
} = require("../../common/enums");
|
||||
|
||||
const PROSTOR_ENUMS = {
|
||||
PROSTOR_AD_TYPE: {
|
||||
[CRAWLER_AD_TYPE.ALL]: "&action=0",
|
||||
[CRAWLER_AD_TYPE.ONLY_SELL]: "&action=1",
|
||||
[CRAWLER_AD_TYPE.ONLY_RENT]: "&action=2"
|
||||
},
|
||||
PROSTOR_AD_CATEGORY: {
|
||||
[AD_CATEGORY.ALL.id]: "",
|
||||
[AD_CATEGORY.FLAT.id]: "&type=7",
|
||||
[AD_CATEGORY.HOUSE.id]: "&type=8",
|
||||
[AD_CATEGORY.LAND.id]: "&type=10",
|
||||
[AD_CATEGORY.OFFICE.id]: "&type=9",
|
||||
[AD_CATEGORY.APARTMENT.id]: "&type=11",
|
||||
[AD_CATEGORY.GARAGE.id]: "&type=14"
|
||||
//[AD_CATEGORY.COTTAGE.id]: ""
|
||||
},
|
||||
PROSTOR_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss",
|
||||
PROSTOR_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
|
||||
};
|
||||
|
||||
class ProstorCrawler {
|
||||
constructor(
|
||||
savers = [],
|
||||
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
||||
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
|
||||
maxPages = 5000,
|
||||
maxResultsPerPage = 5000,
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://prostor.ba/pretraga";
|
||||
this.crawlerAdTypes = crawlerAdTypes;
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
const newRealEstates = [];
|
||||
|
||||
if (crawlAdCategories) {
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
const urlAdTypePart =
|
||||
PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
|
||||
const singleCategoryResults = await this.extractRealEstates(
|
||||
urlPageToCrawl
|
||||
);
|
||||
|
||||
const resultsSubset = singleCategoryResults.slice(
|
||||
0,
|
||||
this.maxResultsPerPage
|
||||
);
|
||||
|
||||
const saveResults = await this.saveCrawledResults(resultsSubset);
|
||||
const { newRecords } = saveResults;
|
||||
newRealEstates.push(...newRecords);
|
||||
}
|
||||
}
|
||||
}
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async extractRealEstates(url) {
|
||||
console.log("[PROSTOR] Index page : ", url);
|
||||
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const scriptElement = $(
|
||||
"body > div > div.container-fluid > script:nth-child(7)"
|
||||
);
|
||||
|
||||
if (
|
||||
scriptElement[0] &&
|
||||
scriptElement[0].children &&
|
||||
scriptElement[0].children[0] &&
|
||||
scriptElement[0].children[0].data
|
||||
) {
|
||||
const scriptData = scriptElement[0].children[0].data;
|
||||
|
||||
try {
|
||||
// script element data contains JS code and we need to extract only data for realEstates
|
||||
// data string starts with : var map; var markers = [{"r ...
|
||||
// so we remove first 23 characters
|
||||
//
|
||||
// real estate JSON data ends with ...}, ]; map = new...
|
||||
// so we need to find index of that substring to know where to stop
|
||||
// we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually
|
||||
|
||||
const jsonEndIndex = scriptData.indexOf(", ]; map = new");
|
||||
if (jsonEndIndex > -1) {
|
||||
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
|
||||
const realEstates = JSON.parse(jsonData);
|
||||
|
||||
const transformedRealEstates = [];
|
||||
|
||||
for (const realEstate of realEstates) {
|
||||
const transformedRealEstate = ProstorCrawler.transformRealEstateData(
|
||||
realEstate
|
||||
);
|
||||
if (transformedRealEstate) {
|
||||
transformedRealEstates.push(transformedRealEstate);
|
||||
}
|
||||
}
|
||||
|
||||
return transformedRealEstates;
|
||||
} else {
|
||||
throw {
|
||||
message: "Something is wrong with JSON data or data is moved"
|
||||
};
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
throw { message: "Can't find ad data JSON" };
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("[PROSTOR] Exception caught:", e.message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
static transformRealEstateData(realEstateData) {
|
||||
try {
|
||||
const { lat, lng, property_name, price, size, link } = realEstateData;
|
||||
|
||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||
// general form is : /actionType/realEstateType/location/realEstateID
|
||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||
|
||||
const linkParts = link.split("/");
|
||||
|
||||
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
||||
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
||||
const prostorId = linkParts[4];
|
||||
const url = `https://prostor.ba${link}`;
|
||||
|
||||
if (!adType || !realEstateType || !prostorId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const adStatus = AD_STATUS.STATUS_NORMAL;
|
||||
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
||||
const parsedArea = parseFloat(size);
|
||||
|
||||
const data = {
|
||||
url,
|
||||
agencyObjectId: prostorId,
|
||||
originAgencyName: AD_AGENCY.PROSTOR,
|
||||
realEstateType,
|
||||
adType,
|
||||
title: property_name,
|
||||
price: parsedPrice,
|
||||
area: parsedArea,
|
||||
gardenSize: null,
|
||||
shortDescription: "",
|
||||
longDescription: "",
|
||||
streetNumber: 0,
|
||||
streetName: "",
|
||||
locality: "",
|
||||
municipality: "",
|
||||
city: "",
|
||||
region: "",
|
||||
entity: "",
|
||||
country: "",
|
||||
locationLat: lat,
|
||||
locationLong: lng,
|
||||
adStatus,
|
||||
publishedDate: null,
|
||||
renewedDate: null
|
||||
};
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error(
|
||||
"[PROSTOR] Exception caught: " + e.message,
|
||||
"\r\nURL:",
|
||||
url
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
//======= HELPER FUNCTIONS =============
|
||||
|
||||
static getAdCategoryId(categoryText) {
|
||||
switch (categoryText) {
|
||||
case "stan":
|
||||
return AD_CATEGORY.FLAT.id;
|
||||
case "kuca":
|
||||
return AD_CATEGORY.HOUSE.id;
|
||||
case "apartman":
|
||||
return AD_CATEGORY.APARTMENT.id;
|
||||
case "poslovni-prostor":
|
||||
return AD_CATEGORY.OFFICE.id;
|
||||
case "garaza":
|
||||
return AD_CATEGORY.GARAGE.id;
|
||||
case "zemljiste":
|
||||
return AD_CATEGORY.LAND.id;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
static getAdTypeId(adTypeText) {
|
||||
switch (adTypeText) {
|
||||
case "prodaja":
|
||||
return AD_TYPE.AD_TYPE_SALE;
|
||||
case "najam":
|
||||
return AD_TYPE.AD_TYPE_RENT;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
async saveCrawledResults(results) {
|
||||
const savers = this.savers;
|
||||
|
||||
// for (const saver of savers) {
|
||||
// await saver.save(results);
|
||||
// }
|
||||
|
||||
//For now, we use only Postgres saver, so ...
|
||||
return await savers[0].save(results);
|
||||
//so that we can use some sequelize options and information when data is inserted
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = ProstorCrawler;
|
||||
Reference in New Issue
Block a user