637 lines
19 KiB
JavaScript
637 lines
19 KiB
JavaScript
"use strict";
|
|
|
|
const fetch = require("../../helpers/fetchWrapper");
|
|
const cheerio = require("cheerio");
|
|
const moment = require("moment-timezone");
|
|
const FormData = require("form-data");
|
|
|
|
const {
|
|
AD_TYPE,
|
|
AD_CATEGORY,
|
|
AD_AGENCY,
|
|
AD_STATUS,
|
|
CRAWLER_AD_TYPE,
|
|
FURNISHING_TYPE,
|
|
HEATING_TYPE
|
|
} = require("../../common/enums");
|
|
|
|
const {
|
|
PRINT_CRAWLER_DEBUG,
|
|
DEFAULT_TIMEZONE,
|
|
PROSTOR_LOGIN
|
|
} = require("../../config/appConfig");
|
|
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
|
|
|
|
const PROSTOR_ENUMS = {
|
|
PROSTOR_AD_TYPE: {
|
|
[CRAWLER_AD_TYPE.ALL]: "&action=0",
|
|
[CRAWLER_AD_TYPE.ONLY_SELL]: "&action=1",
|
|
[CRAWLER_AD_TYPE.ONLY_RENT]: "&action=2"
|
|
},
|
|
PROSTOR_AD_CATEGORY: {
|
|
[AD_CATEGORY.ALL.id]: "",
|
|
[AD_CATEGORY.FLAT.id]: "&type=7",
|
|
[AD_CATEGORY.HOUSE.id]: "&type=8",
|
|
[AD_CATEGORY.LAND.id]: "&type=10",
|
|
[AD_CATEGORY.OFFICE.id]: "&type=9",
|
|
[AD_CATEGORY.APARTMENT.id]: "&type=11",
|
|
[AD_CATEGORY.GARAGE.id]: "&type=14"
|
|
//[AD_CATEGORY.COTTAGE.id]: ""
|
|
},
|
|
PROSTOR_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss",
|
|
PROSTOR_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
|
|
};
|
|
|
|
class ProstorCrawler {
|
|
constructor(
|
|
savers = [],
|
|
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
|
|
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
|
|
maxPages = 5000,
|
|
maxResultsPerPage = 5000,
|
|
ignoredUsernames = [],
|
|
delayBetweenPages = 1000
|
|
) {
|
|
this.savers = savers;
|
|
this.baseUrl = "https://prostor.ba/pretraga";
|
|
this.crawlerAdTypes = crawlerAdTypes;
|
|
this.crawlerAdCategories = crawlerAdCategories;
|
|
this.maxResultsPerPage = maxResultsPerPage;
|
|
this.delayBetweenPages = delayBetweenPages;
|
|
}
|
|
|
|
async crawl() {
|
|
const crawlAdCategories = this.crawlerAdCategories;
|
|
//We need session cookie to use login privileges
|
|
const prostorCookie = await this.getCookies();
|
|
//New tag to check if crawler loged in
|
|
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
|
|
const newRealEstates = [];
|
|
//Crawl only if login was successful
|
|
if (crawlAdCategories && login) {
|
|
const indexGenerators = [];
|
|
for (const adCategory of crawlAdCategories) {
|
|
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
|
|
}
|
|
|
|
let done = false;
|
|
while (!done) {
|
|
const categoryIndexerPromises = [];
|
|
const generatorsToRemove = [];
|
|
for (const indexGenerator of indexGenerators) {
|
|
categoryIndexerPromises.push(indexGenerator.next());
|
|
generatorsToRemove.push(false);
|
|
}
|
|
|
|
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
|
const entries = singlePageResults.entries();
|
|
|
|
for (const [index, { value: singlePageResult }] of entries) {
|
|
if (singlePageResult) {
|
|
const saveResults = await this.saveCrawledResults(singlePageResult);
|
|
const { newRecords } = saveResults;
|
|
|
|
newRealEstates.push(...newRecords);
|
|
|
|
if (
|
|
Array.isArray(newRecords) &&
|
|
newRecords.length === 0 &&
|
|
!PROSTOR_FORCE_CRAWL
|
|
) {
|
|
generatorsToRemove[index] = true;
|
|
}
|
|
} else {
|
|
//Generator returned undefined, remove this generator from array
|
|
generatorsToRemove[index] = true;
|
|
// console.log("Generator ", index + 1, "has no more pages");
|
|
}
|
|
}
|
|
|
|
// console.log("Generators state : ", generatorsToRemove);
|
|
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
|
if (generatorsToRemove[i]) {
|
|
// console.log("\tRemove generator ", i + 1);
|
|
indexGenerators.splice(i, 1);
|
|
}
|
|
}
|
|
if (indexGenerators.length === 0) {
|
|
done = true;
|
|
}
|
|
|
|
await this.sleep(this.delayBetweenPages);
|
|
}
|
|
}
|
|
return newRealEstates;
|
|
}
|
|
|
|
async *categoryIndexer(adCategory, prostorCookie) {
|
|
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
|
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
|
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
|
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
|
|
const listOfAllRealEstates = await this.extractRealEstates(
|
|
urlPageToCrawl,
|
|
prostorCookie
|
|
);
|
|
|
|
let elementToStartIndexFrom = 0;
|
|
while (true) {
|
|
const realEstatesForSinglePage = listOfAllRealEstates.slice(
|
|
elementToStartIndexFrom,
|
|
elementToStartIndexFrom + this.maxResultsPerPage
|
|
);
|
|
|
|
if (realEstatesForSinglePage.length > 0) {
|
|
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
|
|
|
const singlePageResults = await this.indexSinglePage(
|
|
realEstatesForSinglePage,
|
|
prostorCookie
|
|
);
|
|
|
|
const filteredSinglePageResults = singlePageResults.filter(
|
|
singleResult => !!singleResult
|
|
);
|
|
|
|
if (
|
|
Array.isArray(filteredSinglePageResults) &&
|
|
filteredSinglePageResults.length > 0
|
|
) {
|
|
yield filteredSinglePageResults;
|
|
} else {
|
|
return undefined;
|
|
}
|
|
} else {
|
|
return undefined;
|
|
}
|
|
}
|
|
} else {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
async indexSinglePage(realEstatesList, prostorCookie) {
|
|
const asyncActions = [];
|
|
for (const realEstate of realEstatesList) {
|
|
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
|
|
}
|
|
|
|
try {
|
|
return await Promise.all(asyncActions);
|
|
} catch (e) {
|
|
console.log(
|
|
"[PROSTOR] Error crawling ads : ",
|
|
e.message || "UNKNOWN ERROR"
|
|
);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async scrapeAd(realEstate, prostorCookie) {
|
|
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
|
|
|
//Status information is given already in realestate list
|
|
const adStatus = ProstorCrawler.getStatusId(status);
|
|
|
|
const url = `https://prostor.ba${link}`;
|
|
|
|
// console.log("[PROSTOR] Scraping : ", url);
|
|
try {
|
|
const adPageSource = await fetch(url, {
|
|
headers: { Cookie: prostorCookie }
|
|
});
|
|
const body = await adPageSource.text();
|
|
const $ = cheerio.load(body);
|
|
|
|
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
|
// general form is : /actionType/realEstateType/location/realEstateID
|
|
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
|
|
|
const linkParts = link.split("/");
|
|
|
|
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
|
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
|
const prostorId = linkParts[4];
|
|
|
|
if (!adType || !realEstateType || !prostorId) {
|
|
return null;
|
|
}
|
|
|
|
const allDataSelector =
|
|
"body > div > div.container-fluid > div > div.column-right > table > tbody";
|
|
|
|
const realEstateProperties = {};
|
|
|
|
$(allDataSelector)
|
|
.find("p")
|
|
.each((i, element) => {
|
|
const propertyElement = $(element)
|
|
.text()
|
|
.split(":")
|
|
.map(text => text.trim().toLowerCase());
|
|
|
|
const propertyTitle = propertyElement[0];
|
|
realEstateProperties[propertyTitle] = propertyElement[1];
|
|
});
|
|
|
|
$(allDataSelector)
|
|
.find("div.mb-2")
|
|
.each((i, element) => {
|
|
const propertyElement = $(element)
|
|
.text()
|
|
.trim()
|
|
.toLowerCase();
|
|
|
|
realEstateProperties[propertyElement] = true;
|
|
});
|
|
|
|
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
|
|
return null;
|
|
}
|
|
|
|
let numberOfRooms =
|
|
parseFloat(realEstateProperties["broj soba"]) +
|
|
parseFloat(realEstateProperties["broj spavaćih soba"]) || null,
|
|
numberOfFloors = null,
|
|
floor = null,
|
|
accessRoadType = null,
|
|
heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties),
|
|
furnishingType = null,
|
|
balcony =
|
|
realEstateProperties["balkon"] ||
|
|
realEstateProperties["terasa"] ||
|
|
realEstateProperties["lođa"] ||
|
|
null,
|
|
newBuilding = linkParts[1] === "novogradnja",
|
|
elevator = realEstateProperties["lift"] || null,
|
|
water = realEstateProperties["voda"] || null,
|
|
electricity = realEstateProperties["električna energija"] || null,
|
|
drainageSystem = realEstateProperties["kanalizacija"] || null,
|
|
registeredInZkBooks = null,
|
|
recentlyAdapted = null,
|
|
parking = realEstateProperties["parking"] || null,
|
|
garage = realEstateProperties["garaža"] || null,
|
|
gas = realEstateProperties["plin"] || null,
|
|
antiTheftDoor = realEstateProperties["blindo vrata"] || null,
|
|
airCondition = realEstateProperties["klima"] || null,
|
|
phoneConnection = realEstateProperties["telefon"] || null,
|
|
cableTV = realEstateProperties["kablovksa tv"] || null,
|
|
internet =
|
|
realEstateProperties["internet"] ||
|
|
realEstateProperties["adsl"] ||
|
|
null,
|
|
basementAttic = realEstateProperties["podrum"] || null,
|
|
storeRoom = realEstateProperties["ostava"] || null,
|
|
videoSurveillance = realEstateProperties["video nadzor"],
|
|
alarm = realEstateProperties["alarm"] || null,
|
|
suitableForStudents = null,
|
|
includingBills = null,
|
|
animalsAllowed = null,
|
|
pool = realEstateProperties["bazen"] || null,
|
|
urbanPlanPermit = null,
|
|
buildingPermit = null,
|
|
utilityConnection = null,
|
|
distanceToRiver = null,
|
|
numberOfViewsAgency = null;
|
|
|
|
// Floor versions (there are possibly more versions) :
|
|
// Sprat: 3/3
|
|
// Sprat: 1 - 2/2
|
|
// Sprat: Pr - 7/7
|
|
// Sprat: -2/0
|
|
// If there are two parts, that represents more real estates are sold
|
|
// numberOfFloors is contained in second part, after / sign
|
|
|
|
const floorsArray = realEstateProperties["sprat"].split(" - ");
|
|
let floorText = "";
|
|
if (floorsArray.length === 1) {
|
|
const floorDescription = floorsArray[0].split("/");
|
|
numberOfFloors = parseInt(floorDescription[1]) || null;
|
|
floorText = floorDescription[0];
|
|
floor = Math.round(parseFloat(floorText));
|
|
} else if (floorsArray.length === 2) {
|
|
const floorDescription = floorsArray[1].split("/");
|
|
numberOfFloors = parseInt(floorDescription[1]) || null;
|
|
floorText = floorsArray[0];
|
|
floor = Math.round(parseFloat(floorText));
|
|
} else {
|
|
// This is something strange
|
|
}
|
|
|
|
if (isNaN(floor)) {
|
|
// It was textual representation of floor, like "Pr", "Su" or similar
|
|
switch (floorText) {
|
|
case "pr":
|
|
floor = 0;
|
|
break;
|
|
case "su":
|
|
floor = -1;
|
|
break;
|
|
default:
|
|
console.log(
|
|
"[PROSTOR] Unknown textual representation of floor : ",
|
|
floorText
|
|
);
|
|
floor = null;
|
|
}
|
|
}
|
|
|
|
if (realEstateProperties["namješteno"]) {
|
|
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
|
} else if (realEstateProperties["polunamješteno"]) {
|
|
furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
|
|
} else {
|
|
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
|
|
}
|
|
|
|
const title = property_name;
|
|
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
|
const parsedArea = parseFloat(size);
|
|
const gardenSize = null;
|
|
const longDescription = null;
|
|
|
|
const data = {
|
|
url,
|
|
agencyObjectId: prostorId,
|
|
originAgencyName: AD_AGENCY.PROSTOR,
|
|
realEstateType,
|
|
adType,
|
|
title,
|
|
price: parsedPrice,
|
|
area: parsedArea,
|
|
gardenSize,
|
|
shortDescription: "",
|
|
longDescription: longDescription,
|
|
streetNumber: 0,
|
|
streetName: realEstateProperties["adresa"],
|
|
locality: "",
|
|
municipality: "",
|
|
city: "",
|
|
region: "",
|
|
entity: "",
|
|
country: "",
|
|
locationLat: lat,
|
|
locationLong: lng,
|
|
adStatus,
|
|
numberOfRooms,
|
|
numberOfFloors,
|
|
floor,
|
|
accessRoadType,
|
|
heatingType,
|
|
furnishingType,
|
|
balcony,
|
|
newBuilding,
|
|
elevator,
|
|
water,
|
|
electricity,
|
|
drainageSystem,
|
|
registeredInZkBooks,
|
|
recentlyAdapted,
|
|
parking,
|
|
garage,
|
|
gas,
|
|
antiTheftDoor,
|
|
airCondition,
|
|
phoneConnection,
|
|
cableTV,
|
|
internet,
|
|
basementAttic,
|
|
storeRoom,
|
|
videoSurveillance,
|
|
alarm,
|
|
suitableForStudents,
|
|
includingBills,
|
|
animalsAllowed,
|
|
pool,
|
|
urbanPlanPermit,
|
|
buildingPermit,
|
|
utilityConnection,
|
|
distanceToRiver,
|
|
numberOfViewsAgency
|
|
};
|
|
|
|
return data;
|
|
} catch (e) {
|
|
console.error(
|
|
"[PROSTOR] Exception caught: " + e.message,
|
|
"\r\nURL:",
|
|
url
|
|
);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async extractRealEstates(url, prostorCookie) {
|
|
if (PRINT_CRAWLER_DEBUG) {
|
|
console.log("[PROSTOR] Index page : ", url);
|
|
}
|
|
|
|
try {
|
|
const res = await fetch(url, {
|
|
headers: { Cookie: prostorCookie }
|
|
});
|
|
const body = await res.text();
|
|
const $ = cheerio.load(body);
|
|
|
|
const scriptElement = $(
|
|
"body > div > div.container-fluid > script:nth-child(7)"
|
|
);
|
|
|
|
if (
|
|
scriptElement[0] &&
|
|
scriptElement[0].children &&
|
|
scriptElement[0].children[0] &&
|
|
scriptElement[0].children[0].data
|
|
) {
|
|
const scriptData = scriptElement[0].children[0].data;
|
|
|
|
try {
|
|
// script element data contains JS code and we need to extract only data for realEstates
|
|
// data string starts with : var map; var markers = [{"r ...
|
|
// so we remove first 23 characters
|
|
//
|
|
// real estate JSON data ends with ...}, ]; map = new...
|
|
// so we need to find index of that substring to know where to stop
|
|
// we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually
|
|
|
|
const jsonEndIndex = scriptData.indexOf(", ]; map = new");
|
|
if (jsonEndIndex > -1) {
|
|
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
|
|
const realEstates = JSON.parse(jsonData);
|
|
|
|
// const transformedRealEstates = [];
|
|
//
|
|
// for (const realEstate of realEstates) {
|
|
// const transformedRealEstate = ProstorCrawler.transformRealEstateData(
|
|
// realEstate
|
|
// );
|
|
// if (transformedRealEstate) {
|
|
// transformedRealEstates.push(transformedRealEstate);
|
|
// }
|
|
// }
|
|
//
|
|
// return transformedRealEstates;
|
|
return realEstates;
|
|
} else {
|
|
throw {
|
|
message: "Something is wrong with JSON data or data is moved"
|
|
};
|
|
}
|
|
} catch (e) {
|
|
console.log(e);
|
|
throw e;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.error(
|
|
"[PROSTOR] Exception caught:",
|
|
e.message || "UNKNOWN MESSAGE"
|
|
);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
//======= HELPER FUNCTIONS =============
|
|
|
|
static getAdCategoryId(categoryText) {
|
|
switch (categoryText) {
|
|
case "stan":
|
|
return AD_CATEGORY.FLAT.id;
|
|
case "kuca":
|
|
return AD_CATEGORY.HOUSE.id;
|
|
case "apartman":
|
|
return AD_CATEGORY.APARTMENT.id;
|
|
case "poslovni-prostor":
|
|
return AD_CATEGORY.OFFICE.id;
|
|
case "garaza":
|
|
return AD_CATEGORY.GARAGE.id;
|
|
case "zemljiste":
|
|
return AD_CATEGORY.LAND.id;
|
|
default:
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
static getAdTypeId(adTypeText) {
|
|
switch (adTypeText) {
|
|
case "prodaja":
|
|
return AD_TYPE.AD_TYPE_SALE.stringId;
|
|
case "najam":
|
|
return AD_TYPE.AD_TYPE_RENT.stringId;
|
|
case "novogradnja":
|
|
return AD_TYPE.AD_TYPE_SALE.stringId;
|
|
default:
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
static getHeatingTypeId(realEstateProperties) {
|
|
const realEstatePropertiesKeys = Object.keys(realEstateProperties);
|
|
for (const property of realEstatePropertiesKeys) {
|
|
switch (property) {
|
|
case "centralno toplane":
|
|
return HEATING_TYPE.CENTRAL_CITY.id;
|
|
case "etažno plinsko":
|
|
return HEATING_TYPE.CENTRAL_GAS.id;
|
|
case "termo blok":
|
|
case "podno grijanje":
|
|
return HEATING_TYPE.OTHER.id;
|
|
case "etažno električno":
|
|
case "konvektori":
|
|
return HEATING_TYPE.ELECTRICITY.id;
|
|
case "plinske peći":
|
|
return HEATING_TYPE.GAS.id;
|
|
case "vlastita kotlovnica":
|
|
return HEATING_TYPE.CENTRAL_BOILER.id;
|
|
case "toplotna pumpa":
|
|
return HEATING_TYPE.HEAT_PUMP.id;
|
|
case "kamin":
|
|
return HEATING_TYPE.WOOD.id;
|
|
default:
|
|
//console.log("[PROSTOR] Nepoznato >>> [", property, "]");
|
|
}
|
|
}
|
|
}
|
|
|
|
static getStatusId(statusText) {
|
|
switch (statusText) {
|
|
case "":
|
|
return AD_STATUS.STATUS_NORMAL;
|
|
case "Rezervisano":
|
|
return AD_STATUS.STATUS_RESERVED;
|
|
case "Prodano":
|
|
return AD_STATUS.STATUS_SOLD;
|
|
case "Iznajmljeno":
|
|
return AD_STATUS.STATUS_RENTED;
|
|
case "VIP ponuda":
|
|
return AD_STATUS.STATUS_VIP;
|
|
default:
|
|
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
|
|
return AD_STATUS.STATUS_NORMAL;
|
|
}
|
|
}
|
|
|
|
async sleep(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
async saveCrawledResults(results) {
|
|
const savers = this.savers;
|
|
|
|
// for (const saver of savers) {
|
|
// await saver.save(results);
|
|
// }
|
|
|
|
//For now, we use only Postgres saver, so ...
|
|
return savers[0].save(results);
|
|
//so that we can use some sequelize options and information when data is inserted
|
|
}
|
|
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
|
|
let formData = new FormData();
|
|
formData.append("email", PROSTOR_LOGIN.EMAIL);
|
|
formData.append("password", PROSTOR_LOGIN.PASSWORD);
|
|
|
|
return fetch("https://prostor.ba/moj-prostor/prijava", {
|
|
method: "POST",
|
|
body: formData,
|
|
headers: { Cookie: prostorCookie }
|
|
})
|
|
.then(page => {
|
|
return page.text();
|
|
})
|
|
.then(resp => {
|
|
const $ = cheerio.load(resp);
|
|
if (
|
|
$("h1")
|
|
.text()
|
|
.indexOf("Dobrodošli") !== -1
|
|
) {
|
|
console.log("[PROSTOR]: Crawler loged in!");
|
|
return true;
|
|
} else {
|
|
console.log("[PROSTOR]: Crawler login failed - wrong credentials!");
|
|
return false;
|
|
}
|
|
})
|
|
.catch(err => {
|
|
console.log("[PROSTOR]: Crawler login error ", err);
|
|
});
|
|
}
|
|
async getCookies() {
|
|
const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", {
|
|
headers: { Cookie: "" }
|
|
});
|
|
const raw = getResponse.headers.raw()["set-cookie"];
|
|
const cookie = raw
|
|
.map(datastring => {
|
|
const data = datastring.split(";");
|
|
const cookieData = data[0];
|
|
return cookieData;
|
|
})
|
|
.join(";");
|
|
return cookie;
|
|
}
|
|
}
|
|
|
|
module.exports = ProstorCrawler;
|