Files
old-web/app/crawler/specificCrawlers/prostor.js
2020-02-20 19:49:29 +01:00

637 lines
19 KiB
JavaScript

"use strict";
const fetch = require("../../helpers/fetchWrapper");
const cheerio = require("cheerio");
const moment = require("moment-timezone");
const FormData = require("form-data");
const {
AD_TYPE,
AD_CATEGORY,
AD_AGENCY,
AD_STATUS,
CRAWLER_AD_TYPE,
FURNISHING_TYPE,
HEATING_TYPE
} = require("../../common/enums");
const {
PRINT_CRAWLER_DEBUG,
DEFAULT_TIMEZONE,
PROSTOR_LOGIN
} = require("../../config/appConfig");
const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor");
const PROSTOR_ENUMS = {
PROSTOR_AD_TYPE: {
[CRAWLER_AD_TYPE.ALL]: "&action=0",
[CRAWLER_AD_TYPE.ONLY_SELL]: "&action=1",
[CRAWLER_AD_TYPE.ONLY_RENT]: "&action=2"
},
PROSTOR_AD_CATEGORY: {
[AD_CATEGORY.ALL.id]: "",
[AD_CATEGORY.FLAT.id]: "&type=7",
[AD_CATEGORY.HOUSE.id]: "&type=8",
[AD_CATEGORY.LAND.id]: "&type=10",
[AD_CATEGORY.OFFICE.id]: "&type=9",
[AD_CATEGORY.APARTMENT.id]: "&type=11",
[AD_CATEGORY.GARAGE.id]: "&type=14"
//[AD_CATEGORY.COTTAGE.id]: ""
},
PROSTOR_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss",
PROSTOR_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss"
};
class ProstorCrawler {
constructor(
savers = [],
crawlerAdTypes = CRAWLER_AD_TYPE.ALL,
crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE],
maxPages = 5000,
maxResultsPerPage = 5000,
ignoredUsernames = [],
delayBetweenPages = 1000
) {
this.savers = savers;
this.baseUrl = "https://prostor.ba/pretraga";
this.crawlerAdTypes = crawlerAdTypes;
this.crawlerAdCategories = crawlerAdCategories;
this.maxResultsPerPage = maxResultsPerPage;
this.delayBetweenPages = delayBetweenPages;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
//We need session cookie to use login privileges
const prostorCookie = await this.getCookies();
//New tag to check if crawler loged in
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
const newRealEstates = [];
//Crawl only if login was successful
if (crawlAdCategories && login) {
const indexGenerators = [];
for (const adCategory of crawlAdCategories) {
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
}
let done = false;
while (!done) {
const categoryIndexerPromises = [];
const generatorsToRemove = [];
for (const indexGenerator of indexGenerators) {
categoryIndexerPromises.push(indexGenerator.next());
generatorsToRemove.push(false);
}
const singlePageResults = await Promise.all(categoryIndexerPromises);
const entries = singlePageResults.entries();
for (const [index, { value: singlePageResult }] of entries) {
if (singlePageResult) {
const saveResults = await this.saveCrawledResults(singlePageResult);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
if (
Array.isArray(newRecords) &&
newRecords.length === 0 &&
!PROSTOR_FORCE_CRAWL
) {
generatorsToRemove[index] = true;
}
} else {
//Generator returned undefined, remove this generator from array
generatorsToRemove[index] = true;
// console.log("Generator ", index + 1, "has no more pages");
}
}
// console.log("Generators state : ", generatorsToRemove);
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
if (generatorsToRemove[i]) {
// console.log("\tRemove generator ", i + 1);
indexGenerators.splice(i, 1);
}
}
if (indexGenerators.length === 0) {
done = true;
}
await this.sleep(this.delayBetweenPages);
}
}
return newRealEstates;
}
async *categoryIndexer(adCategory, prostorCookie) {
const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=0${urlAdTypePart}${urlCategoryPart}`;
const listOfAllRealEstates = await this.extractRealEstates(
urlPageToCrawl,
prostorCookie
);
let elementToStartIndexFrom = 0;
while (true) {
const realEstatesForSinglePage = listOfAllRealEstates.slice(
elementToStartIndexFrom,
elementToStartIndexFrom + this.maxResultsPerPage
);
if (realEstatesForSinglePage.length > 0) {
elementToStartIndexFrom += realEstatesForSinglePage.length;
const singlePageResults = await this.indexSinglePage(
realEstatesForSinglePage,
prostorCookie
);
const filteredSinglePageResults = singlePageResults.filter(
singleResult => !!singleResult
);
if (
Array.isArray(filteredSinglePageResults) &&
filteredSinglePageResults.length > 0
) {
yield filteredSinglePageResults;
} else {
return undefined;
}
} else {
return undefined;
}
}
} else {
return undefined;
}
}
async indexSinglePage(realEstatesList, prostorCookie) {
const asyncActions = [];
for (const realEstate of realEstatesList) {
asyncActions.push(this.scrapeAd(realEstate, prostorCookie));
}
try {
return await Promise.all(asyncActions);
} catch (e) {
console.log(
"[PROSTOR] Error crawling ads : ",
e.message || "UNKNOWN ERROR"
);
return [];
}
}
async scrapeAd(realEstate, prostorCookie) {
const { lat, lng, property_name, price, size, link, status } = realEstate;
//Status information is given already in realestate list
const adStatus = ProstorCrawler.getStatusId(status);
const url = `https://prostor.ba${link}`;
// console.log("[PROSTOR] Scraping : ", url);
try {
const adPageSource = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await adPageSource.text();
const $ = cheerio.load(body);
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
const linkParts = link.split("/");
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
if (!adType || !realEstateType || !prostorId) {
return null;
}
const allDataSelector =
"body > div > div.container-fluid > div > div.column-right > table > tbody";
const realEstateProperties = {};
$(allDataSelector)
.find("p")
.each((i, element) => {
const propertyElement = $(element)
.text()
.split(":")
.map(text => text.trim().toLowerCase());
const propertyTitle = propertyElement[0];
realEstateProperties[propertyTitle] = propertyElement[1];
});
$(allDataSelector)
.find("div.mb-2")
.each((i, element) => {
const propertyElement = $(element)
.text()
.trim()
.toLowerCase();
realEstateProperties[propertyElement] = true;
});
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
return null;
}
let numberOfRooms =
parseFloat(realEstateProperties["broj soba"]) +
parseFloat(realEstateProperties["broj spavaćih soba"]) || null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties),
furnishingType = null,
balcony =
realEstateProperties["balkon"] ||
realEstateProperties["terasa"] ||
realEstateProperties["lođa"] ||
null,
newBuilding = linkParts[1] === "novogradnja",
elevator = realEstateProperties["lift"] || null,
water = realEstateProperties["voda"] || null,
electricity = realEstateProperties["električna energija"] || null,
drainageSystem = realEstateProperties["kanalizacija"] || null,
registeredInZkBooks = null,
recentlyAdapted = null,
parking = realEstateProperties["parking"] || null,
garage = realEstateProperties["garaža"] || null,
gas = realEstateProperties["plin"] || null,
antiTheftDoor = realEstateProperties["blindo vrata"] || null,
airCondition = realEstateProperties["klima"] || null,
phoneConnection = realEstateProperties["telefon"] || null,
cableTV = realEstateProperties["kablovksa tv"] || null,
internet =
realEstateProperties["internet"] ||
realEstateProperties["adsl"] ||
null,
basementAttic = realEstateProperties["podrum"] || null,
storeRoom = realEstateProperties["ostava"] || null,
videoSurveillance = realEstateProperties["video nadzor"],
alarm = realEstateProperties["alarm"] || null,
suitableForStudents = null,
includingBills = null,
animalsAllowed = null,
pool = realEstateProperties["bazen"] || null,
urbanPlanPermit = null,
buildingPermit = null,
utilityConnection = null,
distanceToRiver = null,
numberOfViewsAgency = null;
// Floor versions (there are possibly more versions) :
// Sprat: 3/3
// Sprat: 1 - 2/2
// Sprat: Pr - 7/7
// Sprat: -2/0
// If there are two parts, that represents more real estates are sold
// numberOfFloors is contained in second part, after / sign
const floorsArray = realEstateProperties["sprat"].split(" - ");
let floorText = "";
if (floorsArray.length === 1) {
const floorDescription = floorsArray[0].split("/");
numberOfFloors = parseInt(floorDescription[1]) || null;
floorText = floorDescription[0];
floor = Math.round(parseFloat(floorText));
} else if (floorsArray.length === 2) {
const floorDescription = floorsArray[1].split("/");
numberOfFloors = parseInt(floorDescription[1]) || null;
floorText = floorsArray[0];
floor = Math.round(parseFloat(floorText));
} else {
// This is something strange
}
if (isNaN(floor)) {
// It was textual representation of floor, like "Pr", "Su" or similar
switch (floorText) {
case "pr":
floor = 0;
break;
case "su":
floor = -1;
break;
default:
console.log(
"[PROSTOR] Unknown textual representation of floor : ",
floorText
);
floor = null;
}
}
if (realEstateProperties["namješteno"]) {
furnishingType = FURNISHING_TYPE.FURNISHED.id;
} else if (realEstateProperties["polunamješteno"]) {
furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
} else {
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
}
const title = property_name;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
const gardenSize = null;
const longDescription = null;
const data = {
url,
agencyObjectId: prostorId,
originAgencyName: AD_AGENCY.PROSTOR,
realEstateType,
adType,
title,
price: parsedPrice,
area: parsedArea,
gardenSize,
shortDescription: "",
longDescription: longDescription,
streetNumber: 0,
streetName: realEstateProperties["adresa"],
locality: "",
municipality: "",
city: "",
region: "",
entity: "",
country: "",
locationLat: lat,
locationLong: lng,
adStatus,
numberOfRooms,
numberOfFloors,
floor,
accessRoadType,
heatingType,
furnishingType,
balcony,
newBuilding,
elevator,
water,
electricity,
drainageSystem,
registeredInZkBooks,
recentlyAdapted,
parking,
garage,
gas,
antiTheftDoor,
airCondition,
phoneConnection,
cableTV,
internet,
basementAttic,
storeRoom,
videoSurveillance,
alarm,
suitableForStudents,
includingBills,
animalsAllowed,
pool,
urbanPlanPermit,
buildingPermit,
utilityConnection,
distanceToRiver,
numberOfViewsAgency
};
return data;
} catch (e) {
console.error(
"[PROSTOR] Exception caught: " + e.message,
"\r\nURL:",
url
);
return null;
}
}
async extractRealEstates(url, prostorCookie) {
if (PRINT_CRAWLER_DEBUG) {
console.log("[PROSTOR] Index page : ", url);
}
try {
const res = await fetch(url, {
headers: { Cookie: prostorCookie }
});
const body = await res.text();
const $ = cheerio.load(body);
const scriptElement = $(
"body > div > div.container-fluid > script:nth-child(7)"
);
if (
scriptElement[0] &&
scriptElement[0].children &&
scriptElement[0].children[0] &&
scriptElement[0].children[0].data
) {
const scriptData = scriptElement[0].children[0].data;
try {
// script element data contains JS code and we need to extract only data for realEstates
// data string starts with : var map; var markers = [{"r ...
// so we remove first 23 characters
//
// real estate JSON data ends with ...}, ]; map = new...
// so we need to find index of that substring to know where to stop
// we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually
const jsonEndIndex = scriptData.indexOf(", ]; map = new");
if (jsonEndIndex > -1) {
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
const realEstates = JSON.parse(jsonData);
// const transformedRealEstates = [];
//
// for (const realEstate of realEstates) {
// const transformedRealEstate = ProstorCrawler.transformRealEstateData(
// realEstate
// );
// if (transformedRealEstate) {
// transformedRealEstates.push(transformedRealEstate);
// }
// }
//
// return transformedRealEstates;
return realEstates;
} else {
throw {
message: "Something is wrong with JSON data or data is moved"
};
}
} catch (e) {
console.log(e);
throw e;
}
}
} catch (e) {
console.error(
"[PROSTOR] Exception caught:",
e.message || "UNKNOWN MESSAGE"
);
return [];
}
}
//======= HELPER FUNCTIONS =============
static getAdCategoryId(categoryText) {
switch (categoryText) {
case "stan":
return AD_CATEGORY.FLAT.id;
case "kuca":
return AD_CATEGORY.HOUSE.id;
case "apartman":
return AD_CATEGORY.APARTMENT.id;
case "poslovni-prostor":
return AD_CATEGORY.OFFICE.id;
case "garaza":
return AD_CATEGORY.GARAGE.id;
case "zemljiste":
return AD_CATEGORY.LAND.id;
default:
return undefined;
}
}
static getAdTypeId(adTypeText) {
switch (adTypeText) {
case "prodaja":
return AD_TYPE.AD_TYPE_SALE.stringId;
case "najam":
return AD_TYPE.AD_TYPE_RENT.stringId;
case "novogradnja":
return AD_TYPE.AD_TYPE_SALE.stringId;
default:
return undefined;
}
}
static getHeatingTypeId(realEstateProperties) {
const realEstatePropertiesKeys = Object.keys(realEstateProperties);
for (const property of realEstatePropertiesKeys) {
switch (property) {
case "centralno toplane":
return HEATING_TYPE.CENTRAL_CITY.id;
case "etažno plinsko":
return HEATING_TYPE.CENTRAL_GAS.id;
case "termo blok":
case "podno grijanje":
return HEATING_TYPE.OTHER.id;
case "etažno električno":
case "konvektori":
return HEATING_TYPE.ELECTRICITY.id;
case "plinske peći":
return HEATING_TYPE.GAS.id;
case "vlastita kotlovnica":
return HEATING_TYPE.CENTRAL_BOILER.id;
case "toplotna pumpa":
return HEATING_TYPE.HEAT_PUMP.id;
case "kamin":
return HEATING_TYPE.WOOD.id;
default:
//console.log("[PROSTOR] Nepoznato >>> [", property, "]");
}
}
}
static getStatusId(statusText) {
switch (statusText) {
case "":
return AD_STATUS.STATUS_NORMAL;
case "Rezervisano":
return AD_STATUS.STATUS_RESERVED;
case "Prodano":
return AD_STATUS.STATUS_SOLD;
case "Iznajmljeno":
return AD_STATUS.STATUS_RENTED;
case "VIP ponuda":
return AD_STATUS.STATUS_VIP;
default:
console.log("[PROSTOR] Unknown AD_STATUS : [", statusText, "]");
return AD_STATUS.STATUS_NORMAL;
}
}
async sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async saveCrawledResults(results) {
const savers = this.savers;
// for (const saver of savers) {
// await saver.save(results);
// }
//For now, we use only Postgres saver, so ...
return savers[0].save(results);
//so that we can use some sequelize options and information when data is inserted
}
async loginForScraping(PROSTOR_LOGIN, prostorCookie) {
let formData = new FormData();
formData.append("email", PROSTOR_LOGIN.EMAIL);
formData.append("password", PROSTOR_LOGIN.PASSWORD);
return fetch("https://prostor.ba/moj-prostor/prijava", {
method: "POST",
body: formData,
headers: { Cookie: prostorCookie }
})
.then(page => {
return page.text();
})
.then(resp => {
const $ = cheerio.load(resp);
if (
$("h1")
.text()
.indexOf("Dobrodošli") !== -1
) {
console.log("[PROSTOR]: Crawler loged in!");
return true;
} else {
console.log("[PROSTOR]: Crawler login failed - wrong credentials!");
return false;
}
})
.catch(err => {
console.log("[PROSTOR]: Crawler login error ", err);
});
}
async getCookies() {
const getResponse = await fetch("https://prostor.ba/moj-prostor/prijava", {
headers: { Cookie: "" }
});
const raw = getResponse.headers.raw()["set-cookie"];
const cookie = raw
.map(datastring => {
const data = datastring.split(";");
const cookieData = data[0];
return cookieData;
})
.join(";");
return cookie;
}
}
module.exports = ProstorCrawler;