WIP Started saljic crawler.
This commit is contained in:
@@ -57,12 +57,504 @@ class SaljicCrawler {
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
//
|
||||
console.log("Saljic URL: ", this.baseUrl);
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const newRealEstates = [];
|
||||
|
||||
if (crawlAdCategories) {
|
||||
const indexGenerators = [];
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
indexGenerators.push(this.categoryIndexer(adCategory));
|
||||
}
|
||||
//
|
||||
console.log(indexGenerators);
|
||||
//
|
||||
let done = false;
|
||||
while (!done) {
|
||||
const categoryIndexerPromises = [];
|
||||
const generatorsToRemove = [];
|
||||
for (const indexGenerator of indexGenerators) {
|
||||
categoryIndexerPromises.push(indexGenerator.next());
|
||||
generatorsToRemove.push(false);
|
||||
}
|
||||
|
||||
const singlePageResults = await Promise.all(categoryIndexerPromises);
|
||||
const entries = singlePageResults.entries();
|
||||
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||
const { newRecords } = saveResults;
|
||||
|
||||
newRealEstates.push(...newRecords);
|
||||
|
||||
if (
|
||||
Array.isArray(newRecords) &&
|
||||
newRecords.length === 0 &&
|
||||
!SALJIC_FORCE_CRAWL
|
||||
) {
|
||||
generatorsToRemove[index] = true;
|
||||
}
|
||||
} else {
|
||||
//Generator returned undefined, remove this generator from array
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("Generator ", index + 1, "has no more pages");
|
||||
}
|
||||
}
|
||||
|
||||
// console.log("Generators state : ", generatorsToRemove);
|
||||
for (let i = generatorsToRemove.length - 1; i >= 0; i--) {
|
||||
if (generatorsToRemove[i]) {
|
||||
// console.log("\tRemove generator ", i + 1);
|
||||
indexGenerators.splice(i, 1);
|
||||
}
|
||||
}
|
||||
if (indexGenerators.length === 0) {
|
||||
done = true;
|
||||
}
|
||||
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
}
|
||||
}
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async *categoryIndexer(adCategory) {
|
||||
const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory];
|
||||
|
||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||
const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}`;
|
||||
const listOfAllRealEstates = await this.extractRealEstates(
|
||||
urlPageToCrawl
|
||||
);
|
||||
|
||||
let elementToStartIndexFrom = 0;
|
||||
while (true) {
|
||||
const realEstatesForSinglePage = listOfAllRealEstates.slice(
|
||||
elementToStartIndexFrom,
|
||||
elementToStartIndexFrom + this.maxResultsPerPage
|
||||
);
|
||||
|
||||
if (realEstatesForSinglePage.length > 0) {
|
||||
elementToStartIndexFrom += realEstatesForSinglePage.length;
|
||||
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
realEstatesForSinglePage
|
||||
);
|
||||
|
||||
const filteredSinglePageResults = singlePageResults.filter(
|
||||
singleResult => !!singleResult
|
||||
);
|
||||
|
||||
if (
|
||||
Array.isArray(filteredSinglePageResults) &&
|
||||
filteredSinglePageResults.length > 0
|
||||
) {
|
||||
yield filteredSinglePageResults;
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
async indexSinglePage(realEstatesList) {
|
||||
const asyncActions = [];
|
||||
for (const realEstate of realEstatesList) {
|
||||
asyncActions.push(this.scrapeAd(realEstate));
|
||||
}
|
||||
|
||||
try {
|
||||
return await Promise.all(asyncActions);
|
||||
} catch (e) {
|
||||
console.log(
|
||||
"[SALJIC] Error crawling ads : ",
|
||||
e.message || "UNKNOWN ERROR"
|
||||
);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAd(realEstate) {
|
||||
const { lat, lng, property_name, price, size, link, status } = realEstate;
|
||||
const url = `https://www.saljicnekretnine.ba/v2/${link}`;
|
||||
// console.log("[SALJIC] Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
// ?? Ovo se mora promijeniti
|
||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||
// general form is : /actionType/realEstateType/location/realEstateID
|
||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||
|
||||
const linkParts = link.split("/");
|
||||
|
||||
const adType = SaljicCrawler.getAdTypeId(linkParts[1]);
|
||||
const realEstateType = SaljicCrawler.getAdCategoryId(linkParts[2]);
|
||||
const prostorId = linkParts[4];
|
||||
|
||||
if (!adType || !realEstateType || !prostorId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const allDataSelector =
|
||||
"body > div > div.container-fluid > div > div.column-right > table > tbody";
|
||||
|
||||
const realEstateProperties = {};
|
||||
|
||||
$(allDataSelector)
|
||||
.find("p")
|
||||
.each((i, element) => {
|
||||
const propertyElement = $(element)
|
||||
.text()
|
||||
.split(":")
|
||||
.map(text => text.trim().toLowerCase());
|
||||
|
||||
const propertyTitle = propertyElement[0];
|
||||
realEstateProperties[propertyTitle] = propertyElement[1];
|
||||
});
|
||||
|
||||
$(allDataSelector)
|
||||
.find("div.mb-2")
|
||||
.each((i, element) => {
|
||||
const propertyElement = $(element)
|
||||
.text()
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
|
||||
realEstateProperties[propertyElement] = true;
|
||||
});
|
||||
|
||||
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let numberOfRooms =
|
||||
parseFloat(realEstateProperties["broj soba"]) +
|
||||
parseFloat(realEstateProperties["broj spavaćih soba"]) || null,
|
||||
numberOfFloors = null,
|
||||
floor = null,
|
||||
accessRoadType = null,
|
||||
heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties),
|
||||
furnishingType = null,
|
||||
balcony =
|
||||
realEstateProperties["balkon"] ||
|
||||
realEstateProperties["terasa"] ||
|
||||
realEstateProperties["lođa"] ||
|
||||
null,
|
||||
newBuilding = linkParts[1] === "novogradnja",
|
||||
elevator = realEstateProperties["lift"] || null,
|
||||
water = realEstateProperties["voda"] || null,
|
||||
electricity = realEstateProperties["električna energija"] || null,
|
||||
drainageSystem = realEstateProperties["kanalizacija"] || null,
|
||||
registeredInZkBooks = null,
|
||||
recentlyAdapted = null,
|
||||
parking = realEstateProperties["parking"] || null,
|
||||
garage = realEstateProperties["garaža"] || null,
|
||||
gas = realEstateProperties["plin"] || null,
|
||||
antiTheftDoor = realEstateProperties["blindo vrata"] || null,
|
||||
airCondition = realEstateProperties["klima"] || null,
|
||||
phoneConnection = realEstateProperties["telefon"] || null,
|
||||
cableTV = realEstateProperties["kablovksa tv"] || null,
|
||||
internet =
|
||||
realEstateProperties["internet"] ||
|
||||
realEstateProperties["adsl"] ||
|
||||
null,
|
||||
basementAttic = realEstateProperties["podrum"] || null,
|
||||
storeRoom = realEstateProperties["ostava"] || null,
|
||||
videoSurveillance = realEstateProperties["video nadzor"],
|
||||
alarm = realEstateProperties["alarm"] || null,
|
||||
suitableForStudents = null,
|
||||
includingBills = null,
|
||||
animalsAllowed = null,
|
||||
pool = realEstateProperties["bazen"] || null,
|
||||
urbanPlanPermit = null,
|
||||
buildingPermit = null,
|
||||
utilityConnection = null,
|
||||
distanceToRiver = null,
|
||||
numberOfViewsAgency = null;
|
||||
|
||||
// Floor versions (there are possibly more versions) :
|
||||
// Sprat: 3/3
|
||||
// Sprat: 1 - 2/2
|
||||
// Sprat: Pr - 7/7
|
||||
// Sprat: -2/0
|
||||
// If there are two parts, that represents more real estates are sold
|
||||
// numberOfFloors is contained in second part, after / sign
|
||||
|
||||
const floorsArray = realEstateProperties["sprat"].split(" - ");
|
||||
let floorText = "";
|
||||
if (floorsArray.length === 1) {
|
||||
const floorDescription = floorsArray[0].split("/");
|
||||
numberOfFloors = parseInt(floorDescription[1]) || null;
|
||||
floorText = floorDescription[0];
|
||||
floor = Math.round(parseFloat(floorText));
|
||||
} else if (floorsArray.length === 2) {
|
||||
const floorDescription = floorsArray[1].split("/");
|
||||
numberOfFloors = parseInt(floorDescription[1]) || null;
|
||||
floorText = floorsArray[0];
|
||||
floor = Math.round(parseFloat(floorText));
|
||||
} else {
|
||||
// This is something strange
|
||||
}
|
||||
|
||||
if (isNaN(floor)) {
|
||||
// It was textual representation of floor, like "Pr", "Su" or similar
|
||||
switch (floorText) {
|
||||
case "pr":
|
||||
floor = 0;
|
||||
break;
|
||||
case "su":
|
||||
floor = -1;
|
||||
break;
|
||||
default:
|
||||
console.log(
|
||||
"[SALJIC] Unknown textual representation of floor : ",
|
||||
floorText
|
||||
);
|
||||
floor = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (realEstateProperties["namješteno"]) {
|
||||
furnishingType = FURNISHING_TYPE.FURNISHED.id;
|
||||
} else if (realEstateProperties["polunamješteno"]) {
|
||||
furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id;
|
||||
} else {
|
||||
furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id;
|
||||
}
|
||||
|
||||
const adStatus = SaljicCrawler.getStatusId(status);
|
||||
const title = property_name;
|
||||
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
||||
const parsedArea = parseFloat(size);
|
||||
const gardenSize = null;
|
||||
const longDescription = null;
|
||||
|
||||
const data = {
|
||||
url,
|
||||
agencyObjectId: prostorId,
|
||||
originAgencyName: AD_AGENCY.SALJIC,
|
||||
realEstateType,
|
||||
adType,
|
||||
title,
|
||||
price: parsedPrice,
|
||||
area: parsedArea,
|
||||
gardenSize,
|
||||
shortDescription: "",
|
||||
longDescription: longDescription,
|
||||
streetNumber: 0,
|
||||
streetName: realEstateProperties["adresa"],
|
||||
locality: "",
|
||||
municipality: "",
|
||||
city: "",
|
||||
region: "",
|
||||
entity: "",
|
||||
country: "",
|
||||
locationLat: lat,
|
||||
locationLong: lng,
|
||||
adStatus,
|
||||
numberOfRooms,
|
||||
numberOfFloors,
|
||||
floor,
|
||||
accessRoadType,
|
||||
heatingType,
|
||||
furnishingType,
|
||||
balcony,
|
||||
newBuilding,
|
||||
elevator,
|
||||
water,
|
||||
electricity,
|
||||
drainageSystem,
|
||||
registeredInZkBooks,
|
||||
recentlyAdapted,
|
||||
parking,
|
||||
garage,
|
||||
gas,
|
||||
antiTheftDoor,
|
||||
airCondition,
|
||||
phoneConnection,
|
||||
cableTV,
|
||||
internet,
|
||||
basementAttic,
|
||||
storeRoom,
|
||||
videoSurveillance,
|
||||
alarm,
|
||||
suitableForStudents,
|
||||
includingBills,
|
||||
animalsAllowed,
|
||||
pool,
|
||||
urbanPlanPermit,
|
||||
buildingPermit,
|
||||
utilityConnection,
|
||||
distanceToRiver,
|
||||
numberOfViewsAgency
|
||||
};
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async extractRealEstates(url) {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("[SALJIC] Index page : ", url);
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
console.log("SALJIC: $", $);
|
||||
|
||||
const scriptElement = $(
|
||||
"body > div > div.container-fluid > script:nth-child(7)"
|
||||
);
|
||||
//
|
||||
//console.log(scriptElement[0]);
|
||||
//console.log(scriptElement[0].children);
|
||||
if (
|
||||
scriptElement[0] &&
|
||||
scriptElement[0].children &&
|
||||
scriptElement[0].children[0] &&
|
||||
scriptElement[0].children[0].data
|
||||
) {
|
||||
const scriptData = scriptElement[0].children[0].data;
|
||||
//
|
||||
console.log(scriptData);
|
||||
try {
|
||||
// script element data contains JS code and we need to extract only data for realEstates
|
||||
// data string starts with : var map; var markers = [{"r ...
|
||||
// so we remove first 23 characters
|
||||
//
|
||||
// real estate JSON data ends with ...}, ]; map = new...
|
||||
// so we need to find index of that substring to know where to stop
|
||||
// we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually
|
||||
|
||||
const jsonEndIndex = scriptData.indexOf(", ]; map = new");
|
||||
if (jsonEndIndex > -1) {
|
||||
const jsonData = scriptData.substring(23, jsonEndIndex) + "]";
|
||||
const realEstates = JSON.parse(jsonData);
|
||||
|
||||
// const transformedRealEstates = [];
|
||||
//
|
||||
// for (const realEstate of realEstates) {
|
||||
// const transformedRealEstate = SaljicCrawler.transformRealEstateData(
|
||||
// realEstate
|
||||
// );
|
||||
// if (transformedRealEstate) {
|
||||
// transformedRealEstates.push(transformedRealEstate);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return transformedRealEstates;
|
||||
return realEstates;
|
||||
} else {
|
||||
throw {
|
||||
message: "Something is wrong with JSON data or data is moved"
|
||||
};
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(
|
||||
"[SALJIC] Exception caught:",
|
||||
e.message || "UNKNOWN MESSAGE"
|
||||
);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
//======= HELPER FUNCTIONS =============
|
||||
|
||||
static getAdCategoryId(categoryText) {
|
||||
switch (categoryText) {
|
||||
case "stan":
|
||||
return AD_CATEGORY.FLAT.id;
|
||||
case "kuca":
|
||||
return AD_CATEGORY.HOUSE.id;
|
||||
case "apartman":
|
||||
return AD_CATEGORY.APARTMENT.id;
|
||||
case "poslovni-prostor":
|
||||
return AD_CATEGORY.OFFICE.id;
|
||||
case "garaza":
|
||||
return AD_CATEGORY.GARAGE.id;
|
||||
case "zemljiste":
|
||||
return AD_CATEGORY.LAND.id;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
static getAdTypeId(adTypeText) {
|
||||
switch (adTypeText) {
|
||||
case "prodaja":
|
||||
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||
case "najam":
|
||||
return AD_TYPE.AD_TYPE_RENT.stringId;
|
||||
case "novogradnja":
|
||||
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
static getHeatingTypeId(realEstateProperties) {
|
||||
const realEstatePropertiesKeys = Object.keys(realEstateProperties);
|
||||
for (const property of realEstatePropertiesKeys) {
|
||||
switch (property) {
|
||||
case "centralno toplane":
|
||||
return HEATING_TYPE.CENTRAL_CITY.id;
|
||||
case "etažno plinsko":
|
||||
return HEATING_TYPE.CENTRAL_GAS.id;
|
||||
case "termo blok":
|
||||
case "podno grijanje":
|
||||
return HEATING_TYPE.OTHER.id;
|
||||
case "etažno električno":
|
||||
case "konvektori":
|
||||
return HEATING_TYPE.ELECTRICITY.id;
|
||||
case "plinske peći":
|
||||
return HEATING_TYPE.GAS.id;
|
||||
case "vlastita kotlovnica":
|
||||
return HEATING_TYPE.CENTRAL_BOILER.id;
|
||||
case "toplotna pumpa":
|
||||
return HEATING_TYPE.HEAT_PUMP.id;
|
||||
case "kamin":
|
||||
return HEATING_TYPE.WOOD.id;
|
||||
default:
|
||||
//console.log("[SALJIC] Nepoznato >>> [", property, "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static getStatusId(statusText) {
|
||||
switch (statusText) {
|
||||
case "":
|
||||
return AD_STATUS.STATUS_NORMAL;
|
||||
case "Rezervisano":
|
||||
return AD_STATUS.STATUS_RESERVED;
|
||||
case "Prodano":
|
||||
return AD_STATUS.STATUS_SOLD;
|
||||
case "Iznajmljeno":
|
||||
return AD_STATUS.STATUS_RENTED;
|
||||
default:
|
||||
console.log("[SALJIC] Unknown AD_STATUS : [", statusText, "]");
|
||||
return AD_STATUS.STATUS_NORMAL;
|
||||
}
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user