add additional fields to the Prostor crawler
This commit is contained in:
@@ -56,35 +56,6 @@ class ProstorCrawler {
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
}
|
||||
|
||||
async crawlOld() {
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
const newRealEstates = [];
|
||||
|
||||
if (crawlAdCategories) {
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
const urlAdTypePart =
|
||||
PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
|
||||
const singleCategoryResults = await this.extractRealEstates(
|
||||
urlPageToCrawl
|
||||
);
|
||||
|
||||
const resultsSubset = singleCategoryResults.slice(
|
||||
0,
|
||||
this.maxResultsPerPage
|
||||
);
|
||||
|
||||
const saveResults = await this.saveCrawledResults(resultsSubset);
|
||||
const { newRecords } = saveResults;
|
||||
newRealEstates.push(...newRecords);
|
||||
}
|
||||
}
|
||||
}
|
||||
return newRealEstates;
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
@@ -210,20 +181,67 @@ class ProstorCrawler {
|
||||
async scrapeAd(realEstate) {
|
||||
const { lat, lng, property_name, price, size, link } = realEstate;
|
||||
const url = `https://prostor.ba${link}`;
|
||||
console.log("[PROSTOR] Scraping : ", url);
|
||||
// console.log("[PROSTOR] Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
let numberOfRooms = null,
|
||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||
// general form is : /actionType/realEstateType/location/realEstateID
|
||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||
|
||||
const linkParts = link.split("/");
|
||||
|
||||
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
||||
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
||||
const prostorId = linkParts[4];
|
||||
|
||||
if (!adType || !realEstateType || !prostorId) {
|
||||
console.log(
|
||||
"adType: ",
|
||||
adType,
|
||||
" reType: ",
|
||||
realEstateType,
|
||||
" prostorId: ",
|
||||
prostorId,
|
||||
"url: ",
|
||||
url
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
const allDataSelector =
|
||||
"body > div > div.container-fluid > div > div.column-right > table > tbody";
|
||||
|
||||
const realEstateProperties = {};
|
||||
|
||||
$(allDataSelector)
|
||||
.find("p")
|
||||
.each((i, elem) => {
|
||||
const propertyElement = $(elem)
|
||||
.text()
|
||||
.split(":")
|
||||
.map(text => text.trim());
|
||||
|
||||
const propertyTitle = propertyElement[0];
|
||||
realEstateProperties[propertyTitle] = propertyElement[1];
|
||||
});
|
||||
|
||||
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
|
||||
return null;
|
||||
}
|
||||
|
||||
let numberOfRooms =
|
||||
parseFloat(realEstateProperties["Broj soba"]) +
|
||||
parseFloat(realEstateProperties["Broj spavaćih soba"]) || null,
|
||||
numberOfFloors = null,
|
||||
floor = null,
|
||||
accessRoadType = null,
|
||||
heatingType = null,
|
||||
furnishingType = null,
|
||||
balcony = null,
|
||||
newBuilding = null,
|
||||
newBuilding = linkParts[1] === "novogradnja",
|
||||
elevator = null,
|
||||
water = null,
|
||||
electricity = null,
|
||||
@@ -252,28 +270,46 @@ class ProstorCrawler {
|
||||
distanceToRiver = null,
|
||||
numberOfViewsAgency = null;
|
||||
|
||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||
// general form is : /actionType/realEstateType/location/realEstateID
|
||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||
// Floor versions (there are possibly more versions) :
|
||||
// Sprat: 3/3
|
||||
// Sprat: 1 - 2/2
|
||||
// Sprat: Pr - 7/7
|
||||
// Sprat: -2/0
|
||||
// If there are two parts, that represents more real estates are sold
|
||||
// numberOfFloors is contained in second part, after / sign
|
||||
|
||||
const linkParts = link.split("/");
|
||||
const floorsArray = realEstateProperties["Sprat"].split(" - ");
|
||||
let floorText = "";
|
||||
if (floorsArray.length === 1) {
|
||||
const floorDescription = floorsArray[0].split("/");
|
||||
numberOfFloors = parseInt(floorDescription[1]) || null;
|
||||
floorText = floorDescription[0];
|
||||
floor = Math.round(parseFloat(floorText));
|
||||
} else if (floorsArray.length === 2) {
|
||||
const floorDescription = floorsArray[1].split("/");
|
||||
numberOfFloors = parseInt(floorDescription[1]) || null;
|
||||
floorText = floorsArray[0];
|
||||
floor = Math.round(parseFloat(floorText));
|
||||
} else {
|
||||
// This is something strange
|
||||
}
|
||||
|
||||
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
||||
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
||||
const prostorId = linkParts[4];
|
||||
|
||||
if (!adType || !realEstateType || !prostorId) {
|
||||
console.log(
|
||||
"adType: ",
|
||||
adType,
|
||||
" reType: ",
|
||||
realEstateType,
|
||||
" prostorId: ",
|
||||
prostorId,
|
||||
"url: ",
|
||||
url
|
||||
);
|
||||
return null;
|
||||
if (isNaN(floor)) {
|
||||
// It was textual representation of floor, like "Pr", "Su" or similar
|
||||
switch (floorText.toLowerCase()) {
|
||||
case "pr":
|
||||
floor = 0;
|
||||
break;
|
||||
case "su":
|
||||
floor = -1;
|
||||
break;
|
||||
default:
|
||||
console.log(
|
||||
"[PROSTOR] Unknown textual representation of floor : ",
|
||||
floorText
|
||||
);
|
||||
floor = null;
|
||||
}
|
||||
}
|
||||
|
||||
const adStatus = AD_STATUS.STATUS_NORMAL;
|
||||
@@ -282,19 +318,6 @@ class ProstorCrawler {
|
||||
const parsedArea = parseFloat(size);
|
||||
const gardenSize = null;
|
||||
const longDescription = null;
|
||||
const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||
if (!publishedDateMoment.isValid()) {
|
||||
throw {
|
||||
message: `Invalid published date`
|
||||
};
|
||||
}
|
||||
|
||||
const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE);
|
||||
if (!renewedDateMoment.isValid()) {
|
||||
throw {
|
||||
message: `Invalid renewed date`
|
||||
};
|
||||
}
|
||||
|
||||
const data = {
|
||||
url,
|
||||
@@ -309,7 +332,7 @@ class ProstorCrawler {
|
||||
shortDescription: "",
|
||||
longDescription: longDescription,
|
||||
streetNumber: 0,
|
||||
streetName: "",
|
||||
streetName: realEstateProperties["Adresa"],
|
||||
locality: "",
|
||||
municipality: "",
|
||||
city: "",
|
||||
@@ -319,8 +342,6 @@ class ProstorCrawler {
|
||||
locationLat: lat,
|
||||
locationLong: lng,
|
||||
adStatus,
|
||||
publishedDate: publishedDateMoment.toISOString(),
|
||||
renewedDate: renewedDateMoment.toISOString(),
|
||||
numberOfRooms,
|
||||
numberOfFloors,
|
||||
floor,
|
||||
@@ -437,67 +458,6 @@ class ProstorCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
static transformRealEstateData(realEstateData) {
|
||||
try {
|
||||
const { lat, lng, property_name, price, size, link } = realEstateData;
|
||||
|
||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||
// general form is : /actionType/realEstateType/location/realEstateID
|
||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||
|
||||
const linkParts = link.split("/");
|
||||
|
||||
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
||||
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
||||
const prostorId = linkParts[4];
|
||||
const url = `https://prostor.ba${link}`;
|
||||
|
||||
if (!adType || !realEstateType || !prostorId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const adStatus = AD_STATUS.STATUS_NORMAL;
|
||||
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
||||
const parsedArea = parseFloat(size);
|
||||
|
||||
const data = {
|
||||
url,
|
||||
agencyObjectId: prostorId,
|
||||
originAgencyName: AD_AGENCY.PROSTOR,
|
||||
realEstateType,
|
||||
adType,
|
||||
title: property_name,
|
||||
price: parsedPrice,
|
||||
area: parsedArea,
|
||||
gardenSize: null,
|
||||
shortDescription: "",
|
||||
longDescription: "",
|
||||
streetNumber: 0,
|
||||
streetName: "",
|
||||
locality: "",
|
||||
municipality: "",
|
||||
city: "",
|
||||
region: "",
|
||||
entity: "",
|
||||
country: "",
|
||||
locationLat: lat,
|
||||
locationLong: lng,
|
||||
adStatus,
|
||||
publishedDate: null,
|
||||
renewedDate: null
|
||||
};
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error(
|
||||
"[PROSTOR] Exception caught: " + e.message,
|
||||
"\r\nURL:",
|
||||
url
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
//======= HELPER FUNCTIONS =============
|
||||
|
||||
static getAdCategoryId(categoryText) {
|
||||
@@ -525,6 +485,8 @@ class ProstorCrawler {
|
||||
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||
case "najam":
|
||||
return AD_TYPE.AD_TYPE_RENT.stringId;
|
||||
case "novogradnja":
|
||||
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||
default:
|
||||
return undefined;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user