add additional fields to the Prostor crawler
This commit is contained in:
@@ -56,35 +56,6 @@ class ProstorCrawler {
|
|||||||
this.delayBetweenPages = delayBetweenPages;
|
this.delayBetweenPages = delayBetweenPages;
|
||||||
}
|
}
|
||||||
|
|
||||||
async crawlOld() {
|
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
|
||||||
const newRealEstates = [];
|
|
||||||
|
|
||||||
if (crawlAdCategories) {
|
|
||||||
for (const adCategory of crawlAdCategories) {
|
|
||||||
const urlAdTypePart =
|
|
||||||
PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
|
|
||||||
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
|
|
||||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
|
||||||
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
|
|
||||||
const singleCategoryResults = await this.extractRealEstates(
|
|
||||||
urlPageToCrawl
|
|
||||||
);
|
|
||||||
|
|
||||||
const resultsSubset = singleCategoryResults.slice(
|
|
||||||
0,
|
|
||||||
this.maxResultsPerPage
|
|
||||||
);
|
|
||||||
|
|
||||||
const saveResults = await this.saveCrawledResults(resultsSubset);
|
|
||||||
const { newRecords } = saveResults;
|
|
||||||
newRealEstates.push(...newRecords);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return newRealEstates;
|
|
||||||
}
|
|
||||||
|
|
||||||
async crawl() {
|
async crawl() {
|
||||||
const crawlAdCategories = this.crawlerAdCategories;
|
const crawlAdCategories = this.crawlerAdCategories;
|
||||||
|
|
||||||
@@ -210,20 +181,67 @@ class ProstorCrawler {
|
|||||||
async scrapeAd(realEstate) {
|
async scrapeAd(realEstate) {
|
||||||
const { lat, lng, property_name, price, size, link } = realEstate;
|
const { lat, lng, property_name, price, size, link } = realEstate;
|
||||||
const url = `https://prostor.ba${link}`;
|
const url = `https://prostor.ba${link}`;
|
||||||
console.log("[PROSTOR] Scraping : ", url);
|
// console.log("[PROSTOR] Scraping : ", url);
|
||||||
try {
|
try {
|
||||||
const adPageSource = await fetch(url);
|
const adPageSource = await fetch(url);
|
||||||
const body = await adPageSource.text();
|
const body = await adPageSource.text();
|
||||||
const $ = cheerio.load(body);
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
let numberOfRooms = null,
|
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||||
|
// general form is : /actionType/realEstateType/location/realEstateID
|
||||||
|
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||||
|
|
||||||
|
const linkParts = link.split("/");
|
||||||
|
|
||||||
|
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
||||||
|
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
||||||
|
const prostorId = linkParts[4];
|
||||||
|
|
||||||
|
if (!adType || !realEstateType || !prostorId) {
|
||||||
|
console.log(
|
||||||
|
"adType: ",
|
||||||
|
adType,
|
||||||
|
" reType: ",
|
||||||
|
realEstateType,
|
||||||
|
" prostorId: ",
|
||||||
|
prostorId,
|
||||||
|
"url: ",
|
||||||
|
url
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const allDataSelector =
|
||||||
|
"body > div > div.container-fluid > div > div.column-right > table > tbody";
|
||||||
|
|
||||||
|
const realEstateProperties = {};
|
||||||
|
|
||||||
|
$(allDataSelector)
|
||||||
|
.find("p")
|
||||||
|
.each((i, elem) => {
|
||||||
|
const propertyElement = $(elem)
|
||||||
|
.text()
|
||||||
|
.split(":")
|
||||||
|
.map(text => text.trim());
|
||||||
|
|
||||||
|
const propertyTitle = propertyElement[0];
|
||||||
|
realEstateProperties[propertyTitle] = propertyElement[1];
|
||||||
|
});
|
||||||
|
|
||||||
|
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
let numberOfRooms =
|
||||||
|
parseFloat(realEstateProperties["Broj soba"]) +
|
||||||
|
parseFloat(realEstateProperties["Broj spavaćih soba"]) || null,
|
||||||
numberOfFloors = null,
|
numberOfFloors = null,
|
||||||
floor = null,
|
floor = null,
|
||||||
accessRoadType = null,
|
accessRoadType = null,
|
||||||
heatingType = null,
|
heatingType = null,
|
||||||
furnishingType = null,
|
furnishingType = null,
|
||||||
balcony = null,
|
balcony = null,
|
||||||
newBuilding = null,
|
newBuilding = linkParts[1] === "novogradnja",
|
||||||
elevator = null,
|
elevator = null,
|
||||||
water = null,
|
water = null,
|
||||||
electricity = null,
|
electricity = null,
|
||||||
@@ -252,28 +270,46 @@ class ProstorCrawler {
|
|||||||
distanceToRiver = null,
|
distanceToRiver = null,
|
||||||
numberOfViewsAgency = null;
|
numberOfViewsAgency = null;
|
||||||
|
|
||||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
// Floor versions (there are possibly more versions) :
|
||||||
// general form is : /actionType/realEstateType/location/realEstateID
|
// Sprat: 3/3
|
||||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
// Sprat: 1 - 2/2
|
||||||
|
// Sprat: Pr - 7/7
|
||||||
|
// Sprat: -2/0
|
||||||
|
// If there are two parts, that represents more real estates are sold
|
||||||
|
// numberOfFloors is contained in second part, after / sign
|
||||||
|
|
||||||
const linkParts = link.split("/");
|
const floorsArray = realEstateProperties["Sprat"].split(" - ");
|
||||||
|
let floorText = "";
|
||||||
|
if (floorsArray.length === 1) {
|
||||||
|
const floorDescription = floorsArray[0].split("/");
|
||||||
|
numberOfFloors = parseInt(floorDescription[1]) || null;
|
||||||
|
floorText = floorDescription[0];
|
||||||
|
floor = Math.round(parseFloat(floorText));
|
||||||
|
} else if (floorsArray.length === 2) {
|
||||||
|
const floorDescription = floorsArray[1].split("/");
|
||||||
|
numberOfFloors = parseInt(floorDescription[1]) || null;
|
||||||
|
floorText = floorsArray[0];
|
||||||
|
floor = Math.round(parseFloat(floorText));
|
||||||
|
} else {
|
||||||
|
// This is something strange
|
||||||
|
}
|
||||||
|
|
||||||
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
if (isNaN(floor)) {
|
||||||
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
// It was textual representation of floor, like "Pr", "Su" or similar
|
||||||
const prostorId = linkParts[4];
|
switch (floorText.toLowerCase()) {
|
||||||
|
case "pr":
|
||||||
if (!adType || !realEstateType || !prostorId) {
|
floor = 0;
|
||||||
console.log(
|
break;
|
||||||
"adType: ",
|
case "su":
|
||||||
adType,
|
floor = -1;
|
||||||
" reType: ",
|
break;
|
||||||
realEstateType,
|
default:
|
||||||
" prostorId: ",
|
console.log(
|
||||||
prostorId,
|
"[PROSTOR] Unknown textual representation of floor : ",
|
||||||
"url: ",
|
floorText
|
||||||
url
|
);
|
||||||
);
|
floor = null;
|
||||||
return null;
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const adStatus = AD_STATUS.STATUS_NORMAL;
|
const adStatus = AD_STATUS.STATUS_NORMAL;
|
||||||
@@ -282,19 +318,6 @@ class ProstorCrawler {
|
|||||||
const parsedArea = parseFloat(size);
|
const parsedArea = parseFloat(size);
|
||||||
const gardenSize = null;
|
const gardenSize = null;
|
||||||
const longDescription = null;
|
const longDescription = null;
|
||||||
const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE);
|
|
||||||
if (!publishedDateMoment.isValid()) {
|
|
||||||
throw {
|
|
||||||
message: `Invalid published date`
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE);
|
|
||||||
if (!renewedDateMoment.isValid()) {
|
|
||||||
throw {
|
|
||||||
message: `Invalid renewed date`
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
url,
|
url,
|
||||||
@@ -309,7 +332,7 @@ class ProstorCrawler {
|
|||||||
shortDescription: "",
|
shortDescription: "",
|
||||||
longDescription: longDescription,
|
longDescription: longDescription,
|
||||||
streetNumber: 0,
|
streetNumber: 0,
|
||||||
streetName: "",
|
streetName: realEstateProperties["Adresa"],
|
||||||
locality: "",
|
locality: "",
|
||||||
municipality: "",
|
municipality: "",
|
||||||
city: "",
|
city: "",
|
||||||
@@ -319,8 +342,6 @@ class ProstorCrawler {
|
|||||||
locationLat: lat,
|
locationLat: lat,
|
||||||
locationLong: lng,
|
locationLong: lng,
|
||||||
adStatus,
|
adStatus,
|
||||||
publishedDate: publishedDateMoment.toISOString(),
|
|
||||||
renewedDate: renewedDateMoment.toISOString(),
|
|
||||||
numberOfRooms,
|
numberOfRooms,
|
||||||
numberOfFloors,
|
numberOfFloors,
|
||||||
floor,
|
floor,
|
||||||
@@ -437,67 +458,6 @@ class ProstorCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static transformRealEstateData(realEstateData) {
|
|
||||||
try {
|
|
||||||
const { lat, lng, property_name, price, size, link } = realEstateData;
|
|
||||||
|
|
||||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
|
||||||
// general form is : /actionType/realEstateType/location/realEstateID
|
|
||||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
|
||||||
|
|
||||||
const linkParts = link.split("/");
|
|
||||||
|
|
||||||
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
|
|
||||||
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
|
|
||||||
const prostorId = linkParts[4];
|
|
||||||
const url = `https://prostor.ba${link}`;
|
|
||||||
|
|
||||||
if (!adType || !realEstateType || !prostorId) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const adStatus = AD_STATUS.STATUS_NORMAL;
|
|
||||||
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
|
|
||||||
const parsedArea = parseFloat(size);
|
|
||||||
|
|
||||||
const data = {
|
|
||||||
url,
|
|
||||||
agencyObjectId: prostorId,
|
|
||||||
originAgencyName: AD_AGENCY.PROSTOR,
|
|
||||||
realEstateType,
|
|
||||||
adType,
|
|
||||||
title: property_name,
|
|
||||||
price: parsedPrice,
|
|
||||||
area: parsedArea,
|
|
||||||
gardenSize: null,
|
|
||||||
shortDescription: "",
|
|
||||||
longDescription: "",
|
|
||||||
streetNumber: 0,
|
|
||||||
streetName: "",
|
|
||||||
locality: "",
|
|
||||||
municipality: "",
|
|
||||||
city: "",
|
|
||||||
region: "",
|
|
||||||
entity: "",
|
|
||||||
country: "",
|
|
||||||
locationLat: lat,
|
|
||||||
locationLong: lng,
|
|
||||||
adStatus,
|
|
||||||
publishedDate: null,
|
|
||||||
renewedDate: null
|
|
||||||
};
|
|
||||||
|
|
||||||
return data;
|
|
||||||
} catch (e) {
|
|
||||||
console.error(
|
|
||||||
"[PROSTOR] Exception caught: " + e.message,
|
|
||||||
"\r\nURL:",
|
|
||||||
url
|
|
||||||
);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//======= HELPER FUNCTIONS =============
|
//======= HELPER FUNCTIONS =============
|
||||||
|
|
||||||
static getAdCategoryId(categoryText) {
|
static getAdCategoryId(categoryText) {
|
||||||
@@ -525,6 +485,8 @@ class ProstorCrawler {
|
|||||||
return AD_TYPE.AD_TYPE_SALE.stringId;
|
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||||
case "najam":
|
case "najam":
|
||||||
return AD_TYPE.AD_TYPE_RENT.stringId;
|
return AD_TYPE.AD_TYPE_RENT.stringId;
|
||||||
|
case "novogradnja":
|
||||||
|
return AD_TYPE.AD_TYPE_SALE.stringId;
|
||||||
default:
|
default:
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user