add additional fields to the Prostor crawler

This commit is contained in:
Bilal Catic
2019-11-14 02:09:42 +01:00
parent 618dcd217e
commit c13857bc09

View File

@@ -56,35 +56,6 @@ class ProstorCrawler {
this.delayBetweenPages = delayBetweenPages;
}
async crawlOld() {
const crawlAdCategories = this.crawlerAdCategories;
const newRealEstates = [];
if (crawlAdCategories) {
for (const adCategory of crawlAdCategories) {
const urlAdTypePart =
PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes];
const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory];
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`;
const singleCategoryResults = await this.extractRealEstates(
urlPageToCrawl
);
const resultsSubset = singleCategoryResults.slice(
0,
this.maxResultsPerPage
);
const saveResults = await this.saveCrawledResults(resultsSubset);
const { newRecords } = saveResults;
newRealEstates.push(...newRecords);
}
}
}
return newRealEstates;
}
async crawl() {
const crawlAdCategories = this.crawlerAdCategories;
@@ -210,20 +181,67 @@ class ProstorCrawler {
async scrapeAd(realEstate) {
const { lat, lng, property_name, price, size, link } = realEstate;
const url = `https://prostor.ba${link}`;
console.log("[PROSTOR] Scraping : ", url);
// console.log("[PROSTOR] Scraping : ", url);
try {
const adPageSource = await fetch(url);
const body = await adPageSource.text();
const $ = cheerio.load(body);
let numberOfRooms = null,
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
const linkParts = link.split("/");
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
if (!adType || !realEstateType || !prostorId) {
console.log(
"adType: ",
adType,
" reType: ",
realEstateType,
" prostorId: ",
prostorId,
"url: ",
url
);
return null;
}
const allDataSelector =
"body > div > div.container-fluid > div > div.column-right > table > tbody";
const realEstateProperties = {};
$(allDataSelector)
.find("p")
.each((i, elem) => {
const propertyElement = $(elem)
.text()
.split(":")
.map(text => text.trim());
const propertyTitle = propertyElement[0];
realEstateProperties[propertyTitle] = propertyElement[1];
});
if (JSON.stringify(realEstateProperties) === JSON.stringify({})) {
return null;
}
let numberOfRooms =
parseFloat(realEstateProperties["Broj soba"]) +
parseFloat(realEstateProperties["Broj spavaćih soba"]) || null,
numberOfFloors = null,
floor = null,
accessRoadType = null,
heatingType = null,
furnishingType = null,
balcony = null,
newBuilding = null,
newBuilding = linkParts[1] === "novogradnja",
elevator = null,
water = null,
electricity = null,
@@ -252,28 +270,46 @@ class ProstorCrawler {
distanceToRiver = null,
numberOfViewsAgency = null;
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
// Floor versions (there are possibly more versions) :
// Sprat: 3/3
// Sprat: 1 - 2/2
// Sprat: Pr - 7/7
// Sprat: -2/0
// If there are two parts, that represents more real estates are sold
// numberOfFloors is contained in second part, after / sign
const linkParts = link.split("/");
const floorsArray = realEstateProperties["Sprat"].split(" - ");
let floorText = "";
if (floorsArray.length === 1) {
const floorDescription = floorsArray[0].split("/");
numberOfFloors = parseInt(floorDescription[1]) || null;
floorText = floorDescription[0];
floor = Math.round(parseFloat(floorText));
} else if (floorsArray.length === 2) {
const floorDescription = floorsArray[1].split("/");
numberOfFloors = parseInt(floorDescription[1]) || null;
floorText = floorsArray[0];
floor = Math.round(parseFloat(floorText));
} else {
// This is something strange
}
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
if (!adType || !realEstateType || !prostorId) {
console.log(
"adType: ",
adType,
" reType: ",
realEstateType,
" prostorId: ",
prostorId,
"url: ",
url
);
return null;
if (isNaN(floor)) {
// It was textual representation of floor, like "Pr", "Su" or similar
switch (floorText.toLowerCase()) {
case "pr":
floor = 0;
break;
case "su":
floor = -1;
break;
default:
console.log(
"[PROSTOR] Unknown textual representation of floor : ",
floorText
);
floor = null;
}
}
const adStatus = AD_STATUS.STATUS_NORMAL;
@@ -282,19 +318,6 @@ class ProstorCrawler {
const parsedArea = parseFloat(size);
const gardenSize = null;
const longDescription = null;
const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE);
if (!publishedDateMoment.isValid()) {
throw {
message: `Invalid published date`
};
}
const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE);
if (!renewedDateMoment.isValid()) {
throw {
message: `Invalid renewed date`
};
}
const data = {
url,
@@ -309,7 +332,7 @@ class ProstorCrawler {
shortDescription: "",
longDescription: longDescription,
streetNumber: 0,
streetName: "",
streetName: realEstateProperties["Adresa"],
locality: "",
municipality: "",
city: "",
@@ -319,8 +342,6 @@ class ProstorCrawler {
locationLat: lat,
locationLong: lng,
adStatus,
publishedDate: publishedDateMoment.toISOString(),
renewedDate: renewedDateMoment.toISOString(),
numberOfRooms,
numberOfFloors,
floor,
@@ -437,67 +458,6 @@ class ProstorCrawler {
}
}
static transformRealEstateData(realEstateData) {
try {
const { lat, lng, property_name, price, size, link } = realEstateData;
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
// general form is : /actionType/realEstateType/location/realEstateID
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
const linkParts = link.split("/");
const adType = ProstorCrawler.getAdTypeId(linkParts[1]);
const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]);
const prostorId = linkParts[4];
const url = `https://prostor.ba${link}`;
if (!adType || !realEstateType || !prostorId) {
return null;
}
const adStatus = AD_STATUS.STATUS_NORMAL;
const parsedPrice = parseFloat(price.replace(/\./g, "")) || null;
const parsedArea = parseFloat(size);
const data = {
url,
agencyObjectId: prostorId,
originAgencyName: AD_AGENCY.PROSTOR,
realEstateType,
adType,
title: property_name,
price: parsedPrice,
area: parsedArea,
gardenSize: null,
shortDescription: "",
longDescription: "",
streetNumber: 0,
streetName: "",
locality: "",
municipality: "",
city: "",
region: "",
entity: "",
country: "",
locationLat: lat,
locationLong: lng,
adStatus,
publishedDate: null,
renewedDate: null
};
return data;
} catch (e) {
console.error(
"[PROSTOR] Exception caught: " + e.message,
"\r\nURL:",
url
);
return null;
}
}
//======= HELPER FUNCTIONS =============
static getAdCategoryId(categoryText) {
@@ -525,6 +485,8 @@ class ProstorCrawler {
return AD_TYPE.AD_TYPE_SALE.stringId;
case "najam":
return AD_TYPE.AD_TYPE_RENT.stringId;
case "novogradnja":
return AD_TYPE.AD_TYPE_SALE.stringId;
default:
return undefined;
}