From c13857bc096616fe4061b1bf8a4cc37dad08a340 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 14 Nov 2019 02:09:42 +0100 Subject: [PATCH] add additional fields to the Prostor crawler --- app/crawler/specificCrawlers/prostor.js | 220 ++++++++++-------------- 1 file changed, 91 insertions(+), 129 deletions(-) diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index dbd1b7e..7737591 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -56,35 +56,6 @@ class ProstorCrawler { this.delayBetweenPages = delayBetweenPages; } - async crawlOld() { - const crawlAdCategories = this.crawlerAdCategories; - const newRealEstates = []; - - if (crawlAdCategories) { - for (const adCategory of crawlAdCategories) { - const urlAdTypePart = - PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; - const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; - if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { - const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; - const singleCategoryResults = await this.extractRealEstates( - urlPageToCrawl - ); - - const resultsSubset = singleCategoryResults.slice( - 0, - this.maxResultsPerPage - ); - - const saveResults = await this.saveCrawledResults(resultsSubset); - const { newRecords } = saveResults; - newRealEstates.push(...newRecords); - } - } - } - return newRealEstates; - } - async crawl() { const crawlAdCategories = this.crawlerAdCategories; @@ -210,20 +181,67 @@ class ProstorCrawler { async scrapeAd(realEstate) { const { lat, lng, property_name, price, size, link } = realEstate; const url = `https://prostor.ba${link}`; - console.log("[PROSTOR] Scraping : ", url); + // console.log("[PROSTOR] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); - let numberOfRooms = null, + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = ProstorCrawler.getAdTypeId(linkParts[1]); + const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + + if (!adType || !realEstateType || !prostorId) { + console.log( + "adType: ", + adType, + " reType: ", + realEstateType, + " prostorId: ", + prostorId, + "url: ", + url + ); + return null; + } + + const allDataSelector = + "body > div > div.container-fluid > div > div.column-right > table > tbody"; + + const realEstateProperties = {}; + + $(allDataSelector) + .find("p") + .each((i, elem) => { + const propertyElement = $(elem) + .text() + .split(":") + .map(text => text.trim()); + + const propertyTitle = propertyElement[0]; + realEstateProperties[propertyTitle] = propertyElement[1]; + }); + + if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { + return null; + } + + let numberOfRooms = + parseFloat(realEstateProperties["Broj soba"]) + + parseFloat(realEstateProperties["Broj spavaćih soba"]) || null, numberOfFloors = null, floor = null, accessRoadType = null, heatingType = null, furnishingType = null, balcony = null, - newBuilding = null, + newBuilding = linkParts[1] === "novogradnja", elevator = null, water = null, electricity = null, @@ -252,28 +270,46 @@ class ProstorCrawler { distanceToRiver = null, numberOfViewsAgency = null; - // link contains part of the URL in the format of : /prodaja/stan/stup/9556 - // general form is : /actionType/realEstateType/location/realEstateID - // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + // Floor versions (there are possibly more versions) : + // Sprat: 3/3 + // Sprat: 1 - 2/2 + // Sprat: Pr - 7/7 + // Sprat: -2/0 + // If there are two parts, that represents more real estates are sold + // numberOfFloors is contained in second part, after / sign - const linkParts = link.split("/"); + const floorsArray = realEstateProperties["Sprat"].split(" - "); + let floorText = ""; + if (floorsArray.length === 1) { + const floorDescription = floorsArray[0].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorDescription[0]; + floor = Math.round(parseFloat(floorText)); + } else if (floorsArray.length === 2) { + const floorDescription = floorsArray[1].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorsArray[0]; + floor = Math.round(parseFloat(floorText)); + } else { + // This is something strange + } - const adType = ProstorCrawler.getAdTypeId(linkParts[1]); - const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); - const prostorId = linkParts[4]; - - if (!adType || !realEstateType || !prostorId) { - console.log( - "adType: ", - adType, - " reType: ", - realEstateType, - " prostorId: ", - prostorId, - "url: ", - url - ); - return null; + if (isNaN(floor)) { + // It was textual representation of floor, like "Pr", "Su" or similar + switch (floorText.toLowerCase()) { + case "pr": + floor = 0; + break; + case "su": + floor = -1; + break; + default: + console.log( + "[PROSTOR] Unknown textual representation of floor : ", + floorText + ); + floor = null; + } } const adStatus = AD_STATUS.STATUS_NORMAL; @@ -282,19 +318,6 @@ class ProstorCrawler { const parsedArea = parseFloat(size); const gardenSize = null; const longDescription = null; - const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE); - if (!publishedDateMoment.isValid()) { - throw { - message: `Invalid published date` - }; - } - - const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE); - if (!renewedDateMoment.isValid()) { - throw { - message: `Invalid renewed date` - }; - } const data = { url, @@ -309,7 +332,7 @@ class ProstorCrawler { shortDescription: "", longDescription: longDescription, streetNumber: 0, - streetName: "", + streetName: realEstateProperties["Adresa"], locality: "", municipality: "", city: "", @@ -319,8 +342,6 @@ class ProstorCrawler { locationLat: lat, locationLong: lng, adStatus, - publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString(), numberOfRooms, numberOfFloors, floor, @@ -437,67 +458,6 @@ class ProstorCrawler { } } - static transformRealEstateData(realEstateData) { - try { - const { lat, lng, property_name, price, size, link } = realEstateData; - - // link contains part of the URL in the format of : /prodaja/stan/stup/9556 - // general form is : /actionType/realEstateType/location/realEstateID - // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] - - const linkParts = link.split("/"); - - const adType = ProstorCrawler.getAdTypeId(linkParts[1]); - const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); - const prostorId = linkParts[4]; - const url = `https://prostor.ba${link}`; - - if (!adType || !realEstateType || !prostorId) { - return null; - } - - const adStatus = AD_STATUS.STATUS_NORMAL; - const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; - const parsedArea = parseFloat(size); - - const data = { - url, - agencyObjectId: prostorId, - originAgencyName: AD_AGENCY.PROSTOR, - realEstateType, - adType, - title: property_name, - price: parsedPrice, - area: parsedArea, - gardenSize: null, - shortDescription: "", - longDescription: "", - streetNumber: 0, - streetName: "", - locality: "", - municipality: "", - city: "", - region: "", - entity: "", - country: "", - locationLat: lat, - locationLong: lng, - adStatus, - publishedDate: null, - renewedDate: null - }; - - return data; - } catch (e) { - console.error( - "[PROSTOR] Exception caught: " + e.message, - "\r\nURL:", - url - ); - return null; - } - } - //======= HELPER FUNCTIONS ============= static getAdCategoryId(categoryText) { @@ -525,6 +485,8 @@ class ProstorCrawler { return AD_TYPE.AD_TYPE_SALE.stringId; case "najam": return AD_TYPE.AD_TYPE_RENT.stringId; + case "novogradnja": + return AD_TYPE.AD_TYPE_SALE.stringId; default: return undefined; }