From 1ba7cf8531057ae25d911fe76f8f0d8f31d77b4b Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Fri, 31 Jan 2020 22:03:39 +0100 Subject: [PATCH] Added crawler for Saljic nekretnine. --- app/crawler/specificCrawlers/saljic.js | 382 ++++++++++--------------- 1 file changed, 154 insertions(+), 228 deletions(-) diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 95c24bb..37c569e 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -174,6 +174,25 @@ class SaljicCrawler { } }); + let adTypesTmp = []; + + $("#shop") + .find(".product") + .each((i, elem) => { + const adType = $(elem) + .find(".trakica-search-page") + .text() + .trim(); + if (adType) { + adTypesTmp.push(adType); + } + }); + + //Converting to AD_TYPE + const adTypes = adTypesTmp.map(adTypeText => { + return this.getAdTypeId(adTypeText); + }); + //Converting to absolute URLs const hrefsAbs = hrefs.map(link => { return "https://www.saljicnekretnine.ba" + link; @@ -186,7 +205,7 @@ class SaljicCrawler { const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { - asyncScraping.push(this.scrapeAd(hrefsAbs[i])); + asyncScraping.push(this.scrapeAd(hrefsAbs[i], adTypes[i])); } const scrapedData = await Promise.all(asyncScraping); @@ -198,16 +217,19 @@ class SaljicCrawler { } } - async scrapeAd(url) { + async scrapeAd(url, adType) { console.log("[SALJIC] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); - // ??? treba li nesto za status - let status = AD_STATUS.STATUS_NORMAL; + // No information for status ex. PRODAN + const status = AD_STATUS.STATUS_NORMAL; + //Extracting agency ID from url + const agencyObjectId = parseInt(url.substring(46, url.length)); + //Extracting main properties const propertySelectors = { title: "div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2", @@ -227,7 +249,6 @@ class SaljicCrawler { .replace(/ {1,}/g, " ") .trim(); - console.log("Title:", title); const priceText = $(propertySelectors.price) .text() .replace(/(\r\n|\n|\r)/gm, "") @@ -240,18 +261,14 @@ class SaljicCrawler { priceText.substring(8, priceText.length - 3).replace(",", "") ); - console.log("Price:", price); - const streetName = $(propertySelectors.streetName) .text() .replace(/(\r\n|\n|\r)/gm, "") .trim(); - console.log("Street:", streetName); const descriptions = $(propertySelectors.descriptions) .text() .trim(); - console.log("Description:", descriptions); const latAndLongSrc = $(propertySelectors.latAndLong).attr("src"); const latText = latAndLongSrc.substring( @@ -264,8 +281,6 @@ class SaljicCrawler { ); const locationLat = parseFloat(latText) || null; const locationLong = parseFloat(longText) || null; - console.log("Lat:", locationLat); - console.log("Long:", locationLong); //====== DETAIL INFORMATION FIELDS ========== let area, @@ -306,6 +321,8 @@ class SaljicCrawler { distanceToRiver = null; let publishedDate = null; let renewedDate = null; + let realEstateType; + let numberOfViewsAgency = null; //Extracting data - Glavne karakteristike let mainFieldIndex = 1; @@ -346,18 +363,15 @@ class SaljicCrawler { recentlyAdapted = true; break; case "Broj parking mjesta": - `${month}/${day}/${year}`; parking = true; break; case "Dostupno od": const day = mainFieldValue.substring(0, 2); const month = mainFieldValue.substring(3, 5); const year = mainFieldValue.substring(6, mainFieldValue.length); - console.log(`${month}/${day}/${year}`); publishedDate = new Date(`${month}/${day}/${year}`); break; default: - // console.log(fieldTitle, " = ", fieldValue); break; } @@ -367,39 +381,121 @@ class SaljicCrawler { mainFieldIndex++; } while (true); - console.log("Area:", area); - console.log("Garden size:", gardenSize); - console.log("Number of rooms:", numberOfRooms); - console.log("Number of floors", numberOfFloors); - console.log("Floor:", floor); - console.log("Adapted:", recentlyAdapted); - console.log("Parking:", parking); - console.log("Published date:", publishedDate); + //Extracting data - Sadrzaji + let additionalFieldIndex = 1; + do { + const additionalFieldSelector = `div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.col-md-12.bottommargin > ul > li.border-color.col-md-5.col-md-offset-1.col-md-pull-1.list-group-item-bottom:nth-child(${additionalFieldIndex})`; - //const category = $(propertySelectors.category) - //.text() - //.trim(); + const additionalField = $(additionalFieldSelector) + .text() + .trim(); + + if (additionalFieldIndex === 1) { + //Extracting data of real estate type + const categoryTmp = additionalField + .replace(/[\n\r\t]/gm, "") + .substring( + additionalField.indexOf("Kategorija") + 10, + additionalField.length + ) + .trim(); + realEstateType = this.getAdCategoryId(categoryTmp); + } else { + switch (additionalField) { + case "Internet": + internet = true; + break; + case "Garaža": + garage = true; + break; + case "Klima": + airCondition = true; + break; + case "Balkon": + balcony = true; + break; + case "Ostava": + storeRoom = true; + break; + case "Podrum": + basementAttic = true; + break; + case "Blindirana vrata": + antiTheftDoor = true; + break; + case "Voda": + water = true; + break; + case "Kablovska": + cableTV = true; + break; + case "Uknjiženo": + registeredInZkBooks = true; + break; + case "Grijanje - centralno": + heatingType = HEATING_TYPE.CENTRAL_CITY.id; + break; + case "Grijanje - plin": + heatingType = HEATING_TYPE.GAS.id; + break; + case "Grijanje - struja": + heatingType = HEATING_TYPE.ELECTRICITY.id; + break; + case "Grijanje": + heatingType = HEATING_TYPE.OTHER.id; + break; + case "Plin": + gas = true; + break; + case "Namješten": + furnishingType = FURNISHING_TYPE.FURNISHED.id; + break; + case "Alarm": + alarm = true; + break; + case "Video nadzor": + videoSurveillance = true; + break; + case "Lift": + elevator = true; + break; + case "Novogradnja": + newBuilding = true; + break; + + default: + break; + } + } + + if (additionalField === "") { + break; + } + additionalFieldIndex++; + } while (true); + + //If no published date it takes current date of crawling + if (publishedDate) { + renewedDate = new Date(); + } else { + publishedDate = new Date(); + renewedDate = new Date(); + } const data = { url, - agencyObjectId: olxId, - originAgencyName: AD_AGENCY.OLX, - realEstateType: parsedCategory, - adType: parsedAdType, + agencyObjectId, + originAgencyName: AD_AGENCY.SALJIC, + realEstateType, + adType, title, - price: parsedPrice, - area: parsedArea, - gardenSize: parsedGardenSize, - shortDescription: descriptions - .first() - .text() - .trim(), - longDescription: descriptions - .last() - .text() - .trim(), + price, + area, + gardenSize, + shortDescription: descriptions.substring(0, descriptions.indexOf(".")), + longDescription: descriptions, streetNumber: 0, - streetName: "", + streetName, locality: "", municipality: "", city: "", @@ -409,8 +505,8 @@ class SaljicCrawler { locationLat, locationLong, adStatus: status, - publishedDate: publishedDateMoment.toISOString(), - renewedDate: renewedDateMoment.toISOString(), + publishedDate, + renewedDate, numberOfRooms, numberOfFloors, floor, @@ -447,7 +543,7 @@ class SaljicCrawler { distanceToRiver, numberOfViewsAgency }; - + console.log(data); return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); @@ -459,19 +555,25 @@ class SaljicCrawler { getAdCategoryId(categoryText) { switch (categoryText) { - case "Stanovi": + case "Stan": return AD_CATEGORY.FLAT.id; - case "Zemljišta": + case "Građevinsko zemljiste": return AD_CATEGORY.LAND.id; - case "Kuće": + case "Industrijsko zemljiste": + return AD_CATEGORY.LAND.id; + case "Poljoprivredno zemljiste": + return AD_CATEGORY.LAND.id; + case "Kuća": return AD_CATEGORY.HOUSE.id; - case "Poslovni prostori": + case "Poslovni prostor": + return AD_CATEGORY.OFFICE.id; + case "Kancelarije": return AD_CATEGORY.OFFICE.id; case "Apartmani": return AD_CATEGORY.APARTMENT.id; - case "Garaže": + case "Garaža": return AD_CATEGORY.GARAGE.id; - case "Vikendice": + case "Vikendica": return AD_CATEGORY.COTTAGE.id; default: return undefined; @@ -480,191 +582,15 @@ class SaljicCrawler { getAdTypeId(adTypeText) { switch (adTypeText) { - case "Prodaja": + case "PRODAJA": return AD_TYPE.AD_TYPE_SALE.stringId; - case "Izdavanje": + case "NAJAM": return AD_TYPE.AD_TYPE_RENT.stringId; - case "Potražnja": - return AD_TYPE.AD_TYPE_REQUEST.stringId; default: return undefined; } } - getHeatingTypeId(heatingTypeText) { - switch (heatingTypeText) { - case "struja": - return HEATING_TYPE.ELECTRICITY.id; - case "plin": - return HEATING_TYPE.GAS.id; - case "drva": - return HEATING_TYPE.WOOD.id; - case "centralno (gradsko)": - return HEATING_TYPE.CENTRAL_CITY.id; - case "centralno (kotlovnica)": - return HEATING_TYPE.CENTRAL_BOILER.id; - case "centralno (plin)": - return HEATING_TYPE.CENTRAL_GAS.id; - case "nije uvedeno": - return HEATING_TYPE.NO_HEATING.id; - case "ostalo": - return HEATING_TYPE.OTHER.id; - case "drugo": - return HEATING_TYPE.OTHER.id; - default: - console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); - return null; - } - } - - getFurnishingTypeId(furnishingTypeText) { - switch (furnishingTypeText) { - case "namješten": - return FURNISHING_TYPE.FURNISHED.id; - case "polunamješten": - return FURNISHING_TYPE.HALF_FURNISHED.id; - case "nenamješten": - return FURNISHING_TYPE.NOT_FURNISHED.id; - case "": - return FURNISHING_TYPE.FURNISHED.id; - default: - console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); - return null; - } - } - - getAccessRoadTypeId(accessRoadTypeText) { - switch (accessRoadTypeText) { - case "asfalt": - return ACCESS_ROAD_TYPE.ASPHALT.id; - case "beton": - return ACCESS_ROAD_TYPE.CONCRETE.id; - case "makadam": - return ACCESS_ROAD_TYPE.MACADAM.id; - case "ostalo": - return ACCESS_ROAD_TYPE.OTHER.id; - default: - console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); - return null; - } - } - - parseArea(areaText) { - if (!areaText) { - return NaN; - } - const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; - const textWithOnlyOneDecimalDot = areaText - .replace(",", ".") - .replace(removeDotsExceptLastOneRegex, ""); - - return parseFloat(textWithOnlyOneDecimalDot); - } - - parsePrice(priceText) { - if (!priceText) { - return NaN; - } - const formattedPriceText = priceText.replace(".", "").replace(",", "."); - return parseFloat(formattedPriceText); - } - - parseNumberOfRooms(numberOfRoomsText, categoryId) { - if (categoryId === AD_CATEGORY.FLAT.id) { - switch (numberOfRoomsText) { - case "garsonjera": - return 0; - case "jednosoban (1)": - return 1; - case "jednoiposoban (1.5)": - return 1.5; - case "dvosoban (2)": - return 2; - case "trosoban (3)": - return 3; - case "četverosoban (4)": - return 4; - case "petosoban i više": - return 5; - default: - console.log( - "broj soba [stan] = NEPOZNATO [", - numberOfRoomsText, - ", ", - categoryId, - "]" - ); - return null; - } - } - - if ( - categoryId === AD_CATEGORY.HOUSE.id || - categoryId === AD_CATEGORY.COTTAGE.id || - categoryId === AD_CATEGORY.APARTMENT.id || - categoryId === AD_CATEGORY.OFFICE.id - ) { - return parseInt(numberOfRoomsText) || null; - } - - console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); - return null; - } - - parseNumberOfFloors(numberOfFloorsText, categoryId) { - if ( - categoryId === AD_CATEGORY.HOUSE.id || - categoryId === AD_CATEGORY.COTTAGE.id - ) { - return parseInt(numberOfFloorsText) || null; - } - - if (categoryId === AD_CATEGORY.OFFICE.id) { - if ( - numberOfFloorsText === "suteren" || - numberOfFloorsText === "prizemlje" - ) { - return 0; - } - if (numberOfFloorsText === "6+") { - return 7; - } - return parseInt(numberOfFloorsText) || null; - } - - console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); - return null; - } - - parseFloorNumber(floorText, categoryId) { - if ( - categoryId === AD_CATEGORY.FLAT.id || - categoryId === AD_CATEGORY.APARTMENT.id - ) { - if ( - floorText === "suteren" || - floorText === "prizemlje" || - floorText === "visoko prizemlje" - ) { - return 0; - } - return parseInt(floorText) || null; - } - - if (categoryId === AD_CATEGORY.OFFICE.id) { - if (floorText === "zaseban objekat") { - return null; - } - if (floorText === "prizemlje" || floorText === "visoko prizemlje") { - return 0; - } - return parseInt(floorText) || null; - } - - console.log("sprat = NEPOZNATO [", floorText, "]"); - return null; - } - async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); }