diff --git a/app/crawler/specificCrawlers/saljic.js b/app/crawler/specificCrawlers/saljic.js index 407a558..0e912fc 100644 --- a/app/crawler/specificCrawlers/saljic.js +++ b/app/crawler/specificCrawlers/saljic.js @@ -57,12 +57,504 @@ class SaljicCrawler { } async crawl() { - // - console.log("Saljic URL: ", this.baseUrl); + const crawlAdCategories = this.crawlerAdCategories; + + const newRealEstates = []; + + if (crawlAdCategories) { + const indexGenerators = []; + for (const adCategory of crawlAdCategories) { + indexGenerators.push(this.categoryIndexer(adCategory)); + } + // + console.log(indexGenerators); + // + let done = false; + while (!done) { + const categoryIndexerPromises = []; + const generatorsToRemove = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); + } + + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); + + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; + + newRealEstates.push(...newRecords); + + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !SALJIC_FORCE_CRAWL + ) { + generatorsToRemove[index] = true; + } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); + } + } + + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + await this.sleep(this.delayBetweenPages); + } + } + return newRealEstates; + } + + async *categoryIndexer(adCategory) { + const urlAdTypePart = SALJIC_ENUMS.SALJIC_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = SALJIC_ENUMS.SALJIC_AD_CATEGORY[adCategory]; + + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { + const urlPageToCrawl = `${this.baseUrl}?order_by=${urlAdTypePart}${urlCategoryPart}`; + const listOfAllRealEstates = await this.extractRealEstates( + urlPageToCrawl + ); + + let elementToStartIndexFrom = 0; + while (true) { + const realEstatesForSinglePage = listOfAllRealEstates.slice( + elementToStartIndexFrom, + elementToStartIndexFrom + this.maxResultsPerPage + ); + + if (realEstatesForSinglePage.length > 0) { + elementToStartIndexFrom += realEstatesForSinglePage.length; + + const singlePageResults = await this.indexSinglePage( + realEstatesForSinglePage + ); + + const filteredSinglePageResults = singlePageResults.filter( + singleResult => !!singleResult + ); + + if ( + Array.isArray(filteredSinglePageResults) && + filteredSinglePageResults.length > 0 + ) { + yield filteredSinglePageResults; + } else { + return undefined; + } + } else { + return undefined; + } + } + } else { + return undefined; + } + } + + async indexSinglePage(realEstatesList) { + const asyncActions = []; + for (const realEstate of realEstatesList) { + asyncActions.push(this.scrapeAd(realEstate)); + } + + try { + return await Promise.all(asyncActions); + } catch (e) { + console.log( + "[SALJIC] Error crawling ads : ", + e.message || "UNKNOWN ERROR" + ); + return []; + } + } + + async scrapeAd(realEstate) { + const { lat, lng, property_name, price, size, link, status } = realEstate; + const url = `https://www.saljicnekretnine.ba/v2/${link}`; + // console.log("[SALJIC] Scraping : ", url); + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + // ?? Ovo se mora promijeniti + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = SaljicCrawler.getAdTypeId(linkParts[1]); + const realEstateType = SaljicCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + + if (!adType || !realEstateType || !prostorId) { + return null; + } + + const allDataSelector = + "body > div > div.container-fluid > div > div.column-right > table > tbody"; + + const realEstateProperties = {}; + + $(allDataSelector) + .find("p") + .each((i, element) => { + const propertyElement = $(element) + .text() + .split(":") + .map(text => text.trim().toLowerCase()); + + const propertyTitle = propertyElement[0]; + realEstateProperties[propertyTitle] = propertyElement[1]; + }); + + $(allDataSelector) + .find("div.mb-2") + .each((i, element) => { + const propertyElement = $(element) + .text() + .trim() + .toLowerCase(); + + realEstateProperties[propertyElement] = true; + }); + + if (JSON.stringify(realEstateProperties) === JSON.stringify({})) { + return null; + } + + let numberOfRooms = + parseFloat(realEstateProperties["broj soba"]) + + parseFloat(realEstateProperties["broj spavaćih soba"]) || null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = ProstorCrawler.getHeatingTypeId(realEstateProperties), + furnishingType = null, + balcony = + realEstateProperties["balkon"] || + realEstateProperties["terasa"] || + realEstateProperties["lođa"] || + null, + newBuilding = linkParts[1] === "novogradnja", + elevator = realEstateProperties["lift"] || null, + water = realEstateProperties["voda"] || null, + electricity = realEstateProperties["električna energija"] || null, + drainageSystem = realEstateProperties["kanalizacija"] || null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = realEstateProperties["parking"] || null, + garage = realEstateProperties["garaža"] || null, + gas = realEstateProperties["plin"] || null, + antiTheftDoor = realEstateProperties["blindo vrata"] || null, + airCondition = realEstateProperties["klima"] || null, + phoneConnection = realEstateProperties["telefon"] || null, + cableTV = realEstateProperties["kablovksa tv"] || null, + internet = + realEstateProperties["internet"] || + realEstateProperties["adsl"] || + null, + basementAttic = realEstateProperties["podrum"] || null, + storeRoom = realEstateProperties["ostava"] || null, + videoSurveillance = realEstateProperties["video nadzor"], + alarm = realEstateProperties["alarm"] || null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = realEstateProperties["bazen"] || null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null, + numberOfViewsAgency = null; + + // Floor versions (there are possibly more versions) : + // Sprat: 3/3 + // Sprat: 1 - 2/2 + // Sprat: Pr - 7/7 + // Sprat: -2/0 + // If there are two parts, that represents more real estates are sold + // numberOfFloors is contained in second part, after / sign + + const floorsArray = realEstateProperties["sprat"].split(" - "); + let floorText = ""; + if (floorsArray.length === 1) { + const floorDescription = floorsArray[0].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorDescription[0]; + floor = Math.round(parseFloat(floorText)); + } else if (floorsArray.length === 2) { + const floorDescription = floorsArray[1].split("/"); + numberOfFloors = parseInt(floorDescription[1]) || null; + floorText = floorsArray[0]; + floor = Math.round(parseFloat(floorText)); + } else { + // This is something strange + } + + if (isNaN(floor)) { + // It was textual representation of floor, like "Pr", "Su" or similar + switch (floorText) { + case "pr": + floor = 0; + break; + case "su": + floor = -1; + break; + default: + console.log( + "[SALJIC] Unknown textual representation of floor : ", + floorText + ); + floor = null; + } + } + + if (realEstateProperties["namješteno"]) { + furnishingType = FURNISHING_TYPE.FURNISHED.id; + } else if (realEstateProperties["polunamješteno"]) { + furnishingType = FURNISHING_TYPE.HALF_FURNISHED.id; + } else { + furnishingType = FURNISHING_TYPE.NOT_FURNISHED.id; + } + + const adStatus = SaljicCrawler.getStatusId(status); + const title = property_name; + const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; + const parsedArea = parseFloat(size); + const gardenSize = null; + const longDescription = null; + + const data = { + url, + agencyObjectId: prostorId, + originAgencyName: AD_AGENCY.SALJIC, + realEstateType, + adType, + title, + price: parsedPrice, + area: parsedArea, + gardenSize, + shortDescription: "", + longDescription: longDescription, + streetNumber: 0, + streetName: realEstateProperties["adresa"], + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat: lat, + locationLong: lng, + adStatus, + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency + }; + + return data; + } catch (e) { + console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url); + return null; + } + } + + async extractRealEstates(url) { + if (PRINT_CRAWLER_DEBUG) { + console.log("[SALJIC] Index page : ", url); + } + + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + console.log("SALJIC: $", $); + + const scriptElement = $( + "body > div > div.container-fluid > script:nth-child(7)" + ); + // + //console.log(scriptElement[0]); + //console.log(scriptElement[0].children); + if ( + scriptElement[0] && + scriptElement[0].children && + scriptElement[0].children[0] && + scriptElement[0].children[0].data + ) { + const scriptData = scriptElement[0].children[0].data; + // + console.log(scriptData); + try { + // script element data contains JS code and we need to extract only data for realEstates + // data string starts with : var map; var markers = [{"r ... + // so we remove first 23 characters + // + // real estate JSON data ends with ...}, ]; map = new... + // so we need to find index of that substring to know where to stop + // we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually + + const jsonEndIndex = scriptData.indexOf(", ]; map = new"); + if (jsonEndIndex > -1) { + const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; + const realEstates = JSON.parse(jsonData); + + // const transformedRealEstates = []; + // + // for (const realEstate of realEstates) { + // const transformedRealEstate = SaljicCrawler.transformRealEstateData( + // realEstate + // ); + // if (transformedRealEstate) { + // transformedRealEstates.push(transformedRealEstate); + // } + // } + // + // return transformedRealEstates; + return realEstates; + } else { + throw { + message: "Something is wrong with JSON data or data is moved" + }; + } + } catch (e) { + console.log(e); + throw e; + } + } + } catch (e) { + console.error( + "[SALJIC] Exception caught:", + e.message || "UNKNOWN MESSAGE" + ); + return []; + } } //======= HELPER FUNCTIONS ============= + static getAdCategoryId(categoryText) { + switch (categoryText) { + case "stan": + return AD_CATEGORY.FLAT.id; + case "kuca": + return AD_CATEGORY.HOUSE.id; + case "apartman": + return AD_CATEGORY.APARTMENT.id; + case "poslovni-prostor": + return AD_CATEGORY.OFFICE.id; + case "garaza": + return AD_CATEGORY.GARAGE.id; + case "zemljiste": + return AD_CATEGORY.LAND.id; + default: + return undefined; + } + } + + static getAdTypeId(adTypeText) { + switch (adTypeText) { + case "prodaja": + return AD_TYPE.AD_TYPE_SALE.stringId; + case "najam": + return AD_TYPE.AD_TYPE_RENT.stringId; + case "novogradnja": + return AD_TYPE.AD_TYPE_SALE.stringId; + default: + return undefined; + } + } + + static getHeatingTypeId(realEstateProperties) { + const realEstatePropertiesKeys = Object.keys(realEstateProperties); + for (const property of realEstatePropertiesKeys) { + switch (property) { + case "centralno toplane": + return HEATING_TYPE.CENTRAL_CITY.id; + case "etažno plinsko": + return HEATING_TYPE.CENTRAL_GAS.id; + case "termo blok": + case "podno grijanje": + return HEATING_TYPE.OTHER.id; + case "etažno električno": + case "konvektori": + return HEATING_TYPE.ELECTRICITY.id; + case "plinske peći": + return HEATING_TYPE.GAS.id; + case "vlastita kotlovnica": + return HEATING_TYPE.CENTRAL_BOILER.id; + case "toplotna pumpa": + return HEATING_TYPE.HEAT_PUMP.id; + case "kamin": + return HEATING_TYPE.WOOD.id; + default: + //console.log("[SALJIC] Nepoznato >>> [", property, "]"); + } + } + } + + static getStatusId(statusText) { + switch (statusText) { + case "": + return AD_STATUS.STATUS_NORMAL; + case "Rezervisano": + return AD_STATUS.STATUS_RESERVED; + case "Prodano": + return AD_STATUS.STATUS_SOLD; + case "Iznajmljeno": + return AD_STATUS.STATUS_RENTED; + default: + console.log("[SALJIC] Unknown AD_STATUS : [", statusText, "]"); + return AD_STATUS.STATUS_NORMAL; + } + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); }