"use strict"; const fetch = require("../../helpers/fetchWrapper"); const { logDebug } = require("../../helpers/log"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE, HEATING_TYPE, FURNISHING_TYPE, ACCESS_ROAD_TYPE } = require("../../common/enums"); const { DEFAULT_TIMEZONE, PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); const OLX_ENUMS = { OLX_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "", [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje", [CRAWLER_AD_TYPE.ONLY_REQUEST]: "&vrsta=samopotraznja" }, OLX_AD_CATEGORY: { [AD_CATEGORY.FLAT.id]: "&kategorija=23", [AD_CATEGORY.HOUSE.id]: "&kategorija=24", [AD_CATEGORY.LAND.id]: "&kategorija=29", [AD_CATEGORY.OFFICE.id]: "&kategorija=25", [AD_CATEGORY.APARTMENT.id]: "&kategorija=27", [AD_CATEGORY.GARAGE.id]: "&kategorija=30", [AD_CATEGORY.COTTAGE.id]: "&kategorija=26" }, MAX_DETAIL_FIELDS: 30, OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm", OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" }; const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx"); const chunk = (array, size = 10) => { let i, j ,temparray; const result = [] for (i=0,j=array.length; i= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory) { try { let pageToIndex = 1; const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; const singlePageResults = await this.indexSinglePage( urlPageToCrawl, this.maxResultsPerPage ); await this.sleep(this.delayBetweenPages); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; } else { return undefined; } ++pageToIndex; if (pageToIndex === this.maxPages) { return undefined; } } } else { return undefined; } } catch (e) { console.log('Error inside generator: ', e); } } async indexSinglePage(url, maxResultsPerPage) { if (PRINT_CRAWLER_DEBUG) { console.log("[OLX] Index page : ", url); } try { const res = await fetch(url); logDebug("Got category results for: ", url); const body = await res.text(); logDebug("Got category results text for: ", url); const $ = cheerio.load(body); let hrefs = []; $("#rezultatipretrage") .find(".listitem") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { asyncScraping.push(hrefs[i]); } const allChunks = chunk(asyncScraping, 2); const dataResults = [] for (let i = 0; i < allChunks.length; i++) { const singleChunk = allChunks[i]; const promises = singleChunk.map(c => this.scrapeAd(c)) const chunkResults = await Promise.all(promises); await this.sleep(this.delayBetweenPages); dataResults.push(...chunkResults); logDebug("Chunk results len:", chunkResults.length); } const filteredScrapedData = dataResults.filter(adData => !!adData); logDebug("Filtered scraped data length: ", filteredScrapedData.length); return filteredScrapedData; } catch (e) { console.error("Exception caught, index single page: " + e); return []; } } async scrapeAd(url) { logDebug("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); let status = AD_STATUS.STATUS_NORMAL; if (body.indexOf(' div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", title: "#naslovartikla", descriptions: ".artikal_detaljniopis_tekst", category: "#artikal_glavni_div > div.artikal_lijevo > div.artikal_kat > div > span:nth-child(3) > a > span" }; const username = $(propertySelectors.username) .text() .trim(); if (this.ignoredUsernames.includes((username || "").toLowerCase())) { return null; } const title = $(propertySelectors.title) .text() .trim(); const descriptions = $(propertySelectors.descriptions); const category = $(propertySelectors.category) .text() .trim(); //====== PRICE DETECTION AND EXTRACTION ===== let price = null; const priceHeader = $("#pc > p.n").text().trim(); const priceValue = $("#pc > p:nth-child(2)").text().trim(); price = priceValue; if (priceHeader.indexOf('Hitn') !== -1) { // Urgent price status = AD_STATUS.STATUS_URGENT; } const discountPriceTag = $("#artikal_glavni_div > div.artikal_lijevo > p:nth-child(4)").text().trim(); if (discountPriceTag.indexOf('Akcij') !== -1) { status = AD_STATUS.STATUS_DISCOUNTED; const discountPriceValues = $("#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p").text().trim(); // discountPriceValues contain string like "10.000 KM 7.500 KM" // First price is regular, second is currently active (discounted) price const bothPrices = discountPriceValues.split('KM'); // Now, currently active price is second element of bothPrices array price = bothPrices[1] ? bothPrices[1].trim() : null; } //====== OTHER AD INFORMATION =============== let adType = null; let olxId = null; let numberOfViewsAgency = null; let otherInformationDivId; //We need to locate DIV ID where other information are stored for (let possibleId = 1; possibleId <= 30; possibleId++) { const adTypeFieldTitle = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` ) .text() .trim(); if (adTypeFieldTitle === "Vrsta oglasa") { otherInformationDivId = possibleId; break; } } if (!otherInformationDivId) { throw { message: "Other information DIV could not be found" }; } const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) .text() .trim(); const publishedDateMoment = moment.tz( publishedDate, OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!publishedDateMoment.isValid()) { throw { message: "Invalid published date ! Check parsing format" }; } const renewedDate = $(renewedDateFullValueSelector) .data("content") .trim(); const renewedDateMoment = moment.tz( renewedDate, OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!renewedDateMoment) { throw { message: "Invalid renewed date ! Check how parser parsed renewed date text" }; } adType = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` ) .text() .trim(); const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { throw { message: `Unknown ad category [${category}]` }; } const parsedAdType = this.getAdTypeId(adType); if (!parsedAdType) { throw { message: "Unknown ad type" }; } const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) .text() .trim(); olxId = $(`${olxIdFieldSelector} > div.df2`) .text() .trim(); numberOfViewsAgency = parseInt( $(numberOfViewsAgencyValueSelector) .text() .trim() ); if (olxIdFieldTitle !== "OLX ID") { throw { message: "Cannot find correct OLX ID" }; } //=========================================== //====== DETAIL INFORMATION FIELDS ========== let area, gardenSize, numberOfRooms = null, numberOfFloors = null, floor = null, accessRoadType = null, heatingType = null, furnishingType = null, balcony = null, newBuilding = null, elevator = null, water = null, electricity = null, drainageSystem = null, registeredInZkBooks = null, recentlyAdapted = null, parking = null, garage = null, gas = null, antiTheftDoor = null, airCondition = null, phoneConnection = null, cableTV = null, internet = null, basementAttic = null, storeRoom = null, videoSurveillance = null, alarm = null, suitableForStudents = null, includingBills = null, animalsAllowed = null, pool = null, urbanPlanPermit = null, buildingPermit = null, utilityConnection = null, distanceToRiver = null; let fieldIndex = 1; do { const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; const fieldTitleSelector = `${fieldSelector} > div.df1`; const fieldValueSelector = `${fieldSelector} > div.df2`; const fieldTitle = $(fieldTitleSelector) .text() .trim() .toLowerCase(); const fieldValue = $(fieldValueSelector) .text() .trim() .toLowerCase(); switch (fieldTitle) { case "kvadrata": area = fieldValue; break; case "okućnica (kvadratura)": gardenSize = fieldValue; break; case "broj soba": numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); break; case "broj prostorija": numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); break; case "broj spratova": numberOfFloors = this.parseNumberOfFloors( fieldValue, parsedCategory ); break; case "sprat": floor = this.parseFloorNumber(fieldValue, parsedCategory); break; case "vrsta grijanja": heatingType = this.getHeatingTypeId(fieldValue); break; case "namješten?": furnishingType = this.getFurnishingTypeId(fieldValue); break; case "namješten": furnishingType = FURNISHING_TYPE.FURNISHED.id; break; case "namještena": furnishingType = FURNISHING_TYPE.FURNISHED.id; break; case "voda": water = true; break; case "struja": electricity = true; break; case "kanalizacija": drainageSystem = fieldValue !== "nema"; break; case "godina izgradnje": newBuilding = newBuilding || fieldValue === "novogradnja"; break; case "kućni ljubimci": animalsAllowed = fieldValue === "da"; break; case "uknjiženo / zk": registeredInZkBooks = true; break; case "uknjiženo (zk)": registeredInZkBooks = true; break; case "novogradnja": newBuilding = true; break; case "nedavno adaptiran": recentlyAdapted = true; break; case "nedavno adaptirana": recentlyAdapted = true; break; case "balkon": balcony = true; break; case "lift": elevator = true; break; case "parking": parking = true; break; case "garaža": garage = true; break; case "plin": gas = true; break; case "blindirana vrata": antiTheftDoor = true; break; case "klima": airCondition = true; break; case "telefonski priključak": phoneConnection = true; break; case "kablovska tv": cableTV = true; break; case "internet": internet = true; break; case "podrum/tavan": basementAttic = true; break; case "ostava/špajz": storeRoom = true; break; case "video nadzor": videoSurveillance = true; break; case "alarm": alarm = true; break; case "za studente": suitableForStudents = true; break; case "uključen trošak režija": includingBills = true; break; case "građevinska dozvola": buildingPermit = true; break; case "komunalni priključak": utilityConnection = true; break; case "urbanistička dozvola": urbanPlanPermit = true; break; case "udaljenost od rijeke (m)": distanceToRiver = parseInt(fieldValue) || null; break; case "prilaz": accessRoadType = this.getAccessRoadTypeId(fieldValue); break; case "bazen": pool = true; break; case "iznajmljeno": status = AD_STATUS.STATUS_RENTED; break; default: // console.log(fieldTitle, " = ", fieldValue); break; } if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { break; } } while (true); //=========================================== //========================================= const parsedArea = this.parseArea(area) || null; const parsedGardenSize = this.parseArea(gardenSize) || null; const parsedPrice = this.parsePrice(price) || null; const latLngRegex = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const locationLatLngMatches = latLngRegex.exec(body); let locationLat = null; let locationLong = null; if (locationLatLngMatches && locationLatLngMatches.length >= 3) { locationLat = parseFloat(locationLatLngMatches[1]) || null; locationLong = parseFloat(locationLatLngMatches[2]) || null; } if ( title.indexOf("[PRODANO]") !== -1 || title.indexOf("[ZAVRŠENO]") !== -1 ) { status = AD_STATUS.STATUS_SOLD; } const data = { url, agencyObjectId: olxId, originAgencyName: AD_AGENCY.OLX, realEstateType: parsedCategory, adType: parsedAdType, title, price: parsedPrice, area: parsedArea, gardenSize: parsedGardenSize, shortDescription: descriptions .first() .text() .trim(), longDescription: descriptions .last() .text() .trim(), streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat, locationLong, adStatus: status, publishedDate: publishedDateMoment.toISOString(), renewedDate: renewedDateMoment.toISOString(), numberOfRooms, numberOfFloors, floor, accessRoadType, heatingType, furnishingType, balcony, newBuilding, elevator, water, electricity, drainageSystem, registeredInZkBooks, recentlyAdapted, parking, garage, gas, antiTheftDoor, airCondition, phoneConnection, cableTV, internet, basementAttic, storeRoom, videoSurveillance, alarm, suitableForStudents, includingBills, animalsAllowed, pool, urbanPlanPermit, buildingPermit, utilityConnection, distanceToRiver, numberOfViewsAgency }; // //console.log("Scraped data:", data); return data; } catch (e) { console.error("Exception caught scrapeAd : " + e.message, "\r\nURL:", url); } return null; } //======= HELPER FUNCTIONS ============= getAdCategoryId(categoryText) { switch (categoryText) { case "Stanovi": return AD_CATEGORY.FLAT.id; case "Zemljišta": return AD_CATEGORY.LAND.id; case "Kuće": return AD_CATEGORY.HOUSE.id; case "Poslovni prostori": return AD_CATEGORY.OFFICE.id; case "Apartmani": return AD_CATEGORY.APARTMENT.id; case "Garaže": return AD_CATEGORY.GARAGE.id; case "Vikendice": return AD_CATEGORY.COTTAGE.id; default: return undefined; } } getAdTypeId(adTypeText) { switch (adTypeText) { case "Prodaja": return AD_TYPE.AD_TYPE_SALE.stringId; case "Izdavanje": return AD_TYPE.AD_TYPE_RENT.stringId; case "Potražnja": return AD_TYPE.AD_TYPE_REQUEST.stringId; default: return undefined; } } getHeatingTypeId(heatingTypeText) { switch (heatingTypeText) { case "struja": return HEATING_TYPE.ELECTRICITY.id; case "plin": return HEATING_TYPE.GAS.id; case "drva": return HEATING_TYPE.WOOD.id; case "centralno (gradsko)": return HEATING_TYPE.CENTRAL_CITY.id; case "centralno (kotlovnica)": return HEATING_TYPE.CENTRAL_BOILER.id; case "centralno (plin)": return HEATING_TYPE.CENTRAL_GAS.id; case "nije uvedeno": return HEATING_TYPE.NO_HEATING.id; case "ostalo": return HEATING_TYPE.OTHER.id; case "drugo": return HEATING_TYPE.OTHER.id; default: console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); return null; } } getFurnishingTypeId(furnishingTypeText) { switch (furnishingTypeText) { case "namješten": return FURNISHING_TYPE.FURNISHED.id; case "polunamješten": return FURNISHING_TYPE.HALF_FURNISHED.id; case "nenamješten": return FURNISHING_TYPE.NOT_FURNISHED.id; case "": return FURNISHING_TYPE.FURNISHED.id; default: console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); return null; } } getAccessRoadTypeId(accessRoadTypeText) { switch (accessRoadTypeText) { case "asfalt": return ACCESS_ROAD_TYPE.ASPHALT.id; case "beton": return ACCESS_ROAD_TYPE.CONCRETE.id; case "makadam": return ACCESS_ROAD_TYPE.MACADAM.id; case "ostalo": return ACCESS_ROAD_TYPE.OTHER.id; default: console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); return null; } } parseArea(areaText) { if (!areaText) { return NaN; } const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; const textWithOnlyOneDecimalDot = areaText .replace(",", ".") .replace(removeDotsExceptLastOneRegex, ""); return parseFloat(textWithOnlyOneDecimalDot); } parsePrice(priceText) { if (!priceText) { return NaN; } if (priceText === "Po dogovoru") { return null; } const formattedPriceText = priceText.replace(".", "").replace(",", "."); return parseFloat(formattedPriceText); } parseNumberOfRooms(numberOfRoomsText, categoryId) { if (categoryId === AD_CATEGORY.FLAT.id) { switch (numberOfRoomsText) { case "garsonjera": return 0; case "jednosoban (1)": return 1; case "jednoiposoban (1.5)": return 1.5; case "dvosoban (2)": return 2; case "trosoban (3)": return 3; case "četverosoban (4)": return 4; case "petosoban i više": return 5; default: console.log( "broj soba [stan] = NEPOZNATO [", numberOfRoomsText, ", ", categoryId, "]" ); return null; } } if ( categoryId === AD_CATEGORY.HOUSE.id || categoryId === AD_CATEGORY.COTTAGE.id || categoryId === AD_CATEGORY.APARTMENT.id || categoryId === AD_CATEGORY.OFFICE.id ) { return parseInt(numberOfRoomsText) || null; } console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); return null; } parseNumberOfFloors(numberOfFloorsText, categoryId) { if ( categoryId === AD_CATEGORY.HOUSE.id || categoryId === AD_CATEGORY.COTTAGE.id ) { return parseInt(numberOfFloorsText) || null; } if (categoryId === AD_CATEGORY.OFFICE.id) { if ( numberOfFloorsText === "suteren" || numberOfFloorsText === "prizemlje" ) { return 0; } if (numberOfFloorsText === "6+") { return 7; } return parseInt(numberOfFloorsText) || null; } console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); return null; } parseFloorNumber(floorText, categoryId) { if ( categoryId === AD_CATEGORY.FLAT.id || categoryId === AD_CATEGORY.APARTMENT.id ) { if ( floorText === "suteren" || floorText === "prizemlje" || floorText === "visoko prizemlje" ) { return 0; } return parseInt(floorText) || null; } if (categoryId === AD_CATEGORY.OFFICE.id) { if (floorText === "zaseban objekat") { return null; } if (floorText === "prizemlje" || floorText === "visoko prizemlje") { return 0; } return parseInt(floorText) || null; } console.log("sprat = NEPOZNATO [", floorText, "]"); return null; } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = OlxCrawler;