"use strict"; const fetch = require("../../helpers/fetchWrapper"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE, HEATING_TYPE, FURNISHING_TYPE, ACCESS_ROAD_TYPE } = require("../../common/enums"); const { DEFAULT_TIMEZONE, PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); const OLX_ENUMS = { OLX_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "", [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje", [CRAWLER_AD_TYPE.ONLY_REQUEST]: "&vrsta=samopotraznja" }, OLX_AD_CATEGORY: { [AD_CATEGORY.FLAT.id]: "&kategorija=23", [AD_CATEGORY.HOUSE.id]: "&kategorija=24", [AD_CATEGORY.LAND.id]: "&kategorija=29", [AD_CATEGORY.OFFICE.id]: "&kategorija=25", [AD_CATEGORY.APARTMENT.id]: "&kategorija=27", [AD_CATEGORY.GARAGE.id]: "&kategorija=30", [AD_CATEGORY.COTTAGE.id]: "&kategorija=26" }, MAX_DETAIL_FIELDS: 30, OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm", OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" }; const { OLX_FORCE_CRAWL, OLX_DELAY_BETWEEN_ADS } = require("../specificConfigs/olx"); class OlxCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 1000, maxResultsPerPage = 100, ignoredUsernames = [], delayBetweenPages = 1000, delayBetweenAds = OLX_DELAY_BETWEEN_ADS ) { this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; this.ignoredUsernames = ignoredUsernames; this.delayBetweenPages = delayBetweenPages; this.delayBetweenAds = delayBetweenAds; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory)); } let done = false; while (!done) { const categoryIndexerPromises = []; const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); generatorsToRemove.push(false); } const singlePageResults = await Promise.all(categoryIndexerPromises); const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); const { newRecords, existingRecords } = saveResults; newRealEstates.push(...newRecords); for (const existingRecord of existingRecords) { const { publishedDate, renewedDate } = existingRecord; const publishedDateMoment = moment.utc(publishedDate); const renewedDateMoment = moment.utc(renewedDate); const stopCrawlingThisCategory = publishedDateMoment.isSame( renewedDateMoment, "minute" ); if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) { generatorsToRemove[index] = true; // console.log("\tGenerator ", index + 1, "has no more new ads"); break; } } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; // console.log("Generator ", index + 1, "has no more pages"); } } // console.log("Generators state : ", generatorsToRemove); for (let i = generatorsToRemove.length - 1; i >= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory) { let pageToIndex = 1; const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; const singlePageResults = await this.indexSinglePage( urlPageToCrawl, this.maxResultsPerPage ); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; } else { return undefined; } ++pageToIndex; if (pageToIndex === this.maxPages) { return undefined; } } } else { return undefined; } } async indexSinglePage(url, maxResultsPerPage) { if (PRINT_CRAWLER_DEBUG) { console.log("[OLX] Index page : ", url); } try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; $("#rezultatipretrage") .find(".listitem") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { asyncScraping.push(this.scrapeAd(hrefs[i])); //Delaying next scrape ad request to avoid ScraperAPI server error asyncScraping.push(this.sleep(this.delayBetweenAds)); } const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; } catch (e) { console.error("Exception caught:" + e); return []; } } async scrapeAd(url) { console.log("Scraping : ", url); // let hasParseErrors = false; //let numberOfParseErrors = 0; // do { try { await this.sleep(this.delayBetweenAds); const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); let status = AD_STATUS.STATUS_NORMAL; const propertySelectors = { username: "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", title: "#naslovartikla", descriptions: ".artikal_detaljniopis_tekst", category: "#artikal_glavni_div > div.artikal_lijevo > div.artikal_kat > div > span:nth-child(3) > a > span" }; const username = $(propertySelectors.username) .text() .trim(); if (this.ignoredUsernames.includes((username || "").toLowerCase())) { return null; } const title = $(propertySelectors.title) .text() .trim(); const descriptions = $(propertySelectors.descriptions); const category = $(propertySelectors.category) .text() .trim(); //====== PRICE DETECTION AND EXTRACTION ===== let price = null; let normalPrice = null; let urgentPrice = null; const normalPriceValue = $("#pc > p:nth-child(2)") .text() .trim(); const urgentPriceValue = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" ) .text() .trim(); //Debug //console.log("Title:", title); //console.log("Url scraped:", url); // console.log("Normal price value:", normalPriceValue); // console.log("Urgent price value:", urgentPriceValue); // if (normalPriceValue && normalPriceValue.length > 0) { normalPrice = normalPriceValue .replace(/\r\n|\n|\r/gm, "") .replace("KM", "") .trim(); if ( $("#pc > p.n") .text() .indexOf("Hitna") !== -1 ) { status = AD_STATUS.STATUS_URGENT; } else { status = AD_STATUS.STATUS_NORMAL; } } else { // console.log("Body:", body); // throw { message: "Can't find normal price" }; } if (urgentPriceValue && urgentPriceValue.length > 0) { const priceValues = urgentPriceValue.replace("Cijena", "").split("KM"); //priceValues will contain values like ["100000", "90000", ...], second element is urgent price if (priceValues.length > 0) { if (priceValues[0].trim().indexOf("Hitno") != -1) { urgentPrice = priceValues[0].replace("Hitno", "").trim(); status = AD_STATUS.STATUS_URGENT; } else { urgentPrice = priceValues[0].trim(); } } else { throw { message: "Can't find urgent price" }; } } price = status === AD_STATUS.STATUS_URGENT ? urgentPrice : normalPrice; //====== OTHER AD INFORMATION =============== let adType = null; let olxId = null; let numberOfViewsAgency = null; let otherInformationDivId; //We need to locate DIV ID where other information are stored for (let possibleId = 1; possibleId <= 30; possibleId++) { const adTypeFieldTitle = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` ) .text() .trim(); if (adTypeFieldTitle === "Vrsta oglasa") { otherInformationDivId = possibleId; break; } } if (!otherInformationDivId) { throw { message: "Other information DIV could not be found" }; } const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; const numberOfViewsAgencyValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(6) > div.df2`; const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) .text() .trim(); const publishedDateMoment = moment.tz( publishedDate, OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!publishedDateMoment.isValid()) { throw { message: "Invalid published date ! Check parsing format" }; } const renewedDate = $(renewedDateFullValueSelector) .data("content") .trim(); const renewedDateMoment = moment.tz( renewedDate, OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!renewedDateMoment) { throw { message: "Invalid renewed date ! Check how parser parsed renewed date text" }; } adType = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` ) .text() .trim(); const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { throw { message: `Unknown ad category [${category}]` }; } const parsedAdType = this.getAdTypeId(adType); if (!parsedAdType) { throw { message: "Unknown ad type" }; } const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) .text() .trim(); olxId = $(`${olxIdFieldSelector} > div.df2`) .text() .trim(); numberOfViewsAgency = parseInt( $(numberOfViewsAgencyValueSelector) .text() .trim() ); if (olxIdFieldTitle !== "OLX ID") { throw { message: "Cannot find correct OLX ID" }; } //=========================================== //====== DETAIL INFORMATION FIELDS ========== let area, gardenSize, numberOfRooms = null, numberOfFloors = null, floor = null, accessRoadType = null, heatingType = null, furnishingType = null, balcony = null, newBuilding = null, elevator = null, water = null, electricity = null, drainageSystem = null, registeredInZkBooks = null, recentlyAdapted = null, parking = null, garage = null, gas = null, antiTheftDoor = null, airCondition = null, phoneConnection = null, cableTV = null, internet = null, basementAttic = null, storeRoom = null, videoSurveillance = null, alarm = null, suitableForStudents = null, includingBills = null, animalsAllowed = null, pool = null, urbanPlanPermit = null, buildingPermit = null, utilityConnection = null, distanceToRiver = null; let fieldIndex = 1; do { const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; const fieldTitleSelector = `${fieldSelector} > div.df1`; const fieldValueSelector = `${fieldSelector} > div.df2`; const fieldTitle = $(fieldTitleSelector) .text() .trim() .toLowerCase(); const fieldValue = $(fieldValueSelector) .text() .trim() .toLowerCase(); switch (fieldTitle) { case "kvadrata": area = fieldValue; break; case "okućnica (kvadratura)": gardenSize = fieldValue; break; case "broj soba": numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); break; case "broj prostorija": numberOfRooms = this.parseNumberOfRooms(fieldValue, parsedCategory); break; case "broj spratova": numberOfFloors = this.parseNumberOfFloors( fieldValue, parsedCategory ); break; case "sprat": floor = this.parseFloorNumber(fieldValue, parsedCategory); break; case "vrsta grijanja": heatingType = this.getHeatingTypeId(fieldValue); break; case "namješten?": furnishingType = this.getFurnishingTypeId(fieldValue); break; case "namješten": furnishingType = FURNISHING_TYPE.FURNISHED.id; break; case "namještena": furnishingType = FURNISHING_TYPE.FURNISHED.id; break; case "voda": water = true; break; case "struja": electricity = true; break; case "kanalizacija": drainageSystem = fieldValue !== "nema"; break; case "godina izgradnje": newBuilding = newBuilding || fieldValue === "novogradnja"; break; case "kućni ljubimci": animalsAllowed = fieldValue === "da"; break; case "uknjiženo / zk": registeredInZkBooks = true; break; case "uknjiženo (zk)": registeredInZkBooks = true; break; case "novogradnja": newBuilding = true; break; case "nedavno adaptiran": recentlyAdapted = true; break; case "nedavno adaptirana": recentlyAdapted = true; break; case "balkon": balcony = true; break; case "lift": elevator = true; break; case "parking": parking = true; break; case "garaža": garage = true; break; case "plin": gas = true; break; case "blindirana vrata": antiTheftDoor = true; break; case "klima": airCondition = true; break; case "telefonski priključak": phoneConnection = true; break; case "kablovska tv": cableTV = true; break; case "internet": internet = true; break; case "podrum/tavan": basementAttic = true; break; case "ostava/špajz": storeRoom = true; break; case "video nadzor": videoSurveillance = true; break; case "alarm": alarm = true; break; case "za studente": suitableForStudents = true; break; case "uključen trošak režija": includingBills = true; break; case "građevinska dozvola": buildingPermit = true; break; case "komunalni priključak": utilityConnection = true; break; case "urbanistička dozvola": urbanPlanPermit = true; break; case "udaljenost od rijeke (m)": distanceToRiver = parseInt(fieldValue) || null; break; case "prilaz": accessRoadType = this.getAccessRoadTypeId(fieldValue); break; case "bazen": pool = true; break; case "iznajmljeno": status = AD_STATUS.STATUS_RENTED; break; default: // console.log(fieldTitle, " = ", fieldValue); break; } if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { break; } } while (true); //=========================================== //========================================= const parsedArea = this.parseArea(area) || null; const parsedGardenSize = this.parseArea(gardenSize) || null; const parsedPrice = this.parsePrice(price) || null; const latLngRegex = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const locationLatLngMatches = latLngRegex.exec(body); let locationLat = null; let locationLong = null; if (locationLatLngMatches && locationLatLngMatches.length >= 3) { locationLat = parseFloat(locationLatLngMatches[1]) || null; locationLong = parseFloat(locationLatLngMatches[2]) || null; } if ( title.indexOf("[PRODANO]") !== -1 || title.indexOf("[ZAVRŠENO]") !== -1 ) { status = AD_STATUS.STATUS_SOLD; } const data = { url, agencyObjectId: olxId, originAgencyName: AD_AGENCY.OLX, realEstateType: parsedCategory, adType: parsedAdType, title, price: parsedPrice, area: parsedArea, gardenSize: parsedGardenSize, shortDescription: descriptions .first() .text() .trim(), longDescription: descriptions .last() .text() .trim(), streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat, locationLong, adStatus: status, publishedDate: publishedDateMoment.toISOString(), renewedDate: renewedDateMoment.toISOString(), numberOfRooms, numberOfFloors, floor, accessRoadType, heatingType, furnishingType, balcony, newBuilding, elevator, water, electricity, drainageSystem, registeredInZkBooks, recentlyAdapted, parking, garage, gas, antiTheftDoor, airCondition, phoneConnection, cableTV, internet, basementAttic, storeRoom, videoSurveillance, alarm, suitableForStudents, includingBills, animalsAllowed, pool, urbanPlanPermit, buildingPermit, utilityConnection, distanceToRiver, numberOfViewsAgency }; // //console.log("Scraped data:", data); //Delay between real estate ads to avoid error from Scraper API // await this.sleep(this.delayBetweenAds); return data; } catch (e) { // hasParseErrors = true; // numberOfParseErrors++; console.error("Exception caught: " + e.message, "\r\nURL:", url); } // } while (hasParseErrors && numberOfParseErrors <= 1); await this.sleep(this.delayBetweenAds); return null; } //======= HELPER FUNCTIONS ============= getAdCategoryId(categoryText) { switch (categoryText) { case "Stanovi": return AD_CATEGORY.FLAT.id; case "Zemljišta": return AD_CATEGORY.LAND.id; case "Kuće": return AD_CATEGORY.HOUSE.id; case "Poslovni prostori": return AD_CATEGORY.OFFICE.id; case "Apartmani": return AD_CATEGORY.APARTMENT.id; case "Garaže": return AD_CATEGORY.GARAGE.id; case "Vikendice": return AD_CATEGORY.COTTAGE.id; default: return undefined; } } getAdTypeId(adTypeText) { switch (adTypeText) { case "Prodaja": return AD_TYPE.AD_TYPE_SALE.stringId; case "Izdavanje": return AD_TYPE.AD_TYPE_RENT.stringId; case "Potražnja": return AD_TYPE.AD_TYPE_REQUEST.stringId; default: return undefined; } } getHeatingTypeId(heatingTypeText) { switch (heatingTypeText) { case "struja": return HEATING_TYPE.ELECTRICITY.id; case "plin": return HEATING_TYPE.GAS.id; case "drva": return HEATING_TYPE.WOOD.id; case "centralno (gradsko)": return HEATING_TYPE.CENTRAL_CITY.id; case "centralno (kotlovnica)": return HEATING_TYPE.CENTRAL_BOILER.id; case "centralno (plin)": return HEATING_TYPE.CENTRAL_GAS.id; case "nije uvedeno": return HEATING_TYPE.NO_HEATING.id; case "ostalo": return HEATING_TYPE.OTHER.id; case "drugo": return HEATING_TYPE.OTHER.id; default: console.log("grijanje = NEPOZNATO [", heatingTypeText, "]"); return null; } } getFurnishingTypeId(furnishingTypeText) { switch (furnishingTypeText) { case "namješten": return FURNISHING_TYPE.FURNISHED.id; case "polunamješten": return FURNISHING_TYPE.HALF_FURNISHED.id; case "nenamješten": return FURNISHING_TYPE.NOT_FURNISHED.id; case "": return FURNISHING_TYPE.FURNISHED.id; default: console.log("namješten = NEPOZNATO [", furnishingTypeText, "]"); return null; } } getAccessRoadTypeId(accessRoadTypeText) { switch (accessRoadTypeText) { case "asfalt": return ACCESS_ROAD_TYPE.ASPHALT.id; case "beton": return ACCESS_ROAD_TYPE.CONCRETE.id; case "makadam": return ACCESS_ROAD_TYPE.MACADAM.id; case "ostalo": return ACCESS_ROAD_TYPE.OTHER.id; default: console.log("pristup = NEPOZNATO [", accessRoadTypeText, "]"); return null; } } parseArea(areaText) { if (!areaText) { return NaN; } const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; const textWithOnlyOneDecimalDot = areaText .replace(",", ".") .replace(removeDotsExceptLastOneRegex, ""); return parseFloat(textWithOnlyOneDecimalDot); } parsePrice(priceText) { if (!priceText) { return NaN; } if (priceText === "Po dogovoru") { return null; } const formattedPriceText = priceText.replace(".", "").replace(",", "."); return parseFloat(formattedPriceText); } parseNumberOfRooms(numberOfRoomsText, categoryId) { if (categoryId === AD_CATEGORY.FLAT.id) { switch (numberOfRoomsText) { case "garsonjera": return 0; case "jednosoban (1)": return 1; case "jednoiposoban (1.5)": return 1.5; case "dvosoban (2)": return 2; case "trosoban (3)": return 3; case "četverosoban (4)": return 4; case "petosoban i više": return 5; default: console.log( "broj soba [stan] = NEPOZNATO [", numberOfRoomsText, ", ", categoryId, "]" ); return null; } } if ( categoryId === AD_CATEGORY.HOUSE.id || categoryId === AD_CATEGORY.COTTAGE.id || categoryId === AD_CATEGORY.APARTMENT.id || categoryId === AD_CATEGORY.OFFICE.id ) { return parseInt(numberOfRoomsText) || null; } console.log("broj soba = NEPOZNATO [", numberOfRoomsText, "]"); return null; } parseNumberOfFloors(numberOfFloorsText, categoryId) { if ( categoryId === AD_CATEGORY.HOUSE.id || categoryId === AD_CATEGORY.COTTAGE.id ) { return parseInt(numberOfFloorsText) || null; } if (categoryId === AD_CATEGORY.OFFICE.id) { if ( numberOfFloorsText === "suteren" || numberOfFloorsText === "prizemlje" ) { return 0; } if (numberOfFloorsText === "6+") { return 7; } return parseInt(numberOfFloorsText) || null; } console.log("broj spratova = NEPOZNATO [", numberOfFloorsText, "]"); return null; } parseFloorNumber(floorText, categoryId) { if ( categoryId === AD_CATEGORY.FLAT.id || categoryId === AD_CATEGORY.APARTMENT.id ) { if ( floorText === "suteren" || floorText === "prizemlje" || floorText === "visoko prizemlje" ) { return 0; } return parseInt(floorText) || null; } if (categoryId === AD_CATEGORY.OFFICE.id) { if (floorText === "zaseban objekat") { return null; } if (floorText === "prizemlje" || floorText === "visoko prizemlje") { return 0; } return parseInt(floorText) || null; } console.log("sprat = NEPOZNATO [", floorText, "]"); return null; } async sleep(ms) { // console.log("Sleep for:", ms); return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = OlxCrawler;