"use strict"; let fetch = require("node-fetch"); let cheerio = require("cheerio"); const { AD_TYPE, AD_CATEGORY, IGNORED_USERNAMES, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE } = require("../../common/enums"); const OLX_ENUMS = { OLX_AD_TYPE: {}, OLX_AD_CATEGORY: {}, MAX_DETAIL_FIELDS: 30 }; OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = ""; OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja"; OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje"; OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23"; OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24"; OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29"; OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25"; OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27"; OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30"; class OlxCrawler { constructor( fromPage = 1, toPage = 10, maxResults = 1000, savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [ AD_CATEGORY.CATEGORY_FLAT, AD_CATEGORY.CATEGORY_HOUSE ] ) { this.fromPage = fromPage; this.toPage = toPage; this.maxResults = maxResults; this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; } async crawl() { console.log("[OLX] Crawler started"); const crawlAdTypes = this.crawlerAdTypes; const crawlAdCategories = this.crawlerAdCategories; const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`; if (crawlAdCategories && crawlAdTypes) { const asyncPagesIndexingByCategory = []; for (const adCategory of crawlAdCategories) { asyncPagesIndexingByCategory.push( this.indexPages( `${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}` ) ); } await Promise.all(asyncPagesIndexingByCategory); } console.log("[OLX] Crawler finished"); } async indexPages(url) { const startPage = this.fromPage; const endPage = this.toPage; const maxResultsPerPage = this.maxResults; for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { const singlePageResults = await this.indexSinglePage( url, pageNumber, maxResultsPerPage ); await this.saveCrawledResults(singlePageResults); await this.sleep(5000); } } async indexSinglePage(urlWithoutPageNumber, pageNumber, maxResultsPerPage) { try { const url = `${urlWithoutPageNumber}&stranica=${pageNumber}`; const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; const singlePageResults = []; $("#rezultatipretrage") .find(".listitem") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; for (let i = 0; i < actualNoOfResults; i++) { console.log(`Scraping : ${hrefs[i]}`); const adData = await this.scrapeAd(hrefs[i]); if (adData) { singlePageResults.push(adData); } await this.sleep(500); } return singlePageResults; } catch (e) { console.error("Exception caught:" + e); } } async scrapeAd(url) { try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); let status = AD_STATUS.STATUS_NORMAL; const username = $( "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" ).text(); if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { return null; } const title = $("#naslovartikla").text(); const descriptions = $(".artikal_detaljniopis_tekst"); const category = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" ).text(); //====== PRICE DETECTION AND EXTRACTION ===== let price = null; const normalPriceValue = $("#pc > p:nth-child(2)").text(); const urgentPriceValue = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" ).text(); if (normalPriceValue && normalPriceValue.length > 0) { price = normalPriceValue; if ( $("#pc > p.n") .text() .indexOf("Hitna") !== -1 ) { status = AD_STATUS.STATUS_URGENT; } else { status = AD_STATUS.STATUS_NORMAL; } } else if (urgentPriceValue && urgentPriceValue.length > 0) { const priceValues = urgentPriceValue.split("KM"); //priceValues will contain values like ["100000", "90000", ...], second element is urgent price if (priceValues.length > 1) { price = priceValues[1].trim(); status = AD_STATUS.STATUS_DISCOUNTED; } else { throw { message: "Can't find urgent price" }; } } else { throw { message: "Can't find price (it is not normal nor urgent price ?)" }; } //====== OTHER AD INFORMATION =============== let adType = null; let olxId = null; let otherInformationDivId; //We need to locate DIV ID where other information are stored for (let possibleId = 10; possibleId <= 20; possibleId++) { const adTypeFieldTitle = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` ) .text() .trim(); if (adTypeFieldTitle === "Vrsta oglasa") { otherInformationDivId = possibleId; break; } } if (!otherInformationDivId) { throw { message: "Other information DIV could not be found" }; } const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; adType = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` ) .text() .trim(); const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) .text() .trim(); olxId = $(`${olxIdFieldSelector} > div.df2`) .text() .trim(); if (olxIdFieldTitle !== "OLX ID") { throw { message: "Cannot find correct OLX ID" }; } //=========================================== //====== DETAIL INFORMATION FIELDS ========== let area = null; let gardenSize = null; let fieldIndex = 1; do { const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; const fieldTitleSelector = `${fieldSelector} > div.df1`; const fieldValueSelector = `${fieldSelector} > div.df2`; const fieldTitle = $(fieldTitleSelector) .text() .trim(); const fieldValue = $(fieldValueSelector) .text() .trim(); switch (fieldTitle) { case "Kvadrata": area = fieldValue; break; case "Okućnica (kvadratura)": gardenSize = fieldValue; break; } if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { break; } } while (true); //=========================================== //====== UNUSED FIELDS FOR NOW ============== const time = $("time").attr("datetime"); const numberOfViews = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" ).text(); //=========================================== //========================================= const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { throw { message: "Unknown ad category" }; } const parsedAdType = this.getAdTypeId(adType); if (!parsedAdType) { throw { message: "Unknown ad type" }; } const parsedArea = this.parseArea(area) || null; const parsedGardenSize = this.parseArea(gardenSize) || null; const parsedPrice = this.parsePrice(price) || null; const latLngRegex = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const locationLatLngMatches = latLngRegex.exec(body); let locationLat = null; let locationLong = null; if (locationLatLngMatches && locationLatLngMatches.length >= 3) { locationLat = parseFloat(locationLatLngMatches[1]) || null; locationLong = parseFloat(locationLatLngMatches[2]) || null; } const data = { url, agencyObjectId: olxId, originAgencyName: AD_AGENCY.OLX, realEstateType: this.getAdCategoryId(category), adType: parsedAdType, title, price: parsedPrice, area: parsedArea, gardenSize: parsedGardenSize, shortDescription: descriptions.first().text(), longDescription: descriptions.last().text(), streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat, locationLong, adStatus: status }; return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); } return null; } //======= HELPER FUNCTIONS ============= getAdCategoryId(categoryText) { switch (categoryText) { case "Stanovi": return AD_CATEGORY.CATEGORY_FLAT; case "Zemljišta": return AD_CATEGORY.CATEGORY_LAND; case "Kuće": return AD_CATEGORY.CATEGORY_HOUSE; case "Poslovni prostori": return AD_CATEGORY.CATEGORY_OFFICE; default: return undefined; } } getAdTypeId(adTypeText) { switch (adTypeText) { case "Prodaja": return AD_TYPE.AD_TYPE_SALE; case "Izdavanje": return AD_TYPE.AD_TYPE_RENT; default: return undefined; } } parseArea(areaText) { if (!areaText) { return NaN; } const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; const textWithOnlyOneDecimalDot = areaText .replace(",", ".") .replace(removeDotsExceptLastOneRegex, ""); return parseFloat(textWithOnlyOneDecimalDot); } parsePrice(priceText) { if (!priceText) { return NaN; } const formattedPriceText = priceText.replace(".", "").replace(",", "."); return parseFloat(formattedPriceText); } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; for (const saver of savers) { await saver.save(results); } } } module.exports = OlxCrawler;