"use strict"; const fetch = require("node-fetch"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE } = require("../../common/enums"); const { DEFAULT_TIMEZONE } = require("../../config/appConfig"); const OLX_ENUMS = { OLX_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "", [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje" }, OLX_AD_CATEGORY: { [AD_CATEGORY.FLAT.id]: "&kategorija=23", [AD_CATEGORY.HOUSE.id]: "&kategorija=24", [AD_CATEGORY.LAND.id]: "&kategorija=29", [AD_CATEGORY.OFFICE.id]: "&kategorija=25", [AD_CATEGORY.APARTMENT.id]: "&kategorija=27", [AD_CATEGORY.GARAGE.id]: "&kategorija=30", [AD_CATEGORY.COTTAGE.id]: "&kategorija=26" }, MAX_DETAIL_FIELDS: 30, OLX_PUBLISHED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm", OLX_RENEWED_DATE_FORMAT: "DD.MM.YYYY. u HH:mm" }; class OlxCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 1000, maxResultsPerPage = 100, ignoredUsernames = [], delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; this.ignoredUsernames = ignoredUsernames; this.delayBetweenPages = delayBetweenPages; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory)); } let done = false; while (!done) { const categoryIndexerPromises = []; const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); generatorsToRemove.push(false); } const singlePageResults = await Promise.all(categoryIndexerPromises); const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); const { newRecords, existingRecords } = saveResults; newRealEstates.push(...newRecords); for (const existingRecord of existingRecords) { const { publishedDate, renewedDate } = existingRecord; const publishedDateMoment = moment.utc(publishedDate); const renewedDateMoment = moment.utc(renewedDate); const stopCrawlingThisCategory = publishedDateMoment.isSame( renewedDateMoment, "minute" ); if (stopCrawlingThisCategory) { generatorsToRemove[index] = true; // console.log("\tGenerator ", index + 1, "has no more new ads"); break; } } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; // console.log("Generator ", index + 1, "has no more pages"); } } // console.log("Generators state : ", generatorsToRemove); for (let i = generatorsToRemove.length - 1; i >= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory) { let pageToIndex = 1; const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; if (urlAdTypePart && urlCategoryPart) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; const singlePageResults = await this.indexSinglePage( urlPageToCrawl, this.maxResultsPerPage ); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; } else { return undefined; } ++pageToIndex; if (pageToIndex === this.maxPages) { return undefined; } } } else { return undefined; } } async indexSinglePage(url, maxResultsPerPage) { try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; $("#rezultatipretrage") .find(".listitem") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { asyncScraping.push(this.scrapeAd(hrefs[i])); } const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; } catch (e) { console.error("Exception caught:" + e); return []; } } async scrapeAd(url) { // console.log("Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); let status = AD_STATUS.STATUS_NORMAL; const propertySelectors = { username: "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span", title: "#naslovartikla", descriptions: ".artikal_detaljniopis_tekst", category: "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" }; const username = $(propertySelectors.username) .text() .trim(); if (this.ignoredUsernames.includes((username || "").toLowerCase())) { return null; } const title = $(propertySelectors.title) .text() .trim(); const descriptions = $(propertySelectors.descriptions); const category = $(propertySelectors.category) .text() .trim(); //====== PRICE DETECTION AND EXTRACTION ===== let price = null; const normalPriceValue = $("#pc > p:nth-child(2)").text(); const urgentPriceValue = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" ) .text() .trim(); if (normalPriceValue && normalPriceValue.length > 0) { price = normalPriceValue; if ( $("#pc > p.n") .text() .indexOf("Hitna") !== -1 ) { status = AD_STATUS.STATUS_URGENT; } else { status = AD_STATUS.STATUS_NORMAL; } } else if (urgentPriceValue && urgentPriceValue.length > 0) { const priceValues = urgentPriceValue.split("KM"); //priceValues will contain values like ["100000", "90000", ...], second element is urgent price if (priceValues.length > 1) { price = priceValues[1].trim(); status = AD_STATUS.STATUS_DISCOUNTED; } else { throw { message: "Can't find urgent price" }; } } else { throw { message: "Can't find price (it is not normal nor urgent price ?)" }; } //====== OTHER AD INFORMATION =============== let adType = null; let olxId = null; let otherInformationDivId; //We need to locate DIV ID where other information are stored for (let possibleId = 10; possibleId <= 20; possibleId++) { const adTypeFieldTitle = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` ) .text() .trim(); if (adTypeFieldTitle === "Vrsta oglasa") { otherInformationDivId = possibleId; break; } } if (!otherInformationDivId) { throw { message: "Other information DIV could not be found" }; } const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; const publishedDateValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(3) > div.df2.neanimiraj > time`; const renewedDateFullValueSelector = `#artikal_glavni_div > div.artikal_lijevo > div.op.ob.pop`; const publishedDate = $(publishedDateValueSelector) .text() .trim(); const publishedDateMoment = moment.tz( publishedDate, OLX_ENUMS.OLX_PUBLISHED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!publishedDateMoment.isValid()) { throw { message: "Invalid published date ! Check parsing format" }; } const renewedDate = $(renewedDateFullValueSelector) .data("content") .trim(); const renewedDateMoment = moment.tz( renewedDate, OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!renewedDateMoment) { throw { message: "Invalid renewed date ! Check how parser parsed renewed date text" }; } adType = $( `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` ) .text() .trim(); const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) .text() .trim(); olxId = $(`${olxIdFieldSelector} > div.df2`) .text() .trim(); if (olxIdFieldTitle !== "OLX ID") { throw { message: "Cannot find correct OLX ID" }; } //=========================================== //====== DETAIL INFORMATION FIELDS ========== let area = null; let gardenSize = null; let fieldIndex = 1; do { const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; const fieldTitleSelector = `${fieldSelector} > div.df1`; const fieldValueSelector = `${fieldSelector} > div.df2`; const fieldTitle = $(fieldTitleSelector) .text() .trim(); const fieldValue = $(fieldValueSelector) .text() .trim(); switch (fieldTitle) { case "Kvadrata": area = fieldValue; break; case "Okućnica (kvadratura)": gardenSize = fieldValue; break; } if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { break; } } while (true); //=========================================== //====== UNUSED FIELDS FOR NOW ============== const time = $("time").attr("datetime"); const numberOfViews = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" ) .text() .trim(); //=========================================== //========================================= const parsedCategory = this.getAdCategoryId(category); if (!parsedCategory) { throw { message: "Unknown ad category" }; } const parsedAdType = this.getAdTypeId(adType); if (!parsedAdType) { throw { message: "Unknown ad type" }; } const parsedArea = this.parseArea(area) || null; const parsedGardenSize = this.parseArea(gardenSize) || null; const parsedPrice = this.parsePrice(price) || null; const latLngRegex = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const locationLatLngMatches = latLngRegex.exec(body); let locationLat = null; let locationLong = null; if (locationLatLngMatches && locationLatLngMatches.length >= 3) { locationLat = parseFloat(locationLatLngMatches[1]) || null; locationLong = parseFloat(locationLatLngMatches[2]) || null; } const data = { url, agencyObjectId: olxId, originAgencyName: AD_AGENCY.OLX, realEstateType: parsedCategory, adType: parsedAdType, title, price: parsedPrice, area: parsedArea, gardenSize: parsedGardenSize, shortDescription: descriptions .first() .text() .trim(), longDescription: descriptions .last() .text() .trim(), streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat, locationLong, adStatus: status, publishedDate: publishedDateMoment.toISOString(), renewedDate: renewedDateMoment.toISOString() }; return data; } catch (e) { console.error("Exception caught: " + e.message, "\r\nURL:", url); } return null; } //======= HELPER FUNCTIONS ============= getAdCategoryId(categoryText) { switch (categoryText) { case "Stanovi": return AD_CATEGORY.FLAT.id; case "Zemljišta": return AD_CATEGORY.LAND.id; case "Kuće": return AD_CATEGORY.HOUSE.id; case "Poslovni prostori": return AD_CATEGORY.OFFICE.id; case "Apartmani": return AD_CATEGORY.APARTMENT.id; case "Garaže": return AD_CATEGORY.GARAGE.id; case "Vikendice": return AD_CATEGORY.COTTAGE.id; default: return undefined; } } getAdTypeId(adTypeText) { switch (adTypeText) { case "Prodaja": return AD_TYPE.AD_TYPE_SALE; case "Izdavanje": return AD_TYPE.AD_TYPE_RENT; default: return undefined; } } parseArea(areaText) { if (!areaText) { return NaN; } const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; const textWithOnlyOneDecimalDot = areaText .replace(",", ".") .replace(removeDotsExceptLastOneRegex, ""); return parseFloat(textWithOnlyOneDecimalDot); } parsePrice(priceText) { if (!priceText) { return NaN; } const formattedPriceText = priceText.replace(".", "").replace(",", "."); return parseFloat(formattedPriceText); } parseRenewedDate(renewedDateText) { const currentMoment = moment.tz(DEFAULT_TIMEZONE); if (renewedDateText.includes("Prije mjesec dana")) { return currentMoment.add(-1, "month"); } if (renewedDateText.includes("Jučer")) { return currentMoment.add(-1, "day"); } if (renewedDateText.includes("Prije sat")) { return currentMoment.add(-1, "hour"); } if (renewedDateText.includes("dan")) { // format for this case should be "Prije N dana" or "Prije N dan" const dateParts = renewedDateText.split(" "); if (dateParts[0] === "Prije") { const numberOfDays = parseInt(dateParts[1]); return currentMoment.add(-1 * numberOfDays, "days"); } else { return undefined; } } if (renewedDateText.includes("sat")) { const dateParts = renewedDateText.split(" "); const parsedHours = dateParts && dateParts.length > 2 ? parseInt(dateParts[1]) : undefined; if (!parsedHours) { return undefined; } return currentMoment.add(-1 * parsedHours, "hours"); } const todayVariations = ["min", "sekund", "maloprije"]; for (const todayVariation of todayVariations) { if (renewedDateText.includes(todayVariation)) { return currentMoment; } } const renewedDateMoment = moment.tz( renewedDateText, OLX_ENUMS.OLX_RENEWED_DATE_FORMAT, DEFAULT_TIMEZONE ); return renewedDateMoment.isValid() ? renewedDateMoment : undefined; } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return await savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = OlxCrawler;