"use strict"; const fetch = require("node-fetch"); const cheerio = require("cheerio"); const Promise = require("bluebird"); const moment = require("moment-timezone"); const htmlToText = require("html-to-text"); const { AD_TYPE, AD_CATEGORY, AD_AGENCY, AD_STATUS, CRAWLER_AD_TYPE } = require("../../common/enums"); const { DEFAULT_TIMEZONE, PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); const AKTIDO_ENUMS = { AKTIDO_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "/prodaja-1/najam-2", [CRAWLER_AD_TYPE.ONLY_SELL]: "/prodaja-1", [CRAWLER_AD_TYPE.ONLY_RENT]: "/najam-2" }, AKTIDO_AD_CATEGORY: { [AD_CATEGORY.ALL.id]: "", [AD_CATEGORY.FLAT.id]: "/tip-2", [AD_CATEGORY.HOUSE.id]: "/tip-1", [AD_CATEGORY.LAND.id]: "/tip-5", [AD_CATEGORY.OFFICE.id]: "/tip-4", [AD_CATEGORY.APARTMENT.id]: "/tip-3", [AD_CATEGORY.GARAGE.id]: "/tip-6" //[AD_CATEGORY.COTTAGE.id]: "" }, AKTIDO_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss", AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" }; const { AKTIDO_FORCE_CRAWL } = require("../specificConfigs/aktido"); class AktidoCrawler { constructor( savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], maxPages = 1000, maxResultsPerPage = 100, ignoredUsernames = [], delayBetweenPages = 1000 ) { this.savers = savers; this.baseUrl = "https://www.aktido.ba/pretraga/sortiraj-date_DESC"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxPages = maxPages; this.maxResultsPerPage = maxResultsPerPage; this.delayBetweenPages = delayBetweenPages; } async crawl() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; if (crawlAdCategories) { const indexGenerators = []; for (const adCategory of crawlAdCategories) { indexGenerators.push(this.categoryIndexer(adCategory)); } let done = false; while (!done) { const categoryIndexerPromises = []; const generatorsToRemove = []; for (const indexGenerator of indexGenerators) { categoryIndexerPromises.push(indexGenerator.next()); generatorsToRemove.push(false); } const singlePageResults = await Promise.all(categoryIndexerPromises); const entries = singlePageResults.entries(); for (const [index, { value: singlePageResult }] of entries) { if (singlePageResult) { const saveResults = await this.saveCrawledResults(singlePageResult); const { newRecords } = saveResults; newRealEstates.push(...newRecords); if ( Array.isArray(newRecords) && newRecords.length === 0 && !AKTIDO_FORCE_CRAWL ) { generatorsToRemove[index] = true; } } else { //Generator returned undefined, remove this generator from array generatorsToRemove[index] = true; // console.log("Generator ", index + 1, "has no more pages"); } } // console.log("Generators state : ", generatorsToRemove); for (let i = generatorsToRemove.length - 1; i >= 0; i--) { if (generatorsToRemove[i]) { // console.log("\tRemove generator ", i + 1); indexGenerators.splice(i, 1); } } if (indexGenerators.length === 0) { done = true; } await this.sleep(this.delayBetweenPages); } } return newRealEstates; } async *categoryIndexer(adCategory) { let pageToIndex = 1; const urlAdTypePart = AKTIDO_ENUMS.AKTIDO_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = AKTIDO_ENUMS.AKTIDO_AD_CATEGORY[adCategory]; if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}/stranica-${pageToIndex}`; const singlePageResults = await this.indexSinglePage( urlPageToCrawl, this.maxResultsPerPage ); if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { yield singlePageResults; } else { return undefined; } ++pageToIndex; if (pageToIndex === this.maxPages) { return undefined; } } } else { return undefined; } } async indexSinglePage(url, maxResultsPerPage) { if (PRINT_CRAWLER_DEBUG) { console.log("[AKTIDO] Index page : ", url); } try { const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); let hrefs = []; $( "body > div > div.container > div.row > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div.row.box-items.group-grid-view" ) .find(".moreInfo") .each((i, elem) => { const href = $(elem) .find("a") .first() .attr("href"); if (href) { hrefs.push(href); } }); let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { asyncScraping.push(this.scrapeAd(hrefs[i])); } const scrapedData = await Promise.all(asyncScraping); const filteredScrapedData = scrapedData.filter(adData => !!adData); return filteredScrapedData; } catch (e) { console.error("[AKTIDO] Exception caught:" + e); return []; } } async scrapeAd(url) { // console.log("[AKTIDO] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); const $ = cheerio.load(body); const mapElementParent = $(".box-map").parent(); const scriptElement = $("script", mapElementParent); if ( scriptElement[0] && scriptElement[0].children && scriptElement[0].children[0] && scriptElement[0].children[0].data ) { let extractedData; try { //data string starts with : var json_map_data = [{"r ... //so we remove first 20 characters const jsonData = scriptElement[0].children[0].data.substring(20); const parsedJsonData = JSON.parse(jsonData); extractedData = parsedJsonData[0]; } catch (e) { throw { message: "Can't find ad data JSON" }; } const aktidoId = extractedData["re_realEstates_id"]; const adCategory = this.getKiviCategoryIdFromAktidoId( parseInt(extractedData["re_types_id"]) ); if (!adCategory) { throw { message: `Invalid category : ${extractedData["re_types_id"]}` }; } const adType = this.getKiviAdTypeFromAktidoActionId( parseInt(extractedData["re_action_id"]) ); if (!adType) { throw { message: `Invalid ad type : ${extractedData["re_action_id"]}` }; } const title = extractedData["re_realEstates_portalName"]; const extractedPrice = parseFloat( extractedData["re_realEstates_price"] ); const price = extractedPrice ? extractedPrice : null; const area = parseFloat(extractedData["re_realEstates_area"]); const gardenSize = parseFloat( extractedData["re_realEstates_fieldArea"] ); const longDescription = htmlToText.fromString( extractedData["re_realEstates_description"] ); const locationLong = extractedData["re_realEstates_longitude"]; const locationLat = extractedData["re_realEstates_latitude"]; const publishedDateMoment = moment.tz( extractedData["re_realEstates_inserted"], AKTIDO_ENUMS.AKTIDO_PUBLISHED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!publishedDateMoment.isValid()) { throw { message: `Invalid published date : ${ extractedData["re_realEstates_inserted"] }` }; } const renewedDateMoment = moment.tz( extractedData["re_realEstates_edited"], AKTIDO_ENUMS.AKTIDO_RENEWED_DATE_FORMAT, DEFAULT_TIMEZONE ); if (!renewedDateMoment.isValid()) { throw { message: `Invalid renewed date : ${ extractedData["re_realEstates_edited"] }` }; } const adStatus = AD_STATUS.STATUS_NORMAL; const data = { url, agencyObjectId: aktidoId, originAgencyName: AD_AGENCY.AKTIDO, realEstateType: adCategory, adType, title, price, area, gardenSize, shortDescription: "", longDescription: longDescription, streetNumber: 0, streetName: "", locality: "", municipality: "", city: "", region: "", entity: "", country: "", locationLat, locationLong, adStatus, publishedDate: publishedDateMoment.toISOString(), renewedDate: renewedDateMoment.toISOString() }; return data; } else { console.log("[AKTIDO] No JSON data for this ad : ", url); return null; } } catch (e) { console.error("[AKTIDO] Exception caught: " + e.message, "\r\nURL:", url); return null; } return null; } //======= HELPER FUNCTIONS ============= getKiviCategoryIdFromAktidoId(aktidoCategoryId) { switch (aktidoCategoryId) { case 1: return AD_CATEGORY.HOUSE.id; case 2: return AD_CATEGORY.FLAT.id; case 3: return AD_CATEGORY.APARTMENT.id; case 4: return AD_CATEGORY.OFFICE.id; case 5: return AD_CATEGORY.LAND.id; case 6: return AD_CATEGORY.GARAGE.id; default: return undefined; } } getKiviAdTypeFromAktidoActionId(actionId) { switch (actionId) { case 1: return AD_TYPE.AD_TYPE_SALE.stringId; case 2: return AD_TYPE.AD_TYPE_RENT.stringId; default: return undefined; } } async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async saveCrawledResults(results) { const savers = this.savers; // for (const saver of savers) { // await saver.save(results); // } //For now, we use only Postgres saver, so ... return await savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } } module.exports = AktidoCrawler;