From f899c96dc67d06243a6ab913323685ecfe198f44 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 28 Oct 2019 09:14:45 +0100 Subject: [PATCH 1/4] add crawler and crawler config for Aktido agency --- app/common/enums.js | 3 +- app/crawler/crawl.js | 13 +- app/crawler/crawlerConfig.js | 4 +- app/crawler/specificConfigs/aktido.js | 33 +++ app/crawler/specificCrawlers/aktido.js | 381 +++++++++++++++++++++++++ development.env | 7 + 6 files changed, 438 insertions(+), 3 deletions(-) create mode 100644 app/crawler/specificConfigs/aktido.js create mode 100644 app/crawler/specificCrawlers/aktido.js diff --git a/app/common/enums.js b/app/common/enums.js index 9a59359..93545a6 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -138,7 +138,8 @@ const AD_STATUS = { const AD_AGENCY = { OLX: "OLX", RENTAL: "RENTAL", - PROSTOR: "PROSTOR" + PROSTOR: "PROSTOR", + AKTIDO: "AKTIDO" }; const CRAWLER_AD_TYPE = { diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 002e17a..82411b6 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -8,11 +8,13 @@ const OlxCrawler = require("./specificCrawlers/olx"); const RentalCrawler = require("./specificCrawlers/rental"); const ProstorCrawler = require("./specificCrawlers/prostor"); +const AktidoCrawler = require("./specificCrawlers/aktido"); const { OLX_CONFIG, RENTAL_CONFIG, - PROSTOR_CONFIG + PROSTOR_CONFIG, + AKTIDO_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); @@ -46,6 +48,15 @@ async function crawlAll() { PROSTOR_CONFIG.PROSTOR_MAX_RESULTS_PER_PAGE, PROSTOR_CONFIG.PROSTOR_IGNORED_USERNAMES, PROSTOR_CONFIG.PROSTOR_DELAY_BETWEEN_PAGES + ), + new AktidoCrawler( + [postgresSaver], + AKTIDO_CONFIG.AKTIDO_CRAWLER_AD_TYPE, + AKTIDO_CONFIG.AKTIDO_CRAWLER_AD_CATEGORIES, + AKTIDO_CONFIG.AKTIDO_MAX_PAGES, + AKTIDO_CONFIG.AKTIDO_MAX_RESULTS_PER_PAGE, + AKTIDO_CONFIG.AKTIDO_IGNORED_USERNAMES, + AKTIDO_CONFIG.AKTIDO_DELAY_BETWEEN_PAGES ) ]; diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index 1818ccb..ee98e44 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -4,9 +4,11 @@ require("dotenv").config({ path: __dirname + "/./../../.env" }); const OLX_CONFIG = require("./specificConfigs/olx"); const RENTAL_CONFIG = require("./specificConfigs/rental"); const PROSTOR_CONFIG = require("./specificConfigs/prostor"); +const AKTIDO_CONFIG = require("./specificConfigs/aktido"); module.exports = { OLX_CONFIG, RENTAL_CONFIG, - PROSTOR_CONFIG + PROSTOR_CONFIG, + AKTIDO_CONFIG }; diff --git a/app/crawler/specificConfigs/aktido.js b/app/crawler/specificConfigs/aktido.js new file mode 100644 index 0000000..bd06645 --- /dev/null +++ b/app/crawler/specificConfigs/aktido.js @@ -0,0 +1,33 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const aktidoCrawlerAdType = + process.env.AKTIDO_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.AKTIDO_CRAWLER_AD_TYPE] + : null; + +const aktidoParsedCrawlerAdCategories = + process.env.AKTIDO_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.AKTIDO_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const aktidoIgnoredUsernames = []; + +const transformedAktidoCrawlerAdCategories = aktidoParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + AKTIDO_MAX_PAGES: parseInt(process.env.AKTIDO_MAX_PAGES) || 500, + AKTIDO_MAX_RESULTS_PER_PAGE: + parseInt(process.env.AKTIDO_MAX_RESULTS_PER_PAGE) || 50, + AKTIDO_CRAWLER_AD_TYPE: aktidoCrawlerAdType || CRAWLER_AD_TYPE.NONE, + AKTIDO_CRAWLER_AD_CATEGORIES: transformedAktidoCrawlerAdCategories, + AKTIDO_IGNORED_USERNAMES: aktidoIgnoredUsernames || [], + AKTIDO_DELAY_BETWEEN_PAGES: + parseInt(process.env.AKTIDO_DELAY_BETWEEN_PAGES) || 1000 +}; diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js new file mode 100644 index 0000000..6ea44e9 --- /dev/null +++ b/app/crawler/specificCrawlers/aktido.js @@ -0,0 +1,381 @@ +"use strict"; + +const fetch = require("node-fetch"); +const cheerio = require("cheerio"); +const Promise = require("bluebird"); +const moment = require("moment-timezone"); +const htmlToText = require("html-to-text"); + +const { + AD_TYPE, + AD_CATEGORY, + AD_AGENCY, + AD_STATUS, + CRAWLER_AD_TYPE +} = require("../../common/enums"); + +const { + DEFAULT_TIMEZONE, + PRINT_CRAWLER_DEBUG +} = require("../../config/appConfig"); + +const AKTIDO_ENUMS = { + AKTIDO_AD_TYPE: { + [CRAWLER_AD_TYPE.ALL]: "/prodaja-1/najam-2", + [CRAWLER_AD_TYPE.ONLY_SELL]: "/prodaja-1", + [CRAWLER_AD_TYPE.ONLY_RENT]: "/najam-2" + }, + AKTIDO_AD_CATEGORY: { + [AD_CATEGORY.FLAT.id]: "/tip-2", + [AD_CATEGORY.HOUSE.id]: "/tip-1", + [AD_CATEGORY.LAND.id]: "/tip-5", + [AD_CATEGORY.OFFICE.id]: "/tip-4", + [AD_CATEGORY.APARTMENT.id]: "/tip-3", + [AD_CATEGORY.GARAGE.id]: "/tip-6" + //[AD_CATEGORY.COTTAGE.id]: "" + }, + AKTIDO_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss", + AKTIDO_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" +}; + +class AktidoCrawler { + constructor( + savers = [], + crawlerAdTypes = CRAWLER_AD_TYPE.ALL, + crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], + maxPages = 1000, + maxResultsPerPage = 100, + ignoredUsernames = [], + delayBetweenPages = 1000 + ) { + this.savers = savers; + this.baseUrl = "https://www.aktido.ba/pretraga/sortiraj-date_DESC"; + this.crawlerAdTypes = crawlerAdTypes; + this.crawlerAdCategories = crawlerAdCategories; + this.maxPages = maxPages; + this.maxResultsPerPage = maxResultsPerPage; + this.delayBetweenPages = delayBetweenPages; + } + + async crawl() { + const crawlAdCategories = this.crawlerAdCategories; + + const newRealEstates = []; + + if (crawlAdCategories) { + const indexGenerators = []; + for (const adCategory of crawlAdCategories) { + indexGenerators.push(this.categoryIndexer(adCategory)); + } + + let done = false; + while (!done) { + const categoryIndexerPromises = []; + const generatorsToRemove = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); + } + + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); + + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; + + newRealEstates.push(...newRecords); + + if (Array.isArray(newRecords) && newRecords.length === 0) { + generatorsToRemove[index] = true; + } + + // for (const existingRecord of existingRecords) { + // const { publishedDate, renewedDate } = existingRecord; + // + // const publishedDateMoment = moment.utc(publishedDate); + // const renewedDateMoment = moment.utc(renewedDate); + // + // const stopCrawlingThisCategory = publishedDateMoment.isSame( + // renewedDateMoment, + // "minute" + // ); + // + // if (stopCrawlingThisCategory) { + // generatorsToRemove[index] = true; + // // console.log("\tGenerator ", index + 1, "has no more new ads"); + // break; + // } + // } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); + } + } + + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + await this.sleep(this.delayBetweenPages); + } + } + return newRealEstates; + } + + async *categoryIndexer(adCategory) { + let pageToIndex = 1; + + const urlAdTypePart = AKTIDO_ENUMS.AKTIDO_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = AKTIDO_ENUMS.AKTIDO_AD_CATEGORY[adCategory]; + if (urlAdTypePart && urlCategoryPart) { + while (true) { + const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}/stranica-${pageToIndex}`; + const singlePageResults = await this.indexSinglePage( + urlPageToCrawl, + this.maxResultsPerPage + ); + + if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { + yield singlePageResults; + } else { + return undefined; + } + + ++pageToIndex; + if (pageToIndex === this.maxPages) { + return undefined; + } + } + } else { + return undefined; + } + } + + async indexSinglePage(url, maxResultsPerPage) { + if (PRINT_CRAWLER_DEBUG) { + console.log("[AKTIDO] Index page : ", url); + } + + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + let hrefs = []; + + $( + "body > div > div.container > div.row > div.col-xs-12.col-sm-12.col-md-12.col-lg-9.content-main > div.row.box-items.group-grid-view" + ) + .find(".moreInfo") + .each((i, elem) => { + const href = $(elem) + .find("a") + .first() + .attr("href"); + if (href) { + hrefs.push(href); + } + }); + + let actualNoOfResults = + hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; + + const asyncScraping = []; + for (let i = 0; i < actualNoOfResults; i++) { + asyncScraping.push(this.scrapeAd(hrefs[i])); + } + + const scrapedData = await Promise.all(asyncScraping); + const filteredScrapedData = scrapedData.filter(adData => !!adData); + return filteredScrapedData; + } catch (e) { + console.error("[AKTIDO] Exception caught:" + e); + return []; + } + } + + async scrapeAd(url) { + console.log("[AKTIDO] Scraping : ", url); + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + + const mapElementParent = $(".box-map").parent(); + const scriptElement = $("script", mapElementParent); + if ( + scriptElement[0] && + scriptElement[0].children && + scriptElement[0].children[0] && + scriptElement[0].children[0].data + ) { + let extractedData; + try { + //data string starts with : var json_map_data = [{"r ... + //so we remove first 20 characters + + const jsonData = scriptElement[0].children[0].data.substring(20); + const parsedJsonData = JSON.parse(jsonData); + extractedData = parsedJsonData[0]; + } catch (e) { + throw { message: "Can't find ad data JSON" }; + } + + const aktidoId = extractedData["re_realEstates_id"]; + const adCategory = this.getKiviCategoryIdFromAktidoId( + parseInt(extractedData["re_types_id"]) + ); + if (!adCategory) { + throw { + message: `Invalid category : ${extractedData["re_types_id"]}` + }; + } + const adType = this.getKiviAdTypeFromAktidoActionId( + parseInt(extractedData["re_action_id"]) + ); + if (!adType) { + throw { + message: `Invalid ad type : ${extractedData["re_action_id"]}` + }; + } + + const title = extractedData["re_realEstates_portalName"]; + const extractedPrice = parseFloat( + extractedData["re_realEstates_price"] + ); + const price = extractedPrice ? extractedPrice : null; + const area = parseFloat(extractedData["re_realEstates_area"]); + const gardenSize = parseFloat( + extractedData["re_realEstates_fieldArea"] + ); + const longDescription = htmlToText.fromString( + extractedData["re_realEstates_description"] + ); + const locationLong = extractedData["re_realEstates_longitude"]; + const locationLat = extractedData["re_realEstates_latitude"]; + const publishedDateMoment = moment.tz( + extractedData["re_realEstates_inserted"], + AKTIDO_ENUMS.AKTIDO_PUBLISHED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + if (!publishedDateMoment.isValid()) { + throw { + message: `Invalid published date : ${ + extractedData["re_realEstates_inserted"] + }` + }; + } + + const renewedDateMoment = moment.tz( + extractedData["re_realEstates_edited"], + AKTIDO_ENUMS.AKTIDO_RENEWED_DATE_FORMAT, + DEFAULT_TIMEZONE + ); + if (!renewedDateMoment.isValid()) { + throw { + message: `Invalid renewed date : ${ + extractedData["re_realEstates_edited"] + }` + }; + } + + const adStatus = AD_STATUS.STATUS_NORMAL; + + const data = { + url, + agencyObjectId: aktidoId, + originAgencyName: AD_AGENCY.AKTIDO, + realEstateType: adCategory, + adType, + title, + price, + area, + gardenSize, + shortDescription: "", + longDescription: longDescription, + streetNumber: 0, + streetName: "", + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat, + locationLong, + adStatus, + publishedDate: publishedDateMoment.toISOString(), + renewedDate: renewedDateMoment.toISOString() + }; + + return data; + } else { + console.log("[AKTIDO] No JSON data for this ad : ", url); + return null; + } + } catch (e) { + console.error("[AKTIDO] Exception caught: " + e.message, "\r\nURL:", url); + return null; + } + return null; + } + + //======= HELPER FUNCTIONS ============= + + getKiviCategoryIdFromAktidoId(aktidoCategoryId) { + switch (aktidoCategoryId) { + case 1: + return AD_CATEGORY.HOUSE.id; + case 2: + return AD_CATEGORY.FLAT.id; + case 3: + return AD_CATEGORY.APARTMENT.id; + case 4: + return AD_CATEGORY.OFFICE.id; + case 5: + return AD_CATEGORY.LAND.id; + case 6: + return AD_CATEGORY.GARAGE.id; + default: + return undefined; + } + } + + getKiviAdTypeFromAktidoActionId(actionId) { + switch (actionId) { + case 1: + return AD_TYPE.AD_TYPE_SALE; + case 2: + return AD_TYPE.AD_TYPE_RENT; + default: + return undefined; + } + } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async saveCrawledResults(results) { + const savers = this.savers; + + // for (const saver of savers) { + // await saver.save(results); + // } + + //For now, we use only Postgres saver, so ... + return await savers[0].save(results); + //so that we can use some sequelize options and information when data is inserted + } +} + +module.exports = AktidoCrawler; diff --git a/development.env b/development.env index 0bd2939..f80dfe1 100644 --- a/development.env +++ b/development.env @@ -45,3 +45,10 @@ PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check c PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!! PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!! +#==AKTIDO== +AKTIDO_MAX_PAGES=Restrict crawler to this number of pages +AKTIDO_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved +AKTIDO_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values +AKTIDO_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values +AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!! +AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page -- 2.47.3 From 5b6886f52bdb5bd86302af159b58468f2b3e89b5 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 28 Oct 2019 09:20:03 +0100 Subject: [PATCH 2/4] add ALL categories option for Aktido agency --- app/crawler/specificCrawlers/aktido.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index 6ea44e9..af1aaff 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -26,6 +26,7 @@ const AKTIDO_ENUMS = { [CRAWLER_AD_TYPE.ONLY_RENT]: "/najam-2" }, AKTIDO_AD_CATEGORY: { + [AD_CATEGORY.ALL.id]: "", [AD_CATEGORY.FLAT.id]: "/tip-2", [AD_CATEGORY.HOUSE.id]: "/tip-1", [AD_CATEGORY.LAND.id]: "/tip-5", @@ -137,7 +138,7 @@ class AktidoCrawler { const urlAdTypePart = AKTIDO_ENUMS.AKTIDO_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = AKTIDO_ENUMS.AKTIDO_AD_CATEGORY[adCategory]; - if (urlAdTypePart && urlCategoryPart) { + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}/stranica-${pageToIndex}`; const singlePageResults = await this.indexSinglePage( -- 2.47.3 From 2c2fcd648f0e2a35bd5fa39f24631bfc570e544c Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 28 Oct 2019 09:23:51 +0100 Subject: [PATCH 3/4] remove scrapeAd logging --- app/crawler/specificCrawlers/aktido.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/crawler/specificCrawlers/aktido.js b/app/crawler/specificCrawlers/aktido.js index af1aaff..a2ea43d 100644 --- a/app/crawler/specificCrawlers/aktido.js +++ b/app/crawler/specificCrawlers/aktido.js @@ -205,7 +205,7 @@ class AktidoCrawler { } async scrapeAd(url) { - console.log("[AKTIDO] Scraping : ", url); + // console.log("[AKTIDO] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); -- 2.47.3 From 1e36cb8423a267b7ee7b12f0875862c1ffb450e7 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 28 Oct 2019 09:24:08 +0100 Subject: [PATCH 4/4] add ALL category option for Rental agency --- app/crawler/specificCrawlers/rental.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index 45e572d..b73278e 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -26,6 +26,7 @@ const RENTAL_ENUMS = { [CRAWLER_AD_TYPE.ONLY_RENT]: "/najam-2" }, RENTAL_AD_CATEGORY: { + [AD_CATEGORY.ALL.id]: "", [AD_CATEGORY.FLAT.id]: "/tip-2", [AD_CATEGORY.HOUSE.id]: "/tip-1", [AD_CATEGORY.LAND.id]: "/tip-5", @@ -137,7 +138,7 @@ class RentalCrawler { const urlAdTypePart = RENTAL_ENUMS.RENTAL_AD_TYPE[this.crawlerAdTypes]; const urlCategoryPart = RENTAL_ENUMS.RENTAL_AD_CATEGORY[adCategory]; - if (urlAdTypePart && urlCategoryPart) { + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { while (true) { const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}/stranica-${pageToIndex}`; const singlePageResults = await this.indexSinglePage( @@ -204,7 +205,7 @@ class RentalCrawler { } async scrapeAd(url) { - // console.log("[RENTAL] Scraping : ", url); + console.log("[RENTAL] Scraping : ", url); try { const adPageSource = await fetch(url); const body = await adPageSource.text(); -- 2.47.3