diff --git a/app/helpers/awsEmail.js b/app/helpers/awsEmail.js index 922bd0d..b7ad3ed 100644 --- a/app/helpers/awsEmail.js +++ b/app/helpers/awsEmail.js @@ -2,7 +2,6 @@ const dotenv = require('dotenv').config(); const { getRealEstateTypeEnum } = require('./enums'); const { getRegionName, getMunicipalityName } = require('./codes'); -const db = require('../models/index'); const { allRERequestByUiid } = require('./db/dbHelper'); var AWS = require('aws-sdk'); const TEMPLATE_NAME = "MarketAlertTemplate" @@ -102,11 +101,11 @@ const sendBulkEmail = async (marketAlerts) => { groupedRERequests = []; - let RERequestUuids = marketAlerts.map(marketAlert => marketAlert.request); + const RERequestUuidsMaped = marketAlerts.map(marketAlert => marketAlert.request); - RERequestUuids = Array.from(new Set(RERequestUuids)); + const RERequestUuidsArray = Array.from(new Set(RERequestUuidsMaped)); - RERequestUuids = RERequestUuids.map(marketAlert => { + const RERequestUuids = RERequestUuidsArray.map(marketAlert => { return { uniqueId: marketAlert } }); diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index 18d25bb..3d918cd 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -13,20 +13,264 @@ module.exports = class OlxCrawler { this.maxResults = maxResults; } - async indexSingle(url, email, uuid) { + async indexPages(urls) { + const indexers = []; + + urls.forEach(url => { + indexers.push(new Indexer(url)); + }); + + return Promise.map(indexers, function (indexer) { + return indexer.indexWithPagination(); + }).then(async (results) => { + return results + }) + } + + async crawl() { + console.log("OLX CRAWLER: start crawl"); + + const filteredResults = []; + const realestateRequests = await allRERequest(); + console.log("OLX CRAWLER: found " + realestateRequests.length + "subscribed RealEstateRequests"); + const urls = this.createRequestUrls(realestateRequests); + let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); + console.log("Final crawler results"); + console.log(results[0].length); + + for (const finalResult of results[0]) { + + if (null !== finalResult) { + if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { + const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); + + if (pointInsideBoundingBox[0].length !== 0) { + filteredResults.push(finalResult); + } + } + } + } + + console.log("OLX CRAWLER: number of olx crawler results, after geo location filtering: " + filteredResults.length); + return filteredResults; + } + + createRequestUrls(realestateRequests) { + const urls = [] + + for (const request of realestateRequests) { + const realsestateType = "kategorija=" + getRealEstateTypeEnum(request.realEstateType).olxCategory; + const region = "kanton=" + getRegion(request.region).olxid; + const municipality = "grad%5B%5D=" + getMunicipality(request.region, request.municipality).olxid; + const sizeMin = "kvadrata_min=" + request.sizeMin; + const sizeMax = "kvadrata_max=" + request.sizeMax; + const priceMin = "od=" + request.priceMin; + const priceMax = "do=" + request.priceMax; + + const olxUrl = { + url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}&stranica=`, + email: request.email, + uuid: request.uniqueId + } + console.log(olxUrl.url); + urls.push(olxUrl); + } + + return urls; + } +}; + + +class Indexer { + + /** + * + * @param {String|Array} olxUrl single or array of objects containing url email and uuid + * @param {Array} hrefResutls array contaning urls from crawler results + */ + + constructor(olxUrl, hrefResutls) { + this.olxUrl = olxUrl; + this.hrefResutls = hrefResutls; + } + + async indexWithPagination(pageNumber = 1) { + + console.log("This is olxUrl:" + this.olxUrl.url); + const pageNr = this.olxUrl.url.match(/\d+$/); + const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr); + try { - const res = await fetch(url); + + return Promise.map(indexers.indexers, function (indexer) { + return indexer.indexPage(pageNumber); + }).then(async (results) => { + let hasResults = false; + + results.forEach(result => { + if (!hasResults) { + console.log("No results detected") + hasResults = result.hasResults + } + }); + + if (!hasResults) { + console.log("HAS NO MORE RESULTS, stop the paging, there are some results and they should contain only HREFS"); + console.log(results.length); + const singlePageIndexers = this.prepareHrefIndexers(results); + if (singlePageIndexers.length === 0) { + console.log("THERE IS NOT EVEN SINGLE RESULT"); + return [] + } + + return Promise.map(singlePageIndexers, function (indexer) { + return indexer.indexSingle(); + }).then(async (results) => { + console.log("SinglePageMethod in HAS NO RESULTS, MarketAralms"); + console.log(results.length); + return results; + }); + + } else { + console.log("HAS MORE RESULTS, should only contain HREFS"); + console.log(results.length); + const newResults = await this.indexWithPagination(results[0].pageNumber + 10); + const singlePageIndexers = this.prepareHrefIndexers(results); + + const newerResults = await Promise.map(singlePageIndexers, function (indexer) { + return indexer.indexSingle(); + }).then(async (results) => { + console.log("SinglePageMethod HAS RESULTS, should contain MarketAlerts only"); + console.log(results.length); + return results; + }); + + Array.prototype.push.apply(newResults, newerResults); + return newResults; + + } + }); + } catch (e) { + console.error("Error has accured", e); + } + + } + + prepareIndexers(pageNr) { + console.log("Entering prepareIndexers : page nr - " + pageNr); + const indexers = []; + let lastPageNumber; + if (pageNr) { + for (let index = Number(pageNr[0]); index <= Number(pageNr[0]) + 10; index++) { + lastPageNumber = index; + const newOlxUrl = { + url: this.olxUrl.url.replace(/\d+$/, "") + index, + email: this.olxUrl.email, + uuid: this.olxUrl.uuid + } + indexers.push(new Indexer(newOlxUrl)); + + } + } else { + for (let index = 1; index <= 10; index++) { + lastPageNumber = index; + const newOlxUrl = { + url: this.olxUrl.url + index, + email: this.olxUrl.email, + uuid: this.olxUrl.uuid + } + indexers.push(new Indexer(newOlxUrl)); + } + } + return { + indexers: indexers, + lastPageNumber: lastPageNumber + }; + } + + prepareHrefIndexers(results) { + const indexers = [] + + if (!Array.isArray(results)) { + results.hrefs.forEach(href => { + const newOlxUrl = { + url: href, + email: results.olxUrl.email, + uuid: results.olxUrl.uuid + } + + indexers.push(new Indexer(newOlxUrl)); + }); + + } else { + + + results.forEach(result => { + + if (result !== null && result.hasOwnProperty('hrefs')) { + result.hrefs.forEach(href => { + // console.log(href); + const newOlxUrl = { + url: href, + email: result.olxUrl.email, + uuid: result.olxUrl.uuid + } + + indexers.push(new Indexer(newOlxUrl)); + }) + } + + }); + } + + return indexers; + } + + async indexPage(pageNumber) { + console.log("Page number in index page, max page number :") + console.log(pageNumber); + + try { + + console.log("Indexing page: " + this.olxUrl.url); + const res = await fetch(this.olxUrl.url); + const body = await res.text(); + const $ = cheerio.load(body); + const hrefs = []; + let hasResults = false + + $('#rezultatipretrage').find('.listitem').each((i, elem) => { + hasResults = true + const href = $(elem).find('a').first().attr('href'); + hrefs.push(href); + }); + + console.log("this is hrefs for olxUrl" + this.olxUrl.url); + console.log("NUMBER OF HREFS " + hrefs.length); + + return { + hrefs: hrefs, + hasResults: hasResults, + pageNumber: pageNumber, + olxUrl: this.olxUrl + } + } catch (e) { + console.error('Exception caught:' + e); + } + } + + async indexSingle() { + try { + console.log("Index single"); + console.log(this.olxUrl.url); + + if (this.olxUrl.url === undefined) { + return {} + } + const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); - //TODO figure out what to do with username - const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); - - // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { - // return null; - // } - - //TODO remove properties that are not needed, and add some if they are missing const title = $('#naslovartikla').text().trim(); const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); @@ -37,43 +281,17 @@ module.exports = class OlxCrawler { const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); - const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); const time = $('time').attr('datetime'); const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); const descriptions = $('.artikal_detaljniopis_tekst'); - // const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; const imgRe = /href":("[^"]*")/g; const matches = latLngRe.exec(body); let lng = '', lat = ''; - - - const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) const parsePrice = (price) => parseFloat(price.replace(".", "")) - - // TODO we dont save images ?? - - // const images = []; - // const imgMatches = body.match(imgRe); - - // for (let i = 0; imgMatches && i < imgMatches.length; i++) { - // let img = imgMatches[i].replace("href\":", "") - // img = img.replace("\"", ""); - // img = img.replace("\"", ""); - // images.push(img); - // } - - // const uploadPromises = images.map(img => { - // const imgFixed = eval(`'${img}'`); - // return cloudinary.uploader.upload(eval(`'${img}'`)); - // }); - - // const uploadResults = await Promise.all(uploadPromises); - // const cloudinaryImages = uploadResults.map(ur => ur.url); - if (matches && matches.length >= 3) { lat = matches[1]; lng = matches[2]; @@ -81,18 +299,16 @@ module.exports = class OlxCrawler { const parsedPrice = parsePrice(price); - console.log(location); const locationArray = location.split(","); const region = locationArray[0]; const municipality = locationArray[1]; const data = { realEstateType: this.getCategoryId(realEstateType), - email: email, - uuid: uuid, + email: this.olxUrl.email, + uuid: this.olxUrl.uuid, olxId: olxId, - // category: category, - url, + url: this.olxUrl.url, title, price: isNaN(parsedPrice) ? 0 : parsedPrice, size: parseFloat(size), @@ -100,14 +316,12 @@ module.exports = class OlxCrawler { address, region, municipality, - // adType: AD_TYPE_SALE, time, shortDescription: descriptions.first().text(), longDescription: descriptions.last().text(), lat, lng, loc: [parseFloat(lat), parseFloat(lng)], - // images: cloudinaryImages }; return data; @@ -118,41 +332,6 @@ module.exports = class OlxCrawler { return null; } - async indexPage(olxUrl, maxResults = 1000) { - try { - //TODO fix paging - // console.log('Starting to index page: ' + pageNr); - // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; - - const res = await fetch(olxUrl.url); - const body = await res.text(); - const $ = cheerio.load(body); - const hrefs = []; - const results = []; - - $('#rezultatipretrage').find('.listitem').each((i, elem) => { - const href = $(elem).find('a').first().attr('href'); - hrefs.push(href); - }); - - let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; - - for (let i = 0; i < hrefs.length; i++) { - console.log(`indexing: ${hrefs[i]}`); - - const singleData = await this.indexSingle(hrefs[i], olxUrl.email, olxUrl.uuid); - - if (singleData) { - results.push(singleData); - } - } - - return results; - } catch (e) { - console.error('Exception caught:' + e); - } - } - getCategoryId(category) { switch (category) { @@ -169,70 +348,5 @@ module.exports = class OlxCrawler { return ''; } } - - async indexPages(urls, start, end, maxResults = 1000) { - //TODO fix paging - // let results = {}; - // for (let i = start; i <= end; i++) { - // let result = await this.indexPage(i, maxResults); - // Object.assign(results, result) - // } - // return results; - - let results = []; - for (let url of urls) { - let result = await this.indexPage(url, maxResults); - results.push(result); - } - return results; - } - - async crawl() { - console.log("OLX CRAWLER: start crawl"); - - const filteredResults = []; - const realestateRequests = await allRERequest(); - console.log("OLX CRAWLER: found " + realestateRequests.length + "subscribed RealEstateRequests"); - const urls = this.createRequestUrls(realestateRequests); - let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); - - for (const result of results) { - for (const finalResult of result) { - if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { - const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); - - if (pointInsideBoundingBox[0].length !== 0) { - filteredResults.push(finalResult); - } - } - } - } - console.log("OLX CRAWLER: number of olx crawler results, after geo location filtering: " + filteredResults.length); - return filteredResults; - } - - createRequestUrls(realestateRequests) { - const urls = [] - - for (const request of realestateRequests) { - const realsestateType = "kategorija=" + getRealEstateTypeEnum(request.realEstateType).olxCategory; - const region = "kanton=" + getRegion(request.region).olxid; - const municipality = "grad%5B%5D=" + getMunicipality(request.region, request.municipality).olxid; - const sizeMin = "kvadrata_min=" + request.sizeMin; - const sizeMax = "kvadrata_max=" + request.sizeMax; - const priceMin = "od=" + request.priceMin; - const priceMax = "do=" + request.priceMax; - - const olxUrl = { - url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}`, - email: request.email, - uuid: request.uniqueId - } - console.log(olxUrl.url); - urls.push(olxUrl); - } - - return urls; - } -}; +}