diff --git a/app/helpers/awsEmail.js b/app/helpers/awsEmail.js index 922bd0d..b7ad3ed 100644 --- a/app/helpers/awsEmail.js +++ b/app/helpers/awsEmail.js @@ -2,7 +2,6 @@ const dotenv = require('dotenv').config(); const { getRealEstateTypeEnum } = require('./enums'); const { getRegionName, getMunicipalityName } = require('./codes'); -const db = require('../models/index'); const { allRERequestByUiid } = require('./db/dbHelper'); var AWS = require('aws-sdk'); const TEMPLATE_NAME = "MarketAlertTemplate" @@ -102,11 +101,11 @@ const sendBulkEmail = async (marketAlerts) => { groupedRERequests = []; - let RERequestUuids = marketAlerts.map(marketAlert => marketAlert.request); + const RERequestUuidsMaped = marketAlerts.map(marketAlert => marketAlert.request); - RERequestUuids = Array.from(new Set(RERequestUuids)); + const RERequestUuidsArray = Array.from(new Set(RERequestUuidsMaped)); - RERequestUuids = RERequestUuids.map(marketAlert => { + const RERequestUuids = RERequestUuidsArray.map(marketAlert => { return { uniqueId: marketAlert } }); diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index 18d25bb..c0fa043 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -179,12 +179,34 @@ module.exports = class OlxCrawler { // } // return results; - let results = []; - for (let url of urls) { - let result = await this.indexPage(url, maxResults); - results.push(result); - } - return results; + // let results = []; + const indexers= []; + // let it = 3 + // for (let url of urls) { + // // let result = await this.indexPage(url, maxResults); + // // results.push(result); + // it++ + // indexers.push(new Indexer(url)); + + // } + + urls.forEach(url => { + indexers.push(new Indexer(url)); + }); + + return Promise.map(indexers, function (indexer) { + return indexer.indexWithPagination(); + }).then(async (results) => { + return results + }) + + + // let results = []; + // for (let url of urls) { + // let result = await this.indexPage(url, maxResults); + // results.push(result); + // } + // return results; } async crawl() { @@ -196,19 +218,21 @@ module.exports = class OlxCrawler { const urls = this.createRequestUrls(realestateRequests); let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); - for (const result of results) { - for (const finalResult of result) { - if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { - const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); + console.log(results[0]); - if (pointInsideBoundingBox[0].length !== 0) { - filteredResults.push(finalResult); - } - } - } - } - console.log("OLX CRAWLER: number of olx crawler results, after geo location filtering: " + filteredResults.length); - return filteredResults; + // for (const result of results) { + // for (const finalResult of result) { + // if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { + // const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); + + // if (pointInsideBoundingBox[0].length !== 0) { + // filteredResults.push(finalResult); + // } + // } + // } + // } + // console.log("OLX CRAWLER: number of olx crawler results, after geo location filtering: " + filteredResults.length); + // return filteredResults; } createRequestUrls(realestateRequests) { @@ -224,7 +248,7 @@ module.exports = class OlxCrawler { const priceMax = "do=" + request.priceMax; const olxUrl = { - url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}`, + url: `https://www.olx.ba/pretraga?${realsestateType}&id=2&stanje=0&vrstapregleda=tabela&sort_order=desc&${region}&${municipality}&${priceMin}&${priceMax}&vrsta=samoprodaja&${sizeMin}&${sizeMax}&stranica=`, email: request.email, uuid: request.uniqueId } @@ -236,3 +260,302 @@ module.exports = class OlxCrawler { } }; + +class Indexer { + +/** + * + * @param {String|Array} olxUrl single or array of URLS + * @param {String} email + * @param {Strung} uuid + */ + + constructor(olxUrl, email, uuid) { + this.olxUrl = olxUrl + this.email = email + this.uuid = uuid + } + + async indexWithPagination(pageNumber = 1){ + + let definitiveResults = []; + console.log("This is olxUrl:" + this.olxUrl.url); + const pageNr = this.olxUrl.url.match(/\d+$/); + const indexers = this.iteratePages(pageNumber ? [pageNumber] : pageNr); + console.log("This is page number:" + pageNr); + // if (pageNr && Number(pageNr[0]) >= 20){ + // console.log("Returning page numbermax"); + // return; + // } + + // if (pageNumber && pageNumber >= 30){ + // console.log("Returning page numbermax"); + // return; + // } + + try{ + var lastPageNumber = indexers.lastPageNumber; + + return Promise.map(indexers.indexers, function (indexer) { + console.log("Page number before async"); + console.log(pageNumber); + return indexer.indexPage(pageNumber); + }).then(async (results) => { + let hasResults = false; + results.forEach(result => { + if (!hasResults){ + console.log("No results detected") + hasResults = result.hasResults + } + }); + + + // console.log(results); + console.log("PageNumber"); + console.log(pageNumber); + if (!hasResults){ + console.log("HAS NO RESULTS"); + // console.log(results); + return results + } else { + console.log("HAS RESULTS"); + console.log(" Results PageNumber"); + console.log(results[0].pageNumber) + const newResults = await this.indexWithPagination(results[0].pageNumber + 10); + // console.log(newResults); + Array.prototype.push.apply(results,newResults); + return results; + // console.log("Idnexing seccond"); + // // this.olxUrl.url = this.olxUrl.url.replace(/\d+$/, "")+lastPageNumber + // // // return await this.indexWithPagination(); + // // Array.prototype.push.apply(definitiveResults,results); + // definitiveResults = results; + // console.log("This is FIRST RESULT array"); + // // console.log(results); + // console.log("This is definitive array after first results"); + // console.log(definitiveResults); + // const pageNr2 = [10] + // const indexers2 = this.iteratePages([pageNr2[0]]); + // console.log("Indexers 2" + indexers2); + // console.log( indexers2); + // // const moreIndexers = this.iteratePages() + // return Promise.map(indexers2.indexers, function (seccondIndexer) { + // console.log("Indexing seccond indexPage()"); + // return seccondIndexer.indexPage(); + // }).then(async (results) => { + // console.log("Indexing seccond results "); + // Array.prototype.push.apply(definitiveResults,results); + // console.log("This is SECCOND RESULT array"); + // // console.log(results); + // console.log("This is definitive array after SECCOND results"); + // console.log(definitiveResults); + // return definitiveResults; + // }); + } + }); + } catch(e) { + console.error("Error has accured", e); + } + + } + + iteratePages(pageNr) { + console.log("Entering iterate pages: page nr - " + pageNr ); + const indexers = []; + let lastPageNumber; + if (pageNr){ + for (let index = Number(pageNr[0]); index <= Number(pageNr[0]) + 10; index++) { + lastPageNumber = index; + + // console.log("width number") + // console.log("page number: " + Number(pageNr[0])) + // console.log("index: " + index ) + console.log(this.olxUrl.url.replace(/\d+$/, "")+index) + indexers.push(new Indexer({url : this.olxUrl.url.replace(/\d+$/, "")+index})) + + } + } else { + for (let index = 1; index <= 10; index++) { + lastPageNumber = index; + // console.log("widouth number") + // console.log("index: " + index ) + // console.log(this.olxUrl.url+index) + indexers.push(new Indexer({url :this.olxUrl.url+index})) + } + } + return {indexers : indexers, lastPageNumber : lastPageNumber}; + } + + async indexPage(pageNumber) { + console.log("Page number in index page") + console.log(pageNumber); + + try { + //TODO fix paging + // console.log('Starting to index page: ' + pageNr); + // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; + console.log("Indexing page: " + this.olxUrl.url); + const res = await fetch(this.olxUrl.url); + const body = await res.text(); + const $ = cheerio.load(body); + const hrefs = []; + const results = []; + let hasResults = false + + $('#rezultatipretrage').find('.listitem').each((i, elem) => { + hasResults = true + const href = $(elem).find('a').first().attr('href'); + hrefs.push(href); + }); + + // let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; + + // for (let i = 0; i < hrefs.length; i++) { + // console.log(`indexing: ${hrefs[i]}`); + + // const singleData = await this.indexSingle(hrefs[i], this.olxUrl.email, this.olxUrl.uuid); + + // if (singleData) { + // results.push(singleData); + // } + // } + + // await this.sleep(this.olxUrl); + // console.log('Finished indexing PAGE'); + // const singleIndex = [new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl)] + // return Promise.map(singleIndex, function (indexer) { + // return indexer.indexSingle(); + // }).then(async (results) => { + + // return results + // }) + console.log("this is hrefs for olxUrl" + this.olxUrl.url); + + console.log("NUMBER OF HREFS " + hrefs.length); + // console.log("HREFS " + hrefs); + console.log("HAS NO MORE RESULTS " + hasResults); + + return {hrefs : hrefs, hasResults : hasResults, pageNumber : pageNumber} + + // return results; + } catch (e) { + console.error('Exception caught:' + e); + } + } + + async indexAllHrefs () { + + } + + async indexSingle() { + // try { + // const res = await fetch(url); + // const body = await res.text(); + // const $ = cheerio.load(body); + + // //TODO figure out what to do with username + // const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); + + // // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { + // // return null; + // // } + + // //TODO remove properties that are not needed, and add some if they are missing + // const title = $('#naslovartikla').text(); + // const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); + + // const price = $('#pc > p:nth-child(2)').text(); + // const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); + // const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); + // const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); + // const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); + // const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); + + // const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); + // const time = $('time').attr('datetime'); + // const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); + + // const descriptions = $('.artikal_detaljniopis_tekst'); + // // const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); + // const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + // const imgRe = /href":("[^"]*")/g; + // const matches = latLngRe.exec(body); + // let lng = '', + // lat = ''; + + + // const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) + // const parsePrice = (price) => parseFloat(price.replace(".", "")) + + + // // TODO we dont save images ?? + + // // const images = []; + // // const imgMatches = body.match(imgRe); + + // // for (let i = 0; imgMatches && i < imgMatches.length; i++) { + // // let img = imgMatches[i].replace("href\":", "") + // // img = img.replace("\"", ""); + // // img = img.replace("\"", ""); + // // images.push(img); + // // } + + // // const uploadPromises = images.map(img => { + // // const imgFixed = eval(`'${img}'`); + // // return cloudinary.uploader.upload(eval(`'${img}'`)); + // // }); + + // // const uploadResults = await Promise.all(uploadPromises); + // // const cloudinaryImages = uploadResults.map(ur => ur.url); + + // if (matches && matches.length >= 3) { + // lat = matches[1]; + // lng = matches[2]; + // } + + // const parsedPrice = parsePrice(price); + + // const locationArray = location.split(","); + // const region = locationArray[0]; + // const municipality = locationArray[1]; + + // const data = { + // realEstateType: this.getCategoryId(realEstateType), + // email : email, + // olxId: olxId, + // // category: category, + // url, + // title, + // price: isNaN(parsedPrice) ? price : parsedPrice, + // size: parseFloat(size), + // gardenSize: parseFloat(gardenSize), + // address, + // region, + // municipality, + // // adType: AD_TYPE_SALE, + // time, + // shortDescription: descriptions.first().text(), + // longDescription: descriptions.last().text(), + // lat, + // lng, + // loc: [parseFloat(lat), parseFloat(lng)], + // // images: cloudinaryImages + // }; + + // return data; + // } catch (e) { + // console.error('Exception caught: ' + e.message); + // } + + // return null; + await this.sleep(this.olxUrl); + console.log("Finished indexing single page"); + return {}; + } + + async sleep(ms) { + console.log(ms); + return new Promise(resolve => setTimeout(resolve, ms)); + } +} +