diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index 365813f..7ab4609 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -171,13 +171,20 @@ module.exports = class OlxCrawler { // return results; let results = []; + const indexers= []; + let it = 3 for (let url of urls) { - let result = await this.indexPage(url, maxResults); - // Object.assign(results, result) - results.push(result); - // await this.sleep(5000); + // let result = await this.indexPage(url, maxResults); + // results.push(result); + it++ + indexers.push(new Indexer(it * 2000)); + } - return results; + return Promise.map(indexers, function (indexer) { + return indexer.indexPage(); + }).then(async (results) => { + return results + }) } async crawl() { @@ -198,11 +205,15 @@ module.exports = class OlxCrawler { } } } - + // await this.sleep(10000); console.log(filteredResults); return filteredResults; } + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + createRequestUrls(realestateRequests) { const urls = [] @@ -227,3 +238,170 @@ module.exports = class OlxCrawler { } }; + +class Indexer { + + constructor(olxUrl, email, uuid) { + this.olxUrl = olxUrl + this.email = email + this.uuid = uuid + } + + async indexPage() { + + try { + // console.log('Starting to index page: ' + pageNr); + // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; + + // const res = await fetch(this.olxUrl.url); + // const body = await res.text(); + // const $ = cheerio.load(body); + // const hrefs = []; + // const results = []; + + // $('#rezultatipretrage').find('.listitem').each((i, elem) => { + // const href = $(elem).find('a').first().attr('href'); + // hrefs.push(href); + // }); + + // let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; + + // for (let i = 0; i < hrefs.length; i++) { + // console.log(`indexing: ${hrefs[i]}`); + + // const singleData = await this.indexSingle(hrefs[i], this.olxUrl.email); + + // if (singleData) { + // results.push(singleData); + // } + // // await this.sleep(500); + // } + await this.sleep(this.olxUrl); + console.log('Finished indexing PAGE'); + const singleIndex = [new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl)] + return Promise.map(singleIndex, function (indexer) { + return indexer.indexSingle(); + }).then(async (results) => { + + return results + }) + + + + // return results; + } catch (e) { + console.error('Exception caught:' + e); + } + } + + async indexSingle() { + // try { + // const res = await fetch(url); + // const body = await res.text(); + // const $ = cheerio.load(body); + + // //TODO figure out what to do with username + // const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); + + // // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { + // // return null; + // // } + + // //TODO remove properties that are not needed, and add some if they are missing + // const title = $('#naslovartikla').text(); + // const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); + + // const price = $('#pc > p:nth-child(2)').text(); + // const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); + // const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); + // const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); + // const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); + // const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); + + // const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); + // const time = $('time').attr('datetime'); + // const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); + + // const descriptions = $('.artikal_detaljniopis_tekst'); + // // const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); + // const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + // const imgRe = /href":("[^"]*")/g; + // const matches = latLngRe.exec(body); + // let lng = '', + // lat = ''; + + + // const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) + // const parsePrice = (price) => parseFloat(price.replace(".", "")) + + + // // TODO we dont save images ?? + + // // const images = []; + // // const imgMatches = body.match(imgRe); + + // // for (let i = 0; imgMatches && i < imgMatches.length; i++) { + // // let img = imgMatches[i].replace("href\":", "") + // // img = img.replace("\"", ""); + // // img = img.replace("\"", ""); + // // images.push(img); + // // } + + // // const uploadPromises = images.map(img => { + // // const imgFixed = eval(`'${img}'`); + // // return cloudinary.uploader.upload(eval(`'${img}'`)); + // // }); + + // // const uploadResults = await Promise.all(uploadPromises); + // // const cloudinaryImages = uploadResults.map(ur => ur.url); + + // if (matches && matches.length >= 3) { + // lat = matches[1]; + // lng = matches[2]; + // } + + // const parsedPrice = parsePrice(price); + + // const locationArray = location.split(","); + // const region = locationArray[0]; + // const municipality = locationArray[1]; + + // const data = { + // realEstateType: this.getCategoryId(realEstateType), + // email : email, + // olxId: olxId, + // // category: category, + // url, + // title, + // price: isNaN(parsedPrice) ? price : parsedPrice, + // size: parseFloat(size), + // gardenSize: parseFloat(gardenSize), + // address, + // region, + // municipality, + // // adType: AD_TYPE_SALE, + // time, + // shortDescription: descriptions.first().text(), + // longDescription: descriptions.last().text(), + // lat, + // lng, + // loc: [parseFloat(lat), parseFloat(lng)], + // // images: cloudinaryImages + // }; + + // return data; + // } catch (e) { + // console.error('Exception caught: ' + e.message); + // } + + // return null; + await this.sleep(this.olxUrl); + console.log("Finished indexing single page"); + return {}; + } + + async sleep(ms) { + console.log(ms); + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/app/helpers/crawlers/olxPagerIndex.js b/app/helpers/crawlers/olxPagerIndex.js new file mode 100644 index 0000000..e69de29 diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js index 49ad166..2e2b417 100644 --- a/app/services/crawlerService.js +++ b/app/services/crawlerService.js @@ -4,9 +4,13 @@ const OlxCrawler = require("../helpers/crawlers/olxClawler"); const db = require("../models/index"); const olxCrawler = new OlxCrawler(1, 2, 3); +const olxCrawler1 = new OlxCrawler(1, 2, 3); +const olxCrawler2 = new OlxCrawler(1, 2, 3); const crawlers = [ olxCrawler, + olxCrawler1, + olxCrawler2, ]; async function crawlAll() {