From 039b1a63764e82b53101b90aeb8ce5cbfa97606b Mon Sep 17 00:00:00 2001 From: Nedim Uka Date: Fri, 5 Jul 2019 17:09:39 +0200 Subject: [PATCH] Optimiset crawlers , and pagingation --- app/helpers/crawlers/olxClawler.js | 629 ++++++++++------------------- 1 file changed, 210 insertions(+), 419 deletions(-) diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js index c0fa043..3d918cd 100644 --- a/app/helpers/crawlers/olxClawler.js +++ b/app/helpers/crawlers/olxClawler.js @@ -13,182 +13,8 @@ module.exports = class OlxCrawler { this.maxResults = maxResults; } - async indexSingle(url, email, uuid) { - try { - const res = await fetch(url); - const body = await res.text(); - const $ = cheerio.load(body); - - //TODO figure out what to do with username - const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); - - // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { - // return null; - // } - - //TODO remove properties that are not needed, and add some if they are missing - const title = $('#naslovartikla').text().trim(); - const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); - - const price = $('#pc > p:nth-child(2)').text(); - const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); - const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); - const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); - const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); - const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); - - const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); - const time = $('time').attr('datetime'); - const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); - - const descriptions = $('.artikal_detaljniopis_tekst'); - // const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); - const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; - const imgRe = /href":("[^"]*")/g; - const matches = latLngRe.exec(body); - let lng = '', - lat = ''; - - - const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) - const parsePrice = (price) => parseFloat(price.replace(".", "")) - - - // TODO we dont save images ?? - - // const images = []; - // const imgMatches = body.match(imgRe); - - // for (let i = 0; imgMatches && i < imgMatches.length; i++) { - // let img = imgMatches[i].replace("href\":", "") - // img = img.replace("\"", ""); - // img = img.replace("\"", ""); - // images.push(img); - // } - - // const uploadPromises = images.map(img => { - // const imgFixed = eval(`'${img}'`); - // return cloudinary.uploader.upload(eval(`'${img}'`)); - // }); - - // const uploadResults = await Promise.all(uploadPromises); - // const cloudinaryImages = uploadResults.map(ur => ur.url); - - if (matches && matches.length >= 3) { - lat = matches[1]; - lng = matches[2]; - } - - const parsedPrice = parsePrice(price); - - console.log(location); - const locationArray = location.split(","); - const region = locationArray[0]; - const municipality = locationArray[1]; - - const data = { - realEstateType: this.getCategoryId(realEstateType), - email: email, - uuid: uuid, - olxId: olxId, - // category: category, - url, - title, - price: isNaN(parsedPrice) ? 0 : parsedPrice, - size: parseFloat(size), - gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), - address, - region, - municipality, - // adType: AD_TYPE_SALE, - time, - shortDescription: descriptions.first().text(), - longDescription: descriptions.last().text(), - lat, - lng, - loc: [parseFloat(lat), parseFloat(lng)], - // images: cloudinaryImages - }; - - return data; - } catch (e) { - console.error('Exception caught: ' + e.message); - } - - return null; - } - - async indexPage(olxUrl, maxResults = 1000) { - try { - //TODO fix paging - // console.log('Starting to index page: ' + pageNr); - // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; - - const res = await fetch(olxUrl.url); - const body = await res.text(); - const $ = cheerio.load(body); - const hrefs = []; - const results = []; - - $('#rezultatipretrage').find('.listitem').each((i, elem) => { - const href = $(elem).find('a').first().attr('href'); - hrefs.push(href); - }); - - let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; - - for (let i = 0; i < hrefs.length; i++) { - console.log(`indexing: ${hrefs[i]}`); - - const singleData = await this.indexSingle(hrefs[i], olxUrl.email, olxUrl.uuid); - - if (singleData) { - results.push(singleData); - } - } - - return results; - } catch (e) { - console.error('Exception caught:' + e); - } - } - - getCategoryId(category) { - - switch (category) { - case 'Stanovi': - return 'stan'; - - case 'Vikendice': - return 'vikendica' - - case 'Kuće': - return 'kuca'; - - default: - return ''; - } - } - - async indexPages(urls, start, end, maxResults = 1000) { - //TODO fix paging - // let results = {}; - // for (let i = start; i <= end; i++) { - // let result = await this.indexPage(i, maxResults); - // Object.assign(results, result) - // } - // return results; - - // let results = []; - const indexers= []; - // let it = 3 - // for (let url of urls) { - // // let result = await this.indexPage(url, maxResults); - // // results.push(result); - // it++ - // indexers.push(new Indexer(url)); - - // } + async indexPages(urls) { + const indexers = []; urls.forEach(url => { indexers.push(new Indexer(url)); @@ -198,15 +24,7 @@ module.exports = class OlxCrawler { return indexer.indexWithPagination(); }).then(async (results) => { return results - }) - - - // let results = []; - // for (let url of urls) { - // let result = await this.indexPage(url, maxResults); - // results.push(result); - // } - // return results; + }) } async crawl() { @@ -217,22 +35,24 @@ module.exports = class OlxCrawler { console.log("OLX CRAWLER: found " + realestateRequests.length + "subscribed RealEstateRequests"); const urls = this.createRequestUrls(realestateRequests); let results = await this.indexPages(urls, this.fromPage, this.toPage, this.maxResults); + console.log("Final crawler results"); + console.log(results[0].length); - console.log(results[0]); + for (const finalResult of results[0]) { - // for (const result of results) { - // for (const finalResult of result) { - // if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { - // const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); + if (null !== finalResult) { + if (finalResult.lat !== undefined && finalResult.lat !== null && finalResult.lat !== "") { + const pointInsideBoundingBox = await findPointInsideBoundingBox([finalResult.lng, finalResult.lat], finalResult.email); - // if (pointInsideBoundingBox[0].length !== 0) { - // filteredResults.push(finalResult); - // } - // } - // } - // } - // console.log("OLX CRAWLER: number of olx crawler results, after geo location filtering: " + filteredResults.length); - // return filteredResults; + if (pointInsideBoundingBox[0].length !== 0) { + filteredResults.push(finalResult); + } + } + } + } + + console.log("OLX CRAWLER: number of olx crawler results, after geo location filtering: " + filteredResults.length); + return filteredResults; } createRequestUrls(realestateRequests) { @@ -262,144 +82,161 @@ module.exports = class OlxCrawler { class Indexer { - -/** - * - * @param {String|Array} olxUrl single or array of URLS - * @param {String} email - * @param {Strung} uuid - */ - constructor(olxUrl, email, uuid) { - this.olxUrl = olxUrl - this.email = email - this.uuid = uuid + /** + * + * @param {String|Array} olxUrl single or array of objects containing url email and uuid + * @param {Array} hrefResutls array contaning urls from crawler results + */ + + constructor(olxUrl, hrefResutls) { + this.olxUrl = olxUrl; + this.hrefResutls = hrefResutls; } - async indexWithPagination(pageNumber = 1){ + async indexWithPagination(pageNumber = 1) { - let definitiveResults = []; - console.log("This is olxUrl:" + this.olxUrl.url); - const pageNr = this.olxUrl.url.match(/\d+$/); - const indexers = this.iteratePages(pageNumber ? [pageNumber] : pageNr); - console.log("This is page number:" + pageNr); - // if (pageNr && Number(pageNr[0]) >= 20){ - // console.log("Returning page numbermax"); - // return; - // } + console.log("This is olxUrl:" + this.olxUrl.url); + const pageNr = this.olxUrl.url.match(/\d+$/); + const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr); - // if (pageNumber && pageNumber >= 30){ - // console.log("Returning page numbermax"); - // return; - // } - - try{ - var lastPageNumber = indexers.lastPageNumber; + try { return Promise.map(indexers.indexers, function (indexer) { - console.log("Page number before async"); - console.log(pageNumber); return indexer.indexPage(pageNumber); }).then(async (results) => { let hasResults = false; + results.forEach(result => { - if (!hasResults){ + if (!hasResults) { console.log("No results detected") - hasResults = result.hasResults + hasResults = result.hasResults } }); - - // console.log(results); - console.log("PageNumber"); - console.log(pageNumber); - if (!hasResults){ - console.log("HAS NO RESULTS"); - // console.log(results); - return results + if (!hasResults) { + console.log("HAS NO MORE RESULTS, stop the paging, there are some results and they should contain only HREFS"); + console.log(results.length); + const singlePageIndexers = this.prepareHrefIndexers(results); + if (singlePageIndexers.length === 0) { + console.log("THERE IS NOT EVEN SINGLE RESULT"); + return [] + } + + return Promise.map(singlePageIndexers, function (indexer) { + return indexer.indexSingle(); + }).then(async (results) => { + console.log("SinglePageMethod in HAS NO RESULTS, MarketAralms"); + console.log(results.length); + return results; + }); + } else { - console.log("HAS RESULTS"); - console.log(" Results PageNumber"); - console.log(results[0].pageNumber) + console.log("HAS MORE RESULTS, should only contain HREFS"); + console.log(results.length); const newResults = await this.indexWithPagination(results[0].pageNumber + 10); - // console.log(newResults); - Array.prototype.push.apply(results,newResults); - return results; - // console.log("Idnexing seccond"); - // // this.olxUrl.url = this.olxUrl.url.replace(/\d+$/, "")+lastPageNumber - // // // return await this.indexWithPagination(); - // // Array.prototype.push.apply(definitiveResults,results); - // definitiveResults = results; - // console.log("This is FIRST RESULT array"); - // // console.log(results); - // console.log("This is definitive array after first results"); - // console.log(definitiveResults); - // const pageNr2 = [10] - // const indexers2 = this.iteratePages([pageNr2[0]]); - // console.log("Indexers 2" + indexers2); - // console.log( indexers2); - // // const moreIndexers = this.iteratePages() - // return Promise.map(indexers2.indexers, function (seccondIndexer) { - // console.log("Indexing seccond indexPage()"); - // return seccondIndexer.indexPage(); - // }).then(async (results) => { - // console.log("Indexing seccond results "); - // Array.prototype.push.apply(definitiveResults,results); - // console.log("This is SECCOND RESULT array"); - // // console.log(results); - // console.log("This is definitive array after SECCOND results"); - // console.log(definitiveResults); - // return definitiveResults; - // }); - } - }); - } catch(e) { + const singlePageIndexers = this.prepareHrefIndexers(results); + + const newerResults = await Promise.map(singlePageIndexers, function (indexer) { + return indexer.indexSingle(); + }).then(async (results) => { + console.log("SinglePageMethod HAS RESULTS, should contain MarketAlerts only"); + console.log(results.length); + return results; + }); + + Array.prototype.push.apply(newResults, newerResults); + return newResults; + + } + }); + } catch (e) { console.error("Error has accured", e); } } - iteratePages(pageNr) { - console.log("Entering iterate pages: page nr - " + pageNr ); + prepareIndexers(pageNr) { + console.log("Entering prepareIndexers : page nr - " + pageNr); const indexers = []; let lastPageNumber; - if (pageNr){ + if (pageNr) { for (let index = Number(pageNr[0]); index <= Number(pageNr[0]) + 10; index++) { lastPageNumber = index; + const newOlxUrl = { + url: this.olxUrl.url.replace(/\d+$/, "") + index, + email: this.olxUrl.email, + uuid: this.olxUrl.uuid + } + indexers.push(new Indexer(newOlxUrl)); - // console.log("width number") - // console.log("page number: " + Number(pageNr[0])) - // console.log("index: " + index ) - console.log(this.olxUrl.url.replace(/\d+$/, "")+index) - indexers.push(new Indexer({url : this.olxUrl.url.replace(/\d+$/, "")+index})) - } } else { for (let index = 1; index <= 10; index++) { lastPageNumber = index; - // console.log("widouth number") - // console.log("index: " + index ) - // console.log(this.olxUrl.url+index) - indexers.push(new Indexer({url :this.olxUrl.url+index})) + const newOlxUrl = { + url: this.olxUrl.url + index, + email: this.olxUrl.email, + uuid: this.olxUrl.uuid + } + indexers.push(new Indexer(newOlxUrl)); } } - return {indexers : indexers, lastPageNumber : lastPageNumber}; + return { + indexers: indexers, + lastPageNumber: lastPageNumber + }; + } + + prepareHrefIndexers(results) { + const indexers = [] + + if (!Array.isArray(results)) { + results.hrefs.forEach(href => { + const newOlxUrl = { + url: href, + email: results.olxUrl.email, + uuid: results.olxUrl.uuid + } + + indexers.push(new Indexer(newOlxUrl)); + }); + + } else { + + + results.forEach(result => { + + if (result !== null && result.hasOwnProperty('hrefs')) { + result.hrefs.forEach(href => { + // console.log(href); + const newOlxUrl = { + url: href, + email: result.olxUrl.email, + uuid: result.olxUrl.uuid + } + + indexers.push(new Indexer(newOlxUrl)); + }) + } + + }); + } + + return indexers; } async indexPage(pageNumber) { - console.log("Page number in index page") + console.log("Page number in index page, max page number :") console.log(pageNumber); try { - //TODO fix paging - // console.log('Starting to index page: ' + pageNr); - // const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; + console.log("Indexing page: " + this.olxUrl.url); const res = await fetch(this.olxUrl.url); const body = await res.text(); const $ = cheerio.load(body); const hrefs = []; - const results = []; let hasResults = false $('#rezultatipretrage').find('.listitem').each((i, elem) => { @@ -408,154 +245,108 @@ class Indexer { hrefs.push(href); }); - // let actualNoOfResults = (hrefs.length <= maxResults) ? hrefs.length : maxResults; - - // for (let i = 0; i < hrefs.length; i++) { - // console.log(`indexing: ${hrefs[i]}`); - - // const singleData = await this.indexSingle(hrefs[i], this.olxUrl.email, this.olxUrl.uuid); - - // if (singleData) { - // results.push(singleData); - // } - // } - - // await this.sleep(this.olxUrl); - // console.log('Finished indexing PAGE'); - // const singleIndex = [new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl), new Indexer(this.olxUrl)] - // return Promise.map(singleIndex, function (indexer) { - // return indexer.indexSingle(); - // }).then(async (results) => { - - // return results - // }) console.log("this is hrefs for olxUrl" + this.olxUrl.url); - console.log("NUMBER OF HREFS " + hrefs.length); - // console.log("HREFS " + hrefs); - console.log("HAS NO MORE RESULTS " + hasResults); - return {hrefs : hrefs, hasResults : hasResults, pageNumber : pageNumber} - - // return results; + return { + hrefs: hrefs, + hasResults: hasResults, + pageNumber: pageNumber, + olxUrl: this.olxUrl + } } catch (e) { console.error('Exception caught:' + e); } } - async indexAllHrefs () { - - } - async indexSingle() { - // try { - // const res = await fetch(url); - // const body = await res.text(); - // const $ = cheerio.load(body); + try { + console.log("Index single"); + console.log(this.olxUrl.url); - // //TODO figure out what to do with username - // const username = $('#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span').text(); + if (this.olxUrl.url === undefined) { + return {} + } + const res = await fetch(this.olxUrl.url); + const body = await res.text(); + const $ = cheerio.load(body); - // // if (IGNORED_USERNAMES.includes((username || '').toLowerCase())) { - // // return null; - // // } + const title = $('#naslovartikla').text().trim(); + const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); - // //TODO remove properties that are not needed, and add some if they are missing - // const title = $('#naslovartikla').text(); - // const realEstateType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span').text(); + const price = $('#pc > p:nth-child(2)').text(); + const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); + const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); + const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); + const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); + const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); - // const price = $('#pc > p:nth-child(2)').text(); - // const size = $('#dodatnapolja1 > div:nth-child(1) > div.df2').text(); - // const rooms = $('#dodatnapolja1 > div:nth-child(2) > div.df2').text(); - // const address = $('#dodatnapolja1 > div:nth-child(5) > div.df2').text(); - // const gardenSize = $('#dodatnapolja1 > div:nth-child(6) > div.df2').text(); - // const location = $('#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija').attr('data-content'); + const time = $('time').attr('datetime'); + const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); - // const adType = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2').text(); - // const time = $('time').attr('datetime'); - // const olxId = $('#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2').text(); + const descriptions = $('.artikal_detaljniopis_tekst'); + const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + const imgRe = /href":("[^"]*")/g; + const matches = latLngRe.exec(body); + let lng = '', + lat = ''; + const parsePrice = (price) => parseFloat(price.replace(".", "")) - // const descriptions = $('.artikal_detaljniopis_tekst'); - // // const floor = $('#dodatnapolja1').find(':contains(Sprat)').last().nextAll().text(); - // const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; - // const imgRe = /href":("[^"]*")/g; - // const matches = latLngRe.exec(body); - // let lng = '', - // lat = ''; + if (matches && matches.length >= 3) { + lat = matches[1]; + lng = matches[2]; + } + const parsedPrice = parsePrice(price); - // const parseRooms = (rooms) => parseInt([...rooms].filter(c => !isNaN(c)).filter(c => c.trim()).join()) - // const parsePrice = (price) => parseFloat(price.replace(".", "")) + const locationArray = location.split(","); + const region = locationArray[0]; + const municipality = locationArray[1]; + const data = { + realEstateType: this.getCategoryId(realEstateType), + email: this.olxUrl.email, + uuid: this.olxUrl.uuid, + olxId: olxId, + url: this.olxUrl.url, + title, + price: isNaN(parsedPrice) ? 0 : parsedPrice, + size: parseFloat(size), + gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), + address, + region, + municipality, + time, + shortDescription: descriptions.first().text(), + longDescription: descriptions.last().text(), + lat, + lng, + loc: [parseFloat(lat), parseFloat(lng)], + }; - // // TODO we dont save images ?? + return data; + } catch (e) { + console.error('Exception caught: ' + e.message); + } - // // const images = []; - // // const imgMatches = body.match(imgRe); - - // // for (let i = 0; imgMatches && i < imgMatches.length; i++) { - // // let img = imgMatches[i].replace("href\":", "") - // // img = img.replace("\"", ""); - // // img = img.replace("\"", ""); - // // images.push(img); - // // } - - // // const uploadPromises = images.map(img => { - // // const imgFixed = eval(`'${img}'`); - // // return cloudinary.uploader.upload(eval(`'${img}'`)); - // // }); - - // // const uploadResults = await Promise.all(uploadPromises); - // // const cloudinaryImages = uploadResults.map(ur => ur.url); - - // if (matches && matches.length >= 3) { - // lat = matches[1]; - // lng = matches[2]; - // } - - // const parsedPrice = parsePrice(price); - - // const locationArray = location.split(","); - // const region = locationArray[0]; - // const municipality = locationArray[1]; - - // const data = { - // realEstateType: this.getCategoryId(realEstateType), - // email : email, - // olxId: olxId, - // // category: category, - // url, - // title, - // price: isNaN(parsedPrice) ? price : parsedPrice, - // size: parseFloat(size), - // gardenSize: parseFloat(gardenSize), - // address, - // region, - // municipality, - // // adType: AD_TYPE_SALE, - // time, - // shortDescription: descriptions.first().text(), - // longDescription: descriptions.last().text(), - // lat, - // lng, - // loc: [parseFloat(lat), parseFloat(lng)], - // // images: cloudinaryImages - // }; - - // return data; - // } catch (e) { - // console.error('Exception caught: ' + e.message); - // } - - // return null; - await this.sleep(this.olxUrl); - console.log("Finished indexing single page"); - return {}; + return null; } - async sleep(ms) { - console.log(ms); - return new Promise(resolve => setTimeout(resolve, ms)); - } + getCategoryId(category) { + + switch (category) { + case 'Stanovi': + return 'stan'; + + case 'Vikendice': + return 'vikendica' + + case 'Kuće': + return 'kuca'; + + default: + return ''; + } + } }