From 76a989fa37d487442e99da31c09cafc5b08762ef Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Mon, 16 Sep 2019 15:59:53 +0200 Subject: [PATCH] replace old crawler, without specific crawler and saver implementation --- app/common/enums.js | 28 ++ app/crawler/crawl.js | 55 ++++ app/crawler/savers/postgres.js | 28 ++ app/crawler/specific/olx.js | 192 +++++++++++++ app/helpers/crawlers/olxClawler.js | 435 ----------------------------- app/services/crawlerService.js | 77 ----- package.json | 3 +- 7 files changed, 305 insertions(+), 513 deletions(-) create mode 100644 app/common/enums.js create mode 100644 app/crawler/crawl.js create mode 100644 app/crawler/savers/postgres.js create mode 100644 app/crawler/specific/olx.js delete mode 100644 app/helpers/crawlers/olxClawler.js delete mode 100644 app/services/crawlerService.js diff --git a/app/common/enums.js b/app/common/enums.js new file mode 100644 index 0000000..9174d00 --- /dev/null +++ b/app/common/enums.js @@ -0,0 +1,28 @@ +const AD_TYPE = { + AD_TYPE_SALE: 1, + AD_TYPE_RENT: 2 +}; + +const AD_CATEGORY = { + CATEGORY_FLAT: 0, + CATEGORY_HOUSE: 1, + CATEGORY_OFFICE: 2, + CATEGORY_LAND: 3, + CATEGORY_APARTMENT: 4, + CATEGORY_GARAGE: 5 +}; + +const IGNORED_USERNAMES = []; + +const REAL_ESTATE_STATUS = { + STATUS_NORMAL: 0, + STATUS_RESERVED: 1, + STATUS_SOLD: 2 +}; + +module.exports = { + AD_TYPE, + IGNORED_USERNAMES, + AD_CATEGORY, + REAL_ESTATE_STATUS +}; diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js new file mode 100644 index 0000000..dcdf094 --- /dev/null +++ b/app/crawler/crawl.js @@ -0,0 +1,55 @@ +"use strict"; +/* + Entry point for crawling functionality + All communication between crawlers and savers is here + All environment specific configuration is read here and + passed to the crawlers and savers. +*/ + +require("dotenv").config(); +const OlxCrawler = require("./specific/olx"); +const PostgresSaver = require("./savers/postgres"); + +let crawlers = [ + // new OlxCrawler( + // process.env.OLX_FROM_PAGE, + // process.env.OLX_TO_PAGE, + // process.env.OLX_MAX_RESULTS + // ) + // new ProstorCrawler( + // parseInt(process.env.PROSTOR_FROM_PAGE), + // parseInt(process.env.PROSTOR_TO_PAGE), + // parseInt(process.env.PROSTOR_MAX_RESULTS) + // ), + // new RentalCrawler( + // parseInt(process.env.RENTAL_FROM_PAGE), + // parseInt(process.env.RENTAL_TO_PAGE), + // parseInt(process.env.RENTAL_MAX_RESULTS) + // ) +]; + +let savers = [new PostgresSaver(process.env.MONGO_URL)]; + +async function crawlAll() { + for (let crawler of crawlers) { + try { + const crawlerResults = await crawler.crawl(); + for (let saver of savers) { + try { + await saver.connect(); + await saver.save(crawlerResults); + } catch (e) { + console.log("Error saving. Trying next saver! ", e); + } + } + } catch (e) { + console.log("Error crawling. Trying next crawler! ", e); + } + } + + for (let saver of savers) { + saver.close(); + } +} + +crawlAll(); diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js new file mode 100644 index 0000000..d67dbd4 --- /dev/null +++ b/app/crawler/savers/postgres.js @@ -0,0 +1,28 @@ +class PostgresSaver { + constructor(url) { + this.url = url; + } + + connect() { + //TODO: It seems we never worry about open/close connection with Sequelize ? + //TODO: Check if postgres is ready + return true; + } + + async save(results) { + let resultsForMongo = Object.keys(results).map(key => { + return results[key]; + }); + + for (const doc of resultsForMongo) { + this.collection.update({ url: doc.url }, doc, { upsert: true }); + } + } + + close() { + //TODO: It seems we never worry about open/close connection with Sequelize ? + return true; + } +} + +module.exports = PostgresSaver; diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js new file mode 100644 index 0000000..d5c9432 --- /dev/null +++ b/app/crawler/specific/olx.js @@ -0,0 +1,192 @@ +"use strict"; + +let fetch = require("node-fetch"); +let cheerio = require("cheerio"); + +const { + AD_TYPE, + AD_CATEGORY, + IGNORED_USERNAMES +} = require("../../common/enums"); + +class OlxCrawler { + constructor(fromPage = 0, toPage = 10, maxResults = 1000) { + this.fromPage = fromPage; + this.toPage = toPage; + this.maxResults = maxResults; + } + + async indexSingle(url) { + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + + const username = $( + "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" + ).text(); + + if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { + return null; + } + + const title = $("#naslovartikla").text(); + const category = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" + ).text(); + + const price = $("#pc > p:nth-child(2)").text(); + const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text(); + const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text(); + const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text(); + const location = $( + "#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija" + ).attr("data-content"); + + const adType = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2" + ).text(); + const time = $("time").attr("datetime"); + const olxId = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2" + ).text(); + + const descriptions = $(".artikal_detaljniopis_tekst"); + const floor = $("#dodatnapolja1") + .find(":contains(Sprat)") + .last() + .nextAll() + .text(); + const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + const matches = latLngRe.exec(body); + let lng = "", + lat = ""; + + const parseRooms = rooms => + parseInt( + [...rooms] + .filter(c => !isNaN(c)) + .filter(c => c.trim()) + .join() + ); + const parsePrice = price => parseFloat(price.replace(".", "")); + + if (matches && matches.length >= 3) { + lat = matches[1]; + lng = matches[2]; + } + + const parsedPrice = parsePrice(price); + let parsedRooms; + + if (rooms === "Garsonjera") { + parsedRooms = 0; + } else { + parsedRooms = parseRooms(rooms); + } + + const data = { + category: this.getCategoryId(category), + url, + title, + price: isNaN(parsedPrice) ? price : parsedPrice, + size: parseFloat(size), + rooms: parsedRooms, + floor: parseInt(floor), + address, + location, + adType: AD_TYPE.AD_TYPE_SALE, + time, + shortDescription: descriptions.first().text(), + longDescription: descriptions.last().text(), + lat, + lng, + loc: [parseFloat(lat), parseFloat(lng)] + }; + + return data; + } catch (e) { + console.error("Exception caught: " + e.message); + } + + return null; + } + + async indexPage(pageNr, maxResults = 1000) { + try { + console.log("Starting to index page: " + pageNr); + const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; + + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + const hrefs = []; + const results = {}; + + $("#rezultatipretrage") + .find(".listitem") + .each((i, elem) => { + const href = $(elem) + .find("a") + .first() + .attr("href"); + hrefs.push(href); + }); + + let actualNoOfResults = + hrefs.length <= maxResults ? hrefs.length : maxResults; + + for (let i = 0; i < hrefs.length; i++) { + console.log(`indexing: ${hrefs[i]}`); + + const singleData = await this.indexSingle(hrefs[i]); + + if (singleData) { + results[hrefs[i]] = singleData; + } + await this.sleep(500); + } + + return results; + } catch (e) { + console.error("Exception caught:" + e); + } + } + + getCategoryId(category) { + if (category === "Stanovi") { + return AD_CATEGORY.CATEGORY_FLAT; + } else if (category === "Zemljišta") { + return AD_CATEGORY.CATEGORY_LAND; + } else if (category === "Kuće") { + return AD_CATEGORY.CATEGORY_HOUSE; + } else if (category === "Poslovni prostori") { + return AD_CATEGORY.CATEGORY_OFFICE; + } + } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async indexPages(start, end, maxResults = 1000) { + let results = {}; + for (let i = start; i <= end; i++) { + let result = await this.indexPage(i, maxResults); + Object.assign(results, result); + await this.sleep(5000); + } + return results; + } + + async crawl() { + let results = await this.indexPages( + this.fromPage, + this.toPage, + this.maxResults + ); + return results; + } +} + +module.exports = OlxCrawler; diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js deleted file mode 100644 index d7007b7..0000000 --- a/app/helpers/crawlers/olxClawler.js +++ /dev/null @@ -1,435 +0,0 @@ -const fetch = require("node-fetch"); -const cheerio = require("cheerio"); -const { allRERequest, findPointInsideBoundingBox } = require("../db/dbHelper"); -const { getRealEstateTypeEnum } = require("../enums"); -const { getRegion, getMunicipality } = require("../codes"); -const Promise = require("bluebird"); - -module.exports = class OlxCrawler { - //TODO figure best way to handle paging - constructor(hrefs = []) { - this.hrefs = hrefs; - } - - async indexPages(urls) { - const indexers = []; - - urls.forEach(url => { - indexers.push(new Indexer(url)); - }); - - return Promise.map(indexers, function(indexer) { - return indexer.indexWithPagination(); - }).then(async results => { - return results; - }); - } - - async crawl() { - const filteredResults = []; - const realEstateRequests = await allRERequest(); - const urls = this.createRequestUrls(realEstateRequests); - let results = await this.indexPages( - urls, - this.fromPage, - this.toPage, - this.maxResults - ); - const flatResults = results.flat(); - if (flatResults) { - for (const finalResult of flatResults) { - if (null !== finalResult) { - if ( - finalResult.lat !== undefined && - finalResult.lat !== null && - finalResult.lat !== "" - ) { - const pointInsideBoundingBox = await findPointInsideBoundingBox( - [finalResult.lng, finalResult.lat], - finalResult.email, - finalResult.uuid - ); - - if (pointInsideBoundingBox[0].length !== 0) { - finalResult.hasLocation = true; - filteredResults.push(finalResult); - } else { - finalResult.hasLocation = false; - filteredResults.push(finalResult); - } - } - } - } - return filteredResults; - } - return []; - } - - createRequestUrls(realEstateRequests) { - const urls = []; - - for (const request of realEstateRequests) { - const { - realEstateType, - region, - sizeMin, - sizeMax, - priceMin, - priceMax - } = request; - - const urlRealEstateParams = [ - { - paramName: "kanton", - paramValue: region, - useParam: false - }, - { - paramName: "kategorija", - paramValue: getRealEstateTypeEnum(realEstateType).olxid, - useParam: true - }, - { - paramName: "kvadrata_min", - paramValue: sizeMin, - useParam: true - }, - { - paramName: "kvadrata_max", - paramValue: sizeMax, - useParam: true - }, - { - paramName: "od", - paramValue: priceMin, - useParam: true - }, - { - paramName: "do", - paramValue: priceMax, - useParam: true - } - ]; - const urlResultsParams = [ - { - paramName: "vrstapregleda", - paramValue: "tabela", - useParam: true - }, - { - paramName: "sort_order", - paramValue: "desc", - useParam: true - }, - { - paramName: "vrsta", - paramValue: "samoprodaja", - useParam: true - }, - { - paramName: "stranica", - paramValue: "0", - useParam: true - } - ]; - - const paramsReduceFunction = (accumulatedValue, currentParam) => { - const { paramName, paramValue, useParam } = currentParam; - if (useParam) { - return `${accumulatedValue}&${paramName}=${paramValue}`; - } else { - return accumulatedValue; - } - }; - - const reducedRealEstateParams = urlRealEstateParams.reduce( - paramsReduceFunction, - "" - ); - const reducedResultsParams = urlResultsParams.reduce( - paramsReduceFunction, - "" - ); - - const olxUrl = { - url: `https://www.olx.ba/pretraga?${reducedRealEstateParams}${reducedResultsParams}`, - email: request.email, - uuid: request.uniqueId, - hrefs: this.hrefs - }; - urls.push(olxUrl); - } - - return urls; - } -}; - -class Indexer { - /** - * - * @param {String|Array} olxUrl single or array of objects containing url email and uuid - * @param {Array} hrefResutls array contaning urls from crawler results - */ - - constructor(olxUrl, hrefResutls) { - this.olxUrl = olxUrl; - this.hrefResutls = hrefResutls; - } - - async indexWithPagination(pageNumber = 1) { - const pageNr = this.olxUrl.url.match(/\d+$/); - const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr); - - try { - return Promise.map(indexers.indexers, function(indexer) { - return indexer.indexPage(pageNumber); - }).then(async results => { - let hasResults = false; - - results.forEach(result => { - if (!hasResults) { - hasResults = result.hasResults; - } - }); - - if (!hasResults) { - const singlePageIndexers = this.prepareHrefIndexers(results); - if (singlePageIndexers.length === 0) { - return []; - } - - return Promise.map(singlePageIndexers, function(indexer) { - return indexer.indexSingle(); - }).then(async results => { - return results; - }); - } else { - const newResults = await this.indexWithPagination( - results[0].pageNumber + 5 - ); - const singlePageIndexers = this.prepareHrefIndexers(results); - - const newerResults = await Promise.map(singlePageIndexers, function( - indexer - ) { - return indexer.indexSingle(); - }).then(async results => { - return results; - }); - - Array.prototype.push.apply(newResults, newerResults); - return newResults; - } - }); - } catch (e) { - console.error("Error has accured", e); - } - } - - prepareIndexers(pageNr) { - const indexers = []; - let lastPageNumber; - if (pageNr) { - for ( - let index = Number(pageNr[0]); - index <= Number(pageNr[0]) + 5; - index++ - ) { - lastPageNumber = index; - const newOlxUrl = { - url: this.olxUrl.url.replace(/\d+$/, "") + index, - email: this.olxUrl.email, - uuid: this.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - indexers.push(new Indexer(newOlxUrl)); - } - } else { - for (let index = 1; index <= 5; index++) { - lastPageNumber = index; - const newOlxUrl = { - url: this.olxUrl.url + index, - email: this.olxUrl.email, - uuid: this.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - indexers.push(new Indexer(newOlxUrl)); - } - } - return { - indexers: indexers, - lastPageNumber: lastPageNumber - }; - } - - prepareHrefIndexers(results) { - const indexers = []; - - if (!Array.isArray(results)) { - results.hrefs.forEach(href => { - const newOlxUrl = { - url: href, - email: results.olxUrl.email, - uuid: results.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - - indexers.push(new Indexer(newOlxUrl)); - }); - } else { - results.forEach(result => { - if (result !== null && result.hasOwnProperty("hrefs")) { - result.hrefs.forEach(href => { - const newOlxUrl = { - url: href, - email: result.olxUrl.email, - uuid: result.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - - indexers.push(new Indexer(newOlxUrl)); - }); - } - }); - } - - return indexers; - } - - async indexPage(pageNumber) { - try { - const res = await fetch(this.olxUrl.url); - const body = await res.text(); - const $ = cheerio.load(body); - const hrefs = []; - let hasResults = false; - - $("#rezultatipretrage") - .find(".listitem") - .each((i, elem) => { - hasResults = true; - const href = $(elem) - .find("a") - .first() - .attr("href"); - hrefs.push(href); - }); - return { - hrefs: hrefs, - hasResults: hasResults, - pageNumber: pageNumber, - olxUrl: this.olxUrl - }; - } catch (e) { - console.error("Exception caught:" + e); - } - } - - async indexSingle() { - try { - if (this.olxUrl.url === undefined) { - return {}; - } - - // if (global.hrefs) { - - if ( - this.olxUrl.hrefs[this.olxUrl.uuid] && - this.olxUrl.hrefs[this.olxUrl.uuid].includes(this.olxUrl.url) - ) { - return null; - } - // } - - const res = await fetch(this.olxUrl.url); - const body = await res.text(); - const $ = cheerio.load(body); - - const title = $("#naslovartikla") - .text() - .trim(); - const realEstateType = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" - ).text(); - - const price = $("#pc > p:nth-child(2)").text(); - const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text(); - const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text(); - const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text(); - const gardenSize = $( - "#dodatnapolja1 > div:nth-child(6) > div.df2" - ).text(); - const location = $( - "#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija" - ).attr("data-content"); - - const time = $("time").attr("datetime"); - const olxId = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2" - ).text(); - - const descriptions = $(".artikal_detaljniopis_tekst"); - const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; - const imgRe = /href":("[^"]*")/g; - const matches = latLngRe.exec(body); - let lng = "", - lat = ""; - const parsePrice = price => parseFloat(price.replace(".", "")); - - if (matches && matches.length >= 3) { - lat = matches[1]; - lng = matches[2]; - } - - const parsedPrice = parsePrice(price); - - const locationArray = - location && location.length > 0 ? location.split(",") : []; - const region = locationArray.length > 0 ? locationArray[0] : ""; - const municipality = locationArray.length > 1 ? locationArray[1] : ""; - - const data = { - realEstateType: this.getCategoryId(realEstateType), - email: this.olxUrl.email, - uuid: this.olxUrl.uuid, - olxId: olxId, - url: this.olxUrl.url, - title, - price: isNaN(parsedPrice) ? 0 : parsedPrice, - size: parseFloat(size), - gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), - address, - region, - municipality, - time, - shortDescription: descriptions.first().text(), - longDescription: descriptions.last().text(), - lat, - lng, - loc: [parseFloat(lat), parseFloat(lng)] - }; - - return data; - } catch (e) { - console.error("Exception caught: " + e.message); - } - - return null; - } - - getCategoryId(category) { - switch (category) { - case "Stanovi": - return "stan"; - - case "Vikendice": - return "vikendica"; - - case "Kuće": - return "kuca"; - - default: - return ""; - } - } -} diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js deleted file mode 100644 index 1de43b8..0000000 --- a/app/services/crawlerService.js +++ /dev/null @@ -1,77 +0,0 @@ -const Promise = require("bluebird"); -const OlxCrawler = require("../helpers/crawlers/olxClawler"); -const db = require("../models/index"); -const { allMarketAlerts } = require("../helpers/db/dbHelper"); - -async function crawlAll() { - try { - const marketAlertsFromDb = await allMarketAlerts(true); - const hrefs = []; - - marketAlertsFromDb.map(marketAlert => { - if (hrefs[marketAlert.request] === undefined) { - hrefs[marketAlert.request] = []; - } - - hrefs[marketAlert.request].push(marketAlert.url); - }); - - const olxCrawler = new OlxCrawler(hrefs); - - const crawlers = [olxCrawler]; - - return Promise.map(crawlers, function(crawler) { - return crawler.crawl(); - }).then(async results => { - try { - const marketAlertsFromDb = await allMarketAlerts(false, true); - - const marketAlerts = []; - const mergedResults = [].concat.apply([], results); - - for (const result of mergedResults) { - marketAlerts.push({ - url: result.url, - realestateOrigin: "OLX", - originId: 1, - size: result.size, - price: result.price, - email: result.email, - request: result.uuid, - municipality: result.municipality, - region: result.region, - gardenSize: isNaN(result.gardenSize) ? 0 : result.gardenSize, - realEstateType: result.realEstateType, - title: result.title, - notified: false, - hasLocation: result.hasLocation - }); - } - - try { - const filteredMarketAlerts = marketAlerts.filter( - elem => - !marketAlertsFromDb.find(({ url, request }) => { - return elem.url === url && elem.request === request; - }) - ); - - await db.MarketAlert.bulkCreate(filteredMarketAlerts); - } catch (e) { - console.log( - "CRAWLER SERVICE: Could not bulkCreate marketalers reason: ", - e - ); - } - } catch (e) { - console.log( - "CRAWLER SERVICE: Error crawling. Trying next crawler! ", - e - ); - } - }); - } catch (e) { - console.error("CRAWLER SERVICE:could not fetch marketalerts ", e); - } -} -module.exports = crawlAll; diff --git a/package.json b/package.json index dc49bf6..8056c01 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,8 @@ "migrate": "cd app && npx sequelize db:migrate", "setup": "docker build -t marketalerts . && docker run -e POSTGRES_USER=docker -e POSTGRES_PASSWORD=docker -e POSTGRES_DB=marketalerts --name pg_marketalerts -d -p 5432:5432 marketalerts && sleep 4 && npm run migrate", "docker-start": "docker start pg_marketalerts", - "docker-stop": "docker stop pg_marketalerts" + "docker-stop": "docker stop pg_marketalerts", + "crawl": "cd app/crawler && node crawl.js" }, "repository": { "type": "git",