diff --git a/app/common/enums.js b/app/common/enums.js new file mode 100644 index 0000000..d047240 --- /dev/null +++ b/app/common/enums.js @@ -0,0 +1,44 @@ +const AD_TYPE = { + AD_TYPE_SALE: "SALE", + AD_TYPE_RENT: "RENT" +}; + +const AD_CATEGORY = { + CATEGORY_FLAT: "FLAT", + CATEGORY_HOUSE: "HOUSE", + CATEGORY_OFFICE: "OFFICE", + CATEGORY_LAND: "LAND", + CATEGORY_APARTMENT: "APARTMENT", + CATEGORY_GARAGE: "GARAGE" +}; + +const IGNORED_USERNAMES = []; + +const AD_STATUS = { + STATUS_NORMAL: 1, + STATUS_RESERVED: 2, + STATUS_SOLD: 3, + STATUS_DELETED: 4, + STATUS_URGENT: 5, + STATUS_DISCOUNTED: 6 +}; + +const AD_AGENCY = { + OLX: "OLX" +}; + +const CRAWLER_AD_TYPE = { + NONE: 0, + ALL: 1, + ONLY_SELL: 2, + ONLY_RENT: 3 +}; + +module.exports = { + AD_TYPE, + IGNORED_USERNAMES, + AD_CATEGORY, + AD_STATUS, + AD_AGENCY, + CRAWLER_AD_TYPE +}; diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js new file mode 100644 index 0000000..0f9dcc9 --- /dev/null +++ b/app/crawler/crawl.js @@ -0,0 +1,35 @@ +"use strict"; +/* + Entry point for crawling functionality + All communication between crawlers and savers is here + All environment specific configuration is read here and + passed to the crawlers and savers. +*/ + +require("dotenv").config(); +const OlxCrawler = require("./specific/olx"); +const { OLX_CONFIG } = require("./crawlerConfig"); +const PostgresSaver = require("./savers/postgres"); + +const crawlers = [ + new OlxCrawler( + OLX_CONFIG.OLX_START_PAGE, + OLX_CONFIG.OLX_END_PAGE, + OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, + [new PostgresSaver()], + OLX_CONFIG.OLX_CRAWLER_AD_TYPE, + OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES + ) +]; + +async function crawlAll() { + for (let crawler of crawlers) { + try { + await crawler.crawl(); + } catch (e) { + console.log("Error crawling. Trying next crawler! ", e); + } + } +} + +crawlAll(); diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js new file mode 100644 index 0000000..d524fce --- /dev/null +++ b/app/crawler/crawlerConfig.js @@ -0,0 +1,32 @@ +"use strict"; +require("dotenv").config({ path: "../../.env" }); +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); + +const crawlerAdType = + process.env.OLX_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] + : null; + +const parsedCrawlerAdCategories = + process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["CATEGORY_FLAT", "CATEGORY_HOUSE"]; + +const transformedCrawlerAdCategories = parsedCrawlerAdCategories + .map(categoryName => AD_CATEGORY[categoryName]) + .filter(category => !!category); + +const OLX_CONFIG = { + OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1, + OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10, + OLX_MAX_RESULTS_PER_PAGE: + parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, + OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE, + OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories +}; + +module.exports = { + OLX_CONFIG +}; diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js new file mode 100644 index 0000000..5aa0c3a --- /dev/null +++ b/app/crawler/savers/postgres.js @@ -0,0 +1,21 @@ +const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate"); + +class PostgresSaver { + connect() { + //TODO: It seems we never worry about open/close connection with Sequelize ? + //TODO: Check if postgres is ready + return true; + } + + async save(results) { + console.log("[POSTGRES] Saving..."); + await bulkUpsertRealEstates(results); + } + + close() { + //TODO: It seems we never worry about open/close connection with Sequelize ? + return true; + } +} + +module.exports = PostgresSaver; diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js new file mode 100644 index 0000000..0bf0a35 --- /dev/null +++ b/app/crawler/specific/olx.js @@ -0,0 +1,386 @@ +"use strict"; + +let fetch = require("node-fetch"); +let cheerio = require("cheerio"); + +const { + AD_TYPE, + AD_CATEGORY, + IGNORED_USERNAMES, + AD_AGENCY, + AD_STATUS, + CRAWLER_AD_TYPE +} = require("../../common/enums"); + +const OLX_ENUMS = { + OLX_AD_TYPE: {}, + OLX_AD_CATEGORY: {}, + MAX_DETAIL_FIELDS: 30 +}; + +OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = ""; +OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja"; +OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje"; + +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30"; + +class OlxCrawler { + constructor( + fromPage = 1, + toPage = 10, + maxResults = 1000, + savers = [], + crawlerAdTypes = CRAWLER_AD_TYPE.ALL, + crawlerAdCategories = [ + AD_CATEGORY.CATEGORY_FLAT, + AD_CATEGORY.CATEGORY_HOUSE + ] + ) { + this.fromPage = fromPage; + this.toPage = toPage; + this.maxResults = maxResults; + this.savers = savers; + this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; + this.crawlerAdTypes = crawlerAdTypes; + this.crawlerAdCategories = crawlerAdCategories; + } + + async crawl() { + console.log("[OLX] Crawler started"); + const crawlAdTypes = this.crawlerAdTypes; + const crawlAdCategories = this.crawlerAdCategories; + + const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`; + + if (crawlAdCategories && crawlAdTypes) { + const asyncPagesIndexingByCategory = []; + for (const adCategory of crawlAdCategories) { + asyncPagesIndexingByCategory.push( + this.indexPages( + `${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}` + ) + ); + } + + await Promise.all(asyncPagesIndexingByCategory); + } + console.log("[OLX] Crawler finished"); + } + + async indexPages(url) { + const startPage = this.fromPage; + const endPage = this.toPage; + const maxResultsPerPage = this.maxResults; + + for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { + const singlePageResults = await this.indexSinglePage( + url, + pageNumber, + maxResultsPerPage + ); + await this.saveCrawledResults(singlePageResults); + await this.sleep(5000); + } + } + + async indexSinglePage(urlWithoutPageNumber, pageNumber, maxResultsPerPage) { + try { + const url = `${urlWithoutPageNumber}&stranica=${pageNumber}`; + + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + let hrefs = []; + const singlePageResults = []; + + $("#rezultatipretrage") + .find(".listitem") + .each((i, elem) => { + const href = $(elem) + .find("a") + .first() + .attr("href"); + if (href) { + hrefs.push(href); + } + }); + + let actualNoOfResults = + hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; + + for (let i = 0; i < actualNoOfResults; i++) { + console.log(`Scraping : ${hrefs[i]}`); + + const adData = await this.scrapeAd(hrefs[i]); + + if (adData) { + singlePageResults.push(adData); + } + await this.sleep(500); + } + + return singlePageResults; + } catch (e) { + console.error("Exception caught:" + e); + } + } + + async scrapeAd(url) { + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + let status = AD_STATUS.STATUS_NORMAL; + + const username = $( + "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" + ).text(); + + if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { + return null; + } + + const title = $("#naslovartikla").text(); + const descriptions = $(".artikal_detaljniopis_tekst"); + const category = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" + ).text(); + + //====== PRICE DETECTION AND EXTRACTION ===== + let price = null; + const normalPriceValue = $("#pc > p:nth-child(2)").text(); + const urgentPriceValue = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" + ).text(); + + if (normalPriceValue && normalPriceValue.length > 0) { + price = normalPriceValue; + if ( + $("#pc > p.n") + .text() + .indexOf("Hitna") !== -1 + ) { + status = AD_STATUS.STATUS_URGENT; + } else { + status = AD_STATUS.STATUS_NORMAL; + } + } else if (urgentPriceValue && urgentPriceValue.length > 0) { + const priceValues = urgentPriceValue.split("KM"); + //priceValues will contain values like ["100000", "90000", ...], second element is urgent price + if (priceValues.length > 1) { + price = priceValues[1].trim(); + status = AD_STATUS.STATUS_DISCOUNTED; + } else { + throw { message: "Can't find urgent price" }; + } + } else { + throw { + message: "Can't find price (it is not normal nor urgent price ?)" + }; + } + + //====== OTHER AD INFORMATION =============== + let adType = null; + let olxId = null; + + let otherInformationDivId; + //We need to locate DIV ID where other information are stored + for (let possibleId = 10; possibleId <= 20; possibleId++) { + const adTypeFieldTitle = $( + `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` + ) + .text() + .trim(); + + if (adTypeFieldTitle === "Vrsta oglasa") { + otherInformationDivId = possibleId; + break; + } + } + + if (!otherInformationDivId) { + throw { message: "Other information DIV could not be found" }; + } + + const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; + + adType = $( + `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` + ) + .text() + .trim(); + const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) + .text() + .trim(); + olxId = $(`${olxIdFieldSelector} > div.df2`) + .text() + .trim(); + + if (olxIdFieldTitle !== "OLX ID") { + throw { message: "Cannot find correct OLX ID" }; + } + //=========================================== + + //====== DETAIL INFORMATION FIELDS ========== + let area = null; + let gardenSize = null; + + let fieldIndex = 1; + do { + const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; + const fieldTitleSelector = `${fieldSelector} > div.df1`; + const fieldValueSelector = `${fieldSelector} > div.df2`; + + const fieldTitle = $(fieldTitleSelector) + .text() + .trim(); + const fieldValue = $(fieldValueSelector) + .text() + .trim(); + + switch (fieldTitle) { + case "Kvadrata": + area = fieldValue; + break; + case "Okućnica (kvadratura)": + gardenSize = fieldValue; + break; + } + + if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + break; + } + } while (true); + //=========================================== + + //====== UNUSED FIELDS FOR NOW ============== + const time = $("time").attr("datetime"); + const numberOfViews = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" + ).text(); + //=========================================== + + //========================================= + const parsedCategory = this.getAdCategoryId(category); + if (!parsedCategory) { + throw { message: "Unknown ad category" }; + } + + const parsedAdType = this.getAdTypeId(adType); + if (!parsedAdType) { + throw { message: "Unknown ad type" }; + } + + const parsedArea = this.parseArea(area) || null; + const parsedGardenSize = this.parseArea(gardenSize) || null; + const parsedPrice = this.parsePrice(price) || null; + + const latLngRegex = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + const locationLatLngMatches = latLngRegex.exec(body); + + let locationLat = null; + let locationLong = null; + if (locationLatLngMatches && locationLatLngMatches.length >= 3) { + locationLat = parseFloat(locationLatLngMatches[1]) || null; + locationLong = parseFloat(locationLatLngMatches[2]) || null; + } + + const data = { + url, + agencyObjectId: olxId, + originAgencyName: AD_AGENCY.OLX, + realEstateType: this.getAdCategoryId(category), + adType: parsedAdType, + title, + price: parsedPrice, + area: parsedArea, + gardenSize: parsedGardenSize, + shortDescription: descriptions.first().text(), + longDescription: descriptions.last().text(), + streetNumber: 0, + streetName: "", + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat, + locationLong, + adStatus: status + }; + + return data; + } catch (e) { + console.error("Exception caught: " + e.message, "\r\nURL:", url); + } + return null; + } + + //======= HELPER FUNCTIONS ============= + + getAdCategoryId(categoryText) { + switch (categoryText) { + case "Stanovi": + return AD_CATEGORY.CATEGORY_FLAT; + case "Zemljišta": + return AD_CATEGORY.CATEGORY_LAND; + case "Kuće": + return AD_CATEGORY.CATEGORY_HOUSE; + case "Poslovni prostori": + return AD_CATEGORY.CATEGORY_OFFICE; + default: + return undefined; + } + } + + getAdTypeId(adTypeText) { + switch (adTypeText) { + case "Prodaja": + return AD_TYPE.AD_TYPE_SALE; + case "Izdavanje": + return AD_TYPE.AD_TYPE_RENT; + default: + return undefined; + } + } + + parseArea(areaText) { + if (!areaText) { + return NaN; + } + const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; + const textWithOnlyOneDecimalDot = areaText + .replace(",", ".") + .replace(removeDotsExceptLastOneRegex, ""); + + return parseFloat(textWithOnlyOneDecimalDot); + } + + parsePrice(priceText) { + if (!priceText) { + return NaN; + } + const formattedPriceText = priceText.replace(".", "").replace(",", "."); + return parseFloat(formattedPriceText); + } + + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + + async saveCrawledResults(results) { + const savers = this.savers; + + for (const saver of savers) { + await saver.save(results); + } + } +} + +module.exports = OlxCrawler; diff --git a/app/helpers/crawlers/olxClawler.js b/app/helpers/crawlers/olxClawler.js deleted file mode 100644 index d7007b7..0000000 --- a/app/helpers/crawlers/olxClawler.js +++ /dev/null @@ -1,435 +0,0 @@ -const fetch = require("node-fetch"); -const cheerio = require("cheerio"); -const { allRERequest, findPointInsideBoundingBox } = require("../db/dbHelper"); -const { getRealEstateTypeEnum } = require("../enums"); -const { getRegion, getMunicipality } = require("../codes"); -const Promise = require("bluebird"); - -module.exports = class OlxCrawler { - //TODO figure best way to handle paging - constructor(hrefs = []) { - this.hrefs = hrefs; - } - - async indexPages(urls) { - const indexers = []; - - urls.forEach(url => { - indexers.push(new Indexer(url)); - }); - - return Promise.map(indexers, function(indexer) { - return indexer.indexWithPagination(); - }).then(async results => { - return results; - }); - } - - async crawl() { - const filteredResults = []; - const realEstateRequests = await allRERequest(); - const urls = this.createRequestUrls(realEstateRequests); - let results = await this.indexPages( - urls, - this.fromPage, - this.toPage, - this.maxResults - ); - const flatResults = results.flat(); - if (flatResults) { - for (const finalResult of flatResults) { - if (null !== finalResult) { - if ( - finalResult.lat !== undefined && - finalResult.lat !== null && - finalResult.lat !== "" - ) { - const pointInsideBoundingBox = await findPointInsideBoundingBox( - [finalResult.lng, finalResult.lat], - finalResult.email, - finalResult.uuid - ); - - if (pointInsideBoundingBox[0].length !== 0) { - finalResult.hasLocation = true; - filteredResults.push(finalResult); - } else { - finalResult.hasLocation = false; - filteredResults.push(finalResult); - } - } - } - } - return filteredResults; - } - return []; - } - - createRequestUrls(realEstateRequests) { - const urls = []; - - for (const request of realEstateRequests) { - const { - realEstateType, - region, - sizeMin, - sizeMax, - priceMin, - priceMax - } = request; - - const urlRealEstateParams = [ - { - paramName: "kanton", - paramValue: region, - useParam: false - }, - { - paramName: "kategorija", - paramValue: getRealEstateTypeEnum(realEstateType).olxid, - useParam: true - }, - { - paramName: "kvadrata_min", - paramValue: sizeMin, - useParam: true - }, - { - paramName: "kvadrata_max", - paramValue: sizeMax, - useParam: true - }, - { - paramName: "od", - paramValue: priceMin, - useParam: true - }, - { - paramName: "do", - paramValue: priceMax, - useParam: true - } - ]; - const urlResultsParams = [ - { - paramName: "vrstapregleda", - paramValue: "tabela", - useParam: true - }, - { - paramName: "sort_order", - paramValue: "desc", - useParam: true - }, - { - paramName: "vrsta", - paramValue: "samoprodaja", - useParam: true - }, - { - paramName: "stranica", - paramValue: "0", - useParam: true - } - ]; - - const paramsReduceFunction = (accumulatedValue, currentParam) => { - const { paramName, paramValue, useParam } = currentParam; - if (useParam) { - return `${accumulatedValue}&${paramName}=${paramValue}`; - } else { - return accumulatedValue; - } - }; - - const reducedRealEstateParams = urlRealEstateParams.reduce( - paramsReduceFunction, - "" - ); - const reducedResultsParams = urlResultsParams.reduce( - paramsReduceFunction, - "" - ); - - const olxUrl = { - url: `https://www.olx.ba/pretraga?${reducedRealEstateParams}${reducedResultsParams}`, - email: request.email, - uuid: request.uniqueId, - hrefs: this.hrefs - }; - urls.push(olxUrl); - } - - return urls; - } -}; - -class Indexer { - /** - * - * @param {String|Array} olxUrl single or array of objects containing url email and uuid - * @param {Array} hrefResutls array contaning urls from crawler results - */ - - constructor(olxUrl, hrefResutls) { - this.olxUrl = olxUrl; - this.hrefResutls = hrefResutls; - } - - async indexWithPagination(pageNumber = 1) { - const pageNr = this.olxUrl.url.match(/\d+$/); - const indexers = this.prepareIndexers(pageNumber ? [pageNumber] : pageNr); - - try { - return Promise.map(indexers.indexers, function(indexer) { - return indexer.indexPage(pageNumber); - }).then(async results => { - let hasResults = false; - - results.forEach(result => { - if (!hasResults) { - hasResults = result.hasResults; - } - }); - - if (!hasResults) { - const singlePageIndexers = this.prepareHrefIndexers(results); - if (singlePageIndexers.length === 0) { - return []; - } - - return Promise.map(singlePageIndexers, function(indexer) { - return indexer.indexSingle(); - }).then(async results => { - return results; - }); - } else { - const newResults = await this.indexWithPagination( - results[0].pageNumber + 5 - ); - const singlePageIndexers = this.prepareHrefIndexers(results); - - const newerResults = await Promise.map(singlePageIndexers, function( - indexer - ) { - return indexer.indexSingle(); - }).then(async results => { - return results; - }); - - Array.prototype.push.apply(newResults, newerResults); - return newResults; - } - }); - } catch (e) { - console.error("Error has accured", e); - } - } - - prepareIndexers(pageNr) { - const indexers = []; - let lastPageNumber; - if (pageNr) { - for ( - let index = Number(pageNr[0]); - index <= Number(pageNr[0]) + 5; - index++ - ) { - lastPageNumber = index; - const newOlxUrl = { - url: this.olxUrl.url.replace(/\d+$/, "") + index, - email: this.olxUrl.email, - uuid: this.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - indexers.push(new Indexer(newOlxUrl)); - } - } else { - for (let index = 1; index <= 5; index++) { - lastPageNumber = index; - const newOlxUrl = { - url: this.olxUrl.url + index, - email: this.olxUrl.email, - uuid: this.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - indexers.push(new Indexer(newOlxUrl)); - } - } - return { - indexers: indexers, - lastPageNumber: lastPageNumber - }; - } - - prepareHrefIndexers(results) { - const indexers = []; - - if (!Array.isArray(results)) { - results.hrefs.forEach(href => { - const newOlxUrl = { - url: href, - email: results.olxUrl.email, - uuid: results.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - - indexers.push(new Indexer(newOlxUrl)); - }); - } else { - results.forEach(result => { - if (result !== null && result.hasOwnProperty("hrefs")) { - result.hrefs.forEach(href => { - const newOlxUrl = { - url: href, - email: result.olxUrl.email, - uuid: result.olxUrl.uuid, - hrefs: this.olxUrl.hrefs - }; - - indexers.push(new Indexer(newOlxUrl)); - }); - } - }); - } - - return indexers; - } - - async indexPage(pageNumber) { - try { - const res = await fetch(this.olxUrl.url); - const body = await res.text(); - const $ = cheerio.load(body); - const hrefs = []; - let hasResults = false; - - $("#rezultatipretrage") - .find(".listitem") - .each((i, elem) => { - hasResults = true; - const href = $(elem) - .find("a") - .first() - .attr("href"); - hrefs.push(href); - }); - return { - hrefs: hrefs, - hasResults: hasResults, - pageNumber: pageNumber, - olxUrl: this.olxUrl - }; - } catch (e) { - console.error("Exception caught:" + e); - } - } - - async indexSingle() { - try { - if (this.olxUrl.url === undefined) { - return {}; - } - - // if (global.hrefs) { - - if ( - this.olxUrl.hrefs[this.olxUrl.uuid] && - this.olxUrl.hrefs[this.olxUrl.uuid].includes(this.olxUrl.url) - ) { - return null; - } - // } - - const res = await fetch(this.olxUrl.url); - const body = await res.text(); - const $ = cheerio.load(body); - - const title = $("#naslovartikla") - .text() - .trim(); - const realEstateType = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" - ).text(); - - const price = $("#pc > p:nth-child(2)").text(); - const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text(); - const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text(); - const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text(); - const gardenSize = $( - "#dodatnapolja1 > div:nth-child(6) > div.df2" - ).text(); - const location = $( - "#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija" - ).attr("data-content"); - - const time = $("time").attr("datetime"); - const olxId = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2" - ).text(); - - const descriptions = $(".artikal_detaljniopis_tekst"); - const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; - const imgRe = /href":("[^"]*")/g; - const matches = latLngRe.exec(body); - let lng = "", - lat = ""; - const parsePrice = price => parseFloat(price.replace(".", "")); - - if (matches && matches.length >= 3) { - lat = matches[1]; - lng = matches[2]; - } - - const parsedPrice = parsePrice(price); - - const locationArray = - location && location.length > 0 ? location.split(",") : []; - const region = locationArray.length > 0 ? locationArray[0] : ""; - const municipality = locationArray.length > 1 ? locationArray[1] : ""; - - const data = { - realEstateType: this.getCategoryId(realEstateType), - email: this.olxUrl.email, - uuid: this.olxUrl.uuid, - olxId: olxId, - url: this.olxUrl.url, - title, - price: isNaN(parsedPrice) ? 0 : parsedPrice, - size: parseFloat(size), - gardenSize: isNaN(parseFloat(gardenSize)) ? 0 : parseFloat(gardenSize), - address, - region, - municipality, - time, - shortDescription: descriptions.first().text(), - longDescription: descriptions.last().text(), - lat, - lng, - loc: [parseFloat(lat), parseFloat(lng)] - }; - - return data; - } catch (e) { - console.error("Exception caught: " + e.message); - } - - return null; - } - - getCategoryId(category) { - switch (category) { - case "Stanovi": - return "stan"; - - case "Vikendice": - return "vikendica"; - - case "Kuće": - return "kuca"; - - default: - return ""; - } - } -} diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js new file mode 100644 index 0000000..3f32b7c --- /dev/null +++ b/app/helpers/db/realEstate.js @@ -0,0 +1,38 @@ +"use strict"; +const db = require("../../models/index"); + +const bulkUpsertRealEstates = async realEstateData => { + try { + const fieldsToUpdateIfDuplicate = [ + "realEstateType", + "adType", + "price", + "area", + "streetNumber", + "streetName", + "locality", + "municipality", + "city", + "region", + "entity", + "country", + "locationLat", + "locationLong", + "title", + "shortDescription", + "longDescription", + "gardenSize", + "adStatus", + "updatedAt" + ]; + return await db.RealEstate.bulkCreate(realEstateData, { + updateOnDuplicate: fieldsToUpdateIfDuplicate + }); + } catch (e) { + console.log("Error bulk upserting realEstates : ", e); + } +}; + +module.exports = { + bulkUpsertRealEstates +}; diff --git a/app/migrations/20190916160603-add-title-column-to-realEstates.js b/app/migrations/20190916160603-add-title-column-to-realEstates.js new file mode 100644 index 0000000..eafdeba --- /dev/null +++ b/app/migrations/20190916160603-add-title-column-to-realEstates.js @@ -0,0 +1,13 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.addColumn("RealEstates", "title", { + type: Sequelize.STRING + }); + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.removeColumn("RealEstates", "title"); + } +}; diff --git a/app/migrations/20190916230229-add-description-columns-to-realEstates.js b/app/migrations/20190916230229-add-description-columns-to-realEstates.js new file mode 100644 index 0000000..00c1d5b --- /dev/null +++ b/app/migrations/20190916230229-add-description-columns-to-realEstates.js @@ -0,0 +1,21 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.addColumn("RealEstates", "shortDescription", { + type: Sequelize.STRING + }), + queryInterface.addColumn("RealEstates", "longDescription", { + type: Sequelize.STRING + }) + ]); + }, + + down: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.removeColumn("RealEstates", "shortDescription"), + queryInterface.removeColumn("RealEstates", "longDescription") + ]); + } +}; diff --git a/app/migrations/20190916231241-add-status-column-to-realEstates.js b/app/migrations/20190916231241-add-status-column-to-realEstates.js new file mode 100644 index 0000000..9de0caa --- /dev/null +++ b/app/migrations/20190916231241-add-status-column-to-realEstates.js @@ -0,0 +1,13 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.addColumn("RealEstates", "adStatus", { + type: Sequelize.INTEGER + }); + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.removeColumn("RealEstates", "adStatus"); + } +}; diff --git a/app/migrations/20190917090318-add-composite-key-to-realEstates.js b/app/migrations/20190917090318-add-composite-key-to-realEstates.js new file mode 100644 index 0000000..feb84b2 --- /dev/null +++ b/app/migrations/20190917090318-add-composite-key-to-realEstates.js @@ -0,0 +1,21 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.addConstraint( + "RealEstates", + ["originAgencyName", "agencyObjectId"], + { + type: "unique", + name: "agencyNameObjectIdUniqueKey" + } + ); + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.removeConstraint( + "RealEstates", + "agencyNameObjectIdUniqueKey" + ); + } +}; diff --git a/app/migrations/20190917101052-remove-lastTimeCrawled-column-from-realEstates.js b/app/migrations/20190917101052-remove-lastTimeCrawled-column-from-realEstates.js new file mode 100644 index 0000000..c9b49c2 --- /dev/null +++ b/app/migrations/20190917101052-remove-lastTimeCrawled-column-from-realEstates.js @@ -0,0 +1,14 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return queryInterface.removeColumn("RealEstates", "lastTimeCrawled"); + }, + + down: (queryInterface, Sequelize) => { + return queryInterface.addColumn("RealEstates", "lastTimeCrawled", { + type: Sequelize.DATE, + notNull: true + }); + } +}; diff --git a/app/migrations/20190917102437-remove-deleted-sold-columns-from-realEstates.js b/app/migrations/20190917102437-remove-deleted-sold-columns-from-realEstates.js new file mode 100644 index 0000000..03f43ff --- /dev/null +++ b/app/migrations/20190917102437-remove-deleted-sold-columns-from-realEstates.js @@ -0,0 +1,23 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.removeColumn("RealEstates", "deleted"), + queryInterface.removeColumn("RealEstates", "sold") + ]); + }, + + down: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.addColumn("RealEstates", "deleted", { + type: Sequelize.BOOLEAN, + notNull: true + }), + queryInterface.addColumn("RealEstates", "sold", { + type: Sequelize.BOOLEAN, + notNull: true + }) + ]); + } +}; diff --git a/app/migrations/20190917115804-change-title-short-long-description-columns-in-realEstates.js b/app/migrations/20190917115804-change-title-short-long-description-columns-in-realEstates.js new file mode 100644 index 0000000..9ed3b9e --- /dev/null +++ b/app/migrations/20190917115804-change-title-short-long-description-columns-in-realEstates.js @@ -0,0 +1,21 @@ +"use strict"; + +module.exports = { + up: (queryInterface, Sequelize) => { + return Promise.all([ + queryInterface.changeColumn("RealEstates", "shortDescription", { + type: Sequelize.TEXT + }), + queryInterface.changeColumn("RealEstates", "longDescription", { + type: Sequelize.TEXT + }), + queryInterface.changeColumn("RealEstates", "title", { + type: Sequelize.TEXT + }) + ]); + }, + + down: (queryInterface, Sequelize) => { + return Promise.all([]); + } +}; diff --git a/app/models/realEstate.js b/app/models/realEstate.js index 66b7a92..72b76ec 100644 --- a/app/models/realEstate.js +++ b/app/models/realEstate.js @@ -12,13 +12,15 @@ module.exports = (sequelize, DataTypes) => { type: DataTypes.TEXT, allowNull: false }, - agencyObjectId: { - type: DataTypes.TEXT, - allowNull: false - }, originAgencyName: { type: DataTypes.TEXT, - allowNull: false + allowNull: false, + unique: true + }, + agencyObjectId: { + type: DataTypes.TEXT, + allowNull: false, + unique: true }, realEstateType: { type: DataTypes.TEXT, @@ -45,14 +47,10 @@ module.exports = (sequelize, DataTypes) => { type: DataTypes.DATE, allowNull: false }, - deleted: { - type: DataTypes.BOOLEAN, - allowNull: false - }, - sold: { - type: DataTypes.BOOLEAN, - allowNull: false - } + title: DataTypes.TEXT, + shortDescription: DataTypes.TEXT, + longDescription: DataTypes.TEXT, + adStatus: DataTypes.INTEGER }); RealEstate.associate = models => { diff --git a/app/services/crawlerService.js b/app/services/crawlerService.js deleted file mode 100644 index 1de43b8..0000000 --- a/app/services/crawlerService.js +++ /dev/null @@ -1,77 +0,0 @@ -const Promise = require("bluebird"); -const OlxCrawler = require("../helpers/crawlers/olxClawler"); -const db = require("../models/index"); -const { allMarketAlerts } = require("../helpers/db/dbHelper"); - -async function crawlAll() { - try { - const marketAlertsFromDb = await allMarketAlerts(true); - const hrefs = []; - - marketAlertsFromDb.map(marketAlert => { - if (hrefs[marketAlert.request] === undefined) { - hrefs[marketAlert.request] = []; - } - - hrefs[marketAlert.request].push(marketAlert.url); - }); - - const olxCrawler = new OlxCrawler(hrefs); - - const crawlers = [olxCrawler]; - - return Promise.map(crawlers, function(crawler) { - return crawler.crawl(); - }).then(async results => { - try { - const marketAlertsFromDb = await allMarketAlerts(false, true); - - const marketAlerts = []; - const mergedResults = [].concat.apply([], results); - - for (const result of mergedResults) { - marketAlerts.push({ - url: result.url, - realestateOrigin: "OLX", - originId: 1, - size: result.size, - price: result.price, - email: result.email, - request: result.uuid, - municipality: result.municipality, - region: result.region, - gardenSize: isNaN(result.gardenSize) ? 0 : result.gardenSize, - realEstateType: result.realEstateType, - title: result.title, - notified: false, - hasLocation: result.hasLocation - }); - } - - try { - const filteredMarketAlerts = marketAlerts.filter( - elem => - !marketAlertsFromDb.find(({ url, request }) => { - return elem.url === url && elem.request === request; - }) - ); - - await db.MarketAlert.bulkCreate(filteredMarketAlerts); - } catch (e) { - console.log( - "CRAWLER SERVICE: Could not bulkCreate marketalers reason: ", - e - ); - } - } catch (e) { - console.log( - "CRAWLER SERVICE: Error crawling. Trying next crawler! ", - e - ); - } - }); - } catch (e) { - console.error("CRAWLER SERVICE:could not fetch marketalerts ", e); - } -} -module.exports = crawlAll; diff --git a/development.env b/development.env index 47d75c3..f998e9d 100644 --- a/development.env +++ b/development.env @@ -13,3 +13,11 @@ AMAZON_SECRET_ACCESS_KEY=(your-key-here) AMAZON_REGION=eu-west-1 APP_URL=http://localhost:3001 SOURCE_EMAIL=info@saburly.com + +#=============== CRAWLER SETTINGS===============# +#==OLX== +OLX_START_PAGE=Crawler starts from this page +OLX_END_PAGE=Crawler ends with this page (including this page) +OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved +OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values +OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values diff --git a/package-lock.json b/package-lock.json index 8516da0..cac150b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -53,11 +53,6 @@ "@types/node": "*" } }, - "@types/geojson": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/@types/geojson/-/geojson-1.0.6.tgz", - "integrity": "sha512-Xqg/lIZMrUd0VRmSRbCAewtwGZiAk3mEUDvV4op1tGl+LvyPcb/MIOSxTl9z+9+J+R4/vpjiCAT4xeKzH9ji1w==" - }, "@types/node": { "version": "11.11.1", "resolved": "https://registry.npmjs.org/@types/node/-/node-11.11.1.tgz", @@ -126,6 +121,11 @@ "color-convert": "^1.9.0" } }, + "any-promise": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", + "integrity": "sha1-q8av7tzqUugJzcA3au0845Y10X8=" + }, "anymatch": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", @@ -1924,11 +1924,6 @@ } } }, - "generic-pool": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.5.0.tgz", - "integrity": "sha512-dEkxmX+egB2o4NR80c/q+xzLLzLX+k68/K8xv81XprD+Sk7ZtP14VugeCz+fUwv5FzpWq40pPtAkzPRqT8ka9w==" - }, "get-caller-file": { "version": "2.0.5", "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", @@ -3419,12 +3414,11 @@ "dev": true }, "retry-as-promised": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/retry-as-promised/-/retry-as-promised-2.3.2.tgz", - "integrity": "sha1-zZdO5P2bX+A8vzGHHuSCIcB3N7c=", + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/retry-as-promised/-/retry-as-promised-3.2.0.tgz", + "integrity": "sha512-CybGs60B7oYU/qSQ6kuaFmRd9sTZ6oXSc0toqePvV74Ac6/IFZSI1ReFQmtCN+uvW1Mtqdwpvt/LGOiCBAY2Mg==", "requires": { - "bluebird": "^3.4.6", - "debug": "^2.6.9" + "any-promise": "^1.3.0" } }, "safe-buffer": { @@ -3486,41 +3480,44 @@ } }, "sequelize": { - "version": "4.43.2", - "resolved": "https://registry.npmjs.org/sequelize/-/sequelize-4.43.2.tgz", - "integrity": "sha512-EA3V1AsxVjf2EtGbdEoa9Fe5rSAqy5g4OsX0VwtU6iMezTjIYTCXV8o6mG7i6u3lu4Zc7JWZ6XwhS0k79pT/EQ==", + "version": "5.18.4", + "resolved": "https://registry.npmjs.org/sequelize/-/sequelize-5.18.4.tgz", + "integrity": "sha512-bBmJqpO1H8Z7L0xzITqVo5KHXFI7GmKfGl/5SIPDKsuUMbuZT98s+gyGeaLXpOWGH1ZUO79hvJ8z74vNcxBWHg==", "requires": { "bluebird": "^3.5.0", "cls-bluebird": "^2.1.0", - "debug": "^3.1.0", - "depd": "^1.1.0", + "debug": "^4.1.1", "dottie": "^2.0.0", - "generic-pool": "3.5.0", "inflection": "1.12.0", - "lodash": "^4.17.1", - "moment": "^2.20.0", - "moment-timezone": "^0.5.14", - "retry-as-promised": "^2.3.2", - "semver": "^5.5.0", - "terraformer-wkt-parser": "^1.1.2", + "lodash": "^4.17.11", + "moment": "^2.24.0", + "moment-timezone": "^0.5.21", + "retry-as-promised": "^3.1.0", + "semver": "^6.1.1", + "sequelize-pool": "^2.3.0", "toposort-class": "^1.0.1", "uuid": "^3.2.1", - "validator": "^10.4.0", - "wkx": "^0.4.1" + "validator": "^10.11.0", + "wkx": "^0.4.6" }, "dependencies": { "debug": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", - "integrity": "sha512-mel+jf7nrtEl5Pn1Qx46zARXKDpBbvzezse7p7LqINmdoIk8PYP5SySaxEmYv6TZ0JyEKA1hsCId6DIhgITtWQ==", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.1.1.tgz", + "integrity": "sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw==", "requires": { "ms": "^2.1.1" } }, "ms": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", - "integrity": "sha512-tgp+dl5cGk28utYktBsrFqA7HKgrhgPsg6Z/EfhWI4gl1Hwq8B/GmY/0oXZ6nF8hDVesS/FpnYaD/kOWhYQvyg==" + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" + }, + "semver": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", + "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==" } } }, @@ -3539,6 +3536,11 @@ "yargs": "^13.1.0" } }, + "sequelize-pool": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/sequelize-pool/-/sequelize-pool-2.3.0.tgz", + "integrity": "sha512-Ibz08vnXvkZ8LJTiUOxRcj1Ckdn7qafNZ2t59jYHMX1VIebTAOYefWdRYFt6z6+hy52WGthAHAoLc9hvk3onqA==" + }, "serve-static": { "version": "1.13.2", "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz", @@ -3904,23 +3906,6 @@ } } }, - "terraformer": { - "version": "1.0.9", - "resolved": "https://registry.npmjs.org/terraformer/-/terraformer-1.0.9.tgz", - "integrity": "sha512-YlmQ1fsMWTkKGDGibCRWgmLzrpDRUr63Q025LJ/taYQ6j1Yb8q9McKF7NBi6ACAyUXO6F/bl9w6v4MY307y5Ag==", - "requires": { - "@types/geojson": "^1.0.0" - } - }, - "terraformer-wkt-parser": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/terraformer-wkt-parser/-/terraformer-wkt-parser-1.2.0.tgz", - "integrity": "sha512-QU3iA54St5lF8Za1jg1oj4NYc8sn5tCZ08aNSWDeGzrsaV48eZk1iAVWasxhNspYBoCqdHuoot1pUTUrE1AJ4w==", - "requires": { - "@types/geojson": "^1.0.0", - "terraformer": "~1.0.5" - } - }, "through": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", @@ -4296,9 +4281,9 @@ } }, "wkx": { - "version": "0.4.6", - "resolved": "https://registry.npmjs.org/wkx/-/wkx-0.4.6.tgz", - "integrity": "sha512-LHxXlzRCYQXA9ZHgs8r7Gafh0gVOE8o3QmudM1PIkOdkXXjW7Thcl+gb2P2dRuKgW8cqkitCRZkkjtmWzpHi7A==", + "version": "0.4.8", + "resolved": "https://registry.npmjs.org/wkx/-/wkx-0.4.8.tgz", + "integrity": "sha512-ikPXMM9IR/gy/LwiOSqWlSL3X/J5uk9EO2hHNRXS41eTLXaUFEVw9fn/593jW/tE5tedNg8YjT5HkCa4FqQZyQ==", "requires": { "@types/node": "*" } diff --git a/package.json b/package.json index dc49bf6..a1447c3 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,8 @@ "migrate": "cd app && npx sequelize db:migrate", "setup": "docker build -t marketalerts . && docker run -e POSTGRES_USER=docker -e POSTGRES_PASSWORD=docker -e POSTGRES_DB=marketalerts --name pg_marketalerts -d -p 5432:5432 marketalerts && sleep 4 && npm run migrate", "docker-start": "docker start pg_marketalerts", - "docker-stop": "docker stop pg_marketalerts" + "docker-stop": "docker stop pg_marketalerts", + "crawl": "cd app/crawler && node crawl.js" }, "repository": { "type": "git", @@ -37,7 +38,7 @@ "node-schedule": "^1.3.2", "pg": "^7.10.0", "react-step-wizard": "^5.1.0", - "sequelize": "^4.43.2", + "sequelize": "^5.18.4", "sequelize-cli": "^5.5.0" }, "devDependencies": {