From 3d46c82d3df6b1757d015b40eb126d66de4d3fb9 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Wed, 18 Sep 2019 15:32:48 +0200 Subject: [PATCH] create new crawler and Postgres saver --- app/crawler/crawl.js | 42 +--- app/crawler/crawlerConfig.js | 32 +++ app/crawler/savers/postgres.js | 15 +- app/crawler/specific/olx.js | 438 ++++++++++++++++++++++++--------- 4 files changed, 363 insertions(+), 164 deletions(-) create mode 100644 app/crawler/crawlerConfig.js diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index dcdf094..0f9dcc9 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -8,48 +8,28 @@ require("dotenv").config(); const OlxCrawler = require("./specific/olx"); +const { OLX_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); -let crawlers = [ - // new OlxCrawler( - // process.env.OLX_FROM_PAGE, - // process.env.OLX_TO_PAGE, - // process.env.OLX_MAX_RESULTS - // ) - // new ProstorCrawler( - // parseInt(process.env.PROSTOR_FROM_PAGE), - // parseInt(process.env.PROSTOR_TO_PAGE), - // parseInt(process.env.PROSTOR_MAX_RESULTS) - // ), - // new RentalCrawler( - // parseInt(process.env.RENTAL_FROM_PAGE), - // parseInt(process.env.RENTAL_TO_PAGE), - // parseInt(process.env.RENTAL_MAX_RESULTS) - // ) +const crawlers = [ + new OlxCrawler( + OLX_CONFIG.OLX_START_PAGE, + OLX_CONFIG.OLX_END_PAGE, + OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, + [new PostgresSaver()], + OLX_CONFIG.OLX_CRAWLER_AD_TYPE, + OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES + ) ]; -let savers = [new PostgresSaver(process.env.MONGO_URL)]; - async function crawlAll() { for (let crawler of crawlers) { try { - const crawlerResults = await crawler.crawl(); - for (let saver of savers) { - try { - await saver.connect(); - await saver.save(crawlerResults); - } catch (e) { - console.log("Error saving. Trying next saver! ", e); - } - } + await crawler.crawl(); } catch (e) { console.log("Error crawling. Trying next crawler! ", e); } } - - for (let saver of savers) { - saver.close(); - } } crawlAll(); diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js new file mode 100644 index 0000000..d524fce --- /dev/null +++ b/app/crawler/crawlerConfig.js @@ -0,0 +1,32 @@ +"use strict"; +require("dotenv").config({ path: "../../.env" }); +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); + +const crawlerAdType = + process.env.OLX_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] + : null; + +const parsedCrawlerAdCategories = + process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["CATEGORY_FLAT", "CATEGORY_HOUSE"]; + +const transformedCrawlerAdCategories = parsedCrawlerAdCategories + .map(categoryName => AD_CATEGORY[categoryName]) + .filter(category => !!category); + +const OLX_CONFIG = { + OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1, + OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10, + OLX_MAX_RESULTS_PER_PAGE: + parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, + OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE, + OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories +}; + +module.exports = { + OLX_CONFIG +}; diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js index d67dbd4..5aa0c3a 100644 --- a/app/crawler/savers/postgres.js +++ b/app/crawler/savers/postgres.js @@ -1,8 +1,6 @@ -class PostgresSaver { - constructor(url) { - this.url = url; - } +const { bulkUpsertRealEstates } = require("../../helpers/db/realEstate"); +class PostgresSaver { connect() { //TODO: It seems we never worry about open/close connection with Sequelize ? //TODO: Check if postgres is ready @@ -10,13 +8,8 @@ class PostgresSaver { } async save(results) { - let resultsForMongo = Object.keys(results).map(key => { - return results[key]; - }); - - for (const doc of resultsForMongo) { - this.collection.update({ url: doc.url }, doc, { upsert: true }); - } + console.log("[POSTGRES] Saving..."); + await bulkUpsertRealEstates(results); } close() { diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index d5c9432..0bf0a35 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -6,21 +6,136 @@ let cheerio = require("cheerio"); const { AD_TYPE, AD_CATEGORY, - IGNORED_USERNAMES + IGNORED_USERNAMES, + AD_AGENCY, + AD_STATUS, + CRAWLER_AD_TYPE } = require("../../common/enums"); +const OLX_ENUMS = { + OLX_AD_TYPE: {}, + OLX_AD_CATEGORY: {}, + MAX_DETAIL_FIELDS: 30 +}; + +OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = ""; +OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja"; +OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje"; + +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27"; +OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30"; + class OlxCrawler { - constructor(fromPage = 0, toPage = 10, maxResults = 1000) { + constructor( + fromPage = 1, + toPage = 10, + maxResults = 1000, + savers = [], + crawlerAdTypes = CRAWLER_AD_TYPE.ALL, + crawlerAdCategories = [ + AD_CATEGORY.CATEGORY_FLAT, + AD_CATEGORY.CATEGORY_HOUSE + ] + ) { this.fromPage = fromPage; this.toPage = toPage; this.maxResults = maxResults; + this.savers = savers; + this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; + this.crawlerAdTypes = crawlerAdTypes; + this.crawlerAdCategories = crawlerAdCategories; } - async indexSingle(url) { + async crawl() { + console.log("[OLX] Crawler started"); + const crawlAdTypes = this.crawlerAdTypes; + const crawlAdCategories = this.crawlerAdCategories; + + const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`; + + if (crawlAdCategories && crawlAdTypes) { + const asyncPagesIndexingByCategory = []; + for (const adCategory of crawlAdCategories) { + asyncPagesIndexingByCategory.push( + this.indexPages( + `${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}` + ) + ); + } + + await Promise.all(asyncPagesIndexingByCategory); + } + console.log("[OLX] Crawler finished"); + } + + async indexPages(url) { + const startPage = this.fromPage; + const endPage = this.toPage; + const maxResultsPerPage = this.maxResults; + + for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { + const singlePageResults = await this.indexSinglePage( + url, + pageNumber, + maxResultsPerPage + ); + await this.saveCrawledResults(singlePageResults); + await this.sleep(5000); + } + } + + async indexSinglePage(urlWithoutPageNumber, pageNumber, maxResultsPerPage) { try { + const url = `${urlWithoutPageNumber}&stranica=${pageNumber}`; + const res = await fetch(url); const body = await res.text(); const $ = cheerio.load(body); + let hrefs = []; + const singlePageResults = []; + + $("#rezultatipretrage") + .find(".listitem") + .each((i, elem) => { + const href = $(elem) + .find("a") + .first() + .attr("href"); + if (href) { + hrefs.push(href); + } + }); + + let actualNoOfResults = + hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; + + for (let i = 0; i < actualNoOfResults; i++) { + console.log(`Scraping : ${hrefs[i]}`); + + const adData = await this.scrapeAd(hrefs[i]); + + if (adData) { + singlePageResults.push(adData); + } + await this.sleep(500); + } + + return singlePageResults; + } catch (e) { + console.error("Exception caught:" + e); + } + } + + async scrapeAd(url) { + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + let status = AD_STATUS.STATUS_NORMAL; const username = $( "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" @@ -31,161 +146,240 @@ class OlxCrawler { } const title = $("#naslovartikla").text(); + const descriptions = $(".artikal_detaljniopis_tekst"); const category = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" ).text(); - const price = $("#pc > p:nth-child(2)").text(); - const size = $("#dodatnapolja1 > div:nth-child(1) > div.df2").text(); - const rooms = $("#dodatnapolja1 > div:nth-child(2) > div.df2").text(); - const address = $("#dodatnapolja1 > div:nth-child(5) > div.df2").text(); - const location = $( - "#artikal_glavni_div > div.artikal_lijevo > div.op.pop.mobile-lokacija" - ).attr("data-content"); - - const adType = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(2) > div.df2" - ).text(); - const time = $("time").attr("datetime"); - const olxId = $( - "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(15) > div:nth-child(4) > div.df2" + //====== PRICE DETECTION AND EXTRACTION ===== + let price = null; + const normalPriceValue = $("#pc > p:nth-child(2)").text(); + const urgentPriceValue = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" ).text(); - const descriptions = $(".artikal_detaljniopis_tekst"); - const floor = $("#dodatnapolja1") - .find(":contains(Sprat)") - .last() - .nextAll() - .text(); - const latLngRe = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; - const matches = latLngRe.exec(body); - let lng = "", - lat = ""; - - const parseRooms = rooms => - parseInt( - [...rooms] - .filter(c => !isNaN(c)) - .filter(c => c.trim()) - .join() - ); - const parsePrice = price => parseFloat(price.replace(".", "")); - - if (matches && matches.length >= 3) { - lat = matches[1]; - lng = matches[2]; + if (normalPriceValue && normalPriceValue.length > 0) { + price = normalPriceValue; + if ( + $("#pc > p.n") + .text() + .indexOf("Hitna") !== -1 + ) { + status = AD_STATUS.STATUS_URGENT; + } else { + status = AD_STATUS.STATUS_NORMAL; + } + } else if (urgentPriceValue && urgentPriceValue.length > 0) { + const priceValues = urgentPriceValue.split("KM"); + //priceValues will contain values like ["100000", "90000", ...], second element is urgent price + if (priceValues.length > 1) { + price = priceValues[1].trim(); + status = AD_STATUS.STATUS_DISCOUNTED; + } else { + throw { message: "Can't find urgent price" }; + } + } else { + throw { + message: "Can't find price (it is not normal nor urgent price ?)" + }; } - const parsedPrice = parsePrice(price); - let parsedRooms; + //====== OTHER AD INFORMATION =============== + let adType = null; + let olxId = null; - if (rooms === "Garsonjera") { - parsedRooms = 0; - } else { - parsedRooms = parseRooms(rooms); + let otherInformationDivId; + //We need to locate DIV ID where other information are stored + for (let possibleId = 10; possibleId <= 20; possibleId++) { + const adTypeFieldTitle = $( + `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${possibleId}) > div:nth-child(2) > div.df1` + ) + .text() + .trim(); + + if (adTypeFieldTitle === "Vrsta oglasa") { + otherInformationDivId = possibleId; + break; + } + } + + if (!otherInformationDivId) { + throw { message: "Other information DIV could not be found" }; + } + + const olxIdFieldSelector = `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(4)`; + + adType = $( + `#artikal_glavni_div > div.artikal_lijevo > div:nth-child(${otherInformationDivId}) > div:nth-child(2) > div.df2` + ) + .text() + .trim(); + const olxIdFieldTitle = $(`${olxIdFieldSelector} > div.df1`) + .text() + .trim(); + olxId = $(`${olxIdFieldSelector} > div.df2`) + .text() + .trim(); + + if (olxIdFieldTitle !== "OLX ID") { + throw { message: "Cannot find correct OLX ID" }; + } + //=========================================== + + //====== DETAIL INFORMATION FIELDS ========== + let area = null; + let gardenSize = null; + + let fieldIndex = 1; + do { + const fieldSelector = `#dodatnapolja1 > div:nth-child(${fieldIndex})`; + const fieldTitleSelector = `${fieldSelector} > div.df1`; + const fieldValueSelector = `${fieldSelector} > div.df2`; + + const fieldTitle = $(fieldTitleSelector) + .text() + .trim(); + const fieldValue = $(fieldValueSelector) + .text() + .trim(); + + switch (fieldTitle) { + case "Kvadrata": + area = fieldValue; + break; + case "Okućnica (kvadratura)": + gardenSize = fieldValue; + break; + } + + if (++fieldIndex === OLX_ENUMS.MAX_DETAIL_FIELDS || fieldTitle === "") { + break; + } + } while (true); + //=========================================== + + //====== UNUSED FIELDS FOR NOW ============== + const time = $("time").attr("datetime"); + const numberOfViews = $( + "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" + ).text(); + //=========================================== + + //========================================= + const parsedCategory = this.getAdCategoryId(category); + if (!parsedCategory) { + throw { message: "Unknown ad category" }; + } + + const parsedAdType = this.getAdTypeId(adType); + if (!parsedAdType) { + throw { message: "Unknown ad type" }; + } + + const parsedArea = this.parseArea(area) || null; + const parsedGardenSize = this.parseArea(gardenSize) || null; + const parsedPrice = this.parsePrice(price) || null; + + const latLngRegex = /LatLng\(([0-9]+\.[0-9]+)\,\s+([0-9]+\.[0-9]+)\)/g; + const locationLatLngMatches = latLngRegex.exec(body); + + let locationLat = null; + let locationLong = null; + if (locationLatLngMatches && locationLatLngMatches.length >= 3) { + locationLat = parseFloat(locationLatLngMatches[1]) || null; + locationLong = parseFloat(locationLatLngMatches[2]) || null; } const data = { - category: this.getCategoryId(category), url, + agencyObjectId: olxId, + originAgencyName: AD_AGENCY.OLX, + realEstateType: this.getAdCategoryId(category), + adType: parsedAdType, title, - price: isNaN(parsedPrice) ? price : parsedPrice, - size: parseFloat(size), - rooms: parsedRooms, - floor: parseInt(floor), - address, - location, - adType: AD_TYPE.AD_TYPE_SALE, - time, + price: parsedPrice, + area: parsedArea, + gardenSize: parsedGardenSize, shortDescription: descriptions.first().text(), longDescription: descriptions.last().text(), - lat, - lng, - loc: [parseFloat(lat), parseFloat(lng)] + streetNumber: 0, + streetName: "", + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat, + locationLong, + adStatus: status }; return data; } catch (e) { - console.error("Exception caught: " + e.message); + console.error("Exception caught: " + e.message, "\r\nURL:", url); } - return null; } - async indexPage(pageNr, maxResults = 1000) { - try { - console.log("Starting to index page: " + pageNr); - const url = `http://www.olx.ba/pretraga?vrsta=samoprodaja&sort_order=desc&kategorija=23&sort_po=datum&kanton=9&stranica=${pageNr}`; + //======= HELPER FUNCTIONS ============= - const res = await fetch(url); - const body = await res.text(); - const $ = cheerio.load(body); - const hrefs = []; - const results = {}; - - $("#rezultatipretrage") - .find(".listitem") - .each((i, elem) => { - const href = $(elem) - .find("a") - .first() - .attr("href"); - hrefs.push(href); - }); - - let actualNoOfResults = - hrefs.length <= maxResults ? hrefs.length : maxResults; - - for (let i = 0; i < hrefs.length; i++) { - console.log(`indexing: ${hrefs[i]}`); - - const singleData = await this.indexSingle(hrefs[i]); - - if (singleData) { - results[hrefs[i]] = singleData; - } - await this.sleep(500); - } - - return results; - } catch (e) { - console.error("Exception caught:" + e); + getAdCategoryId(categoryText) { + switch (categoryText) { + case "Stanovi": + return AD_CATEGORY.CATEGORY_FLAT; + case "Zemljišta": + return AD_CATEGORY.CATEGORY_LAND; + case "Kuće": + return AD_CATEGORY.CATEGORY_HOUSE; + case "Poslovni prostori": + return AD_CATEGORY.CATEGORY_OFFICE; + default: + return undefined; } } - getCategoryId(category) { - if (category === "Stanovi") { - return AD_CATEGORY.CATEGORY_FLAT; - } else if (category === "Zemljišta") { - return AD_CATEGORY.CATEGORY_LAND; - } else if (category === "Kuće") { - return AD_CATEGORY.CATEGORY_HOUSE; - } else if (category === "Poslovni prostori") { - return AD_CATEGORY.CATEGORY_OFFICE; + getAdTypeId(adTypeText) { + switch (adTypeText) { + case "Prodaja": + return AD_TYPE.AD_TYPE_SALE; + case "Izdavanje": + return AD_TYPE.AD_TYPE_RENT; + default: + return undefined; } } + parseArea(areaText) { + if (!areaText) { + return NaN; + } + const removeDotsExceptLastOneRegex = /[.](?=.*[.])/g; + const textWithOnlyOneDecimalDot = areaText + .replace(",", ".") + .replace(removeDotsExceptLastOneRegex, ""); + + return parseFloat(textWithOnlyOneDecimalDot); + } + + parsePrice(priceText) { + if (!priceText) { + return NaN; + } + const formattedPriceText = priceText.replace(".", "").replace(",", "."); + return parseFloat(formattedPriceText); + } + async sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } - async indexPages(start, end, maxResults = 1000) { - let results = {}; - for (let i = start; i <= end; i++) { - let result = await this.indexPage(i, maxResults); - Object.assign(results, result); - await this.sleep(5000); - } - return results; - } + async saveCrawledResults(results) { + const savers = this.savers; - async crawl() { - let results = await this.indexPages( - this.fromPage, - this.toPage, - this.maxResults - ); - return results; + for (const saver of savers) { + await saver.save(results); + } } }