diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index 0f9dcc9..2ff5118 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -13,12 +13,12 @@ const PostgresSaver = require("./savers/postgres"); const crawlers = [ new OlxCrawler( - OLX_CONFIG.OLX_START_PAGE, - OLX_CONFIG.OLX_END_PAGE, - OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, [new PostgresSaver()], OLX_CONFIG.OLX_CRAWLER_AD_TYPE, - OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES + OLX_CONFIG.OLX_CRAWLER_AD_CATEGORIES, + OLX_CONFIG.OLX_MAX_PAGES, + OLX_CONFIG.OLX_MAX_RESULTS_PER_PAGE, + OLX_CONFIG.OLX_MAX_AGE ) ]; diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index d524fce..a602ad8 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -2,29 +2,29 @@ require("dotenv").config({ path: "../../.env" }); const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); -const crawlerAdType = +const olxCrawlerAdType = process.env.OLX_CRAWLER_AD_TYPE !== undefined ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] : null; -const parsedCrawlerAdCategories = +const olxParsedCrawlerAdCategories = process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => category.trim() ) : ["CATEGORY_FLAT", "CATEGORY_HOUSE"]; -const transformedCrawlerAdCategories = parsedCrawlerAdCategories +const transformedCrawlerAdCategories = olxParsedCrawlerAdCategories .map(categoryName => AD_CATEGORY[categoryName]) .filter(category => !!category); const OLX_CONFIG = { - OLX_START_PAGE: parseInt(process.env.OLX_START_PAGE) || 1, - OLX_END_PAGE: parseInt(process.env.OLX_END_PAGE) || 10, + OLX_MAX_PAGES: parseInt(process.env.MAX_PAGES) || 500, OLX_MAX_RESULTS_PER_PAGE: parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, - OLX_CRAWLER_AD_TYPE: crawlerAdType || CRAWLER_AD_TYPE.NONE, - OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories + OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, + OLX_CRAWLER_AD_CATEGORIES: transformedCrawlerAdCategories, + OLX_MAX_AGE: parseInt(process.env.OLX_MAX_AGE) || 30 }; module.exports = { diff --git a/app/crawler/savers/postgres.js b/app/crawler/savers/postgres.js index 5aa0c3a..4664be7 100644 --- a/app/crawler/savers/postgres.js +++ b/app/crawler/savers/postgres.js @@ -7,9 +7,9 @@ class PostgresSaver { return true; } - async save(results) { + async save(results, maxAge) { console.log("[POSTGRES] Saving..."); - await bulkUpsertRealEstates(results); + await bulkUpsertRealEstates(results, maxAge); } close() { diff --git a/app/crawler/specific/olx.js b/app/crawler/specific/olx.js index 9723c74..37af9e6 100644 --- a/app/crawler/specific/olx.js +++ b/app/crawler/specific/olx.js @@ -1,7 +1,8 @@ "use strict"; -let fetch = require("node-fetch"); -let cheerio = require("cheerio"); +const fetch = require("node-fetch"); +const cheerio = require("cheerio"); +const Promise = require("bluebird"); const { AD_TYPE, @@ -13,78 +14,113 @@ const { } = require("../../common/enums"); const OLX_ENUMS = { - OLX_AD_TYPE: {}, - OLX_AD_CATEGORY: {}, + OLX_AD_TYPE: { + [CRAWLER_AD_TYPE.ALL]: "", + [CRAWLER_AD_TYPE.ONLY_SELL]: "&vrsta=samoprodaja", + [CRAWLER_AD_TYPE.ONLY_RENT]: "&vrsta=samoizdavanje" + }, + OLX_AD_CATEGORY: { + [AD_CATEGORY.CATEGORY_FLAT]: "&kategorija=23", + [AD_CATEGORY.CATEGORY_HOUSE]: "&kategorija=24", + [AD_CATEGORY.CATEGORY_LAND]: "&kategorija=29", + [AD_CATEGORY.CATEGORY_OFFICE]: "&kategorija=25", + [AD_CATEGORY.CATEGORY_APARTMENT]: "&kategorija=27", + [AD_CATEGORY.CATEGORY_GARAGE]: "&kategorija=30" + }, MAX_DETAIL_FIELDS: 30 }; -OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ALL] = ""; -OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_SELL] = "&vrsta=samoprodaja"; -OLX_ENUMS.OLX_AD_TYPE[CRAWLER_AD_TYPE.ONLY_RENT] = "&vrsta=samoizdavanje"; - -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_FLAT] = "&kategorija=23"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_HOUSE] = "&kategorija=24"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_LAND] = "&kategorija=29"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_OFFICE] = "&kategorija=25"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_APARTMENT] = "&kategorija=27"; -OLX_ENUMS.OLX_AD_CATEGORY[AD_CATEGORY.CATEGORY_GARAGE] = "&kategorija=30"; - class OlxCrawler { constructor( - fromPage = 1, - toPage = 10, - maxResults = 1000, savers = [], crawlerAdTypes = CRAWLER_AD_TYPE.ALL, crawlerAdCategories = [ AD_CATEGORY.CATEGORY_FLAT, AD_CATEGORY.CATEGORY_HOUSE - ] + ], + maxPages = 1000, + maxResultsPerPage = 100, + maxAge = 30 ) { - this.fromPage = fromPage; - this.toPage = toPage; - this.maxResults = maxResults; this.savers = savers; this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum"; this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; + this.maxPages = maxPages; + this.maxResultsPerPage = maxResultsPerPage; + this.maxAge = maxAge; } async crawl() { console.log("[OLX] Crawler started"); - const crawlAdTypes = this.crawlerAdTypes; const crawlAdCategories = this.crawlerAdCategories; - const urlWithAdTypeFilter = `${this.baseUrl}${OLX_ENUMS.OLX_AD_TYPE[crawlAdTypes]}`; - - if (crawlAdCategories && crawlAdTypes) { - const asyncPagesIndexingByCategory = []; + if (crawlAdCategories) { + const indexGenerators = []; for (const adCategory of crawlAdCategories) { - asyncPagesIndexingByCategory.push( - this.indexPages( - `${urlWithAdTypeFilter}${OLX_ENUMS.OLX_AD_CATEGORY[adCategory]}` - ) - ); + indexGenerators.push(this.categoryIndexer(adCategory)); } - await Promise.all(asyncPagesIndexingByCategory); + let done = false; + while (!done) { + const categoryIndexerPromises = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + } + + Promise.all(categoryIndexerPromises).then(singlePageResults => { + const entries = singlePageResults.entries(); + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + this.saveCrawledResults(singlePageResult, this.maxAge) + .then(numberOfSaved => {}) + .catch(error => + console.log("[POSTGRES Saver] Error saving results : ", error) + ); + } else { + //Generator returned undefined, no more pages + indexGenerators.splice(index, 1); + if (indexGenerators.length === 0) { + done = true; + } + } + } + }); + + await this.sleep(500); + } } + console.log("[OLX] Crawler finished"); } - async indexPages(url) { - const startPage = this.fromPage; - const endPage = this.toPage; - const maxResultsPerPage = this.maxResults; + async *categoryIndexer(adCategory) { + let pageToIndex = 1; - for (let pageNumber = startPage; pageNumber <= endPage; pageNumber++) { - const pageUrl = `${url}&stranica=${pageNumber}`; - const singlePageResults = await this.indexSinglePage( - pageUrl, - maxResultsPerPage - ); - await this.saveCrawledResults(singlePageResults); - await this.sleep(5000); + const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory]; + if (urlAdTypePart && urlCategoryPart) { + while (true) { + const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`; + const singlePageResults = await this.indexSinglePage( + urlPageToCrawl, + this.maxResultsPerPage + ); + console.log("indexing ", adCategory, " page : ", pageToIndex); + + if (Array.isArray(singlePageResults) && singlePageResults.length > 0) { + yield singlePageResults; + } else { + return undefined; + } + + ++pageToIndex; + if (pageToIndex === this.maxPages) { + return undefined; + } + } + } else { + return undefined; } } @@ -111,18 +147,16 @@ class OlxCrawler { let actualNoOfResults = hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage; + const asyncScraping = []; for (let i = 0; i < actualNoOfResults; i++) { - const adData = await this.scrapeAd(hrefs[i]); - - if (adData) { - singlePageResults.push(adData); - } - await this.sleep(500); + asyncScraping.push(this.scrapeAd(hrefs[i])); } - return singlePageResults; + const scrapedData = await Promise.all(asyncScraping); + return scrapedData; } catch (e) { console.error("Exception caught:" + e); + return []; } } @@ -135,24 +169,32 @@ class OlxCrawler { const username = $( "#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span" - ).text(); + ) + .text() + .trim(); if (IGNORED_USERNAMES.includes((username || "").toLowerCase())) { return null; } - const title = $("#naslovartikla").text(); + const title = $("#naslovartikla") + .text() + .trim(); const descriptions = $(".artikal_detaljniopis_tekst"); const category = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(3) > div > span:nth-child(3) > a > span" - ).text(); + ) + .text() + .trim(); //====== PRICE DETECTION AND EXTRACTION ===== let price = null; const normalPriceValue = $("#pc > p:nth-child(2)").text(); const urgentPriceValue = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p" - ).text(); + ) + .text() + .trim(); if (normalPriceValue && normalPriceValue.length > 0) { price = normalPriceValue; @@ -258,7 +300,9 @@ class OlxCrawler { const time = $("time").attr("datetime"); const numberOfViews = $( "#artikal_glavni_div > div.artikal_lijevo > div:nth-child(18) > div:nth-child(6) > div.df2" - ).text(); + ) + .text() + .trim(); //=========================================== //========================================= @@ -296,8 +340,14 @@ class OlxCrawler { price: parsedPrice, area: parsedArea, gardenSize: parsedGardenSize, - shortDescription: descriptions.first().text(), - longDescription: descriptions.last().text(), + shortDescription: descriptions + .first() + .text() + .trim(), + longDescription: descriptions + .last() + .text() + .trim(), streetNumber: 0, streetName: "", locality: "", @@ -370,11 +420,11 @@ class OlxCrawler { return new Promise(resolve => setTimeout(resolve, ms)); } - async saveCrawledResults(results) { + async saveCrawledResults(results, maxAge) { const savers = this.savers; for (const saver of savers) { - await saver.save(results); + await saver.save(results, maxAge); } } } diff --git a/app/helpers/db/realEstate.js b/app/helpers/db/realEstate.js index 3f32b7c..aea9d29 100644 --- a/app/helpers/db/realEstate.js +++ b/app/helpers/db/realEstate.js @@ -1,7 +1,7 @@ "use strict"; const db = require("../../models/index"); -const bulkUpsertRealEstates = async realEstateData => { +const bulkUpsertRealEstates = async (realEstateData, maxAge) => { try { const fieldsToUpdateIfDuplicate = [ "realEstateType", diff --git a/development.env b/development.env index f998e9d..e96b363 100644 --- a/development.env +++ b/development.env @@ -16,8 +16,8 @@ SOURCE_EMAIL=info@saburly.com #=============== CRAWLER SETTINGS===============# #==OLX== -OLX_START_PAGE=Crawler starts from this page -OLX_END_PAGE=Crawler ends with this page (including this page) +OLX_MAX_PAGES=Restrict crawler to this number of pages OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved OLX_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values +OLX_MAX_AGE=[in days] if ad is crawled before this number of days, it will be re-crawled