From 935ae60ae1cb592d03e063ecec5d3c45d76e2147 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 24 Oct 2019 16:57:23 +0200 Subject: [PATCH 1/6] move specific crawler config to the separated files --- app/crawler/crawl.js | 4 +- app/crawler/crawlerConfig.js | 68 +------------------ app/crawler/specificConfigs/olx.js | 37 ++++++++++ app/crawler/specificConfigs/rental.js | 33 +++++++++ .../{specific => specificCrawlers}/olx.js | 0 .../{specific => specificCrawlers}/rental.js | 0 test/olxScrapeTest.js | 2 +- 7 files changed, 75 insertions(+), 69 deletions(-) create mode 100644 app/crawler/specificConfigs/olx.js create mode 100644 app/crawler/specificConfigs/rental.js rename app/crawler/{specific => specificCrawlers}/olx.js (100%) rename app/crawler/{specific => specificCrawlers}/rental.js (100%) diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index b219e01..ac4825d 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -5,8 +5,8 @@ All environment specific configuration is read here and passed to the crawlers and savers. */ -const OlxCrawler = require("./specific/olx"); -const RentalCrawler = require("./specific/rental"); +const OlxCrawler = require("./specificCrawlers/olx"); +const RentalCrawler = require("./specificCrawlers/rental"); const { OLX_CONFIG, RENTAL_CONFIG } = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index a58575d..29c68f2 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -1,72 +1,8 @@ "use strict"; require("dotenv").config({ path: __dirname + "/./../../.env" }); -const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../common/enums"); -const olxCrawlerAdType = - process.env.OLX_CRAWLER_AD_TYPE !== undefined - ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] - : null; - -const rentalCrawlerAdType = - process.env.RENTAL_CRAWLER_AD_TYPE !== undefined - ? CRAWLER_AD_TYPE[process.env.RENTAL_CRAWLER_AD_TYPE] - : null; - -const olxParsedCrawlerAdCategories = - process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined - ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => - category.trim() - ) - : ["FLAT", "HOUSE"]; - -const rentalParsedCrawlerAdCategories = - process.env.RENTAL_CRAWLER_AD_CATEGORIES !== undefined - ? process.env.RENTAL_CRAWLER_AD_CATEGORIES.split(",").map(category => - category.trim() - ) - : ["FLAT", "HOUSE"]; - -const olxIgnoredUsernames = - process.env.OLX_IGNORED_USERNAMES !== undefined - ? process.env.OLX_IGNORED_USERNAMES.split(",").map(username => - username.trim() - ) - : []; - -const rentalIgnoredUsernames = []; - -const transformedOlxCrawlerAdCategories = olxParsedCrawlerAdCategories - .map(categoryName => - AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined - ) - .filter(category => !!category); - -const transformedRentalCrawlerAdCategories = rentalParsedCrawlerAdCategories - .map(categoryName => - AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined - ) - .filter(category => !!category); - -const OLX_CONFIG = { - OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500, - OLX_MAX_RESULTS_PER_PAGE: - parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, - OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, - OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories, - OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], - OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 -}; - -const RENTAL_CONFIG = { - RENTAL_MAX_PAGES: parseInt(process.env.RENTAL_MAX_PAGES) || 500, - RENTAL_MAX_RESULTS_PER_PAGE: - parseInt(process.env.RENTAL_MAX_RESULTS_PER_PAGE) || 50, - RENTAL_CRAWLER_AD_TYPE: rentalCrawlerAdType || CRAWLER_AD_TYPE.NONE, - RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories, - RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [], - RENTAL_DELAY_BETWEEN_PAGES: - parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000 -}; +const OLX_CONFIG = require("./specificConfigs/olx"); +const RENTAL_CONFIG = require("./specificConfigs/rental"); module.exports = { OLX_CONFIG, diff --git a/app/crawler/specificConfigs/olx.js b/app/crawler/specificConfigs/olx.js new file mode 100644 index 0000000..53ca727 --- /dev/null +++ b/app/crawler/specificConfigs/olx.js @@ -0,0 +1,37 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const olxCrawlerAdType = + process.env.OLX_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.OLX_CRAWLER_AD_TYPE] + : null; + +const olxParsedCrawlerAdCategories = + process.env.OLX_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.OLX_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const olxIgnoredUsernames = + process.env.OLX_IGNORED_USERNAMES !== undefined + ? process.env.OLX_IGNORED_USERNAMES.split(",").map(username => + username.trim() + ) + : []; + +const transformedOlxCrawlerAdCategories = olxParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + OLX_MAX_PAGES: parseInt(process.env.OLX_MAX_PAGES) || 500, + OLX_MAX_RESULTS_PER_PAGE: + parseInt(process.env.OLX_MAX_RESULTS_PER_PAGE) || 50, + OLX_CRAWLER_AD_TYPE: olxCrawlerAdType || CRAWLER_AD_TYPE.NONE, + OLX_CRAWLER_AD_CATEGORIES: transformedOlxCrawlerAdCategories, + OLX_IGNORED_USERNAMES: olxIgnoredUsernames || [], + OLX_DELAY_BETWEEN_PAGES: parseInt(process.env.OLX_DELAY_BETWEEN_PAGES) || 1000 +}; diff --git a/app/crawler/specificConfigs/rental.js b/app/crawler/specificConfigs/rental.js new file mode 100644 index 0000000..8930d64 --- /dev/null +++ b/app/crawler/specificConfigs/rental.js @@ -0,0 +1,33 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const rentalCrawlerAdType = + process.env.RENTAL_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.RENTAL_CRAWLER_AD_TYPE] + : null; + +const rentalParsedCrawlerAdCategories = + process.env.RENTAL_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.RENTAL_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const rentalIgnoredUsernames = []; + +const transformedRentalCrawlerAdCategories = rentalParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + RENTAL_MAX_PAGES: parseInt(process.env.RENTAL_MAX_PAGES) || 500, + RENTAL_MAX_RESULTS_PER_PAGE: + parseInt(process.env.RENTAL_MAX_RESULTS_PER_PAGE) || 50, + RENTAL_CRAWLER_AD_TYPE: rentalCrawlerAdType || CRAWLER_AD_TYPE.NONE, + RENTAL_CRAWLER_AD_CATEGORIES: transformedRentalCrawlerAdCategories, + RENTAL_IGNORED_USERNAMES: rentalIgnoredUsernames || [], + RENTAL_DELAY_BETWEEN_PAGES: + parseInt(process.env.RENTAL_DELAY_BETWEEN_PAGES) || 1000 +}; diff --git a/app/crawler/specific/olx.js b/app/crawler/specificCrawlers/olx.js similarity index 100% rename from app/crawler/specific/olx.js rename to app/crawler/specificCrawlers/olx.js diff --git a/app/crawler/specific/rental.js b/app/crawler/specificCrawlers/rental.js similarity index 100% rename from app/crawler/specific/rental.js rename to app/crawler/specificCrawlers/rental.js diff --git a/test/olxScrapeTest.js b/test/olxScrapeTest.js index d95dde6..745dbcb 100644 --- a/test/olxScrapeTest.js +++ b/test/olxScrapeTest.js @@ -1,6 +1,6 @@ "use strict"; -const olxCrawler = require("../app/crawler/specific/olx"); +const olxCrawler = require("../app/crawler/specificCrawlers/olx"); const urlToScrape = process.argv[2] || undefined; -- 2.47.3 From 6fc4218e39db29ca11aa4acc794bbb50904b8206 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 24 Oct 2019 17:11:12 +0200 Subject: [PATCH 2/6] add config files for Prostor agency --- app/crawler/crawlerConfig.js | 4 +++- app/crawler/specificConfigs/prostor.js | 33 ++++++++++++++++++++++++++ development.env | 7 ++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 app/crawler/specificConfigs/prostor.js diff --git a/app/crawler/crawlerConfig.js b/app/crawler/crawlerConfig.js index 29c68f2..1818ccb 100644 --- a/app/crawler/crawlerConfig.js +++ b/app/crawler/crawlerConfig.js @@ -3,8 +3,10 @@ require("dotenv").config({ path: __dirname + "/./../../.env" }); const OLX_CONFIG = require("./specificConfigs/olx"); const RENTAL_CONFIG = require("./specificConfigs/rental"); +const PROSTOR_CONFIG = require("./specificConfigs/prostor"); module.exports = { OLX_CONFIG, - RENTAL_CONFIG + RENTAL_CONFIG, + PROSTOR_CONFIG }; diff --git a/app/crawler/specificConfigs/prostor.js b/app/crawler/specificConfigs/prostor.js new file mode 100644 index 0000000..cde72d0 --- /dev/null +++ b/app/crawler/specificConfigs/prostor.js @@ -0,0 +1,33 @@ +"use strict"; +const { CRAWLER_AD_TYPE, AD_CATEGORY } = require("../../common/enums"); + +const prostorCrawlerAdType = + process.env.PROSTOR_CRAWLER_AD_TYPE !== undefined + ? CRAWLER_AD_TYPE[process.env.PROSTOR_CRAWLER_AD_TYPE] + : null; + +const prostorParsedCrawlerAdCategories = + process.env.PROSTOR_CRAWLER_AD_CATEGORIES !== undefined + ? process.env.PROSTOR_CRAWLER_AD_CATEGORIES.split(",").map(category => + category.trim() + ) + : ["FLAT", "HOUSE"]; + +const prostorIgnoredUsernames = []; + +const transformedProstorCrawlerAdCategories = prostorParsedCrawlerAdCategories + .map(categoryName => + AD_CATEGORY[categoryName] ? AD_CATEGORY[categoryName].id : undefined + ) + .filter(category => !!category); + +module.exports = { + PROSTOR_MAX_PAGES: parseInt(process.env.PROSTOR_MAX_PAGES) || 100, + PROSTOR_MAX_RESULTS_PER_PAGE: + parseInt(process.env.PROSTOR_MAX_RESULTS_PER_PAGE) || 50, + PROSTOR_CRAWLER_AD_TYPE: prostorCrawlerAdType || CRAWLER_AD_TYPE.NONE, + PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories, + PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [], + PROSTOR_DELAY_BETWEEN_PAGES: + parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000 +}; diff --git a/development.env b/development.env index 656ed15..fd6dd30 100644 --- a/development.env +++ b/development.env @@ -37,3 +37,10 @@ RENTAL_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check co RENTAL_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values RENTAL_IGNORED_USERNAMES=!!! This is not used for rental crawler !!! RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page +#==PROSTOR== +PROSTOR_MAX_PAGES=Restrict crawler to this number of pages +PROSTOR_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved +PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values +PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values +PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!! +PROSTOR_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page -- 2.47.3 From 5098b08b3f9bd92de123867736d120212b11719b Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Thu, 24 Oct 2019 17:43:14 +0200 Subject: [PATCH 3/6] add ALL option to crawler cat, exclude from real estate types list --- app/common/enums.js | 3 +++ app/controllers/realEstateTypes.js | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/app/common/enums.js b/app/common/enums.js index 1e46f03..1df898e 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -70,6 +70,9 @@ const AD_TYPE = { }; const AD_CATEGORY = { + ALL: { + id: "ALL" + }, FLAT: { id: "FLAT", title: "Stan", diff --git a/app/controllers/realEstateTypes.js b/app/controllers/realEstateTypes.js index 3b5f864..f021f04 100644 --- a/app/controllers/realEstateTypes.js +++ b/app/controllers/realEstateTypes.js @@ -5,9 +5,10 @@ const { AD_CATEGORY } = require("../common/enums"); const getRealEstateTypes = (req, res) => { const title = "Koju nekretninu tražite?"; - const realEstateTypes = Object.keys(AD_CATEGORY).map( - category => AD_CATEGORY[category] - ); + const realEstateTypes = Object.keys(AD_CATEGORY) + .map(category => AD_CATEGORY[category]) + .filter(category => category.title); + res.render("realEstateType", { realEstateTypes, title }); }; -- 2.47.3 From 05fad652c40e76023a81982c1897f48b15790b08 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Fri, 25 Oct 2019 10:53:44 +0200 Subject: [PATCH 4/6] add PROSTOR agency enum; update ENV template --- app/common/enums.js | 3 ++- development.env | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/app/common/enums.js b/app/common/enums.js index 1df898e..9a59359 100644 --- a/app/common/enums.js +++ b/app/common/enums.js @@ -137,7 +137,8 @@ const AD_STATUS = { const AD_AGENCY = { OLX: "OLX", - RENTAL: "RENTAL" + RENTAL: "RENTAL", + PROSTOR: "PROSTOR" }; const CRAWLER_AD_TYPE = { diff --git a/development.env b/development.env index fd6dd30..6713fcd 100644 --- a/development.env +++ b/development.env @@ -38,9 +38,9 @@ RENTAL_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to RENTAL_IGNORED_USERNAMES=!!! This is not used for rental crawler !!! RENTAL_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page #==PROSTOR== -PROSTOR_MAX_PAGES=Restrict crawler to this number of pages -PROSTOR_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved +PROSTOR_MAX_PAGES=!!! This is not used for prostor crawler !!! +PROSTOR_MAX_RESULTS_PER_PAGE=For Prostor crawler, this represents MAX RESULTS in total PROSTOR_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values PROSTOR_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values PROSTOR_IGNORED_USERNAMES=!!! This is not used for prostor crawler !!! -PROSTOR_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page +PROSTOR_DELAY_BETWEEN_PAGES=!!! This is not used for prostor crawler !!! -- 2.47.3 From 7e3b0bfcd589cf62d4097841ef51a2adfa34145b Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Fri, 25 Oct 2019 10:54:08 +0200 Subject: [PATCH 5/6] implement crawler for Prostor agency --- app/crawler/crawl.js | 16 +- app/crawler/specificConfigs/prostor.js | 2 +- app/crawler/specificCrawlers/prostor.js | 248 ++++++++++++++++++++++++ 3 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 app/crawler/specificCrawlers/prostor.js diff --git a/app/crawler/crawl.js b/app/crawler/crawl.js index ac4825d..002e17a 100644 --- a/app/crawler/crawl.js +++ b/app/crawler/crawl.js @@ -7,8 +7,13 @@ */ const OlxCrawler = require("./specificCrawlers/olx"); const RentalCrawler = require("./specificCrawlers/rental"); +const ProstorCrawler = require("./specificCrawlers/prostor"); -const { OLX_CONFIG, RENTAL_CONFIG } = require("./crawlerConfig"); +const { + OLX_CONFIG, + RENTAL_CONFIG, + PROSTOR_CONFIG +} = require("./crawlerConfig"); const PostgresSaver = require("./savers/postgres"); async function crawlAll() { @@ -32,6 +37,15 @@ async function crawlAll() { RENTAL_CONFIG.RENTAL_MAX_RESULTS_PER_PAGE, RENTAL_CONFIG.RENTAL_IGNORED_USERNAMES, RENTAL_CONFIG.RENTAL_DELAY_BETWEEN_PAGES + ), + new ProstorCrawler( + [postgresSaver], + PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_TYPE, + PROSTOR_CONFIG.PROSTOR_CRAWLER_AD_CATEGORIES, + PROSTOR_CONFIG.PROSTOR_MAX_PAGES, + PROSTOR_CONFIG.PROSTOR_MAX_RESULTS_PER_PAGE, + PROSTOR_CONFIG.PROSTOR_IGNORED_USERNAMES, + PROSTOR_CONFIG.PROSTOR_DELAY_BETWEEN_PAGES ) ]; diff --git a/app/crawler/specificConfigs/prostor.js b/app/crawler/specificConfigs/prostor.js index cde72d0..098fc95 100644 --- a/app/crawler/specificConfigs/prostor.js +++ b/app/crawler/specificConfigs/prostor.js @@ -24,7 +24,7 @@ const transformedProstorCrawlerAdCategories = prostorParsedCrawlerAdCategories module.exports = { PROSTOR_MAX_PAGES: parseInt(process.env.PROSTOR_MAX_PAGES) || 100, PROSTOR_MAX_RESULTS_PER_PAGE: - parseInt(process.env.PROSTOR_MAX_RESULTS_PER_PAGE) || 50, + parseInt(process.env.PROSTOR_MAX_RESULTS_PER_PAGE) || 5000, PROSTOR_CRAWLER_AD_TYPE: prostorCrawlerAdType || CRAWLER_AD_TYPE.NONE, PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories, PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [], diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js new file mode 100644 index 0000000..16bcce1 --- /dev/null +++ b/app/crawler/specificCrawlers/prostor.js @@ -0,0 +1,248 @@ +"use strict"; + +const fetch = require("node-fetch"); +const cheerio = require("cheerio"); + +const { + AD_TYPE, + AD_CATEGORY, + AD_AGENCY, + AD_STATUS, + CRAWLER_AD_TYPE +} = require("../../common/enums"); + +const PROSTOR_ENUMS = { + PROSTOR_AD_TYPE: { + [CRAWLER_AD_TYPE.ALL]: "&action=0", + [CRAWLER_AD_TYPE.ONLY_SELL]: "&action=1", + [CRAWLER_AD_TYPE.ONLY_RENT]: "&action=2" + }, + PROSTOR_AD_CATEGORY: { + [AD_CATEGORY.ALL.id]: "", + [AD_CATEGORY.FLAT.id]: "&type=7", + [AD_CATEGORY.HOUSE.id]: "&type=8", + [AD_CATEGORY.LAND.id]: "&type=10", + [AD_CATEGORY.OFFICE.id]: "&type=9", + [AD_CATEGORY.APARTMENT.id]: "&type=11", + [AD_CATEGORY.GARAGE.id]: "&type=14" + //[AD_CATEGORY.COTTAGE.id]: "" + }, + PROSTOR_PUBLISHED_DATE_FORMAT: "YYYY-MM-DD HH:mm:ss", + PROSTOR_RENEWED_DATE_FORMAT: "YYYY-MM-DD u HH:mm:ss" +}; + +class ProstorCrawler { + constructor( + savers = [], + crawlerAdTypes = CRAWLER_AD_TYPE.ALL, + crawlerAdCategories = [AD_CATEGORY.FLAT, AD_CATEGORY.HOUSE], + maxPages = 5000, + maxResultsPerPage = 5000, + ignoredUsernames = [], + delayBetweenPages = 1000 + ) { + this.savers = savers; + this.baseUrl = "https://prostor.ba/pretraga"; + this.crawlerAdTypes = crawlerAdTypes; + this.crawlerAdCategories = crawlerAdCategories; + this.maxResultsPerPage = maxResultsPerPage; + } + + async crawl() { + const crawlAdCategories = this.crawlerAdCategories; + const newRealEstates = []; + + if (crawlAdCategories) { + for (const adCategory of crawlAdCategories) { + const urlAdTypePart = + PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { + const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; + const singleCategoryResults = await this.extractRealEstates( + urlPageToCrawl + ); + + const resultsSubset = singleCategoryResults.slice( + 0, + this.maxResultsPerPage + ); + + const saveResults = await this.saveCrawledResults(resultsSubset); + const { newRecords } = saveResults; + newRealEstates.push(...newRecords); + } + } + } + return newRealEstates; + } + + async extractRealEstates(url) { + console.log("[PROSTOR] Index page : ", url); + + try { + const res = await fetch(url); + const body = await res.text(); + const $ = cheerio.load(body); + + const scriptElement = $( + "body > div > div.container-fluid > script:nth-child(7)" + ); + + if ( + scriptElement[0] && + scriptElement[0].children && + scriptElement[0].children[0] && + scriptElement[0].children[0].data + ) { + const scriptData = scriptElement[0].children[0].data; + + try { + // script element data contains JS code and we need to extract only data for realEstates + // data string starts with : var map; var markers = [{"r ... + // so we remove first 23 characters + // + // real estate JSON data ends with ...}, ]; map = new... + // so we need to find index of that substring to know where to stop + // we will NOT include trailing comma because it breaks JSON parse, so we have to close ] bracket manually + + const jsonEndIndex = scriptData.indexOf(", ]; map = new"); + if (jsonEndIndex > -1) { + const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; + const realEstates = JSON.parse(jsonData); + + const transformedRealEstates = []; + + for (const realEstate of realEstates) { + const transformedRealEstate = ProstorCrawler.transformRealEstateData( + realEstate + ); + if (transformedRealEstate) { + transformedRealEstates.push(transformedRealEstate); + } + } + + return transformedRealEstates; + } else { + throw { + message: "Something is wrong with JSON data or data is moved" + }; + } + } catch (e) { + console.log(e); + throw { message: "Can't find ad data JSON" }; + } + } + } catch (e) { + console.error("[PROSTOR] Exception caught:", e.message); + return []; + } + } + + static transformRealEstateData(realEstateData) { + try { + const { lat, lng, property_name, price, size, link } = realEstateData; + + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = ProstorCrawler.getAdTypeId(linkParts[1]); + const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + const url = `https://prostor.ba${link}`; + + if (!adType || !realEstateType || !prostorId) { + return null; + } + + const adStatus = AD_STATUS.STATUS_NORMAL; + const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; + const parsedArea = parseFloat(size); + + const data = { + url, + agencyObjectId: prostorId, + originAgencyName: AD_AGENCY.PROSTOR, + realEstateType, + adType, + title: property_name, + price: parsedPrice, + area: parsedArea, + gardenSize: null, + shortDescription: "", + longDescription: "", + streetNumber: 0, + streetName: "", + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat: lat, + locationLong: lng, + adStatus, + publishedDate: null, + renewedDate: null + }; + + return data; + } catch (e) { + console.error( + "[PROSTOR] Exception caught: " + e.message, + "\r\nURL:", + url + ); + return null; + } + } + + //======= HELPER FUNCTIONS ============= + + static getAdCategoryId(categoryText) { + switch (categoryText) { + case "stan": + return AD_CATEGORY.FLAT.id; + case "kuca": + return AD_CATEGORY.HOUSE.id; + case "apartman": + return AD_CATEGORY.APARTMENT.id; + case "poslovni-prostor": + return AD_CATEGORY.OFFICE.id; + case "garaza": + return AD_CATEGORY.GARAGE.id; + case "zemljiste": + return AD_CATEGORY.LAND.id; + default: + return undefined; + } + } + + static getAdTypeId(adTypeText) { + switch (adTypeText) { + case "prodaja": + return AD_TYPE.AD_TYPE_SALE; + case "najam": + return AD_TYPE.AD_TYPE_RENT; + default: + return undefined; + } + } + + async saveCrawledResults(results) { + const savers = this.savers; + + // for (const saver of savers) { + // await saver.save(results); + // } + + //For now, we use only Postgres saver, so ... + return await savers[0].save(results); + //so that we can use some sequelize options and information when data is inserted + } +} + +module.exports = ProstorCrawler; -- 2.47.3 From 747ebb88e5b267ad4a7102bf5ba19059742f3297 Mon Sep 17 00:00:00 2001 From: Bilal Catic Date: Fri, 25 Oct 2019 11:08:52 +0200 Subject: [PATCH 6/6] add debugging log switch for crawler process --- app/config/appConfig.js | 5 ++++- app/crawler/specificCrawlers/olx.js | 9 ++++++++- app/crawler/specificCrawlers/prostor.js | 6 +++++- app/crawler/specificCrawlers/rental.js | 9 +++++++-- development.env | 1 + 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/app/config/appConfig.js b/app/config/appConfig.js index 8a11650..0ed1e6b 100644 --- a/app/config/appConfig.js +++ b/app/config/appConfig.js @@ -28,6 +28,8 @@ const MAX_REAL_ESTATES_IN_EMAIL = const MAX_REAL_ESTATES_IN_FIRST_EMAIL = parseInt(process.env.MAX_REAL_ESTATES_IN_FIRST_EMAIL) || 5; +const PRINT_CRAWLER_DEBUG = process.env.PRINT_CRAWLER_DEBUG_INFO || 0; + module.exports = { APP_PORT, APP_URL, @@ -36,5 +38,6 @@ module.exports = { STOP_CRAWLER, AWS_EMAIL_CONFIG, MAX_REAL_ESTATES_IN_EMAIL, - MAX_REAL_ESTATES_IN_FIRST_EMAIL + MAX_REAL_ESTATES_IN_FIRST_EMAIL, + PRINT_CRAWLER_DEBUG }; diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 526db05..d7176d1 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -13,7 +13,10 @@ const { CRAWLER_AD_TYPE } = require("../../common/enums"); -const { DEFAULT_TIMEZONE } = require("../../config/appConfig"); +const { + DEFAULT_TIMEZONE, + PRINT_CRAWLER_DEBUG +} = require("../../config/appConfig"); const OLX_ENUMS = { OLX_AD_TYPE: { @@ -156,6 +159,10 @@ class OlxCrawler { } async indexSinglePage(url, maxResultsPerPage) { + if (PRINT_CRAWLER_DEBUG) { + console.log("[OLX] Index page : ", url); + } + try { const res = await fetch(url); const body = await res.text(); diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index 16bcce1..bb3133c 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -11,6 +11,8 @@ const { CRAWLER_AD_TYPE } = require("../../common/enums"); +const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); + const PROSTOR_ENUMS = { PROSTOR_AD_TYPE: { [CRAWLER_AD_TYPE.ALL]: "&action=0", @@ -78,7 +80,9 @@ class ProstorCrawler { } async extractRealEstates(url) { - console.log("[PROSTOR] Index page : ", url); + if (PRINT_CRAWLER_DEBUG) { + console.log("[PROSTOR] Index page : ", url); + } try { const res = await fetch(url); diff --git a/app/crawler/specificCrawlers/rental.js b/app/crawler/specificCrawlers/rental.js index a29d772..45e572d 100644 --- a/app/crawler/specificCrawlers/rental.js +++ b/app/crawler/specificCrawlers/rental.js @@ -14,7 +14,10 @@ const { CRAWLER_AD_TYPE } = require("../../common/enums"); -const { DEFAULT_TIMEZONE } = require("../../config/appConfig"); +const { + DEFAULT_TIMEZONE, + PRINT_CRAWLER_DEBUG +} = require("../../config/appConfig"); const RENTAL_ENUMS = { RENTAL_AD_TYPE: { @@ -159,7 +162,9 @@ class RentalCrawler { } async indexSinglePage(url, maxResultsPerPage) { - // console.log("[RENTAL] Index page : ", url); + if (PRINT_CRAWLER_DEBUG) { + console.log("[RENTAL] Index page : ", url); + } try { const res = await fetch(url); diff --git a/development.env b/development.env index 6713fcd..0bd2939 100644 --- a/development.env +++ b/development.env @@ -23,6 +23,7 @@ SOURCE_EMAIL=info@saburly.com #=============== CRAWLER SETTINGS===============# CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds STOP_CRAWLER=Non-zero value will skip crawler execution +PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console #==OLX== OLX_MAX_PAGES=Restrict crawler to this number of pages OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved -- 2.47.3