diff --git a/app/crawler/specificConfigs/prostor.js b/app/crawler/specificConfigs/prostor.js index 098fc95..aebdb4d 100644 --- a/app/crawler/specificConfigs/prostor.js +++ b/app/crawler/specificConfigs/prostor.js @@ -29,5 +29,6 @@ module.exports = { PROSTOR_CRAWLER_AD_CATEGORIES: transformedProstorCrawlerAdCategories, PROSTOR_IGNORED_USERNAMES: prostorIgnoredUsernames || [], PROSTOR_DELAY_BETWEEN_PAGES: - parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000 + parseInt(process.env.PROSTOR_DELAY_BETWEEN_PAGES) || 1000, + PROSTOR_FORCE_CRAWL: !!parseInt(process.env.PROSTOR_FORCE_CRAWL) }; diff --git a/app/crawler/specificCrawlers/prostor.js b/app/crawler/specificCrawlers/prostor.js index 7b970e0..dbd1b7e 100644 --- a/app/crawler/specificCrawlers/prostor.js +++ b/app/crawler/specificCrawlers/prostor.js @@ -2,6 +2,7 @@ const fetch = require("node-fetch"); const cheerio = require("cheerio"); +const moment = require("moment-timezone"); const { AD_TYPE, @@ -11,7 +12,11 @@ const { CRAWLER_AD_TYPE } = require("../../common/enums"); -const { PRINT_CRAWLER_DEBUG } = require("../../config/appConfig"); +const { + PRINT_CRAWLER_DEBUG, + DEFAULT_TIMEZONE +} = require("../../config/appConfig"); +const { PROSTOR_FORCE_CRAWL } = require("../specificConfigs/prostor"); const PROSTOR_ENUMS = { PROSTOR_AD_TYPE: { @@ -48,9 +53,10 @@ class ProstorCrawler { this.crawlerAdTypes = crawlerAdTypes; this.crawlerAdCategories = crawlerAdCategories; this.maxResultsPerPage = maxResultsPerPage; + this.delayBetweenPages = delayBetweenPages; } - async crawl() { + async crawlOld() { const crawlAdCategories = this.crawlerAdCategories; const newRealEstates = []; @@ -79,6 +85,290 @@ class ProstorCrawler { return newRealEstates; } + async crawl() { + const crawlAdCategories = this.crawlerAdCategories; + + const newRealEstates = []; + + if (crawlAdCategories) { + const indexGenerators = []; + for (const adCategory of crawlAdCategories) { + indexGenerators.push(this.categoryIndexer(adCategory)); + } + + let done = false; + while (!done) { + const categoryIndexerPromises = []; + const generatorsToRemove = []; + for (const indexGenerator of indexGenerators) { + categoryIndexerPromises.push(indexGenerator.next()); + generatorsToRemove.push(false); + } + + const singlePageResults = await Promise.all(categoryIndexerPromises); + const entries = singlePageResults.entries(); + + for (const [index, { value: singlePageResult }] of entries) { + if (singlePageResult) { + const saveResults = await this.saveCrawledResults(singlePageResult); + const { newRecords } = saveResults; + + newRealEstates.push(...newRecords); + + if ( + Array.isArray(newRecords) && + newRecords.length === 0 && + !PROSTOR_FORCE_CRAWL + ) { + generatorsToRemove[index] = true; + } + } else { + //Generator returned undefined, remove this generator from array + generatorsToRemove[index] = true; + // console.log("Generator ", index + 1, "has no more pages"); + } + } + + // console.log("Generators state : ", generatorsToRemove); + for (let i = generatorsToRemove.length - 1; i >= 0; i--) { + if (generatorsToRemove[i]) { + // console.log("\tRemove generator ", i + 1); + indexGenerators.splice(i, 1); + } + } + if (indexGenerators.length === 0) { + done = true; + } + + await this.sleep(this.delayBetweenPages); + } + } + return newRealEstates; + } + + async *categoryIndexer(adCategory) { + const urlAdTypePart = PROSTOR_ENUMS.PROSTOR_AD_TYPE[this.crawlerAdTypes]; + const urlCategoryPart = PROSTOR_ENUMS.PROSTOR_AD_CATEGORY[adCategory]; + if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) { + const urlPageToCrawl = `${this.baseUrl}?remove_sold=1${urlAdTypePart}${urlCategoryPart}`; + const listOfAllRealEstates = await this.extractRealEstates( + urlPageToCrawl + ); + + let elementToStartIndexFrom = 0; + while (true) { + const realEstatesForSinglePage = listOfAllRealEstates.slice( + elementToStartIndexFrom, + elementToStartIndexFrom + this.maxResultsPerPage + ); + + if (realEstatesForSinglePage.length > 0) { + elementToStartIndexFrom += realEstatesForSinglePage.length; + + const singlePageResults = await this.indexSinglePage( + realEstatesForSinglePage + ); + + const filteredSinglePageResults = singlePageResults.filter( + singleResult => !!singleResult + ); + + if ( + Array.isArray(filteredSinglePageResults) && + filteredSinglePageResults.length > 0 + ) { + yield filteredSinglePageResults; + } else { + return undefined; + } + } else { + return undefined; + } + } + } else { + return undefined; + } + } + + async indexSinglePage(realEstatesList) { + const asyncActions = []; + for (const realEstate of realEstatesList) { + asyncActions.push(this.scrapeAd(realEstate)); + } + + try { + return await Promise.all(asyncActions); + } catch (e) { + console.log( + "[PROSTOR] Error crawling ads : ", + e.message || "UNKNOWN ERROR" + ); + return []; + } + } + + async scrapeAd(realEstate) { + const { lat, lng, property_name, price, size, link } = realEstate; + const url = `https://prostor.ba${link}`; + console.log("[PROSTOR] Scraping : ", url); + try { + const adPageSource = await fetch(url); + const body = await adPageSource.text(); + const $ = cheerio.load(body); + + let numberOfRooms = null, + numberOfFloors = null, + floor = null, + accessRoadType = null, + heatingType = null, + furnishingType = null, + balcony = null, + newBuilding = null, + elevator = null, + water = null, + electricity = null, + drainageSystem = null, + registeredInZkBooks = null, + recentlyAdapted = null, + parking = null, + garage = null, + gas = null, + antiTheftDoor = null, + airCondition = null, + phoneConnection = null, + cableTV = null, + internet = null, + basementAttic = null, + storeRoom = null, + videoSurveillance = null, + alarm = null, + suitableForStudents = null, + includingBills = null, + animalsAllowed = null, + pool = null, + urbanPlanPermit = null, + buildingPermit = null, + utilityConnection = null, + distanceToRiver = null, + numberOfViewsAgency = null; + + // link contains part of the URL in the format of : /prodaja/stan/stup/9556 + // general form is : /actionType/realEstateType/location/realEstateID + // linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID'] + + const linkParts = link.split("/"); + + const adType = ProstorCrawler.getAdTypeId(linkParts[1]); + const realEstateType = ProstorCrawler.getAdCategoryId(linkParts[2]); + const prostorId = linkParts[4]; + + if (!adType || !realEstateType || !prostorId) { + console.log( + "adType: ", + adType, + " reType: ", + realEstateType, + " prostorId: ", + prostorId, + "url: ", + url + ); + return null; + } + + const adStatus = AD_STATUS.STATUS_NORMAL; + const title = property_name; + const parsedPrice = parseFloat(price.replace(/\./g, "")) || null; + const parsedArea = parseFloat(size); + const gardenSize = null; + const longDescription = null; + const publishedDateMoment = moment.tz(DEFAULT_TIMEZONE); + if (!publishedDateMoment.isValid()) { + throw { + message: `Invalid published date` + }; + } + + const renewedDateMoment = moment.tz(DEFAULT_TIMEZONE); + if (!renewedDateMoment.isValid()) { + throw { + message: `Invalid renewed date` + }; + } + + const data = { + url, + agencyObjectId: prostorId, + originAgencyName: AD_AGENCY.PROSTOR, + realEstateType, + adType, + title, + price: parsedPrice, + area: parsedArea, + gardenSize, + shortDescription: "", + longDescription: longDescription, + streetNumber: 0, + streetName: "", + locality: "", + municipality: "", + city: "", + region: "", + entity: "", + country: "", + locationLat: lat, + locationLong: lng, + adStatus, + publishedDate: publishedDateMoment.toISOString(), + renewedDate: renewedDateMoment.toISOString(), + numberOfRooms, + numberOfFloors, + floor, + accessRoadType, + heatingType, + furnishingType, + balcony, + newBuilding, + elevator, + water, + electricity, + drainageSystem, + registeredInZkBooks, + recentlyAdapted, + parking, + garage, + gas, + antiTheftDoor, + airCondition, + phoneConnection, + cableTV, + internet, + basementAttic, + storeRoom, + videoSurveillance, + alarm, + suitableForStudents, + includingBills, + animalsAllowed, + pool, + urbanPlanPermit, + buildingPermit, + utilityConnection, + distanceToRiver, + numberOfViewsAgency + }; + + return data; + } catch (e) { + console.error( + "[PROSTOR] Exception caught: " + e.message, + "\r\nURL:", + url + ); + return null; + } + } + async extractRealEstates(url) { if (PRINT_CRAWLER_DEBUG) { console.log("[PROSTOR] Index page : ", url); @@ -115,18 +405,19 @@ class ProstorCrawler { const jsonData = scriptData.substring(23, jsonEndIndex) + "]"; const realEstates = JSON.parse(jsonData); - const transformedRealEstates = []; - - for (const realEstate of realEstates) { - const transformedRealEstate = ProstorCrawler.transformRealEstateData( - realEstate - ); - if (transformedRealEstate) { - transformedRealEstates.push(transformedRealEstate); - } - } - - return transformedRealEstates; + // const transformedRealEstates = []; + // + // for (const realEstate of realEstates) { + // const transformedRealEstate = ProstorCrawler.transformRealEstateData( + // realEstate + // ); + // if (transformedRealEstate) { + // transformedRealEstates.push(transformedRealEstate); + // } + // } + // + // return transformedRealEstates; + return realEstates; } else { throw { message: "Something is wrong with JSON data or data is moved" @@ -134,11 +425,14 @@ class ProstorCrawler { } } catch (e) { console.log(e); - throw { message: "Can't find ad data JSON" }; + throw e; } } } catch (e) { - console.error("[PROSTOR] Exception caught:", e.message); + console.error( + "[PROSTOR] Exception caught:", + e.message || "UNKNOWN MESSAGE" + ); return []; } } @@ -236,6 +530,10 @@ class ProstorCrawler { } } + async sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + async saveCrawledResults(results) { const savers = this.savers; @@ -244,7 +542,7 @@ class ProstorCrawler { // } //For now, we use only Postgres saver, so ... - return await savers[0].save(results); + return savers[0].save(results); //so that we can use some sequelize options and information when data is inserted } }