Compare commits
9 Commits
master
...
after-scra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
131536d9fb | ||
|
|
824414adad | ||
|
|
41c926b5bb | ||
|
|
b3708cf842 | ||
|
|
f5f8fa276c | ||
|
|
ccea5fe2aa | ||
|
|
e1651306eb | ||
|
|
97c09a6da1 | ||
|
|
034106d87a |
@@ -45,10 +45,10 @@ const USER_AGENT =
|
||||
process.env.USER_AGENT ||
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
|
||||
|
||||
const USE_SCRAPER_API = process.env.USE_SCRAPER_API === undefined ? 1 : parseInt(process.env.USE_SCRAPER_API);
|
||||
const USE_SCRAPER_API = process.env.USE_SCRAPER_API || 1; //Default to use
|
||||
const SCRAPER_API_KEY = process.env.SCRAPER_API_KEY || "";
|
||||
const SCRAPER_API_BASE_URL = process.env.SCRAPER_API_BASE_URL || "";
|
||||
const NODE_FETCH_TIMEOUT_MS = parseInt(process.env.NODE_FETCH_TIMEOUT_MS) || 60000
|
||||
const NUMBER_OF_CONCURRENT_REQ_SCRAPER_API =
|
||||
parseInt(process.env.NUMBER_OF_CONCURRENT_REQ_SCRAPER_API) || 10;
|
||||
|
||||
module.exports = {
|
||||
APP_PORT,
|
||||
@@ -67,6 +67,5 @@ module.exports = {
|
||||
USER_AGENT,
|
||||
USE_SCRAPER_API,
|
||||
SCRAPER_API_KEY,
|
||||
SCRAPER_API_BASE_URL,
|
||||
NODE_FETCH_TIMEOUT_MS
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
};
|
||||
|
||||
@@ -10,7 +10,6 @@ const RentalCrawler = require("./specificCrawlers/rental");
|
||||
const ProstorCrawler = require("./specificCrawlers/prostor");
|
||||
const AktidoCrawler = require("./specificCrawlers/aktido");
|
||||
const SaljicCrawler = require("./specificCrawlers/saljic");
|
||||
const { logDebug } = require("../helpers/log");
|
||||
|
||||
const {
|
||||
OLX_CONFIG,
|
||||
@@ -76,9 +75,7 @@ async function crawlAll() {
|
||||
|
||||
for (const crawler of crawlers) {
|
||||
try {
|
||||
logDebug('Starting crawler: ', crawler);
|
||||
const newRealEstatesFromSingleCrawler = await crawler.crawl();
|
||||
logDebug('Crawler done: ', crawler);
|
||||
if (Array.isArray(newRealEstatesFromSingleCrawler)) {
|
||||
newRealEstates.push(...newRealEstatesFromSingleCrawler);
|
||||
}
|
||||
|
||||
@@ -159,7 +159,7 @@ class AktidoCrawler {
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {}, false);
|
||||
const res = await fetch(url);
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
let hrefs = [];
|
||||
@@ -202,10 +202,6 @@ class AktidoCrawler {
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (body.indexOf('<html') === -1) {
|
||||
throw { message: 'Failed to fetch page !' }
|
||||
}
|
||||
|
||||
const mapElementParent = $(".box-map").parent();
|
||||
const scriptElement = $("script", mapElementParent);
|
||||
if (
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
"use strict";
|
||||
|
||||
const fetch = require("../../helpers/fetchWrapper");
|
||||
const { logDebug } = require("../../helpers/log");
|
||||
const cheerio = require("cheerio");
|
||||
const Promise = require("bluebird");
|
||||
const moment = require("moment-timezone");
|
||||
@@ -19,7 +18,9 @@ const {
|
||||
|
||||
const {
|
||||
DEFAULT_TIMEZONE,
|
||||
PRINT_CRAWLER_DEBUG
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API,
|
||||
SCRAPER_API_KEY
|
||||
} = require("../../config/appConfig");
|
||||
|
||||
const OLX_ENUMS = {
|
||||
@@ -45,15 +46,7 @@ const OLX_ENUMS = {
|
||||
|
||||
const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx");
|
||||
|
||||
const chunk = (array, size = 10) => {
|
||||
let i, j ,temparray;
|
||||
const result = []
|
||||
for (i=0,j=array.length; i<j; i+=size) {
|
||||
temparray = array.slice(i,i+size);
|
||||
result.push(temparray);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
const scraperapiClient = require("scraperapi-sdk")(SCRAPER_API_KEY);
|
||||
|
||||
class OlxCrawler {
|
||||
constructor(
|
||||
@@ -63,7 +56,7 @@ class OlxCrawler {
|
||||
maxPages = 1000,
|
||||
maxResultsPerPage = 100,
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 500
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.olx.ba/pretraga?sort_order=desc&sort_po=datum";
|
||||
@@ -76,7 +69,6 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
logDebug("Starting OLX crawl");
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
|
||||
const newRealEstates = [];
|
||||
@@ -100,32 +92,14 @@ class OlxCrawler {
|
||||
const entries = singlePageResults.entries();
|
||||
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("================================");
|
||||
console.log("Category Indexer index : ", index);
|
||||
}
|
||||
|
||||
if (singlePageResult) {
|
||||
console.log("\tTotal entries : ", singlePageResult.length)
|
||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||
const { newRecords, existingRecords } = saveResults;
|
||||
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("--------------------------");
|
||||
console.log("\tNew record URLs [", newRecords.length, "] :");
|
||||
|
||||
for(const newRecord of newRecords) {
|
||||
console.log("\t\t",newRecord.url);
|
||||
}
|
||||
|
||||
console.log("\t-------------------------");
|
||||
console.log("\tExisting record URLs [", existingRecords.length, "] :");
|
||||
}
|
||||
|
||||
newRealEstates.push(...newRecords);
|
||||
|
||||
for (const existingRecord of existingRecords) {
|
||||
const { publishedDate, renewedDate, url } = existingRecord;
|
||||
const { publishedDate, renewedDate } = existingRecord;
|
||||
|
||||
const publishedDateMoment = moment.utc(publishedDate);
|
||||
const renewedDateMoment = moment.utc(renewedDate);
|
||||
@@ -135,25 +109,13 @@ class OlxCrawler {
|
||||
"minute"
|
||||
);
|
||||
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("\t\t", url);
|
||||
console.log("\t\t\tPublished date : ", publishedDate);
|
||||
console.log("\t\t\tRenewed date : ", renewedDate);
|
||||
console.log("\t\t\tIs same (up to minute) : ", stopCrawlingThisCategory);
|
||||
}
|
||||
|
||||
if (stopCrawlingThisCategory && !OLX_FORCE_CRAWL) {
|
||||
generatorsToRemove[index] = true;
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("\t\t\tStopping this category indexer");
|
||||
}
|
||||
// console.log("\tGenerator ", index + 1, "has no more new ads");
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log("\tNo more entries in this category, stopping!");
|
||||
}
|
||||
//Generator returned undefined, remove this generator from array
|
||||
generatorsToRemove[index] = true;
|
||||
// console.log("Generator ", index + 1, "has no more pages");
|
||||
@@ -178,36 +140,31 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
async *categoryIndexer(adCategory) {
|
||||
try {
|
||||
let pageToIndex = 1;
|
||||
let pageToIndex = 1;
|
||||
|
||||
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||
while (true) {
|
||||
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
urlPageToCrawl,
|
||||
this.maxResultsPerPage
|
||||
);
|
||||
const urlAdTypePart = OLX_ENUMS.OLX_AD_TYPE[this.crawlerAdTypes];
|
||||
const urlCategoryPart = OLX_ENUMS.OLX_AD_CATEGORY[adCategory];
|
||||
if (urlAdTypePart !== undefined && urlCategoryPart !== undefined) {
|
||||
while (true) {
|
||||
const urlPageToCrawl = `${this.baseUrl}${urlAdTypePart}${urlCategoryPart}&stranica=${pageToIndex}`;
|
||||
const singlePageResults = await this.indexSinglePage(
|
||||
urlPageToCrawl,
|
||||
this.maxResultsPerPage
|
||||
);
|
||||
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||
yield singlePageResults;
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
++pageToIndex;
|
||||
if (pageToIndex === this.maxPages) {
|
||||
return undefined;
|
||||
}
|
||||
if (Array.isArray(singlePageResults) && singlePageResults.length > 0) {
|
||||
yield singlePageResults;
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
++pageToIndex;
|
||||
if (pageToIndex === this.maxPages) {
|
||||
return undefined;
|
||||
}
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('Error inside generator: ', e);
|
||||
} else {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -217,10 +174,8 @@ class OlxCrawler {
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {}, false);
|
||||
logDebug("Got category results for: ", url);
|
||||
const res = await fetch(url);
|
||||
const body = await res.text();
|
||||
logDebug("Got category results text for: ", url);
|
||||
const $ = cheerio.load(body);
|
||||
let hrefs = [];
|
||||
|
||||
@@ -239,46 +194,57 @@ class OlxCrawler {
|
||||
let actualNoOfResults =
|
||||
hrefs.length <= maxResultsPerPage ? hrefs.length : maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
asyncScraping.push(hrefs[i]);
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i <= actualNoOfResults;
|
||||
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefs.slice(
|
||||
i,
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
//Before it send n req to scraperAPI it send preflight request to check if we have enough concurrent req availabe
|
||||
//It does not send "real" req until approven internaly
|
||||
let availableConcurrentReqSlots = false;
|
||||
do {
|
||||
availableConcurrentReqSlots = await this.checkAvailableConcurrentReqSlots(
|
||||
concurrentUrlsToScrape.length
|
||||
);
|
||||
} while (availableConcurrentReqSlots !== true);
|
||||
//
|
||||
console.log(
|
||||
`OLX - Sending requests from ${i} to ${i +
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API}.`
|
||||
);
|
||||
console.log(`OLX - Urls sent to scrape: `, concurrentUrlsToScrape);
|
||||
//
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(url =>
|
||||
this.scrapeAd(url)
|
||||
);
|
||||
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
|
||||
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
|
||||
}
|
||||
|
||||
const allChunks = chunk(asyncScraping, 2);
|
||||
const dataResults = []
|
||||
for (let i = 0; i < allChunks.length; i++) {
|
||||
const singleChunk = allChunks[i];
|
||||
const promises = singleChunk.map(c => this.scrapeAd(c))
|
||||
const chunkResults = await Promise.all(promises);
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
dataResults.push(...chunkResults);
|
||||
logDebug("Chunk results len:", chunkResults.length);
|
||||
}
|
||||
|
||||
|
||||
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
||||
logDebug("Filtered scraped data length: ", filteredScrapedData.length);
|
||||
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("Exception caught, index single page: " + e);
|
||||
console.error("Exception caught:" + e);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAd(url) {
|
||||
logDebug("Scraping : ", url);
|
||||
// console.log("Scraping : ", url);
|
||||
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
let status = AD_STATUS.STATUS_NORMAL;
|
||||
|
||||
if (body.indexOf('<html') === -1) {
|
||||
console.error("This is the body: ", body);
|
||||
throw { message: 'Failed to fetch page !' }
|
||||
}
|
||||
|
||||
const propertySelectors = {
|
||||
username:
|
||||
"#lg > div.desno2.profil > div:nth-child(2) > div.vrsta1.vrsta_desno > a > div.username > span",
|
||||
@@ -305,26 +271,65 @@ class OlxCrawler {
|
||||
|
||||
//====== PRICE DETECTION AND EXTRACTION =====
|
||||
let price = null;
|
||||
let normalPrice = null;
|
||||
let urgentPrice = null;
|
||||
const normalPriceValue = $("#pc > p:nth-child(2)")
|
||||
.text()
|
||||
.trim();
|
||||
const urgentPriceValue = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p"
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
//For cases where price is given in discount manner - different from default parsing
|
||||
const discountPriceValue = $(
|
||||
"#artikal_glavni_div > div.artikal_lijevo > div.op.pop > p"
|
||||
)
|
||||
.text()
|
||||
.trim();
|
||||
|
||||
const priceHeader = $("#pc > p.n").text().trim();
|
||||
const priceValue = $("#pc > p:nth-child(2)").text().trim();
|
||||
price = priceValue;
|
||||
|
||||
if (priceHeader.indexOf('Hitn') !== -1) {
|
||||
// Urgent price
|
||||
if (normalPriceValue && normalPriceValue.length > 0) {
|
||||
normalPrice = normalPriceValue
|
||||
.replace(/\r\n|\n|\r/gm, "")
|
||||
.replace("KM", "")
|
||||
.trim();
|
||||
if (
|
||||
$("#pc > p.n")
|
||||
.text()
|
||||
.indexOf("Hitna") !== -1
|
||||
) {
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
} else {
|
||||
status = AD_STATUS.STATUS_NORMAL;
|
||||
}
|
||||
} else if (discountPriceValue && discountPriceValue.length > 0) {
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
const priceValues = discountPriceValue.split("KM");
|
||||
normalPrice = priceValues[0].trim();
|
||||
} else {
|
||||
console.log("Body:", body);
|
||||
throw { message: "Can't find normal price" };
|
||||
}
|
||||
if (urgentPriceValue && urgentPriceValue.length > 0) {
|
||||
const priceValues = urgentPriceValue.replace("Cijena", "").split("KM");
|
||||
//priceValues will contain values like ["100000", "90000", ...], second element is urgent price
|
||||
if (priceValues.length > 0) {
|
||||
if (priceValues[0].trim().indexOf("Hitno") != -1) {
|
||||
urgentPrice = priceValues[0].replace("Hitno", "").trim();
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
} else {
|
||||
urgentPrice = priceValues[0].trim();
|
||||
}
|
||||
} else if (discountPriceValue && discountPriceValue.length > 0) {
|
||||
status = AD_STATUS.STATUS_URGENT;
|
||||
const priceValues = discountPriceValue.split("KM");
|
||||
urgentPrice = priceValues[1].trim();
|
||||
} else {
|
||||
throw { message: "Can't find urgent price" };
|
||||
}
|
||||
}
|
||||
|
||||
const discountPriceTag = $("#artikal_glavni_div > div.artikal_lijevo > p:nth-child(4)").text().trim();
|
||||
if (discountPriceTag.indexOf('Akcij') !== -1) {
|
||||
status = AD_STATUS.STATUS_DISCOUNTED;
|
||||
const discountPriceValues = $("#artikal_glavni_div > div.artikal_lijevo > div:nth-child(5) > p").text().trim();
|
||||
// discountPriceValues contain string like "10.000 KM 7.500 KM"
|
||||
// First price is regular, second is currently active (discounted) price
|
||||
const bothPrices = discountPriceValues.split('KM');
|
||||
// Now, currently active price is second element of bothPrices array
|
||||
price = bothPrices[1] ? bothPrices[1].trim() : null;
|
||||
}
|
||||
price = status === AD_STATUS.STATUS_URGENT ? urgentPrice : normalPrice;
|
||||
|
||||
//====== OTHER AD INFORMATION ===============
|
||||
let adType = null;
|
||||
@@ -705,13 +710,12 @@ class OlxCrawler {
|
||||
distanceToRiver,
|
||||
numberOfViewsAgency
|
||||
};
|
||||
//
|
||||
//console.log("Scraped data:", data);
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error("Exception caught scrapeAd : " + e.message, "\r\nURL:", url);
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -927,8 +931,28 @@ class OlxCrawler {
|
||||
console.log("sprat = NEPOZNATO [", floorText, "]");
|
||||
return null;
|
||||
}
|
||||
async checkAvailableConcurrentReqSlots(numberOfNeededConcurrentReq) {
|
||||
try {
|
||||
const scraperApiAccountInfo = await scraperapiClient.account();
|
||||
const numberOfUsedConcurrentReq =
|
||||
scraperApiAccountInfo.concurrentRequests;
|
||||
const limitOfConcurrentReq = scraperApiAccountInfo.concurrencyLimit;
|
||||
//Buffer of requests to prevent errors with prefligh requests
|
||||
const bufferNumberOfReq = 3;
|
||||
const numberOfAvailableConcurrentReq =
|
||||
limitOfConcurrentReq - bufferNumberOfReq - numberOfUsedConcurrentReq;
|
||||
if (numberOfNeededConcurrentReq <= numberOfAvailableConcurrentReq) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} catch (err) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
// console.log("Sleep for:", ms);
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
|
||||
@@ -63,19 +63,13 @@ class ProstorCrawler {
|
||||
|
||||
async crawl() {
|
||||
const crawlAdCategories = this.crawlerAdCategories;
|
||||
const crawlAdTypes = this.crawlerAdTypes;
|
||||
if (!crawlAdCategories || !crawlAdTypes) {
|
||||
return []
|
||||
}
|
||||
|
||||
const newRealEstates = [];
|
||||
//We need session cookie to use login privileges
|
||||
const prostorCookie = await this.getCookies();
|
||||
//New tag to check if crawler logged in
|
||||
//New tag to check if crawler loged in
|
||||
const login = await this.loginForScraping(PROSTOR_LOGIN, prostorCookie);
|
||||
|
||||
const newRealEstates = [];
|
||||
//Crawl only if login was successful
|
||||
if (login) {
|
||||
if (crawlAdCategories && login) {
|
||||
const indexGenerators = [];
|
||||
for (const adCategory of crawlAdCategories) {
|
||||
indexGenerators.push(this.categoryIndexer(adCategory, prostorCookie));
|
||||
@@ -141,11 +135,6 @@ class ProstorCrawler {
|
||||
prostorCookie
|
||||
);
|
||||
|
||||
if (!Array.isArray(listOfAllRealEstates)){
|
||||
console.log('[PROSTOR] Could not find real estate JSON data, check selector !');
|
||||
return undefined;
|
||||
}
|
||||
|
||||
let elementToStartIndexFrom = 0;
|
||||
while (true) {
|
||||
const realEstatesForSinglePage = listOfAllRealEstates.slice(
|
||||
@@ -215,10 +204,6 @@ class ProstorCrawler {
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (body.indexOf('<html') === -1) {
|
||||
throw { message: 'Failed to fetch page !' }
|
||||
}
|
||||
|
||||
// link contains part of the URL in the format of : /prodaja/stan/stup/9556
|
||||
// general form is : /actionType/realEstateType/location/realEstateID
|
||||
// linkParts contains : ['', 'actionType', 'realEstateType', 'location', 'realEstateID']
|
||||
@@ -450,7 +435,7 @@ class ProstorCrawler {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const scriptElement = $(
|
||||
"body > div.content > div.container-fluid > script:nth-child(6)"
|
||||
"body > div > div.container-fluid > script:nth-child(7)"
|
||||
);
|
||||
|
||||
if (
|
||||
|
||||
@@ -159,7 +159,7 @@ class RentalCrawler {
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {} , false);
|
||||
const res = await fetch(url);
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
let hrefs = [];
|
||||
@@ -202,10 +202,6 @@ class RentalCrawler {
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (body.indexOf('<html') === -1) {
|
||||
throw { message: 'Failed to fetch page !' }
|
||||
}
|
||||
|
||||
const mapElementParent = $(".box-map").parent();
|
||||
const scriptElement = $("script", mapElementParent);
|
||||
if (
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
"use strict";
|
||||
|
||||
const fetch = require("../../helpers/fetchWrapper");
|
||||
const { getUrlParams } = require("../../helpers/url");
|
||||
const cheerio = require("cheerio");
|
||||
const moment = require("moment-timezone");
|
||||
const PromisePool = require('@supercharge/promise-pool');
|
||||
|
||||
const {
|
||||
AD_TYPE,
|
||||
@@ -18,7 +16,8 @@ const {
|
||||
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG,
|
||||
DEFAULT_TIMEZONE
|
||||
DEFAULT_TIMEZONE,
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
} = require("../../config/appConfig");
|
||||
const { SALJIC_FORCE_CRAWL } = require("../specificConfigs/saljic");
|
||||
|
||||
@@ -48,13 +47,12 @@ class SaljicCrawler {
|
||||
maxPages = 5000,
|
||||
maxResultsPerPage = 5000,
|
||||
ignoredUsernames = [],
|
||||
delayBetweenPages = 500
|
||||
delayBetweenPages = 1000
|
||||
) {
|
||||
this.savers = savers;
|
||||
this.baseUrl = "https://www.saljicnekretnine.ba/v2/nekretnine_search";
|
||||
this.crawlerAdTypes = crawlerAdTypes;
|
||||
this.crawlerAdCategories = crawlerAdCategories;
|
||||
this.maxPages = maxPages
|
||||
this.maxResultsPerPage = maxResultsPerPage;
|
||||
this.delayBetweenPages = delayBetweenPages;
|
||||
}
|
||||
@@ -87,6 +85,7 @@ class SaljicCrawler {
|
||||
for (const [index, { value: singlePageResult }] of entries) {
|
||||
if (singlePageResult) {
|
||||
const saveResults = await this.saveCrawledResults(singlePageResult);
|
||||
|
||||
const { newRecords } = saveResults;
|
||||
|
||||
newRealEstates.push(...newRecords);
|
||||
@@ -160,7 +159,7 @@ class SaljicCrawler {
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {}, false);
|
||||
const res = await fetch(url);
|
||||
const body = await res.text();
|
||||
const $ = cheerio.load(body);
|
||||
let hrefs = [];
|
||||
@@ -206,25 +205,32 @@ class SaljicCrawler {
|
||||
? hrefsAbs.length
|
||||
: maxResultsPerPage;
|
||||
|
||||
const asyncScraping = [];
|
||||
for (let i = 0; i < actualNoOfResults; i++) {
|
||||
asyncScraping.push([hrefsAbs[i], adTypes[i]]);
|
||||
const scrapedData = [];
|
||||
for (
|
||||
let i = 0;
|
||||
i <= actualNoOfResults;
|
||||
i = i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
) {
|
||||
const concurrentUrlsToScrape = hrefsAbs.slice(
|
||||
i,
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentAdTypesOfReq = adTypes.slice(
|
||||
i,
|
||||
i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API
|
||||
);
|
||||
|
||||
const concurrentReqScraperApi = concurrentUrlsToScrape.map(
|
||||
(url, index) => this.scrapeAd(url, concurrentAdTypesOfReq[index])
|
||||
);
|
||||
const concurrentReqData = await Promise.all(concurrentReqScraperApi);
|
||||
|
||||
concurrentReqData.forEach(reqData => scrapedData.push(reqData));
|
||||
}
|
||||
|
||||
|
||||
const filteredScrapedData = scrapedData.filter(adData => !!adData);
|
||||
|
||||
const dataResults = []
|
||||
const { scrapedData, errors } = await PromisePool
|
||||
.withConcurrency(2)
|
||||
.for(asyncScraping)
|
||||
.process(async data => {
|
||||
const result = await this.scrapeAd(...data)
|
||||
await this.sleep(this.delayBetweenPages);
|
||||
dataResults.push(result)
|
||||
return result; //TODO: this does not work, scrapedData is null, dataResults works
|
||||
})
|
||||
|
||||
const filteredScrapedData = dataResults.filter(adData => !!adData);
|
||||
return filteredScrapedData;
|
||||
} catch (e) {
|
||||
console.error("[SALJIC] Exception caught:" + e);
|
||||
@@ -232,17 +238,17 @@ class SaljicCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAd(url, adType) {
|
||||
// console.log("[SALJIC] Scraping : ", url);
|
||||
async scrapeAd(url, adTypeAttribute) {
|
||||
//console.log("[SALJIC] Scraping : ", url);
|
||||
try {
|
||||
const adPageSource = await fetch(url);
|
||||
const body = await adPageSource.text();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (body.indexOf('<html') === -1) {
|
||||
throw { message: 'Failed to fetch page !' }
|
||||
//Throws error if req to Scraper API proxy wasn't succesful and responds with error
|
||||
if (body.indexOf("<html>") === -1) {
|
||||
throw { message: "Scraper API server error." };
|
||||
}
|
||||
|
||||
// No information for status ex. PRODAN
|
||||
const status = AD_STATUS.STATUS_NORMAL;
|
||||
//Extracting agency ID from url
|
||||
@@ -250,23 +256,21 @@ class SaljicCrawler {
|
||||
? parseInt(url.substring(46, url.length))
|
||||
: null;
|
||||
|
||||
if (!agencyObjectId) {
|
||||
throw { message : 'No agency object ID - URL changed?'}
|
||||
}
|
||||
|
||||
//Extracting main properties
|
||||
const propertySelectors = {
|
||||
title:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-title > h2",
|
||||
"div.content-wrap > div.container.clearfix.wpc > div.col-md-8.nobottommargin > div.single-post.nobottommargin > div.entry.clearfix > div.entry-title > h2",
|
||||
price:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.topmargin-sm.single-product > div.product > div.product-price > ins",
|
||||
streetName:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > p",
|
||||
|
||||
descriptions:
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.toggle.toggle-bg > div.togglec >p:nth-child(1)",
|
||||
latAndLong:
|
||||
"iframe"
|
||||
"div.content-wrap > div.container > div.col-md-8.nobottommargin > div.single-post > div.entry > div.entry-content.topmargin > div.gmap.bottommargin > iframe"
|
||||
};
|
||||
|
||||
const title = $(propertySelectors.title)
|
||||
.text()
|
||||
.replace(/(\r\n|\n|\r)/gm, "")
|
||||
@@ -296,26 +300,15 @@ class SaljicCrawler {
|
||||
.trim();
|
||||
|
||||
const latAndLongSrc = $(propertySelectors.latAndLong).attr("src");
|
||||
let tmpLatLong;
|
||||
let latText;
|
||||
let longText;
|
||||
if (latAndLongSrc){
|
||||
const mapParams = getUrlParams(latAndLongSrc);
|
||||
if (mapParams) {
|
||||
if (mapParams['marker']){
|
||||
const marker = mapParams['marker'].split(',');
|
||||
latText = marker[0] ? marker[0] : undefined;
|
||||
longText = marker[1] ? marker[1] : undefined;
|
||||
}else{
|
||||
if (mapParams['mlat']) {
|
||||
latText = mapParams['mlat'];
|
||||
}
|
||||
if (mapParams['mlon']) {
|
||||
longText = mapParams['mlon'];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (latAndLongSrc && latAndLongSrc.indexOf("openstreetmap") !== -1) {
|
||||
tmpLatLong = latAndLongSrc.split("marker=")[1];
|
||||
latText = tmpLatLong.split("%2C")[0];
|
||||
longText = tmpLatLong.split("%2C")[1];
|
||||
}
|
||||
const locationLat = parseFloat(latText) || null;
|
||||
const locationLong = parseFloat(longText) || null;
|
||||
|
||||
@@ -364,6 +357,7 @@ class SaljicCrawler {
|
||||
let numberOfViewsKivi = null;
|
||||
let streetNumber = 0;
|
||||
let adStatus = status;
|
||||
let adType = adTypeAttribute;
|
||||
let shortDescription = descriptions
|
||||
? descriptions.substring(0, descriptions.indexOf("."))
|
||||
: "";
|
||||
@@ -402,7 +396,7 @@ class SaljicCrawler {
|
||||
numberOfRooms = parseInt(mainFieldValue);
|
||||
break;
|
||||
case "Broj spratova":
|
||||
numberOfFloors = this.parseNumberOfFloors(mainFieldValue);
|
||||
numberOfFloors = parseInt(mainFieldValue);
|
||||
break;
|
||||
case "Sprat":
|
||||
floor = parseInt(mainFieldValue);
|
||||
@@ -447,10 +441,8 @@ class SaljicCrawler {
|
||||
additionalField.length
|
||||
)
|
||||
.trim();
|
||||
|
||||
realEstateType = this.getAdCategoryId(categoryTmp);
|
||||
if (!realEstateType) {
|
||||
throw { message: 'No real estate type - page body not loaded correctly or page changed?' }
|
||||
}
|
||||
} else {
|
||||
switch (additionalField) {
|
||||
case "Internet":
|
||||
@@ -540,6 +532,11 @@ class SaljicCrawler {
|
||||
const region = "";
|
||||
const entity = "";
|
||||
const country = "";
|
||||
//Throws error if realEstateType is null - not read. Still dont know why?
|
||||
if (realEstateType === null) {
|
||||
console.log("Body:", body);
|
||||
throw { message: "Couldn't read real estate type." };
|
||||
}
|
||||
|
||||
const data = {
|
||||
url,
|
||||
@@ -607,8 +604,9 @@ class SaljicCrawler {
|
||||
|
||||
return data;
|
||||
} catch (e) {
|
||||
console.error("[SALJIC] Exception caught: " + e.message, "\r\nURL:", url);
|
||||
console.error("Exception caught: " + e.message, "\r\nURL:", url);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -652,21 +650,6 @@ class SaljicCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
parseNumberOfFloors(numberOfFloorsText) {
|
||||
const tryNumericalValue = parseInt(numberOfFloorsText);
|
||||
if (!isNaN(tryNumericalValue)){
|
||||
return tryNumericalValue;
|
||||
}
|
||||
|
||||
// Guess number of floors based on number of + sign concatenations
|
||||
// e.g. P+S+Pt -> 3 floors
|
||||
if (typeof numberOfFloorsText === 'string' && numberOfFloorsText.indexOf('+') > 0) {
|
||||
return numberOfFloorsText.split('+').length + 1
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
async sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ const { AD_CATEGORY, AD_TYPE, EMAIL_FREQUENCY } = require("../common/enums");
|
||||
|
||||
//Tag to recognize staging from development
|
||||
const stagingTag = STAGING ? "[STAGING] " : "";
|
||||
const wordOfMouthRequest = `Molimo vas <strong>recite svojim prijateljima</strong> za Kivi - što više korisnika budemo imali, moći ćemo više agencija uključiti i više nekretnina imati u bazi. Hvala!`
|
||||
|
||||
const generateEmailFooter = (searchRequestId, emailFrequencyTitle) => {
|
||||
return ` <div>Trenutno ste prijavljeni da obavještenja o novim nekretninama primate <strong>${emailFrequencyTitle.toLowerCase()} </strong>.</div>
|
||||
@@ -70,9 +69,6 @@ const generateNotificationEmail = (
|
||||
${moreRealEstates}
|
||||
</div>
|
||||
<br/>
|
||||
${wordOfMouthRequest}
|
||||
<br/>
|
||||
<br/>
|
||||
${emailFooter}`;
|
||||
};
|
||||
|
||||
@@ -136,10 +132,6 @@ const generateNewSearchRequestEmail = (searchRequest, matchingRealEstates) => {
|
||||
</div>
|
||||
${matchingRealEstates.length > 0 ? instantRealEstatesText : ""}
|
||||
<br/>
|
||||
<br/>
|
||||
${wordOfMouthRequest}
|
||||
<br/>
|
||||
<br/>
|
||||
${emailFooter}`;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,58 +1,24 @@
|
||||
const nodeFetch = require("node-fetch");
|
||||
const AbortController = require('abort-controller');
|
||||
const FetchCache = require('@sozialhelden/fetch-cache').default;
|
||||
|
||||
console.log("Fc ", FetchCache)
|
||||
|
||||
const {
|
||||
USER_AGENT,
|
||||
USE_SCRAPER_API,
|
||||
SCRAPER_API_KEY,
|
||||
SCRAPER_API_BASE_URL,
|
||||
NODE_FETCH_TIMEOUT_MS
|
||||
SCRAPER_API_KEY
|
||||
} = require("../config/appConfig");
|
||||
|
||||
const timeout = (ms) => {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
const fetchCache = new FetchCache({
|
||||
fetch: nodeFetch,
|
||||
cacheOptions: {
|
||||
// Don't save more than 100 responses in the cache. Allows infinite responses by default
|
||||
maximalItemCount: 10000,
|
||||
// When should the cache evict responses when its full?
|
||||
evictExceedingItemsBy: 'age', // Valid values: 'lru' or 'age'
|
||||
defaultTTL: 6 * 60 * 60 * 1000 // 6 hours
|
||||
// ...see https://github.com/sozialhelden/hamster-cache for all possible options
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
const fetch = async (url, options = {}, useCache = true) => {
|
||||
const controller = new AbortController();
|
||||
|
||||
const fetch = async (url, options = {}) => {
|
||||
const newOptions = Object.assign({}, options);
|
||||
if (!newOptions["headers"]) {
|
||||
newOptions["headers"] = {};
|
||||
}
|
||||
|
||||
newOptions.signal = controller.signal;
|
||||
|
||||
// newOptions["headers"]["User-Agent"] = USER_AGENT;
|
||||
|
||||
let urlToFetchThroughAPI = Buffer.from(url).toString('base64');
|
||||
if (SCRAPER_API_BASE_URL.includes('scraperapi')) {
|
||||
urlToFetchThroughAPI = url;
|
||||
}
|
||||
|
||||
newOptions["headers"]["User-Agent"] = USER_AGENT;
|
||||
const urlAdaptedForScraping = USE_SCRAPER_API
|
||||
? `${SCRAPER_API_BASE_URL}?api_key=${SCRAPER_API_KEY}&url=${urlToFetchThroughAPI}`
|
||||
? `http://api.scraperapi.com/?api_key=${SCRAPER_API_KEY}&url=${url}`
|
||||
: url;
|
||||
const result = useCache ? fetchCache.fetch(urlAdaptedForScraping, newOptions) : nodeFetch(urlAdaptedForScraping, newOptions);
|
||||
|
||||
const timeoutId = setTimeout(() => controller.abort(), NODE_FETCH_TIMEOUT_MS);
|
||||
return result;
|
||||
//
|
||||
// console.log("Url for scraping:", urlAdaptedForScraping);
|
||||
|
||||
return nodeFetch(urlAdaptedForScraping, newOptions);
|
||||
};
|
||||
|
||||
module.exports = fetch;
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
const {
|
||||
PRINT_CRAWLER_DEBUG
|
||||
} = require("../config/appConfig");
|
||||
|
||||
const logDebug = (...args) => {
|
||||
if (PRINT_CRAWLER_DEBUG) {
|
||||
console.log(...args);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
logDebug
|
||||
};
|
||||
@@ -7,26 +7,6 @@ const currentSearchRequest = async req => {
|
||||
|
||||
return await getSearchRequest(searchRequestId);
|
||||
};
|
||||
|
||||
const getUrlParams = function (url) {
|
||||
if (typeof url === 'string' && url.length > 0){
|
||||
const params = {};
|
||||
const questionMarkIndex = url.indexOf('?');
|
||||
if (questionMarkIndex === -1) {
|
||||
return undefined;
|
||||
}
|
||||
const query = url.substring(questionMarkIndex+1);
|
||||
const vars = query.split('&');
|
||||
for (let i = 0; i < vars.length; i++) {
|
||||
const pair = vars[i].split('=');
|
||||
params[pair[0]] = decodeURIComponent(pair[1]);
|
||||
}
|
||||
return params;
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
currentSearchRequest,
|
||||
getUrlParams
|
||||
currentSearchRequest
|
||||
};
|
||||
|
||||
@@ -16,7 +16,7 @@ config.logging = parseInt(process.env.SEQUELIZE_LOGGING) ? console.log : false;
|
||||
|
||||
let sequelize;
|
||||
if (config.use_env_variable) {
|
||||
sequelize = new Sequelize(process.env[config.use_env_variable] + "?ssl=true", config);
|
||||
sequelize = new Sequelize(process.env[config.use_env_variable], config);
|
||||
} else {
|
||||
sequelize = new Sequelize(
|
||||
config.database,
|
||||
|
||||
@@ -24,8 +24,8 @@ API_MAP_KEY=(your-key-here)
|
||||
|
||||
#=============== SCRAPER API SUPORT =============#
|
||||
USE_SCRAPER_API= To turn it on (1) or off (0)
|
||||
SCRAPER_API_KEY= Key for Scraper api
|
||||
SCRAPER_API_BASE_URL= Base url without question mark (example: http://sabur.kivi.ba:1337)
|
||||
SCRAPER_API_KEY= Key for Scraper api
|
||||
NUMBER_OF_CONCURRENT_REQ_SCRAPER_API= Number of requests to send concurrently to Srcaper API proxy
|
||||
|
||||
#=============== AWS SDK EMAIL SETTINGS =======#
|
||||
AWS_KEY_ID=(your-key-here)
|
||||
@@ -37,6 +37,7 @@ SOURCE_EMAIL=info@saburly.com
|
||||
CRAWLER_INTERVAL=Interval to run cralwer(s), in seconds
|
||||
STOP_CRAWLER=Non-zero value will skip crawler execution
|
||||
PRINT_CRAWLER_DEBUG_INFO=Non-zero value will print crawler debugging info to the server console
|
||||
|
||||
#==OLX==
|
||||
OLX_MAX_PAGES=Restrict crawler to this number of pages
|
||||
OLX_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||
@@ -45,6 +46,7 @@ OLX_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be
|
||||
OLX_IGNORED_USERNAMES=comma separated list of usernames to ignore
|
||||
OLX_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||
OLX_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||
|
||||
#==RENTAL==
|
||||
RENTAL_MAX_PAGES=Restrict crawler to this number of pages
|
||||
RENTAL_MAX_RESULTS_PER_PAGE=Only this number or less results from one page will be scraped and saved
|
||||
@@ -72,7 +74,6 @@ AKTIDO_IGNORED_USERNAMES=!!! This is not used for aktido crawler !!!
|
||||
AKTIDO_DELAY_BETWEEN_PAGES=time in miliseconds to wait before indexing next page
|
||||
AKTIDO_FORCE_CRAWL=Non-zero value will force crawler to crawl all pages without stopping when known real estate is found
|
||||
#==SALJIC NEKRETNINE==
|
||||
SALJIC_MAX_PAGES=Restrict crawler to this number of pages
|
||||
SALJIC_MAX_RESULTS_PER_PAGE=For Saljic crawler, this represents how many ads are crawled at once
|
||||
SALJIC_CRAWLER_AD_TYPE=enum name of what type of ads should be crawled, check common/enums.js file for valid values
|
||||
SALJIC_CRAWLER_AD_CATEGORIES=comma separated list of enum names of categories to be included, check common/enums.js file for valid values
|
||||
|
||||
9
index.js
9
index.js
@@ -4,7 +4,6 @@ const bodyParser = require("body-parser");
|
||||
const layout = require("express-layout");
|
||||
const compression = require("compression");
|
||||
const forceSSL = require("./app/helpers/forceSSL");
|
||||
const { logDebug } = require("./app/helpers/log");
|
||||
|
||||
const {
|
||||
APP_PORT,
|
||||
@@ -39,17 +38,11 @@ app.listen(APP_PORT, () =>
|
||||
|
||||
let crawlerRunning = STOP_CRAWLER;
|
||||
const crawl = () => {
|
||||
logDebug("Crawl start. crawlerRunning: ", crawlerRunning);
|
||||
if (!crawlerRunning) {
|
||||
crawlerRunning = true;
|
||||
crawlAll().then(newRealEstates => {
|
||||
logDebug("crawlAll done, new real estate len: ", newRealEstates.length);
|
||||
notifyForNewRealEstates(newRealEstates);
|
||||
}).catch(e => {
|
||||
console.error('Error happened: ', e);
|
||||
}).finally(()=> {
|
||||
crawlerRunning = false;
|
||||
logDebug('Finally done!');
|
||||
notifyForNewRealEstates(newRealEstates);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
226
package-lock.json
generated
226
package-lock.json
generated
@@ -40,32 +40,6 @@
|
||||
"@sendgrid/helpers": "^6.3.0"
|
||||
}
|
||||
},
|
||||
"@sozialhelden/fetch-cache": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@sozialhelden/fetch-cache/-/fetch-cache-2.0.1.tgz",
|
||||
"integrity": "sha512-vMlsdT5JQCGjx1fcFxmMNh7ZKppjjsfUAeZEhhNwhEL7GaqbZXsD1OXEyx2IcRa25ZuZtvJSV6Q3rE77VRdLvg==",
|
||||
"requires": {
|
||||
"@sozialhelden/hamster-cache": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"@sozialhelden/hamster-cache": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/@sozialhelden/hamster-cache/-/hamster-cache-1.0.0.tgz",
|
||||
"integrity": "sha512-/TEGA8mdMawZp4Yq/GrkL+72YL5EGuSeVXC3pKW12YY1t3C+zCN/HZ0HRp4zWF/e67svXcxuz/B0AEQxEdvi7A=="
|
||||
},
|
||||
"@supercharge/goodies": {
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/@supercharge/goodies/-/goodies-1.4.0.tgz",
|
||||
"integrity": "sha512-Np6u2qjRwiA3wTgzz4n2yduydIjSXqtJWP5cOnNqjdlCR/EUAK86LAOhEcU+YW211D1ksugns3GqpARJDoXQ7g=="
|
||||
},
|
||||
"@supercharge/promise-pool": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/@supercharge/promise-pool/-/promise-pool-1.3.0.tgz",
|
||||
"integrity": "sha512-9/EVrJevSPEqI4i/gRH8Dt7C+FQT65wRRYuu0MDaGmSLZ2aTel0jOGu8Ae84fPiQ+Ah0B80RPFUxk+K+Cz48DA==",
|
||||
"requires": {
|
||||
"@supercharge/goodies": "~1.4.0"
|
||||
}
|
||||
},
|
||||
"@types/caseless": {
|
||||
"version": "0.12.2",
|
||||
"resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.2.tgz",
|
||||
@@ -105,14 +79,6 @@
|
||||
"resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz",
|
||||
"integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q=="
|
||||
},
|
||||
"abort-controller": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
|
||||
"integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
|
||||
"requires": {
|
||||
"event-target-shim": "^5.0.0"
|
||||
}
|
||||
},
|
||||
"accepts": {
|
||||
"version": "1.3.5",
|
||||
"resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz",
|
||||
@@ -181,6 +147,14 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"argparse": {
|
||||
"version": "1.0.10",
|
||||
"resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz",
|
||||
"integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==",
|
||||
"requires": {
|
||||
"sprintf-js": "~1.0.2"
|
||||
}
|
||||
},
|
||||
"arr-diff": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz",
|
||||
@@ -229,6 +203,21 @@
|
||||
"integrity": "sha1-WWZ/QfrdTyDMvCu5a41Pf3jsA2c=",
|
||||
"dev": true
|
||||
},
|
||||
"async": {
|
||||
"version": "2.6.3",
|
||||
"resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
|
||||
"integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
|
||||
"requires": {
|
||||
"lodash": "^4.17.14"
|
||||
},
|
||||
"dependencies": {
|
||||
"lodash": {
|
||||
"version": "4.17.15",
|
||||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz",
|
||||
"integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A=="
|
||||
}
|
||||
}
|
||||
},
|
||||
"async-each": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/async-each/-/async-each-1.0.3.tgz",
|
||||
@@ -659,6 +648,11 @@
|
||||
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
|
||||
"integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU="
|
||||
},
|
||||
"colors": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/colors/-/colors-1.0.3.tgz",
|
||||
"integrity": "sha1-BDP0TYCWgP3rYO0mDxsMJi6CpAs="
|
||||
},
|
||||
"combined-stream": {
|
||||
"version": "1.0.7",
|
||||
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.7.tgz",
|
||||
@@ -764,6 +758,25 @@
|
||||
"resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
|
||||
"integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac="
|
||||
},
|
||||
"coveralls": {
|
||||
"version": "3.0.9",
|
||||
"resolved": "https://registry.npmjs.org/coveralls/-/coveralls-3.0.9.tgz",
|
||||
"integrity": "sha512-nNBg3B1+4iDox5A5zqHKzUTiwl2ey4k2o0NEcVZYvl+GOSJdKBj4AJGKLv6h3SvWch7tABHePAQOSZWM9E2hMg==",
|
||||
"requires": {
|
||||
"js-yaml": "^3.13.1",
|
||||
"lcov-parse": "^1.0.0",
|
||||
"log-driver": "^1.2.7",
|
||||
"minimist": "^1.2.0",
|
||||
"request": "^2.88.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"minimist": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz",
|
||||
"integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ="
|
||||
}
|
||||
}
|
||||
},
|
||||
"create-error-class": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/create-error-class/-/create-error-class-3.0.2.tgz",
|
||||
@@ -816,6 +829,11 @@
|
||||
"resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz",
|
||||
"integrity": "sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg=="
|
||||
},
|
||||
"cycle": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/cycle/-/cycle-1.0.3.tgz",
|
||||
"integrity": "sha1-IegLK+hYD5i0aPN5QwZisEbDStI="
|
||||
},
|
||||
"d": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/d/-/d-1.0.1.tgz",
|
||||
@@ -1094,6 +1112,11 @@
|
||||
"prettier-linter-helpers": "^1.0.0"
|
||||
}
|
||||
},
|
||||
"esprima": {
|
||||
"version": "4.0.1",
|
||||
"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
|
||||
"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A=="
|
||||
},
|
||||
"etag": {
|
||||
"version": "1.8.1",
|
||||
"resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
|
||||
@@ -1108,11 +1131,6 @@
|
||||
"es5-ext": "~0.10.14"
|
||||
}
|
||||
},
|
||||
"event-target-shim": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
|
||||
"integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ=="
|
||||
},
|
||||
"events": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz",
|
||||
@@ -1313,6 +1331,11 @@
|
||||
"resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz",
|
||||
"integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU="
|
||||
},
|
||||
"eyes": {
|
||||
"version": "0.1.8",
|
||||
"resolved": "https://registry.npmjs.org/eyes/-/eyes-0.1.8.tgz",
|
||||
"integrity": "sha1-Ys8SAjTGg3hdkCNIqADvPgzCC8A="
|
||||
},
|
||||
"fast-deep-equal": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz",
|
||||
@@ -2514,6 +2537,15 @@
|
||||
"nopt": "~4.0.1"
|
||||
}
|
||||
},
|
||||
"js-yaml": {
|
||||
"version": "3.13.1",
|
||||
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz",
|
||||
"integrity": "sha512-YfbcO7jXDdyj0DGxYVSlSeQNHbD7XPWvrVWeVUujrQEoZzWJIRrCPoyk6kL6IAjAG2IolMK4T0hNUe0HOUs5Jw==",
|
||||
"requires": {
|
||||
"argparse": "^1.0.7",
|
||||
"esprima": "^4.0.0"
|
||||
}
|
||||
},
|
||||
"jsbn": {
|
||||
"version": "0.1.1",
|
||||
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
|
||||
@@ -2576,6 +2608,11 @@
|
||||
"invert-kv": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"lcov-parse": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/lcov-parse/-/lcov-parse-1.0.0.tgz",
|
||||
"integrity": "sha1-6w1GtUER68VhrLTECO+TY73I9+A="
|
||||
},
|
||||
"locate-path": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz",
|
||||
@@ -2590,6 +2627,11 @@
|
||||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz",
|
||||
"integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg=="
|
||||
},
|
||||
"log-driver": {
|
||||
"version": "1.2.7",
|
||||
"resolved": "https://registry.npmjs.org/log-driver/-/log-driver-1.2.7.tgz",
|
||||
"integrity": "sha512-U7KCmLdqsGHBLeWqYlFA0V0Sl6P08EE1ZrmA9cxjUE0WVqT9qnyVDPz1kzpFEP0jdJuFnasWIfSd7fsaNXkpbg=="
|
||||
},
|
||||
"long-timeout": {
|
||||
"version": "0.1.1",
|
||||
"resolved": "https://registry.npmjs.org/long-timeout/-/long-timeout-0.1.1.tgz",
|
||||
@@ -3260,6 +3302,20 @@
|
||||
"integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==",
|
||||
"dev": true
|
||||
},
|
||||
"promise-request-retry": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/promise-request-retry/-/promise-request-retry-1.0.2.tgz",
|
||||
"integrity": "sha512-zZmu19chRtC6TYeAZaELF8s+Zotl48M6bRnIVjcUrObEjpI4wk+2VpGVRaRgCG6isOqsK4c5IMY7t59Ff2ia0A==",
|
||||
"requires": {
|
||||
"async": "^2.6.0",
|
||||
"bluebird": "^3.5.1",
|
||||
"coveralls": "^3.0.0",
|
||||
"req-cwd": "^2.0.0",
|
||||
"request": "^2.85.0",
|
||||
"request-promise": "^4.2.2",
|
||||
"winston": "^2.4.0"
|
||||
}
|
||||
},
|
||||
"proto-list": {
|
||||
"version": "1.2.4",
|
||||
"resolved": "https://registry.npmjs.org/proto-list/-/proto-list-1.2.4.tgz",
|
||||
@@ -3454,6 +3510,22 @@
|
||||
"integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=",
|
||||
"dev": true
|
||||
},
|
||||
"req-cwd": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/req-cwd/-/req-cwd-2.0.0.tgz",
|
||||
"integrity": "sha1-1AgrTURZgDZkD7c93qAe1T20nrw=",
|
||||
"requires": {
|
||||
"req-from": "^2.0.0"
|
||||
}
|
||||
},
|
||||
"req-from": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/req-from/-/req-from-2.0.0.tgz",
|
||||
"integrity": "sha1-10GI5H+TeW9Kpx327jWuaJ8+DnA=",
|
||||
"requires": {
|
||||
"resolve-from": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"request": {
|
||||
"version": "2.88.0",
|
||||
"resolved": "https://registry.npmjs.org/request/-/request-2.88.0.tgz",
|
||||
@@ -3493,6 +3565,32 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"request-promise": {
|
||||
"version": "4.2.5",
|
||||
"resolved": "https://registry.npmjs.org/request-promise/-/request-promise-4.2.5.tgz",
|
||||
"integrity": "sha512-ZgnepCykFdmpq86fKGwqntyTiUrHycALuGggpyCZwMvGaZWgxW6yagT0FHkgo5LzYvOaCNvxYwWYIjevSH1EDg==",
|
||||
"requires": {
|
||||
"bluebird": "^3.5.0",
|
||||
"request-promise-core": "1.1.3",
|
||||
"stealthy-require": "^1.1.1",
|
||||
"tough-cookie": "^2.3.3"
|
||||
}
|
||||
},
|
||||
"request-promise-core": {
|
||||
"version": "1.1.3",
|
||||
"resolved": "https://registry.npmjs.org/request-promise-core/-/request-promise-core-1.1.3.tgz",
|
||||
"integrity": "sha512-QIs2+ArIGQVp5ZYbWD5ZLCY29D5CfWizP8eWnm8FoGD1TX61veauETVQbrV60662V0oFBkrDOuaBI8XgtuyYAQ==",
|
||||
"requires": {
|
||||
"lodash": "^4.17.15"
|
||||
},
|
||||
"dependencies": {
|
||||
"lodash": {
|
||||
"version": "4.17.15",
|
||||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz",
|
||||
"integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A=="
|
||||
}
|
||||
}
|
||||
},
|
||||
"require-directory": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
|
||||
@@ -3511,6 +3609,11 @@
|
||||
"path-parse": "^1.0.6"
|
||||
}
|
||||
},
|
||||
"resolve-from": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-3.0.0.tgz",
|
||||
"integrity": "sha1-six699nWiBvItuZTM17rywoYh0g="
|
||||
},
|
||||
"resolve-url": {
|
||||
"version": "0.2.1",
|
||||
"resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz",
|
||||
@@ -3555,6 +3658,16 @@
|
||||
"resolved": "https://registry.npmjs.org/sax/-/sax-1.2.1.tgz",
|
||||
"integrity": "sha1-e45lYZCyKOgaZq6nSEgNgozS03o="
|
||||
},
|
||||
"scraperapi-sdk": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/scraperapi-sdk/-/scraperapi-sdk-1.0.3.tgz",
|
||||
"integrity": "sha512-wFzdVptJHAA13HWMxR6DxsesA95cx0eBvylh2CHH9UmzBYor7N54jxgL473IW1VZEferSCNpwlW2R/B3zTPDsQ==",
|
||||
"requires": {
|
||||
"promise-request-retry": "^1.0.2",
|
||||
"request": "^2.88.0",
|
||||
"request-promise": "^4.2.5"
|
||||
}
|
||||
},
|
||||
"semver": {
|
||||
"version": "5.6.0",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-5.6.0.tgz",
|
||||
@@ -3877,6 +3990,11 @@
|
||||
"extend-shallow": "^3.0.0"
|
||||
}
|
||||
},
|
||||
"sprintf-js": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
|
||||
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw="
|
||||
},
|
||||
"sshpk": {
|
||||
"version": "1.16.1",
|
||||
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz",
|
||||
@@ -3893,6 +4011,11 @@
|
||||
"tweetnacl": "~0.14.0"
|
||||
}
|
||||
},
|
||||
"stack-trace": {
|
||||
"version": "0.0.10",
|
||||
"resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
|
||||
"integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA="
|
||||
},
|
||||
"static-extend": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz",
|
||||
@@ -3919,6 +4042,11 @@
|
||||
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz",
|
||||
"integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew=="
|
||||
},
|
||||
"stealthy-require": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/stealthy-require/-/stealthy-require-1.1.1.tgz",
|
||||
"integrity": "sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks="
|
||||
},
|
||||
"string-width": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
|
||||
@@ -4390,6 +4518,26 @@
|
||||
"string-width": "^2.1.1"
|
||||
}
|
||||
},
|
||||
"winston": {
|
||||
"version": "2.4.4",
|
||||
"resolved": "https://registry.npmjs.org/winston/-/winston-2.4.4.tgz",
|
||||
"integrity": "sha512-NBo2Pepn4hK4V01UfcWcDlmiVTs7VTB1h7bgnB0rgP146bYhMxX0ypCz3lBOfNxCO4Zuek7yeT+y/zM1OfMw4Q==",
|
||||
"requires": {
|
||||
"async": "~1.0.0",
|
||||
"colors": "1.0.x",
|
||||
"cycle": "1.0.x",
|
||||
"eyes": "0.1.x",
|
||||
"isstream": "0.1.x",
|
||||
"stack-trace": "0.0.x"
|
||||
},
|
||||
"dependencies": {
|
||||
"async": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/async/-/async-1.0.0.tgz",
|
||||
"integrity": "sha1-+PwEyjoTeErenhZBr5hXjPvWR6k="
|
||||
}
|
||||
}
|
||||
},
|
||||
"wkx": {
|
||||
"version": "0.4.8",
|
||||
"resolved": "https://registry.npmjs.org/wkx/-/wkx-0.4.8.tgz",
|
||||
|
||||
@@ -17,8 +17,9 @@
|
||||
"checkup-notify": "cd app/npmScripts && node npmCheckUpNotify.js",
|
||||
"test-search": "cd test && node searchTest.js",
|
||||
"test-olx-scraper": "cd test && node olxScrapeTest.js",
|
||||
"test-saljic-scraper": "cd test && node saljicScrapeTest.js",
|
||||
"test-rental-scraper": "cd test && node rentalScrapeTest.js",
|
||||
"test-saljic-scraper": "cd test && node saljicScrapeTest.js"
|
||||
"test-scraper-api": "cd test && node scraperAPITest.js"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
@@ -32,9 +33,6 @@
|
||||
"dependencies": {
|
||||
"2checkout-node": "0.0.1",
|
||||
"@sendgrid/mail": "^6.3.1",
|
||||
"@sozialhelden/fetch-cache": "^2.0.1",
|
||||
"@supercharge/promise-pool": "^1.3.0",
|
||||
"abort-controller": "^3.0.0",
|
||||
"aws-sdk": "^2.422.0",
|
||||
"bluebird": "^3.5.5",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
@@ -54,6 +52,7 @@
|
||||
"pg": "^7.10.0",
|
||||
"prettier": "^1.19.1",
|
||||
"react-step-wizard": "^5.1.0",
|
||||
"scraperapi-sdk": "^1.0.3",
|
||||
"sequelize": "^5.18.4",
|
||||
"sequelize-cli": "^5.5.0"
|
||||
},
|
||||
|
||||
19
test/scraperAPITest.js
Normal file
19
test/scraperAPITest.js
Normal file
@@ -0,0 +1,19 @@
|
||||
const { SCRAPER_API_KEY } = require("../app/config/appConfig");
|
||||
|
||||
const scraperapiClient = require("scraperapi-sdk")(SCRAPER_API_KEY);
|
||||
|
||||
async function logUsedConcurrentReq() {
|
||||
try {
|
||||
const response = await scraperapiClient.account();
|
||||
const dateOfLog = new Date().toLocaleString();
|
||||
console.log(
|
||||
dateOfLog,
|
||||
" Number of concurrent requests: ",
|
||||
response.concurrentRequests
|
||||
);
|
||||
} catch (err) {
|
||||
console.log(err.message);
|
||||
}
|
||||
}
|
||||
|
||||
setInterval(logUsedConcurrentReq, 1000);
|
||||
Reference in New Issue
Block a user