From 131536d9fbb46ce0e4d376e420296c72f6e635c2 Mon Sep 17 00:00:00 2001 From: Naida Vatric Date: Fri, 6 Mar 2020 14:25:02 +0100 Subject: [PATCH] Olx added preflight check of available concurrent req. --- app/crawler/specificCrawlers/olx.js | 52 ++++---- package-lock.json | 187 ++++++++++++++++++++++++++++ package.json | 4 +- test/scraperAPITest.js | 19 +++ 4 files changed, 239 insertions(+), 23 deletions(-) create mode 100644 test/scraperAPITest.js diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 75293f2..2cf6d5e 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -19,7 +19,8 @@ const { const { DEFAULT_TIMEZONE, PRINT_CRAWLER_DEBUG, - NUMBER_OF_CONCURRENT_REQ_SCRAPER_API + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API, + SCRAPER_API_KEY } = require("../../config/appConfig"); const OLX_ENUMS = { @@ -45,6 +46,8 @@ const OLX_ENUMS = { const { OLX_FORCE_CRAWL } = require("../specificConfigs/olx"); +const scraperapiClient = require("scraperapi-sdk")(SCRAPER_API_KEY); + class OlxCrawler { constructor( savers = [], @@ -201,6 +204,14 @@ class OlxCrawler { i, i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API ); + //Before it send n req to scraperAPI it send preflight request to check if we have enough concurrent req availabe + //It does not send "real" req until approven internaly + let availableConcurrentReqSlots = false; + do { + availableConcurrentReqSlots = await this.checkAvailableConcurrentReqSlots( + concurrentUrlsToScrape.length + ); + } while (availableConcurrentReqSlots !== true); // console.log( `OLX - Sending requests from ${i} to ${i + @@ -920,28 +931,25 @@ class OlxCrawler { console.log("sprat = NEPOZNATO [", floorText, "]"); return null; } - /* - async consecutiveRequestSending(requestsToScraperApi) { - let dataFromAllRequests = []; - - for ( - const i = 0; - i <= requestsToScraperApi.length; - i + NUMBER_OF_CONCURRENT_REQ_SCRAPER_API - ) { - const concurrentRequestsToScraperApi = requestsToScraperApi.slice( - i, - NUMBER_OF_CONCURRENT_REQ_SCRAPER_API - ); - const dataFromConcurrentRequest = await Promise.all( - concurrentRequestsToScraperApi - ); - dataFromAllRequests.push(dataFromConcurrentRequest); - this.sleep(DELAY_BETWEEN_REQ_SCRAPER_API); + async checkAvailableConcurrentReqSlots(numberOfNeededConcurrentReq) { + try { + const scraperApiAccountInfo = await scraperapiClient.account(); + const numberOfUsedConcurrentReq = + scraperApiAccountInfo.concurrentRequests; + const limitOfConcurrentReq = scraperApiAccountInfo.concurrencyLimit; + //Buffer of requests to prevent errors with prefligh requests + const bufferNumberOfReq = 3; + const numberOfAvailableConcurrentReq = + limitOfConcurrentReq - bufferNumberOfReq - numberOfUsedConcurrentReq; + if (numberOfNeededConcurrentReq <= numberOfAvailableConcurrentReq) { + return true; + } else { + return false; + } + } catch (err) { + return false; } - - return dataFromAllRequests; - }*/ + } async sleep(ms) { // console.log("Sleep for:", ms); diff --git a/package-lock.json b/package-lock.json index 4626180..5304e72 100644 --- a/package-lock.json +++ b/package-lock.json @@ -147,6 +147,14 @@ } } }, + "argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "requires": { + "sprintf-js": "~1.0.2" + } + }, "arr-diff": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", @@ -195,6 +203,21 @@ "integrity": "sha1-WWZ/QfrdTyDMvCu5a41Pf3jsA2c=", "dev": true }, + "async": { + "version": "2.6.3", + "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz", + "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==", + "requires": { + "lodash": "^4.17.14" + }, + "dependencies": { + "lodash": { + "version": "4.17.15", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz", + "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==" + } + } + }, "async-each": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/async-each/-/async-each-1.0.3.tgz", @@ -625,6 +648,11 @@ "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=" }, + "colors": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.0.3.tgz", + "integrity": "sha1-BDP0TYCWgP3rYO0mDxsMJi6CpAs=" + }, "combined-stream": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.7.tgz", @@ -730,6 +758,25 @@ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" }, + "coveralls": { + "version": "3.0.9", + "resolved": "https://registry.npmjs.org/coveralls/-/coveralls-3.0.9.tgz", + "integrity": "sha512-nNBg3B1+4iDox5A5zqHKzUTiwl2ey4k2o0NEcVZYvl+GOSJdKBj4AJGKLv6h3SvWch7tABHePAQOSZWM9E2hMg==", + "requires": { + "js-yaml": "^3.13.1", + "lcov-parse": "^1.0.0", + "log-driver": "^1.2.7", + "minimist": "^1.2.0", + "request": "^2.88.0" + }, + "dependencies": { + "minimist": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz", + "integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=" + } + } + }, "create-error-class": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/create-error-class/-/create-error-class-3.0.2.tgz", @@ -782,6 +829,11 @@ "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.3.tgz", "integrity": "sha512-a+EPoD+uZiNfh+5fxw2nO9QwFa6nJe2Or35fGY6Ipw1R3R4AGz1d1TEZrCegvw2YTmZ0jXirGYlzxxpYSHwpEg==" }, + "cycle": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/cycle/-/cycle-1.0.3.tgz", + "integrity": "sha1-IegLK+hYD5i0aPN5QwZisEbDStI=" + }, "d": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/d/-/d-1.0.1.tgz", @@ -1060,6 +1112,11 @@ "prettier-linter-helpers": "^1.0.0" } }, + "esprima": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", + "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==" + }, "etag": { "version": "1.8.1", "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", @@ -1274,6 +1331,11 @@ "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=" }, + "eyes": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/eyes/-/eyes-0.1.8.tgz", + "integrity": "sha1-Ys8SAjTGg3hdkCNIqADvPgzCC8A=" + }, "fast-deep-equal": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", @@ -2475,6 +2537,15 @@ "nopt": "~4.0.1" } }, + "js-yaml": { + "version": "3.13.1", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz", + "integrity": "sha512-YfbcO7jXDdyj0DGxYVSlSeQNHbD7XPWvrVWeVUujrQEoZzWJIRrCPoyk6kL6IAjAG2IolMK4T0hNUe0HOUs5Jw==", + "requires": { + "argparse": "^1.0.7", + "esprima": "^4.0.0" + } + }, "jsbn": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", @@ -2537,6 +2608,11 @@ "invert-kv": "^2.0.0" } }, + "lcov-parse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/lcov-parse/-/lcov-parse-1.0.0.tgz", + "integrity": "sha1-6w1GtUER68VhrLTECO+TY73I9+A=" + }, "locate-path": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", @@ -2551,6 +2627,11 @@ "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz", "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg==" }, + "log-driver": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/log-driver/-/log-driver-1.2.7.tgz", + "integrity": "sha512-U7KCmLdqsGHBLeWqYlFA0V0Sl6P08EE1ZrmA9cxjUE0WVqT9qnyVDPz1kzpFEP0jdJuFnasWIfSd7fsaNXkpbg==" + }, "long-timeout": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/long-timeout/-/long-timeout-0.1.1.tgz", @@ -3221,6 +3302,20 @@ "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", "dev": true }, + "promise-request-retry": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/promise-request-retry/-/promise-request-retry-1.0.2.tgz", + "integrity": "sha512-zZmu19chRtC6TYeAZaELF8s+Zotl48M6bRnIVjcUrObEjpI4wk+2VpGVRaRgCG6isOqsK4c5IMY7t59Ff2ia0A==", + "requires": { + "async": "^2.6.0", + "bluebird": "^3.5.1", + "coveralls": "^3.0.0", + "req-cwd": "^2.0.0", + "request": "^2.85.0", + "request-promise": "^4.2.2", + "winston": "^2.4.0" + } + }, "proto-list": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/proto-list/-/proto-list-1.2.4.tgz", @@ -3415,6 +3510,22 @@ "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=", "dev": true }, + "req-cwd": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/req-cwd/-/req-cwd-2.0.0.tgz", + "integrity": "sha1-1AgrTURZgDZkD7c93qAe1T20nrw=", + "requires": { + "req-from": "^2.0.0" + } + }, + "req-from": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/req-from/-/req-from-2.0.0.tgz", + "integrity": "sha1-10GI5H+TeW9Kpx327jWuaJ8+DnA=", + "requires": { + "resolve-from": "^3.0.0" + } + }, "request": { "version": "2.88.0", "resolved": "https://registry.npmjs.org/request/-/request-2.88.0.tgz", @@ -3454,6 +3565,32 @@ } } }, + "request-promise": { + "version": "4.2.5", + "resolved": "https://registry.npmjs.org/request-promise/-/request-promise-4.2.5.tgz", + "integrity": "sha512-ZgnepCykFdmpq86fKGwqntyTiUrHycALuGggpyCZwMvGaZWgxW6yagT0FHkgo5LzYvOaCNvxYwWYIjevSH1EDg==", + "requires": { + "bluebird": "^3.5.0", + "request-promise-core": "1.1.3", + "stealthy-require": "^1.1.1", + "tough-cookie": "^2.3.3" + } + }, + "request-promise-core": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/request-promise-core/-/request-promise-core-1.1.3.tgz", + "integrity": "sha512-QIs2+ArIGQVp5ZYbWD5ZLCY29D5CfWizP8eWnm8FoGD1TX61veauETVQbrV60662V0oFBkrDOuaBI8XgtuyYAQ==", + "requires": { + "lodash": "^4.17.15" + }, + "dependencies": { + "lodash": { + "version": "4.17.15", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz", + "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A==" + } + } + }, "require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -3472,6 +3609,11 @@ "path-parse": "^1.0.6" } }, + "resolve-from": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-3.0.0.tgz", + "integrity": "sha1-six699nWiBvItuZTM17rywoYh0g=" + }, "resolve-url": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz", @@ -3516,6 +3658,16 @@ "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.1.tgz", "integrity": "sha1-e45lYZCyKOgaZq6nSEgNgozS03o=" }, + "scraperapi-sdk": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/scraperapi-sdk/-/scraperapi-sdk-1.0.3.tgz", + "integrity": "sha512-wFzdVptJHAA13HWMxR6DxsesA95cx0eBvylh2CHH9UmzBYor7N54jxgL473IW1VZEferSCNpwlW2R/B3zTPDsQ==", + "requires": { + "promise-request-retry": "^1.0.2", + "request": "^2.88.0", + "request-promise": "^4.2.5" + } + }, "semver": { "version": "5.6.0", "resolved": "https://registry.npmjs.org/semver/-/semver-5.6.0.tgz", @@ -3838,6 +3990,11 @@ "extend-shallow": "^3.0.0" } }, + "sprintf-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=" + }, "sshpk": { "version": "1.16.1", "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz", @@ -3854,6 +4011,11 @@ "tweetnacl": "~0.14.0" } }, + "stack-trace": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz", + "integrity": "sha1-VHxws0fo0ytOEI6hoqFZ5f3eGcA=" + }, "static-extend": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz", @@ -3880,6 +4042,11 @@ "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz", "integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew==" }, + "stealthy-require": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/stealthy-require/-/stealthy-require-1.1.1.tgz", + "integrity": "sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks=" + }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", @@ -4351,6 +4518,26 @@ "string-width": "^2.1.1" } }, + "winston": { + "version": "2.4.4", + "resolved": "https://registry.npmjs.org/winston/-/winston-2.4.4.tgz", + "integrity": "sha512-NBo2Pepn4hK4V01UfcWcDlmiVTs7VTB1h7bgnB0rgP146bYhMxX0ypCz3lBOfNxCO4Zuek7yeT+y/zM1OfMw4Q==", + "requires": { + "async": "~1.0.0", + "colors": "1.0.x", + "cycle": "1.0.x", + "eyes": "0.1.x", + "isstream": "0.1.x", + "stack-trace": "0.0.x" + }, + "dependencies": { + "async": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/async/-/async-1.0.0.tgz", + "integrity": "sha1-+PwEyjoTeErenhZBr5hXjPvWR6k=" + } + } + }, "wkx": { "version": "0.4.8", "resolved": "https://registry.npmjs.org/wkx/-/wkx-0.4.8.tgz", diff --git a/package.json b/package.json index 96d5274..c4772f2 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,8 @@ "test-search": "cd test && node searchTest.js", "test-olx-scraper": "cd test && node olxScrapeTest.js", "test-saljic-scraper": "cd test && node saljicScrapeTest.js", - "test-rental-scraper": "cd test && node rentalScrapeTest.js" + "test-rental-scraper": "cd test && node rentalScrapeTest.js", + "test-scraper-api": "cd test && node scraperAPITest.js" }, "repository": { "type": "git", @@ -51,6 +52,7 @@ "pg": "^7.10.0", "prettier": "^1.19.1", "react-step-wizard": "^5.1.0", + "scraperapi-sdk": "^1.0.3", "sequelize": "^5.18.4", "sequelize-cli": "^5.5.0" }, diff --git a/test/scraperAPITest.js b/test/scraperAPITest.js new file mode 100644 index 0000000..44026a5 --- /dev/null +++ b/test/scraperAPITest.js @@ -0,0 +1,19 @@ +const { SCRAPER_API_KEY } = require("../app/config/appConfig"); + +const scraperapiClient = require("scraperapi-sdk")(SCRAPER_API_KEY); + +async function logUsedConcurrentReq() { + try { + const response = await scraperapiClient.account(); + const dateOfLog = new Date().toLocaleString(); + console.log( + dateOfLog, + " Number of concurrent requests: ", + response.concurrentRequests + ); + } catch (err) { + console.log(err.message); + } +} + +setInterval(logUsedConcurrentReq, 1000);