From 698acb010a818aee4d7750972f93b09f11717e7f Mon Sep 17 00:00:00 2001 From: = Date: Tue, 15 Sep 2020 01:27:20 -0700 Subject: [PATCH] Add timeout to fetch wrapper --- app/crawler/specificCrawlers/olx.js | 5 +++-- app/helpers/fetchWrapper.js | 10 +++++++++- package-lock.json | 13 +++++++++++++ package.json | 5 +++-- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/app/crawler/specificCrawlers/olx.js b/app/crawler/specificCrawlers/olx.js index 31004fe..6ee1963 100644 --- a/app/crawler/specificCrawlers/olx.js +++ b/app/crawler/specificCrawlers/olx.js @@ -220,6 +220,7 @@ class OlxCrawler { const res = await fetch(url); logDebug("Got category results for: ", url); const body = await res.text(); + logDebug("Got category results text for: ", url); const $ = cheerio.load(body); let hrefs = []; @@ -260,7 +261,7 @@ class OlxCrawler { return filteredScrapedData; } catch (e) { - console.error("Exception caught:" + e); + console.error("Exception caught, index single page: " + e); return []; } } @@ -709,7 +710,7 @@ class OlxCrawler { return data; } catch (e) { - console.error("Exception caught: " + e.message, "\r\nURL:", url); + console.error("Exception caught scrapeAd : " + e.message, "\r\nURL:", url); } return null; } diff --git a/app/helpers/fetchWrapper.js b/app/helpers/fetchWrapper.js index f439a47..0c0e439 100644 --- a/app/helpers/fetchWrapper.js +++ b/app/helpers/fetchWrapper.js @@ -1,4 +1,5 @@ const nodeFetch = require("node-fetch"); +const AbortController = require('abort-controller'); const { USER_AGENT, USE_SCRAPER_API, @@ -11,10 +12,15 @@ const timeout = (ms) => { } const fetch = async (url, options = {}) => { + const controller = new AbortController(); + const newOptions = Object.assign({}, options); if (!newOptions["headers"]) { newOptions["headers"] = {}; } + + newOptions.signal = controller.signal; + // newOptions["headers"]["User-Agent"] = USER_AGENT; let urlToFetchThroughAPI = Buffer.from(url).toString('base64'); @@ -25,7 +31,9 @@ const fetch = async (url, options = {}) => { const urlAdaptedForScraping = USE_SCRAPER_API ? `${SCRAPER_API_BASE_URL}?api_key=${SCRAPER_API_KEY}&url=${urlToFetchThroughAPI}` : url; - return nodeFetch(urlAdaptedForScraping, newOptions); + const result = nodeFetch(urlAdaptedForScraping, newOptions); + const timeoutId = setTimeout(() => controller.abort(), 5000); + return result; }; module.exports = fetch; diff --git a/package-lock.json b/package-lock.json index 7bee37e..2baa071 100644 --- a/package-lock.json +++ b/package-lock.json @@ -92,6 +92,14 @@ "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", "integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==" }, + "abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "requires": { + "event-target-shim": "^5.0.0" + } + }, "accepts": { "version": "1.3.5", "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz", @@ -1087,6 +1095,11 @@ "es5-ext": "~0.10.14" } }, + "event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==" + }, "events": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz", diff --git a/package.json b/package.json index eb716a4..cd8ca94 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,8 @@ "dependencies": { "2checkout-node": "0.0.1", "@sendgrid/mail": "^6.3.1", + "@supercharge/promise-pool": "^1.3.0", + "abort-controller": "^3.0.0", "aws-sdk": "^2.422.0", "bluebird": "^3.5.5", "cheerio": "^1.0.0-rc.2", @@ -52,8 +54,7 @@ "prettier": "^1.19.1", "react-step-wizard": "^5.1.0", "sequelize": "^5.18.4", - "sequelize-cli": "^5.5.0", - "@supercharge/promise-pool": "^1.3.0" + "sequelize-cli": "^5.5.0" }, "devDependencies": { "nodemon": "^1.19.0"