From bd5bbd8c5c6f0ce4ebd6001bc6625ed5e447a4f0 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 12 Feb 2024 14:18:15 +0100 Subject: [PATCH] Add time limit to puppeteer fetch --- .../content_fetchers/exceptions/__init__.py | 7 +++++ .../content_fetchers/puppeteer.py | 29 ++++++++++++------- changedetectionio/update_worker.py | 6 ++++ 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/changedetectionio/content_fetchers/exceptions/__init__.py b/changedetectionio/content_fetchers/exceptions/__init__.py index fa9f3df7..00752a3e 100644 --- a/changedetectionio/content_fetchers/exceptions/__init__.py +++ b/changedetectionio/content_fetchers/exceptions/__init__.py @@ -36,6 +36,13 @@ class BrowserConnectError(Exception): logger.error(f"Browser connection error {msg}") return +class BrowserFetchTimedOut(Exception): + msg = '' + def __init__(self, msg): + self.msg = msg + logger.error(f"Browser processing took too long - {msg}") + return + class BrowserStepsStepException(Exception): def __init__(self, step_n, original_e): self.step_n = step_n diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index e8058ab0..64d06ee7 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -6,7 +6,7 @@ from urllib.parse import urlparse from loguru import logger from changedetectionio.content_fetchers.base import Fetcher -from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable, BrowserConnectError +from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError class fetcher(Fetcher): @@ -221,14 +221,21 @@ class fetcher(Fetcher): def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False, current_include_filters=None, is_binary=False): + #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints + max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180) + # This will work in 3.10 but not >= 3.11 because 3.11 wants tasks only - asyncio.run(self.main( - url=url, - timeout=timeout, - request_headers=request_headers, - request_body=request_body, - request_method=request_method, - ignore_status_codes=ignore_status_codes, - current_include_filters=current_include_filters, - is_binary=is_binary - )) + try: + asyncio.run(asyncio.wait_for(self.main( + url=url, + timeout=timeout, + request_headers=request_headers, + request_body=request_body, + request_method=request_method, + ignore_status_codes=ignore_status_codes, + current_include_filters=current_include_filters, + is_binary=is_binary + ), timeout=max_time)) + except asyncio.TimeoutError: + raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds.")) + diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 819ef4b6..4f0bda8a 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -369,6 +369,12 @@ class update_worker(threading.Thread): } ) process_changedetection_results = False + except content_fetchers.exceptions.BrowserFetchTimedOut as e: + self.datastore.update_watch(uuid=uuid, + update_obj={'last_error': e.msg + } + ) + process_changedetection_results = False except content_fetchers.exceptions.BrowserStepsStepException as e: if not self.datastore.data['watching'].get(uuid):