From bd5bbd8c5c6f0ce4ebd6001bc6625ed5e447a4f0 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Mon, 12 Feb 2024 14:18:15 +0100
Subject: [PATCH] Add time limit to puppeteer fetch

---
 .../content_fetchers/exceptions/__init__.py   |  7 +++++
 .../content_fetchers/puppeteer.py             | 29 ++++++++++++-------
 changedetectionio/update_worker.py            |  6 ++++
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/changedetectionio/content_fetchers/exceptions/__init__.py b/changedetectionio/content_fetchers/exceptions/__init__.py
index fa9f3df7..00752a3e 100644
--- a/changedetectionio/content_fetchers/exceptions/__init__.py
+++ b/changedetectionio/content_fetchers/exceptions/__init__.py
@@ -36,6 +36,13 @@ class BrowserConnectError(Exception):
         logger.error(f"Browser connection error {msg}")
         return
 
+class BrowserFetchTimedOut(Exception):
+    msg = ''
+    def __init__(self, msg):
+        self.msg = msg
+        logger.error(f"Browser processing took too long - {msg}")
+        return
+
 class BrowserStepsStepException(Exception):
     def __init__(self, step_n, original_e):
         self.step_n = step_n
diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py
index e8058ab0..64d06ee7 100644
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -6,7 +6,7 @@ from urllib.parse import urlparse
 
 from loguru import logger
 from changedetectionio.content_fetchers.base import Fetcher
-from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable, BrowserConnectError
+from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
 
 
 class fetcher(Fetcher):
@@ -221,14 +221,21 @@ class fetcher(Fetcher):
     def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
             current_include_filters=None, is_binary=False):
 
+        #@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
+        max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
+
         # This will work in 3.10 but not >= 3.11 because 3.11 wants tasks only
-        asyncio.run(self.main(
-            url=url,
-            timeout=timeout,
-            request_headers=request_headers,
-            request_body=request_body,
-            request_method=request_method,
-            ignore_status_codes=ignore_status_codes,
-            current_include_filters=current_include_filters,
-            is_binary=is_binary
-        ))
+        try:
+            asyncio.run(asyncio.wait_for(self.main(
+                url=url,
+                timeout=timeout,
+                request_headers=request_headers,
+                request_body=request_body,
+                request_method=request_method,
+                ignore_status_codes=ignore_status_codes,
+                current_include_filters=current_include_filters,
+                is_binary=is_binary
+            ), timeout=max_time))
+        except asyncio.TimeoutError:
+            raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
+
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py
index 819ef4b6..4f0bda8a 100644
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -369,6 +369,12 @@ class update_worker(threading.Thread):
                                                                 }
                                                     )
                         process_changedetection_results = False
+                    except content_fetchers.exceptions.BrowserFetchTimedOut as e:
+                        self.datastore.update_watch(uuid=uuid,
+                                                    update_obj={'last_error': e.msg
+                                                                }
+                                                    )
+                        process_changedetection_results = False
                     except content_fetchers.exceptions.BrowserStepsStepException as e:
 
                         if not self.datastore.data['watching'].get(uuid):