From 358a365303232894929bc98abf06605621ce502c Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Mon, 13 Jun 2022 23:39:43 +0200
Subject: [PATCH] Tweaks to playwright fetch code - better timeout handling

---
 changedetectionio/content_fetcher.py | 43 +++++++++++++++++-----------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py
index 6062e263..f4b16cbb 100644
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -287,7 +287,8 @@ class base_html_playwright(Fetcher):
 
             # Seemed to cause a connection Exception even tho I can see it connect
             # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
-            browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000)
+            # 60,000 connection timeout only
+            browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000)
 
             # Set user agent to prevent Cloudflare from blocking the browser
             # Use the default one configured in the App.py model that's passed from fetch_site_status.py
@@ -302,19 +303,24 @@ class base_html_playwright(Fetcher):
 
             page = context.new_page()
             try:
+                page.set_default_navigation_timeout(90000)
+                page.set_default_timeout(90000)
+
                # Bug - never set viewport size BEFORE page.goto
-                response = page.goto(url, timeout=timeout * 1000, wait_until='commit')
-                # Wait_until = commit
-                # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
-                # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
-                # This seemed to solve nearly all 'TimeoutErrors'
-                extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
-                page.wait_for_timeout(extra_wait * 1000)
+
+                # Waits for the next navigation. Using Python context manager
+                # prevents a race condition between clicking and waiting for a navigation.
+                with page.expect_navigation():
+                    response = page.goto(url, wait_until='load')
+
             except playwright._impl._api_types.TimeoutError as e:
                 context.close()
                 browser.close()
-                raise EmptyReply(url=url, status_code=None)
+                # This can be ok, we will try to grab what we could retrieve
+                pass
             except Exception as e:
+                print ("other exception when page.goto")
+                print (str(e))
                 context.close()
                 browser.close()
                 raise PageUnloadable(url=url, status_code=None)
@@ -322,18 +328,23 @@ class base_html_playwright(Fetcher):
             if response is None:
                 context.close()
                 browser.close()
+                print ("response object was none")
+                print (str(e))
                 raise EmptyReply(url=url, status_code=None)
 
-            if len(page.content().strip()) == 0:
+            # Bug 2(?) Set the viewport size AFTER loading the page
+            page.set_viewport_size({"width": 1280, "height": 1024})            
+            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
+            time.sleep(extra_wait)
+            self.content = page.content()
+            self.status_code = response.status
+
+            if len(self.content.strip()) == 0:
                 context.close()
                 browser.close()
+                print ("Content was empty")
+                print (str(e))
                 raise EmptyReply(url=url, status_code=None)
-
-            # Bug 2(?) Set the viewport size AFTER loading the page
-            page.set_viewport_size({"width": 1280, "height": 1024})
-
-            self.status_code = response.status
-            self.content = page.content()
             self.headers = response.all_headers()
 
             if current_css_filter is not None: