From 358a365303232894929bc98abf06605621ce502c Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 13 Jun 2022 23:39:43 +0200 Subject: [PATCH] Tweaks to playwright fetch code - better timeout handling --- changedetectionio/content_fetcher.py | 43 +++++++++++++++++----------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 6062e263..f4b16cbb 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -287,7 +287,8 @@ class base_html_playwright(Fetcher): # Seemed to cause a connection Exception even tho I can see it connect # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) - browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000) + # 60,000 connection timeout only + browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000) # Set user agent to prevent Cloudflare from blocking the browser # Use the default one configured in the App.py model that's passed from fetch_site_status.py @@ -302,19 +303,24 @@ class base_html_playwright(Fetcher): page = context.new_page() try: + page.set_default_navigation_timeout(90000) + page.set_default_timeout(90000) + # Bug - never set viewport size BEFORE page.goto - response = page.goto(url, timeout=timeout * 1000, wait_until='commit') - # Wait_until = commit - # - `'commit'` - consider operation to be finished when network response is received and the document started loading. - # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds - # This seemed to solve nearly all 'TimeoutErrors' - extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay - page.wait_for_timeout(extra_wait * 1000) + + # Waits for the next navigation. Using Python context manager + # prevents a race condition between clicking and waiting for a navigation. + with page.expect_navigation(): + response = page.goto(url, wait_until='load') + except playwright._impl._api_types.TimeoutError as e: context.close() browser.close() - raise EmptyReply(url=url, status_code=None) + # This can be ok, we will try to grab what we could retrieve + pass except Exception as e: + print ("other exception when page.goto") + print (str(e)) context.close() browser.close() raise PageUnloadable(url=url, status_code=None) @@ -322,18 +328,23 @@ class base_html_playwright(Fetcher): if response is None: context.close() browser.close() + print ("response object was none") + print (str(e)) raise EmptyReply(url=url, status_code=None) - if len(page.content().strip()) == 0: + # Bug 2(?) Set the viewport size AFTER loading the page + page.set_viewport_size({"width": 1280, "height": 1024}) + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay + time.sleep(extra_wait) + self.content = page.content() + self.status_code = response.status + + if len(self.content.strip()) == 0: context.close() browser.close() + print ("Content was empty") + print (str(e)) raise EmptyReply(url=url, status_code=None) - - # Bug 2(?) Set the viewport size AFTER loading the page - page.set_viewport_size({"width": 1280, "height": 1024}) - - self.status_code = response.status - self.content = page.content() self.headers = response.all_headers() if current_css_filter is not None: