diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index a0119e91..19b58d39 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -42,7 +42,7 @@ class BrowserStepsStepTimout(Exception): class PageUnloadable(Exception): - def __init__(self, status_code, url, screenshot=False, message=False): + def __init__(self, status_code, url, message, screenshot=False): # Set this so we can use it in other parts of the app self.status_code = status_code self.url = url @@ -299,23 +299,34 @@ class base_html_playwright(Fetcher): if len(request_headers): context.set_extra_http_headers(request_headers) - try: self.page.set_default_navigation_timeout(90000) self.page.set_default_timeout(90000) # Listen for all console events and handle errors self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) - # Bug - never set viewport size BEFORE page.goto - - - # Waits for the next navigation. Using Python context manager - # prevents a race condition between clicking and waiting for a navigation. - response = self.page.goto(url, wait_until='commit') + # Goto page + try: # Wait_until = commit # - `'commit'` - consider operation to be finished when network response is received and the document started loading. # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds # This seemed to solve nearly all 'TimeoutErrors' + response = self.page.goto(url, wait_until='commit') + except playwright._impl._api_types.Error as e: + # Retry once - https://github.com/browserless/chrome/issues/2485 + # Sometimes errors related to invalid cert's and other can be random + print ("Content Fetcher > retrying request got error - ", str(e)) + time.sleep(1) + response = self.page.goto(url, wait_until='commit') + + except Exception as e: + print ("Content Fetcher > Other exception when page.goto", str(e)) + context.close() + browser.close() + raise PageUnloadable(url=url, status_code=None, message=str(e)) + + # Execute any browser steps + try: extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay self.page.wait_for_timeout(extra_wait * 1000) @@ -328,17 +339,15 @@ class base_html_playwright(Fetcher): # This can be ok, we will try to grab what we could retrieve pass except Exception as e: - print ("other exception when page.goto") - print (str(e)) + print ("Content Fetcher > Other exception when executing custom JS code", str(e)) context.close() browser.close() - raise PageUnloadable(url=url, status_code=None) - + raise PageUnloadable(url=url, status_code=None, message=str(e)) if response is None: context.close() browser.close() - print ("response object was none") + print ("Content Fetcher > Response object was none") raise EmptyReply(url=url, status_code=None) # Bug 2(?) Set the viewport size AFTER loading the page @@ -357,7 +366,7 @@ class base_html_playwright(Fetcher): if len(self.page.content().strip()) == 0: context.close() browser.close() - print ("Content was empty") + print ("Content Fetcher > Content was empty") raise EmptyReply(url=url, status_code=None) # Bug 2(?) Set the viewport size AFTER loading the page @@ -502,7 +511,7 @@ class base_html_webdriver(Fetcher): try: self.driver.quit() except Exception as e: - print("Exception in chrome shutdown/quit" + str(e)) + print("Content Fetcher > Exception in chrome shutdown/quit" + str(e)) # "html_requests" is listed as the default fetcher in store.py!