Playwright - Better error reporting and re-try fetch on fail once (#1238)

pull/1242/head
dgtlmoon 2 years ago committed by GitHub
parent b7a2501d64
commit 0c380c170f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -42,7 +42,7 @@ class BrowserStepsStepTimout(Exception):
class PageUnloadable(Exception): class PageUnloadable(Exception):
def __init__(self, status_code, url, screenshot=False, message=False): def __init__(self, status_code, url, message, screenshot=False):
# Set this so we can use it in other parts of the app # Set this so we can use it in other parts of the app
self.status_code = status_code self.status_code = status_code
self.url = url self.url = url
@ -299,23 +299,34 @@ class base_html_playwright(Fetcher):
if len(request_headers): if len(request_headers):
context.set_extra_http_headers(request_headers) context.set_extra_http_headers(request_headers)
try:
self.page.set_default_navigation_timeout(90000) self.page.set_default_navigation_timeout(90000)
self.page.set_default_timeout(90000) self.page.set_default_timeout(90000)
# Listen for all console events and handle errors # Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
# Bug - never set viewport size BEFORE page.goto # Goto page
try:
# Waits for the next navigation. Using Python context manager
# prevents a race condition between clicking and waiting for a navigation.
response = self.page.goto(url, wait_until='commit')
# Wait_until = commit # Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading. # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors' # This seemed to solve nearly all 'TimeoutErrors'
response = self.page.goto(url, wait_until='commit')
except playwright._impl._api_types.Error as e:
# Retry once - https://github.com/browserless/chrome/issues/2485
# Sometimes errors related to invalid cert's and other can be random
print ("Content Fetcher > retrying request got error - ", str(e))
time.sleep(1)
response = self.page.goto(url, wait_until='commit')
except Exception as e:
print ("Content Fetcher > Other exception when page.goto", str(e))
context.close()
browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e))
# Execute any browser steps
try:
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
self.page.wait_for_timeout(extra_wait * 1000) self.page.wait_for_timeout(extra_wait * 1000)
@ -328,17 +339,15 @@ class base_html_playwright(Fetcher):
# This can be ok, we will try to grab what we could retrieve # This can be ok, we will try to grab what we could retrieve
pass pass
except Exception as e: except Exception as e:
print ("other exception when page.goto") print ("Content Fetcher > Other exception when executing custom JS code", str(e))
print (str(e))
context.close() context.close()
browser.close() browser.close()
raise PageUnloadable(url=url, status_code=None) raise PageUnloadable(url=url, status_code=None, message=str(e))
if response is None: if response is None:
context.close() context.close()
browser.close() browser.close()
print ("response object was none") print ("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
# Bug 2(?) Set the viewport size AFTER loading the page # Bug 2(?) Set the viewport size AFTER loading the page
@ -357,7 +366,7 @@ class base_html_playwright(Fetcher):
if len(self.page.content().strip()) == 0: if len(self.page.content().strip()) == 0:
context.close() context.close()
browser.close() browser.close()
print ("Content was empty") print ("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
# Bug 2(?) Set the viewport size AFTER loading the page # Bug 2(?) Set the viewport size AFTER loading the page
@ -502,7 +511,7 @@ class base_html_webdriver(Fetcher):
try: try:
self.driver.quit() self.driver.quit()
except Exception as e: except Exception as e:
print("Exception in chrome shutdown/quit" + str(e)) print("Content Fetcher > Exception in chrome shutdown/quit" + str(e))
# "html_requests" is listed as the default fetcher in store.py! # "html_requests" is listed as the default fetcher in store.py!

Loading…
Cancel
Save