Playwright fetcher - more reliable by just waiting arbitrary seconds after the last network IO

pull/588/head
dgtlmoon 3 years ago
parent dc6f76ba64
commit d4b5237103

@ -118,6 +118,7 @@ class base_html_playwright(Fetcher):
ignore_status_codes=False): ignore_status_codes=False):
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from playwright._impl._api_types import Error, TimeoutError
with sync_playwright() as p: with sync_playwright() as p:
browser_type = getattr(p, self.browser_type) browser_type = getattr(p, self.browser_type)
@ -134,10 +135,16 @@ class base_html_playwright(Fetcher):
) )
page = context.new_page() page = context.new_page()
page.set_viewport_size({"width": 1280, "height": 1024}) page.set_viewport_size({"width": 1280, "height": 1024})
response = page.goto(url, timeout=timeout * 1000) try:
response = page.goto(url, timeout=timeout * 1000, wait_until='commit')
# Wait_until = commit
# - `'commit'` - consider operation to be finished when network response is received and the document started loading.
# Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
# This seemed to solve nearly all 'TimeoutErrors'
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
page.wait_for_timeout(extra_wait * 1000) page.wait_for_timeout(extra_wait * 1000)
except playwright._impl._api_types.TimeoutError as e:
raise EmptyReply(url=url, status_code=None)
if response is None: if response is None:
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)

Loading…
Cancel
Save