From cbb70ada943d8f846229146bb2f42a95d2937860 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 23 Oct 2023 23:14:45 +0200 Subject: [PATCH] Going to a page was doing some different logic to BrowserSteps - make them use the same methods --- .../blueprint/browser_steps/browser_steps.py | 15 ++++---- changedetectionio/content_fetcher.py | 34 +++++-------------- 2 files changed, 17 insertions(+), 32 deletions(-) diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index 8ef1ac19..e8ed0326 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -77,13 +77,13 @@ class steppable_browser_interface(): def action_goto_url(self, selector=None, value=None): # self.page.set_viewport_size({"width": 1280, "height": 5000}) now = time.time() - response = self.page.goto(value, timeout=0, wait_until='commit') - - # Wait_until = commit - # - `'commit'` - consider operation to be finished when network response is received and the document started loading. - # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds - # This seemed to solve nearly all 'TimeoutErrors' + response = self.page.goto(value, timeout=0, wait_until='load') + # Should be the same as the puppeteer_fetch.js methods, means, load with no timeout set (skip timeout) + #and also wait for seconds ? + #await page.waitForTimeout(1000); + #await page.waitForTimeout(extra_wait_ms); print("Time to goto URL ", time.time() - now) + return response def action_click_element_containing_text(self, selector=None, value=''): if not len(value.strip()): @@ -99,7 +99,8 @@ class steppable_browser_interface(): self.page.fill(selector, value, timeout=10 * 1000) def action_execute_js(self, selector, value): - self.page.evaluate(value) + response = self.page.evaluate(value) + return response def action_click_element(self, selector, value): print("Clicking element") diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 97b4e0b1..afba0918 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -464,38 +464,22 @@ class base_html_playwright(Fetcher): if len(request_headers): context.set_extra_http_headers(request_headers) - self.page.set_default_navigation_timeout(90000) - self.page.set_default_timeout(90000) + # Listen for all console events and handle errors + self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) - # Listen for all console events and handle errors - self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) - - # Goto page - try: - # Wait_until = commit - # - `'commit'` - consider operation to be finished when network response is received and the document started loading. - # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds - # This seemed to solve nearly all 'TimeoutErrors' - response = self.page.goto(url, wait_until='commit') - except playwright._impl._api_types.Error as e: - # Retry once - https://github.com/browserless/chrome/issues/2485 - # Sometimes errors related to invalid cert's and other can be random - print("Content Fetcher > retrying request got error - ", str(e)) - time.sleep(1) - response = self.page.goto(url, wait_until='commit') - except Exception as e: - print("Content Fetcher > Other exception when page.goto", str(e)) - context.close() - browser.close() - raise PageUnloadable(url=url, status_code=None, message=str(e)) + # Re-use as much code from browser steps as possible so its the same + from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface + browsersteps_interface = steppable_browser_interface() + browsersteps_interface.page = self.page # Execute any browser steps try: extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay self.page.wait_for_timeout(extra_wait * 1000) + response = browsersteps_interface.action_goto_url(value=url) if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): - self.page.evaluate(self.webdriver_js_execute_code) + response = browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code) except playwright._impl._api_types.TimeoutError as e: context.close() @@ -518,7 +502,7 @@ class base_html_playwright(Fetcher): self.iterate_browser_steps() extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay - time.sleep(extra_wait) + self.page.wait_for_timeout(extra_wait * 1000) self.content = self.page.content() self.status_code = response.status