Fetching/BrowserSteps - Going to a page was using slightly logic to the main way - make them use the same methods (#1890)

1 year ago · 349111eb35
parent 71e50569a0
commit 349111eb35
2 changed files with 20 additions and 35 deletions
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@ -77,13 +77,13 @@ class steppable_browser_interface():
    def action_goto_url(self, selector=None, value=None):
        # self.page.set_viewport_size({"width": 1280, "height": 5000})
        now = time.time()
-        response = self.page.goto(value, timeout=0, wait_until='commit')
-
-        # Wait_until = commit
-        # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
-        # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
-        # This seemed to solve nearly all 'TimeoutErrors'
+        response = self.page.goto(value, timeout=0, wait_until='load')
+        # Should be the same as the puppeteer_fetch.js methods, means, load with no timeout set (skip timeout)
+        #and also wait for seconds ?
+        #await page.waitForTimeout(1000);
+        #await page.waitForTimeout(extra_wait_ms);
        print("Time to goto URL ", time.time() - now)
+        return response

    def action_click_element_containing_text(self, selector=None, value=''):
        if not len(value.strip()):
@ -99,7 +99,8 @@ class steppable_browser_interface():
        self.page.fill(selector, value, timeout=10 * 1000)

    def action_execute_js(self, selector, value):
-        self.page.evaluate(value)
+        response = self.page.evaluate(value)
+        return response

    def action_click_element(self, selector, value):
        print("Clicking element")
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -464,38 +464,19 @@ class base_html_playwright(Fetcher):
            if len(request_headers):
                context.set_extra_http_headers(request_headers)

-                self.page.set_default_navigation_timeout(90000)
-                self.page.set_default_timeout(90000)
+            # Listen for all console events and handle errors
+            self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))

-                # Listen for all console events and handle errors
-                self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
+            # Re-use as much code from browser steps as possible so its the same
+            from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
+            browsersteps_interface = steppable_browser_interface()
+            browsersteps_interface.page = self.page

-            # Goto page
            try:
-                # Wait_until = commit
-                # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
-                # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
-                # This seemed to solve nearly all 'TimeoutErrors'
-                response = self.page.goto(url, wait_until='commit')
-            except playwright._impl._api_types.Error as e:
-                # Retry once - https://github.com/browserless/chrome/issues/2485
-                # Sometimes errors related to invalid cert's and other can be random
-                print("Content Fetcher > retrying request got error - ", str(e))
-                time.sleep(1)
-                response = self.page.goto(url, wait_until='commit')
-            except Exception as e:
-                print("Content Fetcher > Other exception when page.goto", str(e))
-                context.close()
-                browser.close()
-                raise PageUnloadable(url=url, status_code=None, message=str(e))
-
-            # Execute any browser steps
-            try:
-                extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
-                self.page.wait_for_timeout(extra_wait * 1000)
+                response = browsersteps_interface.action_goto_url(value=url)

                if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
-                    self.page.evaluate(self.webdriver_js_execute_code)
+                    browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)

            except playwright._impl._api_types.TimeoutError as e:
                context.close()
@ -514,11 +495,14 @@ class base_html_playwright(Fetcher):
                print("Content Fetcher > Response object was none")
                raise EmptyReply(url=url, status_code=None)

+            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
+            self.page.wait_for_timeout(extra_wait * 1000)
+
            # Run Browser Steps here
            self.iterate_browser_steps()

            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
-            time.sleep(extra_wait)
+            self.page.wait_for_timeout(extra_wait * 1000)

            self.content = self.page.content()
            self.status_code = response.status