Fetching - Custom browser on experimental/puppeteer fetcher - Don't switch to custom puppeteer mode if external browser URL is active (#2068)

12 months ago · 273bd45ad7
parent 3d1e1025d2
commit 273bd45ad7
2 changed files with 23 additions and 16 deletions
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -91,19 +91,20 @@ class ReplyWithContentButNoText(Exception):


 class Fetcher():
+    browser_connection_is_custom = None
+    browser_connection_url = None
    browser_steps = None
    browser_steps_screenshot_path = None
    content = None
    error = None
    fetcher_description = "No description"
-    browser_connection_url = None
    headers = {}
+    instock_data = None
+    instock_data_js = ""
    status_code = None
    webdriver_js_execute_code = None
    xpath_data = None
    xpath_element_js = ""
-    instock_data = None
-    instock_data_js = ""

    # Will be needed in the future by the VisualSelector, always get this where possible.
    screenshot = False
@ -252,16 +253,19 @@ class base_html_playwright(Fetcher):

    proxy = None

-    def __init__(self, proxy_override=None, browser_connection_url=None):
+    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()

        self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')

-        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
-        if not browser_connection_url:
-            self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
+        if custom_browser_connection_url:
+            self.browser_connection_is_custom = True
+            self.browser_connection_url = custom_browser_connection_url
        else:
-            self.browser_connection_url = browser_connection_url
+            # Fallback to fetching from system
+            # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
+            self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
+

        # If any proxy settings are enabled, then we should setup the proxy object
        proxy_args = {}
@ -421,8 +425,10 @@ class base_html_playwright(Fetcher):
            current_include_filters=None,
            is_binary=False):

+
        # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
-        if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
+        # browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
+        if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
            if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
                # Temporary backup solution until we rewrite the playwright code
                return self.run_fetch_browserless_puppeteer(
@ -569,15 +575,16 @@ class base_html_webdriver(Fetcher):
                                        'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
    proxy = None

-    def __init__(self, proxy_override=None, browser_connection_url=None):
+    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()
        from selenium.webdriver.common.proxy import Proxy as SeleniumProxy

        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
-        if not browser_connection_url:
+        if not custom_browser_connection_url:
            self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
        else:
-            self.browser_connection_url = browser_connection_url
+            self.browser_connection_is_custom = True
+            self.browser_connection_url = custom_browser_connection_url

        # If any proxy settings are enabled, then we should setup the proxy object
        proxy_args = {}
@ -674,7 +681,7 @@ class base_html_webdriver(Fetcher):
 class html_requests(Fetcher):
    fetcher_description = "Basic fast Plaintext/HTTP Client"

-    def __init__(self, proxy_override=None, browser_connection_url=None):
+    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()
        self.proxy_override = proxy_override
        # browser_connection_url is none because its always 'launched locally'
--- a/changedetectionio/processors/init.py
+++ b/changedetectionio/processors/init.py
@ -43,14 +43,14 @@ class difference_detection_processor():

        # In the case that the preferred fetcher was a browser config with custom connection URL..
        # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..)
-        browser_connection_url = None
+        custom_browser_connection_url = None
        if prefer_fetch_backend.startswith('extra_browser_'):
            (t, key) = prefer_fetch_backend.split('extra_browser_')
            connection = list(
                filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
            if connection:
                prefer_fetch_backend = 'base_html_playwright'
-                browser_connection_url = connection[0].get('browser_connection_url')
+                custom_browser_connection_url = connection[0].get('browser_connection_url')

        # PDF should be html_requests because playwright will serve it up (so far) in a embedded page
        # @todo https://github.com/dgtlmoon/changedetection.io/issues/2019
@ -74,7 +74,7 @@ class difference_detection_processor():
        # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
        # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
        self.fetcher = fetcher_obj(proxy_override=proxy_url,
-                                   browser_connection_url=browser_connection_url
+                                   custom_browser_connection_url=custom_browser_connection_url
                                   )

        if self.watch.has_browser_steps: