Removing experimental puppeteer fetching browser

11 months ago · 04b7d98e6c
parent 5faa84474c
commit 04b7d98e6c
2 changed files with 0 additions and 138 deletions
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@ -86,10 +86,6 @@ jobs:
        run: |
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'

-      - name: Specific tests in built container with Playwright as Puppeteer experimental fetcher
-        run: |                  
-          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'          
-
      - name: Test built container restock detection via Playwright
        run: |                            
          # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -311,125 +311,6 @@ class base_html_playwright(Fetcher):
        with open(destination, 'w') as f:
            f.write(content)

-    def run_fetch_browserless_puppeteer(self,
-            url,
-            timeout,
-            request_headers,
-            request_body,
-            request_method,
-            ignore_status_codes=False,
-            current_include_filters=None,
-            is_binary=False):
-
-        from pkg_resources import resource_string
-
-        extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
-
-        self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
-        code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
-        # In the future inject this is a proper JS package
-        code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
-        code = code.replace('%instock_scrape_code%', self.instock_data_js)
-
-        from requests.exceptions import ConnectTimeout, ReadTimeout
-        wait_browser_seconds = 240
-
-        browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
-        from urllib.parse import urlparse
-        if not browserless_function_url:
-            # Convert/try to guess from PLAYWRIGHT_DRIVER_URL
-            o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
-            browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
-
-
-        # Append proxy connect string
-        if self.proxy:
-            # Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
-            # Actual authentication handled by Puppeteer/node
-            o = urlparse(self.proxy.get('server'))
-            proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
-            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
-
-        try:
-            amp = '&' if '?' in browserless_function_url else '?'
-            response = requests.request(
-                method="POST",
-                json={
-                    "code": code,
-                    "context": {
-                        # Very primitive disk cache - USE WITH EXTREME CAUTION
-                        # Run browserless container  with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
-                        'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
-                        'execute_js': self.webdriver_js_execute_code,
-                        'extra_wait_ms': extra_wait_ms,
-                        'include_filters': current_include_filters,
-                        'req_headers': request_headers,
-                        'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
-                        'url': url,
-                        'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
-                        'proxy_username': self.proxy.get('username', '') if self.proxy else False,
-                        'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False,
-                        'no_cache_list': [
-                            'twitter',
-                            '.pdf'
-                        ],
-                        # Could use https://github.com/easylist/easylist here, or install a plugin
-                        'block_url_list': [
-                            'adnxs.com',
-                            'analytics.twitter.com',
-                            'doubleclick.net',
-                            'google-analytics.com',
-                            'googletagmanager',
-                            'trustpilot.com'
-                        ]
-                    }
-                },
-                # @todo /function needs adding ws:// to http:// rebuild this
-                url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
-                timeout=wait_browser_seconds)
-
-        except ReadTimeout:
-            raise PageUnloadable(url=url, status_code=None, message=f"No response from browser in {wait_browser_seconds}s")
-        except ConnectTimeout:
-            raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browser, retrying..")
-        else:
-            # 200 Here means that the communication to the browser worked only, not the page state
-            try:
-                x = response.json()
-            except Exception as e:
-                raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
-
-            try:
-                self.status_code = response.status_code
-            except Exception as e:
-                raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
-
-            self.headers = x.get('headers')
-
-            if self.status_code != 200 and not ignore_status_codes:
-                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
-
-            if self.status_code == 200:
-                import base64
-
-                if not x.get('screenshot'):
-                    # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
-                    # https://github.com/puppeteer/puppeteer/issues/1834
-                    # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
-                    # Check your memory is shared and big enough
-                    raise ScreenshotUnavailable(url=url, status_code=None)
-
-                if not x.get('content', '').strip():
-                    raise EmptyReply(url=url, status_code=None)
-
-                self.content = x.get('content')
-                self.instock_data = x.get('instock_data')
-                self.screenshot = base64.b64decode(x.get('screenshot'))
-                self.xpath_data = x.get('xpath_data')
-            else:
-                # Some other error from browserless
-                raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
-
    def run(self,
            url,
            timeout,
@ -441,21 +322,6 @@ class base_html_playwright(Fetcher):
            is_binary=False):


-        # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
-        # browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
-        if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
-            if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
-                # Temporary backup solution until we rewrite the playwright code
-                return self.run_fetch_browserless_puppeteer(
-                    url,
-                    timeout,
-                    request_headers,
-                    request_body,
-                    request_method,
-                    ignore_status_codes,
-                    current_include_filters,
-                    is_binary)
-
        from playwright.sync_api import sync_playwright
        import playwright._impl._errors