Removing experimental puppeteer fetching browser

9 months ago · 04b7d98e6c
parent 5faa84474c
commit 04b7d98e6c
2 changed files with 0 additions and 138 deletions
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@ -86,10 +86,6 @@ jobs:
        run: |
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'
      - name: Specific tests in built container with Playwright as Puppeteer experimental fetcher
        run: |                  
          docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py'          
      - name: Test built container restock detection via Playwright
        run: |                            
          # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -311,125 +311,6 @@ class base_html_playwright(Fetcher):
        with open(destination, 'w') as f:
            f.write(content)
    def run_fetch_browserless_puppeteer(self,
            url,
            timeout,
            request_headers,
            request_body,
            request_method,
            ignore_status_codes=False,
            current_include_filters=None,
            is_binary=False):
        from pkg_resources import resource_string
        extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
        self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
        code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
        # In the future inject this is a proper JS package
        code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
        code = code.replace('%instock_scrape_code%', self.instock_data_js)
        from requests.exceptions import ConnectTimeout, ReadTimeout
        wait_browser_seconds = 240
        browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
        from urllib.parse import urlparse
        if not browserless_function_url:
            # Convert/try to guess from PLAYWRIGHT_DRIVER_URL
            o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
            browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
        # Append proxy connect string
        if self.proxy:
            # Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
            # Actual authentication handled by Puppeteer/node
            o = urlparse(self.proxy.get('server'))
            proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
        try:
            amp = '&' if '?' in browserless_function_url else '?'
            response = requests.request(
                method="POST",
                json={
                    "code": code,
                    "context": {
                        # Very primitive disk cache - USE WITH EXTREME CAUTION
                        # Run browserless container  with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
                        'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
                        'execute_js': self.webdriver_js_execute_code,
                        'extra_wait_ms': extra_wait_ms,
                        'include_filters': current_include_filters,
                        'req_headers': request_headers,
                        'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
                        'url': url,
                        'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
                        'proxy_username': self.proxy.get('username', '') if self.proxy else False,
                        'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False,
                        'no_cache_list': [
                            'twitter',
                            '.pdf'
                        ],
                        # Could use https://github.com/easylist/easylist here, or install a plugin
                        'block_url_list': [
                            'adnxs.com',
                            'analytics.twitter.com',
                            'doubleclick.net',
                            'google-analytics.com',
                            'googletagmanager',
                            'trustpilot.com'
                        ]
                    }
                },
                # @todo /function needs adding ws:// to http:// rebuild this
                url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
                timeout=wait_browser_seconds)
        except ReadTimeout:
            raise PageUnloadable(url=url, status_code=None, message=f"No response from browser in {wait_browser_seconds}s")
        except ConnectTimeout:
            raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browser, retrying..")
        else:
            # 200 Here means that the communication to the browser worked only, not the page state
            try:
                x = response.json()
            except Exception as e:
                raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
            try:
                self.status_code = response.status_code
            except Exception as e:
                raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
            self.headers = x.get('headers')
            if self.status_code != 200 and not ignore_status_codes:
                raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
            if self.status_code == 200:
                import base64
                if not x.get('screenshot'):
                    # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
                    # https://github.com/puppeteer/puppeteer/issues/1834
                    # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
                    # Check your memory is shared and big enough
                    raise ScreenshotUnavailable(url=url, status_code=None)
                if not x.get('content', '').strip():
                    raise EmptyReply(url=url, status_code=None)
                self.content = x.get('content')
                self.instock_data = x.get('instock_data')
                self.screenshot = base64.b64decode(x.get('screenshot'))
                self.xpath_data = x.get('xpath_data')
            else:
                # Some other error from browserless
                raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
    def run(self,
            url,
            timeout,
@ -441,21 +322,6 @@ class base_html_playwright(Fetcher):
            is_binary=False):
        # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
        # browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
        if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
            if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
                # Temporary backup solution until we rewrite the playwright code
                return self.run_fetch_browserless_puppeteer(
                    url,
                    timeout,
                    request_headers,
                    request_body,
                    request_method,
                    ignore_status_codes,
                    current_include_filters,
                    is_binary)
        from playwright.sync_api import sync_playwright
        import playwright._impl._errors