From 68db20168ec471444c736cd2953a979f83b9f02b Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 2 May 2022 21:40:40 +0200 Subject: [PATCH] Add new fetch method: Playwright Chromium (Selenium/WebDriver alternative) (#489) --- Dockerfile | 5 + changedetectionio/changedetection.py | 2 +- changedetectionio/content_fetcher.py | 167 +++++++++++++++++++------ changedetectionio/fetch_site_status.py | 8 +- docker-compose.yml | 14 +++ requirements.txt | 1 + 6 files changed, 149 insertions(+), 48 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2e8131f7..23a3f2c4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,11 @@ COPY requirements.txt /requirements.txt RUN pip install --target=/dependencies -r /requirements.txt +# Playwright is an alternative to Selenium +# Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing +RUN pip install --target=/dependencies playwright~=1.20 \ + || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled." + # Final image stage FROM python:3.8-slim diff --git a/changedetectionio/changedetection.py b/changedetectionio/changedetection.py index a959bc59..6708e8e1 100755 --- a/changedetectionio/changedetection.py +++ b/changedetectionio/changedetection.py @@ -8,7 +8,7 @@ import sys import eventlet import eventlet.wsgi -from . import store, changedetection_app +from . import store, changedetection_app, content_fetcher from . import __version__ def main(): diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 1f40911e..7c6457b3 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -1,13 +1,10 @@ from abc import ABC, abstractmethod import chardet import os -from selenium import webdriver -from selenium.webdriver.common.desired_capabilities import DesiredCapabilities -from selenium.webdriver.common.proxy import Proxy as SeleniumProxy -from selenium.common.exceptions import WebDriverException import requests import time import urllib3.exceptions +import sys class EmptyReply(Exception): @@ -19,13 +16,15 @@ class EmptyReply(Exception): pass + class Fetcher(): error = None status_code = None content = None headers = None - - fetcher_description ="No description" + # Will be needed in the future by the VisualSelector, always get this where possible. + screenshot = False + fetcher_description = "No description" @abstractmethod def get_error(self): @@ -46,10 +45,6 @@ class Fetcher(): def quit(self): return - @abstractmethod - def screenshot(self): - return - @abstractmethod def get_last_status_code(self): return self.status_code @@ -59,29 +54,109 @@ class Fetcher(): def is_ready(self): return True + # Maybe for the future, each fetcher provides its own diff output, could be used for text, image # the current one would return javascript output (as we use JS to generate the diff) # -# Returns tuple(mime_type, stream) -# @abstractmethod -# def return_diff(self, stream_a, stream_b): -# return - def available_fetchers(): - import inspect - from changedetectionio import content_fetcher - p=[] - for name, obj in inspect.getmembers(content_fetcher): - if inspect.isclass(obj): - # @todo html_ is maybe better as fetcher_ or something - # In this case, make sure to edit the default one in store.py and fetch_site_status.py - if "html_" in name: - t=tuple([name,obj.fetcher_description]) - p.append(t) - - return p - -class html_webdriver(Fetcher): + # See the if statement at the bottom of this file for how we switch between playwright and webdriver + import inspect + p = [] + for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass): + if inspect.isclass(obj): + # @todo html_ is maybe better as fetcher_ or something + # In this case, make sure to edit the default one in store.py and fetch_site_status.py + if name.startswith('html_'): + t = tuple([name, obj.fetcher_description]) + p.append(t) + + return p + + +class base_html_playwright(Fetcher): + fetcher_description = "Playwright {}/Javascript".format( + os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() + ) + if os.getenv("PLAYWRIGHT_DRIVER_URL"): + fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) + +# try: +# from playwright.sync_api import sync_playwright +# except ModuleNotFoundError: +# fetcher_enabled = False + + browser_type = '' + command_executor = '' + + # Configs for Proxy setup + # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" + playwright_proxy_settings_mappings = ['server', 'bypass', 'username', 'password'] + + proxy = None + + def __init__(self): + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') + self.command_executor = os.getenv( + "PLAYWRIGHT_DRIVER_URL", + 'ws://playwright-chrome:3000/playwright' + ).strip('"') + + # If any proxy settings are enabled, then we should setup the proxy object + proxy_args = {} + for k in self.playwright_proxy_settings_mappings: + v = os.getenv('playwright_proxy_' + k, False) + if v: + proxy_args[k] = v.strip('"') + + if proxy_args: + self.proxy = proxy_args + + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False): + + from playwright.sync_api import sync_playwright + + with sync_playwright() as p: + browser_type = getattr(p, self.browser_type) + + # Seemed to cause a connection Exception even tho I can see it connect + # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) + browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000) + + # Set user agent to prevent Cloudflare from blocking the browser + context = browser.new_context( + user_agent="Mozilla/5.0", + proxy=self.proxy + ) + page = context.new_page() + page.set_viewport_size({"width": 1280, "height": 1024}) + response = page.goto(url, timeout=timeout * 1000) + + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + page.wait_for_timeout(extra_wait * 1000) + + if response is None: + raise EmptyReply(url=url, status_code=None) + + self.status_code = response.status + self.content = page.content() + self.headers = response.all_headers() + + # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it + # JPEG is better here because the screenshots can be very very large + page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}) + self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=90) + context.close() + browser.close() + + +class base_html_webdriver(Fetcher): if os.getenv("WEBDRIVER_URL"): fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) else: @@ -94,12 +169,11 @@ class html_webdriver(Fetcher): selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', 'proxyAutoconfigUrl', 'sslProxy', 'autodetect', 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] - - - - proxy=None + proxy = None def __init__(self): + from selenium.webdriver.common.proxy import Proxy as SeleniumProxy + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') @@ -121,6 +195,9 @@ class html_webdriver(Fetcher): request_method, ignore_status_codes=False): + from selenium import webdriver + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities + from selenium.common.exceptions import WebDriverException # request_body, request_method unused for now, until some magic in the future happens. # check env for WEBDRIVER_URL @@ -145,9 +222,8 @@ class html_webdriver(Fetcher): time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) self.content = self.driver.page_source self.headers = {} - - def screenshot(self): - return self.driver.get_screenshot_as_png() + self.screenshot = self.driver.get_screenshot_as_png() + self.quit() # Does the connection to the webdriver work? run a test connection. def is_ready(self): @@ -170,6 +246,7 @@ class html_webdriver(Fetcher): except Exception as e: print("Exception in chrome shutdown/quit" + str(e)) + # "html_requests" is listed as the default fetcher in store.py! class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" @@ -183,11 +260,11 @@ class html_requests(Fetcher): ignore_status_codes=False): r = requests.request(method=request_method, - data=request_body, - url=url, - headers=request_headers, - timeout=timeout, - verify=False) + data=request_body, + url=url, + headers=request_headers, + timeout=timeout, + verify=False) # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks. # For example - some sites don't tell us it's utf-8, but return utf-8 content @@ -207,3 +284,11 @@ class html_requests(Fetcher): self.content = r.text self.headers = r.headers + +# Decide which is the 'real' HTML webdriver, this is more a system wide config +# rather than site-specific. +use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) +if use_playwright_as_chrome_fetcher: + html_webdriver = base_html_playwright +else: + html_webdriver = base_html_webdriver diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 55bf49dc..71415dc1 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -68,6 +68,7 @@ class perform_site_check(): fetcher = klass() fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code) + # Fetching complete, now filters # @todo move to class / maybe inside of fetcher abstract base? @@ -192,9 +193,4 @@ class perform_site_check(): if not watch['title'] or not len(watch['title']): update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) - if self.datastore.data['settings']['application'].get('real_browser_save_screenshot', True): - screenshot = fetcher.screenshot() - - fetcher.quit() - - return changed_detected, update_obj, text_content_before_ignored_filter, screenshot \ No newline at end of file + return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 2761031f..88ee8a76 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,13 @@ services: # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # + # Alternative Playwright URL, do not use "'s or 's! + # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/playwright + # + # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password + # + # https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-option-proxy + # # Plain requsts - proxy support example. # - HTTP_PROXY=socks5h://10.10.1.10:1080 # - HTTPS_PROXY=socks5h://10.10.1.10:1080 @@ -58,6 +65,13 @@ services: # # Workaround to avoid the browser crashing inside a docker container # # See https://github.com/SeleniumHQ/docker-selenium#quick-start # - /dev/shm:/dev/shm +# restart: unless-stopped + + # Used for fetching pages via Playwright+Chrome where you need Javascript support. + +# playwright-chrome: +# hostname: playwright-chrome +# image: browserless/chrome # restart: unless-stopped volumes: diff --git a/requirements.txt b/requirements.txt index 8042181c..468dca88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,3 +40,4 @@ selenium ~= 4.1.0 # need to revisit flask login versions werkzeug ~= 2.0.0 +# playwright is installed at Dockerfile build time because it's not available on all platforms