diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 3d036774..518b8b09 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -1,10 +1,6 @@ from abc import ABC, abstractmethod import chardet import os -from selenium import webdriver -from selenium.webdriver.common.desired_capabilities import DesiredCapabilities -from selenium.webdriver.common.proxy import Proxy as SeleniumProxy -from selenium.common.exceptions import WebDriverException import requests import time import urllib3.exceptions @@ -26,6 +22,7 @@ class Fetcher(): headers = None fetcher_description ="No description" + fetcher_list_order = 0 @abstractmethod def get_error(self): @@ -68,16 +65,88 @@ def available_fetchers(): # @todo html_ is maybe better as fetcher_ or something # In this case, make sure to edit the default one in store.py and fetch_site_status.py if "html_" in name: - t=tuple([name,obj.fetcher_description]) + t=tuple([name,obj.fetcher_description,obj.fetcher_list_order]) p.append(t) + # sort by obj.fetcher_list_order + p.sort(key=lambda x: x[2]) + # strip obj.fetcher_list_order from each member in the tuple + p = list(map(lambda x: x[:2], p)) return p +class html_playwright(Fetcher): + fetcher_description = "Playwright {}/Javascript".format( + os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() + ) + if os.getenv("PLAYWRIGHT_DRIVER_URL"): + fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) + fetcher_list_order = 3 + + browser_type = '' + command_executor = '' + + # Configs for Proxy setup + # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" + playwright_proxy_settings_mappings = ['server', 'bypass', 'username', 'password'] + + proxy=None + + def __init__(self): + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') + self.command_executor = os.getenv( + "PLAYWRIGHT_DRIVER_URL", + 'ws://playwright-server:4444/playwright' + ).strip('"') + + # If any proxy settings are enabled, then we should setup the proxy object + proxy_args = {} + for k in self.playwright_proxy_settings_mappings: + v = os.getenv('playwright_proxy_' + k, False) + if v: + proxy_args[k] = v.strip('"') + + if proxy_args: + self.proxy = proxy_args + + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False): + + from playwright.sync_api import sync_playwright + + with sync_playwright() as p: + browser_type = getattr(p, self.browser_type) + browser = browser_type.connect(self.command_executor, timeout=timeout*1000) + # Set user agent to prevent Cloudflare from blocking the browser + context = browser.new_context( + user_agent="Mozilla/5.0", + proxy=self.proxy + ) + page = context.new_page() + response = page.goto(url, timeout=timeout*1000) + page.wait_for_timeout(5000) + + if response is None: + raise EmptyReply(url=url, status_code=None) + + self.status_code = response.status + self.content = page.content() + self.headers = response.all_headers() + + context.close() + browser.close() + class html_webdriver(Fetcher): if os.getenv("WEBDRIVER_URL"): fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) else: fetcher_description = "WebDriver Chrome/Javascript" + fetcher_list_order = 2 command_executor = '' @@ -92,9 +161,12 @@ class html_webdriver(Fetcher): proxy=None def __init__(self): + from selenium.webdriver.common.proxy import Proxy as SeleniumProxy + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') + # If any proxy settings are enabled, then we should setup the proxy object proxy_args = {} for k in self.selenium_proxy_settings_mappings: @@ -113,6 +185,9 @@ class html_webdriver(Fetcher): request_method, ignore_status_codes=False): + from selenium import webdriver + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities + from selenium.common.exceptions import WebDriverException # request_body, request_method unused for now, until some magic in the future happens. # check env for WEBDRIVER_URL @@ -158,6 +233,7 @@ class html_webdriver(Fetcher): # "html_requests" is listed as the default fetcher in store.py! class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" + fetcher_list_order = 1 def run(self, url, diff --git a/changedetectionio/static/images/Playwright-icon.png b/changedetectionio/static/images/Playwright-icon.png new file mode 100644 index 00000000..75db893b Binary files /dev/null and b/changedetectionio/static/images/Playwright-icon.png differ diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 313c1bf5..506cb950 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -52,6 +52,7 @@ {{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}} {%if watch.fetch_backend == "html_webdriver" %}{% endif %} + {%if watch.fetch_backend == "html_playwright" %}{% endif %} {% if watch.last_error is defined and watch.last_error != False %}
{{ watch.last_error }}
diff --git a/docker-compose.yml b/docker-compose.yml index 2761031f..0914ed6f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,17 @@ services: # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # + # Alternative Playwright URL, do not use "'s or 's! + # - PLAYWRIGHT_DRIVER_URL=ws://playwright-server:4444/playwright + # + # Alternative Playwright Browser Type, must match with PLAYWRIGHT_BROWSER_TYPE in the playwright-server service + # See https://playwright.dev/docs/browsers + # - PLAYWRIGHT_BROWSER_TYPE=chromium + # + # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password + # + # https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-option-proxy + # # Plain requsts - proxy support example. # - HTTP_PROXY=socks5h://10.10.1.10:1080 # - HTTPS_PROXY=socks5h://10.10.1.10:1080 @@ -60,6 +71,19 @@ services: # - /dev/shm:/dev/shm # restart: unless-stopped +# playwright-server: +# hostname: playwright-server +# build: ./playwright +# environment: +# - PLAYWRIGHT_PORT=4444 +# # Must match with PLAYWRIGHT_BROWSER_TYPE in the changedetection service +# - PLAYWRIGHT_BROWSER_TYPE=chromium +# ipc: host +# user: pwuser +# security_opt: +# - seccomp:./playwright/seccomp_profile.json +# restart: unless-stopped + volumes: changedetection-data: diff --git a/playwright/Dockerfile b/playwright/Dockerfile new file mode 100644 index 00000000..8dcd659e --- /dev/null +++ b/playwright/Dockerfile @@ -0,0 +1,13 @@ +FROM mcr.microsoft.com/playwright:v1.20.0-focal + +WORKDIR /server +RUN npm install playwright +COPY server.js . + +ENV PLAYWRIGHT_PORT=4444 +ENV PLAYWRIGHT_BROWSER_TYPE=chromium +ENV PLAYWRIGHT_HEADLESS=true + +EXPOSE ${PLAYWRIGHT_PORT} + +CMD [ "node", "server.js" ] diff --git a/playwright/seccomp_profile.json b/playwright/seccomp_profile.json new file mode 100644 index 00000000..bfeea36c --- /dev/null +++ b/playwright/seccomp_profile.json @@ -0,0 +1,12 @@ +{ + "comment": "Allow create user namespaces", + "names": [ + "clone", + "setns", + "unshare" + ], + "action": "SCMP_ACT_ALLOW", + "args": [], + "includes": {}, + "excludes": {} +} diff --git a/playwright/server.js b/playwright/server.js new file mode 100644 index 00000000..9a730305 --- /dev/null +++ b/playwright/server.js @@ -0,0 +1,10 @@ +const playwright = require('playwright'); + +const port = parseInt(process.env.PLAYWRIGHT_PORT) || 4444; +const browserType = process.env.PLAYWRIGHT_BROWSER_TYPE?.toLowerCase() || 'chromium'; +const headless = process.env.PLAYWRIGHT_HEADLESS?.toLowerCase() === 'true' || true; +const wsPath = 'playwright'; +console.log('using port:', port, 'browser:', browserType, 'headless:', headless, 'wspath:', wsPath); + +const serverPromise = playwright[browserType].launchServer({ headless: headless, port: port, wsPath: wsPath }); +serverPromise.then(bs => console.log(bs.wsEndpoint())); diff --git a/requirements.txt b/requirements.txt index feef375b..2a8be8e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,5 @@ lxml # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0 selenium ~= 4.1.0 +# An alternative to Selenium +playwright ~= 1.20