From 5229094e44601a9bd5c5ea502393f940fc1e8491 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 13 Nov 2023 16:39:11 +0100 Subject: [PATCH] New functionanlity - Selectable browser / ability to add extra browser connections (good for using "scraping browsers"/ etc) (#1943) --- .github/workflows/test-only.yml | 11 ++- changedetectionio/__init__.py | 8 +- changedetectionio/content_fetcher.py | 37 +++++--- changedetectionio/forms.py | 9 ++ changedetectionio/model/App.py | 1 + changedetectionio/processors/__init__.py | 18 +++- .../run_custom_browser_url_tests.sh | 44 +++++++++ .../styles/scss/parts/_extra_browsers.scss | 24 +++++ .../styles/scss/parts/_extra_proxies.scss | 7 ++ .../static/styles/scss/styles.scss | 1 + changedetectionio/static/styles/styles.css | 21 +++++ changedetectionio/store.py | 12 +++ changedetectionio/templates/settings.html | 6 +- .../templates/watch-overview.html | 3 +- .../tests/custom_browser_url/__init__.py | 1 + .../test_custom_browser_url.py | 89 +++++++++++++++++++ 16 files changed, 270 insertions(+), 22 deletions(-) create mode 100755 changedetectionio/run_custom_browser_url_tests.sh create mode 100644 changedetectionio/static/styles/scss/parts/_extra_browsers.scss create mode 100644 changedetectionio/tests/custom_browser_url/__init__.py create mode 100644 changedetectionio/tests/custom_browser_url/test_custom_browser_url.py diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index 08a3f944..3064e97d 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -30,7 +30,10 @@ jobs: # Selenium+browserless docker run --network changedet-network -d --hostname selenium -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4 - docker run --network changedet-network -d --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.60-chrome-stable + docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.60-chrome-stable + + # For accessing custom browser tests + docker run --network changedet-network -d --name browserless-custom-url --hostname browserless-custom-url -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm --shm-size="2g" browserless/chrome:1.60-chrome-stable - name: Build changedetection.io container for testing run: | @@ -86,6 +89,12 @@ jobs: # And again with PLAYWRIGHT_DRIVER_URL=.. cd .. + - name: Test custom browser URL + run: | + cd changedetectionio + ./run_custom_browser_url_tests.sh + cd .. + - name: Test changedetection.io container starts+runs basically without error run: | docker run -p 5556:5000 -d test-changedetectionio diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 1d07d790..9edea3cc 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -614,6 +614,8 @@ def changedetection_app(config=None, datastore_o=None): # For the form widget tag uuid lookup form.tags.datastore = datastore # in _value + for p in datastore.extra_browsers: + form.fetch_backend.choices.append(p) form.fetch_backend.choices.append(("system", 'System settings default')) @@ -714,7 +716,7 @@ def changedetection_app(config=None, datastore_o=None): system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver' is_html_webdriver = False - if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver': + if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'): is_html_webdriver = True # Only works reliably with Playwright @@ -987,7 +989,7 @@ def changedetection_app(config=None, datastore_o=None): system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver' is_html_webdriver = False - if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver': + if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'): is_html_webdriver = True password_enabled_and_share_is_off = False @@ -1041,7 +1043,7 @@ def changedetection_app(config=None, datastore_o=None): is_html_webdriver = False - if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver': + if (watch.get('fetch_backend') == 'system' and system_uses_webdriver) or watch.get('fetch_backend') == 'html_webdriver' or watch.get('fetch_backend', '').startswith('extra_browser_'): is_html_webdriver = True # Never requested successfully, but we detected a fetch error diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index d9c14590..db5c7b99 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -96,6 +96,7 @@ class Fetcher(): content = None error = None fetcher_description = "No description" + browser_connection_url = None headers = {} status_code = None webdriver_js_execute_code = None @@ -251,14 +252,16 @@ class base_html_playwright(Fetcher): proxy = None - def __init__(self, proxy_override=None): + def __init__(self, proxy_override=None, browser_connection_url=None): super().__init__() - # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') - self.command_executor = os.getenv( - "PLAYWRIGHT_DRIVER_URL", - 'ws://playwright-chrome:3000' - ).strip('"') + + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + if not browser_connection_url: + self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"') + else: + self.browser_connection_url = browser_connection_url # If any proxy settings are enabled, then we should setup the proxy object proxy_args = {} @@ -444,7 +447,7 @@ class base_html_playwright(Fetcher): # Seemed to cause a connection Exception even tho I can see it connect # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) # 60,000 connection timeout only - browser = browser_type.connect_over_cdp(self.command_executor, timeout=60000) + browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000) # SOCKS5 with authentication is not supported (yet) # https://github.com/microsoft/playwright/issues/10567 @@ -504,7 +507,11 @@ class base_html_playwright(Fetcher): self.status_code = response.status if self.status_code != 200 and not ignore_status_codes: - raise Non200ErrorCodeReceived(url=url, status_code=self.status_code) + + screenshot=self.page.screenshot(type='jpeg', full_page=True, + quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) + + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) if len(self.page.content().strip()) == 0: context.close() @@ -555,8 +562,6 @@ class base_html_webdriver(Fetcher): else: fetcher_description = "WebDriver Chrome/Javascript" - command_executor = '' - # Configs for Proxy setup # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', @@ -564,12 +569,15 @@ class base_html_webdriver(Fetcher): 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] proxy = None - def __init__(self, proxy_override=None): + def __init__(self, proxy_override=None, browser_connection_url=None): super().__init__() from selenium.webdriver.common.proxy import Proxy as SeleniumProxy # .strip('"') is going to save someone a lot of time when they accidently wrap the env value - self.command_executor = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') + if not browser_connection_url: + self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') + else: + self.browser_connection_url = browser_connection_url # If any proxy settings are enabled, then we should setup the proxy object proxy_args = {} @@ -611,7 +619,7 @@ class base_html_webdriver(Fetcher): options.proxy = self.proxy self.driver = webdriver.Remote( - command_executor=self.command_executor, + command_executor=self.browser_connection_url, options=options) try: @@ -666,9 +674,10 @@ class base_html_webdriver(Fetcher): class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" - def __init__(self, proxy_override=None): + def __init__(self, proxy_override=None, browser_connection_url=None): super().__init__() self.proxy_override = proxy_override + # browser_connection_url is none because its always 'launched locally' def run(self, url, diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index e8c35cb8..b3de842b 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -168,7 +168,9 @@ class ValidateContentFetcherIsReady(object): def __call__(self, form, field): import urllib3.exceptions from changedetectionio import content_fetcher + return +# AttributeError: module 'changedetectionio.content_fetcher' has no attribute 'extra_browser_unlocked<>ASDF213r123r' # Better would be a radiohandler that keeps a reference to each class if field.data is not None and field.data != 'system': klass = getattr(content_fetcher, field.data) @@ -496,6 +498,12 @@ class SingleExtraProxy(Form): proxy_url = StringField('Proxy URL', [validators.Optional()], render_kw={"placeholder": "socks5:// or regular proxy http://user:pass@...:3128", "size":50}) # @todo do the validation here instead +class SingleExtraBrowser(Form): + browser_name = StringField('Name', [validators.Optional()], render_kw={"placeholder": "Name"}) + browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50}) + # @todo do the validation here instead + + # datastore.data['settings']['requests'].. class globalSettingsRequestForm(Form): time_between_check = FormField(TimeBetweenCheckForm) @@ -504,6 +512,7 @@ class globalSettingsRequestForm(Form): render_kw={"style": "width: 5em;"}, validators=[validators.NumberRange(min=0, message="Should contain zero or more seconds")]) extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5) + extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5) def validate_extra_proxies(self, extra_validators=None): for e in self.data['extra_proxies']: diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index 697d0d00..1202d5db 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -16,6 +16,7 @@ class model(dict): }, 'requests': { 'extra_proxies': [], # Configurable extra proxies via the UI + 'extra_browsers': [], # Configurable extra proxies via the UI 'jitter_seconds': 0, 'proxy': None, # Preferred proxy connection 'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None}, diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index d2e5ee5c..10c9138c 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -8,11 +8,12 @@ from distutils.util import strtobool class difference_detection_processor(): + browser_steps = None datastore = None fetcher = None screenshot = None + watch = None xpath_data = None - browser_steps = None def __init__(self, *args, datastore, watch_uuid, **kwargs): super().__init__(*args, **kwargs) @@ -40,6 +41,18 @@ class difference_detection_processor(): if not prefer_fetch_backend or prefer_fetch_backend == 'system': prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') + # In the case that the preferred fetcher was a browser config with custom connection URL.. + # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..) + browser_connection_url = None + if prefer_fetch_backend.startswith('extra_browser_'): + (t, key) = prefer_fetch_backend.split('extra_browser_') + connection = list( + filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', []))) + if connection: + prefer_fetch_backend = 'base_html_playwright' + browser_connection_url = connection[0].get('browser_connection_url') + + # Grab the right kind of 'fetcher', (playwright, requests, etc) if hasattr(content_fetcher, prefer_fetch_backend): fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) @@ -54,8 +67,9 @@ class difference_detection_processor(): print(f"Using proxy Key: {preferred_proxy_id} as Proxy URL {proxy_url}") # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. + # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc) self.fetcher = fetcher_obj(proxy_override=proxy_url, - #browser_url_extra/configurable browser url=... + browser_connection_url=browser_connection_url ) if self.watch.has_browser_steps: diff --git a/changedetectionio/run_custom_browser_url_tests.sh b/changedetectionio/run_custom_browser_url_tests.sh new file mode 100755 index 00000000..10cea9c5 --- /dev/null +++ b/changedetectionio/run_custom_browser_url_tests.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# run some tests and look if the 'custom-browser-search-string=1' connect string appeared in the correct containers + +# enable debug +set -x + +# A extra browser is configured, but we never chose to use it, so it should NOT show in the logs +docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url' +docker logs browserless-custom-url &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 1 ] +then + echo "Saw a request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should not" + exit 1 +fi + +docker logs browserless &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 1 ] +then + echo "Saw a request in 'browser' container with 'custom-browser-search-string=1' when I should not" + exit 1 +fi + +# Special connect string should appear in the custom-url container, but not in the 'default' one +docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url' +docker logs browserless-custom-url &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 0 ] +then + echo "Did not see request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should" + exit 1 +fi + +docker logs browserless &>log.txt +grep 'custom-browser-search-string=1' log.txt +if [ $? -ne 1 ] +then + echo "Saw a request in 'browser' container with 'custom-browser-search-string=1' when I should not" + exit 1 +fi + + diff --git a/changedetectionio/static/styles/scss/parts/_extra_browsers.scss b/changedetectionio/static/styles/scss/parts/_extra_browsers.scss new file mode 100644 index 00000000..da0204ad --- /dev/null +++ b/changedetectionio/static/styles/scss/parts/_extra_browsers.scss @@ -0,0 +1,24 @@ +ul#requests-extra_browsers { + list-style: none; + /* tidy up the table to look more "inline" */ + li { + > label { + display: none; + } + + } + + /* each proxy entry is a `table` */ + table { + tr { + display: inline; + } + } +} + +#extra-browsers-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; +} \ No newline at end of file diff --git a/changedetectionio/static/styles/scss/parts/_extra_proxies.scss b/changedetectionio/static/styles/scss/parts/_extra_proxies.scss index 756dd9b9..ed6de397 100644 --- a/changedetectionio/static/styles/scss/parts/_extra_proxies.scss +++ b/changedetectionio/static/styles/scss/parts/_extra_proxies.scss @@ -60,3 +60,10 @@ body.proxy-check-active { padding-bottom: 1em; } + +#extra-proxies-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; +} diff --git a/changedetectionio/static/styles/scss/styles.scss b/changedetectionio/static/styles/scss/styles.scss index 68b95337..ed98a1c6 100644 --- a/changedetectionio/static/styles/scss/styles.scss +++ b/changedetectionio/static/styles/scss/styles.scss @@ -5,6 +5,7 @@ @import "parts/_arrows"; @import "parts/_browser-steps"; @import "parts/_extra_proxies"; +@import "parts/_extra_browsers"; @import "parts/_pagination"; @import "parts/_spinners"; @import "parts/_variables"; diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css index 3b7a87d8..c1865879 100644 --- a/changedetectionio/static/styles/styles.css +++ b/changedetectionio/static/styles/styles.css @@ -128,6 +128,27 @@ body.proxy-check-active #request .proxy-timing { border-radius: 4px; padding: 1em; } +#extra-proxies-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; } + +ul#requests-extra_browsers { + list-style: none; + /* tidy up the table to look more "inline" */ + /* each proxy entry is a `table` */ } + ul#requests-extra_browsers li > label { + display: none; } + ul#requests-extra_browsers table tr { + display: inline; } + +#extra-browsers-setting { + border: 1px solid var(--color-grey-800); + border-radius: 4px; + margin: 1em; + padding: 1em; } + .pagination-page-info { color: #fff; font-size: 0.85rem; diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 6306a391..c00018c4 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -633,6 +633,18 @@ class ChangeDetectionStore: return {} + @property + def extra_browsers(self): + res = [] + p = list(filter( + lambda s: (s.get('browser_name') and s.get('browser_connection_url')), + self.__data['settings']['requests'].get('extra_browsers', []))) + if p: + for i in p: + res.append(("extra_browser_"+i['browser_name'], i['browser_name'])) + + return res + def tag_exists_by_name(self, tag_name): return any(v.get('title', '').lower() == tag_name.lower() for k, v in self.__data['settings']['application']['tags'].items()) diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 273ac561..461208f0 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -230,11 +230,15 @@ nav

Tip: "Residential" and "Mobile" proxy type can be more successfull than "Data Center" for blocked websites. -

+
{{ render_field(form.requests.form.extra_proxies) }} "Name" will be used for selecting the proxy in the Watch Edit settings
SOCKS5 proxies with authentication are only supported with 'plain requests' fetcher, for other fetchers you should whitelist the IP access instead
+
+ Extra Browsers allow changedetection.io to communicate with a different web-browser.
+ {{ render_field(form.requests.form.extra_browsers) }} +
diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 5e2cc090..4b04ead0 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -104,8 +104,9 @@ {% if watch.get_fetch_backend == "html_webdriver" or ( watch.get_fetch_backend == "system" and system_default_fetcher == 'html_webdriver' ) + or "extra_browser_" in watch.get_fetch_backend %} - + {% endif %} {%if watch.is_pdf %}{% endif %} diff --git a/changedetectionio/tests/custom_browser_url/__init__.py b/changedetectionio/tests/custom_browser_url/__init__.py new file mode 100644 index 00000000..f4572339 --- /dev/null +++ b/changedetectionio/tests/custom_browser_url/__init__.py @@ -0,0 +1 @@ +# placeholder \ No newline at end of file diff --git a/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py new file mode 100644 index 00000000..bfc2c95e --- /dev/null +++ b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py @@ -0,0 +1,89 @@ +# !/usr/bin/python3 +import os + +from flask import url_for +from ..util import live_server_setup, wait_for_all_checks + +def do_test(client, live_server, make_test_use_extra_browser=False): + + # Grep for this string in the logs? + test_url = f"https://changedetection.io/ci-test.html" + custom_browser_name = 'custom browser URL' + + # needs to be set and something like 'ws://127.0.0.1:3000?stealth=1&--disable-web-security=true' + assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" + + ##################### + res = client.post( + url_for("settings_page"), + data={"application-empty_pages_are_a_change": "", + "requests-time_between_check-minutes": 180, + 'application-fetch_backend': "html_webdriver", + # browserless-custom-url is setup in .github/workflows/test-only.yml + # the test script run_custom_browser_url_test.sh will look for 'custom-browser-search-string' in the container logs + 'requests-extra_browsers-0-browser_connection_url': 'ws://browserless-custom-url:3000?stealth=1&--disable-web-security=true&custom-browser-search-string=1', + 'requests-extra_browsers-0-browser_name': custom_browser_name + }, + follow_redirects=True + ) + + assert b"Settings updated." in res.data + + # Add our URL to the import page + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + + assert b"1 Imported" in res.data + wait_for_all_checks(client) + + if make_test_use_extra_browser: + + # So the name should appear in the edit page under "Request" > "Fetch Method" + res = client.get( + url_for("edit_page", uuid="first"), + follow_redirects=True + ) + assert b'custom browser URL' in res.data + + res = client.post( + url_for("edit_page", uuid="first"), + data={ + "url": test_url, + "tags": "", + "headers": "", + 'fetch_backend': f"extra_browser_{custom_browser_name}", + 'webdriver_js_execute_code': '' + }, + follow_redirects=True + ) + + assert b"Updated watch." in res.data + wait_for_all_checks(client) + + # Force recheck + res = client.get(url_for("form_watch_checknow"), follow_redirects=True) + assert b'1 watches queued for rechecking.' in res.data + + wait_for_all_checks(client) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + assert b'cool it works' in res.data + + +# Requires playwright to be installed +def test_request_via_custom_browser_url(client, live_server): + live_server_setup(live_server) + # We do this so we can grep the logs of the custom container and see if the request actually went through that container + do_test(client, live_server, make_test_use_extra_browser=True) + + +def test_request_not_via_custom_browser_url(client, live_server): + live_server_setup(live_server) + # We do this so we can grep the logs of the custom container and see if the request actually went through that container + do_test(client, live_server, make_test_use_extra_browser=False)