diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index 1c89f2fa..8fb89d62 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -72,7 +72,11 @@ jobs: run: | # Playwright via Sockpuppetbrowser fetch # tests/visualselector/test_fetch_data.py will do browser steps - docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py' + - name: Playwright and SocketPuppetBrowser - Headers and requests run: | @@ -87,8 +91,11 @@ jobs: # STRAIGHT TO CDP - name: Pyppeteer and SocketPuppetBrowser - Specific tests in built container run: | - # Playwright via Sockpuppetbrowser fetch - docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "FAST_PUPPETEER_CHROME_FETCHER=True" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' + # Playwright via Sockpuppetbrowser fetch + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py' - name: Pyppeteer and SocketPuppetBrowser - Headers and requests checks run: | diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index 22710e99..6bb58b38 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -6,6 +6,8 @@ import re from random import randint from loguru import logger +from changedetectionio.content_fetchers.base import manage_user_agent + # Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end # 0- off, 1- on browser_step_ui_config = {'Choose one': '0 0', @@ -178,6 +180,7 @@ class browsersteps_live_ui(steppable_browser_interface): stale = False # bump and kill this if idle after X sec age_start = 0 + headers = {} # use a special driver, maybe locally etc command_executor = os.getenv( @@ -192,7 +195,8 @@ class browsersteps_live_ui(steppable_browser_interface): browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') - def __init__(self, playwright_browser, proxy=None): + def __init__(self, playwright_browser, proxy=None, headers=None): + self.headers = headers or {} self.age_start = time.time() self.playwright_browser = playwright_browser if self.context is None: @@ -206,16 +210,17 @@ class browsersteps_live_ui(steppable_browser_interface): # @todo handle multiple contexts, bind a unique id from the browser on each req? self.context = self.playwright_browser.new_context( - # @todo - # user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0', - # proxy=self.proxy, - # This is needed to enable JavaScript execution on GitHub and others - bypass_csp=True, - # Should never be needed - accept_downloads=False, - proxy=proxy + accept_downloads=False, # Should never be needed + bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others + extra_http_headers=self.headers, + ignore_https_errors=True, + proxy=proxy, + service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), + # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers + user_agent=manage_user_agent(headers=self.headers), ) + self.page = self.context.new_page() # self.page.set_default_navigation_timeout(keep_open) diff --git a/changedetectionio/blueprint/browser_steps/nonContext.py b/changedetectionio/blueprint/browser_steps/nonContext.py index 5345f306..93abe269 100644 --- a/changedetectionio/blueprint/browser_steps/nonContext.py +++ b/changedetectionio/blueprint/browser_steps/nonContext.py @@ -1,5 +1,4 @@ from playwright.sync_api import PlaywrightContextManager -import asyncio # So playwright wants to run as a context manager, but we do something horrible and hacky # we are holding the session open for as long as possible, then shutting it down, and opening a new one diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index d54b9bb2..d4481828 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -1,6 +1,6 @@ import sys from distutils.util import strtobool - +from loguru import logger from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException import os @@ -29,10 +29,15 @@ def available_fetchers(): # rather than site-specific. use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) if use_playwright_as_chrome_fetcher: - if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')): + # @note - For now, browser steps always uses playwright + if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or False: + logger.debug('Using Playwright library as fetcher') from .playwright import fetcher as html_webdriver else: + logger.debug('Using direct Python Puppeteer library as fetcher') from .puppeteer import fetcher as html_webdriver else: + logger.debug("Falling back to selenium as fetcher") from .webdriver_selenium import fetcher as html_webdriver + diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index 71500d61..756a9bef 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -5,6 +5,40 @@ from loguru import logger from changedetectionio.content_fetchers import BrowserStepsStepException +def manage_user_agent(headers, current_ua=''): + """ + Basic setting of user-agent + + NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques + THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!! + This does not take care of + - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc + - TCP/IP fingerprint JA3 etc + - Graphic rendering fingerprinting + - Your IP being obviously in a pool of bad actors + - Too many requests + - Scraping of SCH-UA browser replies (thanks google!!) + - Scraping of ServiceWorker, new window calls etc + + See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da + Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth + + :param page: + :param headers: + :return: + """ + # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default + ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None) + if ua_in_custom_headers: + return ua_in_custom_headers + + if not ua_in_custom_headers and current_ua: + current_ua = current_ua.replace('HeadlessChrome', 'Chrome') + return current_ua + + return None + + class Fetcher(): browser_connection_is_custom = None browser_connection_url = None diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index 7faa2032..7950e033 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -3,7 +3,8 @@ import os from urllib.parse import urlparse from loguru import logger -from changedetectionio.content_fetchers.base import Fetcher + +from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable class fetcher(Fetcher): @@ -102,19 +103,16 @@ class fetcher(Fetcher): # Set user agent to prevent Cloudflare from blocking the browser # Use the default one configured in the App.py model that's passed from fetch_site_status.py context = browser.new_context( - user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None), + accept_downloads=False, # Should never be needed + bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others + extra_http_headers=request_headers, + ignore_https_errors=True, proxy=self.proxy, - # This is needed to enable JavaScript execution on GitHub and others - bypass_csp=True, - # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers - service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), - # Should never be needed - accept_downloads=False + service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers + user_agent=manage_user_agent(headers=request_headers), ) self.page = context.new_page() - if len(request_headers): - context.set_extra_http_headers(request_headers) # Listen for all console events and handle errors self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 35f4b395..cad1b6b8 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -5,7 +5,8 @@ import websockets.exceptions from urllib.parse import urlparse from loguru import logger -from changedetectionio.content_fetchers.base import Fetcher + +from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError @@ -100,10 +101,11 @@ class fetcher(Fetcher): else: self.page = await browser.newPage() + await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent'))) + await self.page.setBypassCSP(True) if request_headers: await self.page.setExtraHTTPHeaders(request_headers) - # @todo check user-agent worked # SOCKS5 with authentication is not supported (yet) # https://github.com/microsoft/playwright/issues/10567 diff --git a/changedetectionio/tests/fetchers/test_custom_js_before_content.py b/changedetectionio/tests/fetchers/test_custom_js_before_content.py new file mode 100644 index 00000000..bec4334a --- /dev/null +++ b/changedetectionio/tests/fetchers/test_custom_js_before_content.py @@ -0,0 +1,56 @@ +import os +from flask import url_for +from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client + + +def test_execute_custom_js(client, live_server): + + live_server_setup(live_server) + assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" + + test_url = url_for('test_interactive_html_endpoint', _external=True) + test_url = test_url.replace('localhost.localdomain', 'cdio') + test_url = test_url.replace('localhost', 'cdio') + + res = client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'}, + follow_redirects=True + ) + + assert b"Watch added in Paused state, saving will unpause" in res.data + + res = client.post( + url_for("edit_page", uuid="first", unpause_on_save=1), + data={ + "url": test_url, + "tags": "", + 'fetch_backend': "html_webdriver", + 'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();', + 'headers': "testheader: yes\buser-agent: MyCustomAgent", + }, + follow_redirects=True + ) + assert b"unpaused" in res.data + wait_for_all_checks(client) + + uuid = extract_UUID_from_client(client) + assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)" + + assert b"This text should be removed" not in res.data + + # Check HTML conversion detected and workd + res = client.get( + url_for("preview_page", uuid=uuid), + follow_redirects=True + ) + assert b"This text should be removed" not in res.data + assert b"I smell JavaScript because the button was pressed" in res.data + + assert b"testheader: yes" in res.data + assert b"user-agent: mycustomagent" in res.data + + client.get( + url_for("form_delete", uuid="all"), + follow_redirects=True + ) \ No newline at end of file diff --git a/changedetectionio/tests/test_pdf.py b/changedetectionio/tests/test_pdf.py index d8694a57..2d4fb6a9 100644 --- a/changedetectionio/tests/test_pdf.py +++ b/changedetectionio/tests/test_pdf.py @@ -29,7 +29,8 @@ def test_fetch_pdf(client, live_server): follow_redirects=True ) - assert b'PDF-1.5' not in res.data + # PDF header should not be there (it was converted to text) + assert b'PDF' not in res.data[:10] assert b'hello world' in res.data # So we know if the file changes in other ways diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 5974e47a..aab79163 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -242,5 +242,28 @@ def live_server_setup(live_server): resp.headers['Content-Type'] = 'application/pdf' return resp + @live_server.app.route('/test-interactive-html-endpoint') + def test_interactive_html_endpoint(): + header_text="" + for k,v in request.headers.items(): + header_text += f"{k}: {v}
" + + resp = make_response(f""" + + + Primitive JS check for
changedetectionio/tests/visualselector/test_fetch_data.py
+

This text should be removed

+
+ + +
+
+                {header_text.lower()}
+                
+ + """, 200) + resp.headers['Content-Type'] = 'text/html' + return resp + live_server.start() diff --git a/changedetectionio/tests/visualselector/test_fetch_data.py b/changedetectionio/tests/visualselector/test_fetch_data.py index 804a1017..2f460d7c 100644 --- a/changedetectionio/tests/visualselector/test_fetch_data.py +++ b/changedetectionio/tests/visualselector/test_fetch_data.py @@ -7,15 +7,19 @@ from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_cli def test_setup(client, live_server): live_server_setup(live_server) + # Add a site in paused mode, add an invalid filter, we should still have visual selector data ready def test_visual_selector_content_ready(client, live_server): + import os import json assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url - test_url = "https://changedetection.io/ci-test/test-runjs.html" + test_url = url_for('test_interactive_html_endpoint', _external=True) + test_url = test_url.replace('localhost.localdomain', 'cdio') + test_url = test_url.replace('localhost', 'cdio') res = client.post( url_for("form_quick_watch_add"), @@ -23,28 +27,31 @@ def test_visual_selector_content_ready(client, live_server): follow_redirects=True ) assert b"Watch added in Paused state, saving will unpause" in res.data - + uuid = extract_UUID_from_client(client) res = client.post( - url_for("edit_page", uuid="first", unpause_on_save=1), + url_for("edit_page", uuid=uuid, unpause_on_save=1), data={ - "url": test_url, - "tags": "", - "headers": "", - 'fetch_backend': "html_webdriver", - 'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();' + "url": test_url, + "tags": "", + # For now, cookies doesnt work in headers because it must be a full cookiejar object + 'headers': "testheader: yes\buser-agent: MyCustomAgent", + 'fetch_backend': "html_webdriver", }, follow_redirects=True ) assert b"unpaused" in res.data wait_for_all_checks(client) - uuid = extract_UUID_from_client(client) - # Check the JS execute code before extract worked + + assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)" + res = client.get( - url_for("preview_page", uuid="first"), + url_for("preview_page", uuid=uuid), follow_redirects=True ) - assert b'I smell JavaScript' in res.data + assert b"testheader: yes" in res.data + assert b"user-agent: mycustomagent" in res.data + assert os.path.isfile(os.path.join('test-datastore', uuid, 'last-screenshot.png')), "last-screenshot.png should exist" assert os.path.isfile(os.path.join('test-datastore', uuid, 'elements.json')), "xpath elements.json data should exist" @@ -74,30 +81,33 @@ def test_visual_selector_content_ready(client, live_server): def test_basic_browserstep(client, live_server): - assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" #live_server_setup(live_server) + assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" - # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url - test_url = "https://changedetection.io/ci-test/test-runjs.html" + test_url = url_for('test_interactive_html_endpoint', _external=True) + test_url = test_url.replace('localhost.localdomain', 'cdio') + test_url = test_url.replace('localhost', 'cdio') res = client.post( url_for("form_quick_watch_add"), data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'}, follow_redirects=True ) + assert b"Watch added in Paused state, saving will unpause" in res.data res = client.post( url_for("edit_page", uuid="first", unpause_on_save=1), data={ - "url": test_url, - "tags": "", - "headers": "", - 'fetch_backend': "html_webdriver", - 'browser_steps-0-operation': 'Goto site', - 'browser_steps-1-operation': 'Click element', - 'browser_steps-1-selector': 'button[name=test-button]', - 'browser_steps-1-optional_value': '' + "url": test_url, + "tags": "", + 'fetch_backend': "html_webdriver", + 'browser_steps-0-operation': 'Goto site', + 'browser_steps-1-operation': 'Click element', + 'browser_steps-1-selector': 'button[name=test-button]', + 'browser_steps-1-optional_value': '', + # For now, cookies doesnt work in headers because it must be a full cookiejar object + 'headers': "testheader: yes\buser-agent: MyCustomAgent", }, follow_redirects=True ) @@ -105,6 +115,9 @@ def test_basic_browserstep(client, live_server): wait_for_all_checks(client) uuid = extract_UUID_from_client(client) + assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)" + + assert b"This text should be removed" not in res.data # Check HTML conversion detected and workd res = client.get( @@ -114,13 +127,19 @@ def test_basic_browserstep(client, live_server): assert b"This text should be removed" not in res.data assert b"I smell JavaScript because the button was pressed" in res.data + assert b"testheader: yes" in res.data + assert b"user-agent: mycustomagent" in res.data + + four_o_four_url = url_for('test_endpoint', status_code=404, _external=True) + four_o_four_url = four_o_four_url.replace('localhost.localdomain', 'cdio') + four_o_four_url = four_o_four_url.replace('localhost', 'cdio') + # now test for 404 errors res = client.post( url_for("edit_page", uuid=uuid, unpause_on_save=1), data={ - "url": "https://changedetection.io/404", + "url": four_o_four_url, "tags": "", - "headers": "", 'fetch_backend': "html_webdriver", 'browser_steps-0-operation': 'Goto site', 'browser_steps-1-operation': 'Click element',