diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml
index 1c89f2fa..8fb89d62 100644
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@@ -72,7 +72,11 @@ jobs:
run: |
# Playwright via Sockpuppetbrowser fetch
# tests/visualselector/test_fetch_data.py will do browser steps
- docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py'
+
- name: Playwright and SocketPuppetBrowser - Headers and requests
run: |
@@ -87,8 +91,11 @@ jobs:
# STRAIGHT TO CDP
- name: Pyppeteer and SocketPuppetBrowser - Specific tests in built container
run: |
- # Playwright via Sockpuppetbrowser fetch
- docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "FAST_PUPPETEER_CHROME_FETCHER=True" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
+ # Playwright via Sockpuppetbrowser fetch
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py'
- name: Pyppeteer and SocketPuppetBrowser - Headers and requests checks
run: |
diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py
index 22710e99..6bb58b38 100644
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@@ -6,6 +6,8 @@ import re
from random import randint
from loguru import logger
+from changedetectionio.content_fetchers.base import manage_user_agent
+
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
# 0- off, 1- on
browser_step_ui_config = {'Choose one': '0 0',
@@ -178,6 +180,7 @@ class browsersteps_live_ui(steppable_browser_interface):
stale = False
# bump and kill this if idle after X sec
age_start = 0
+ headers = {}
# use a special driver, maybe locally etc
command_executor = os.getenv(
@@ -192,7 +195,8 @@ class browsersteps_live_ui(steppable_browser_interface):
browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
- def __init__(self, playwright_browser, proxy=None):
+ def __init__(self, playwright_browser, proxy=None, headers=None):
+ self.headers = headers or {}
self.age_start = time.time()
self.playwright_browser = playwright_browser
if self.context is None:
@@ -206,16 +210,17 @@ class browsersteps_live_ui(steppable_browser_interface):
# @todo handle multiple contexts, bind a unique id from the browser on each req?
self.context = self.playwright_browser.new_context(
- # @todo
- # user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
- # proxy=self.proxy,
- # This is needed to enable JavaScript execution on GitHub and others
- bypass_csp=True,
- # Should never be needed
- accept_downloads=False,
- proxy=proxy
+ accept_downloads=False, # Should never be needed
+ bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+ extra_http_headers=self.headers,
+ ignore_https_errors=True,
+ proxy=proxy,
+ service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
+ # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+ user_agent=manage_user_agent(headers=self.headers),
)
+
self.page = self.context.new_page()
# self.page.set_default_navigation_timeout(keep_open)
diff --git a/changedetectionio/blueprint/browser_steps/nonContext.py b/changedetectionio/blueprint/browser_steps/nonContext.py
index 5345f306..93abe269 100644
--- a/changedetectionio/blueprint/browser_steps/nonContext.py
+++ b/changedetectionio/blueprint/browser_steps/nonContext.py
@@ -1,5 +1,4 @@
from playwright.sync_api import PlaywrightContextManager
-import asyncio
# So playwright wants to run as a context manager, but we do something horrible and hacky
# we are holding the session open for as long as possible, then shutting it down, and opening a new one
diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py
index d54b9bb2..d4481828 100644
--- a/changedetectionio/content_fetchers/__init__.py
+++ b/changedetectionio/content_fetchers/__init__.py
@@ -1,6 +1,6 @@
import sys
from distutils.util import strtobool
-
+from loguru import logger
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
import os
@@ -29,10 +29,15 @@ def available_fetchers():
# rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
- if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
+ # @note - For now, browser steps always uses playwright
+ if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or False:
+ logger.debug('Using Playwright library as fetcher')
from .playwright import fetcher as html_webdriver
else:
+ logger.debug('Using direct Python Puppeteer library as fetcher')
from .puppeteer import fetcher as html_webdriver
else:
+ logger.debug("Falling back to selenium as fetcher")
from .webdriver_selenium import fetcher as html_webdriver
+
diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py
index 71500d61..756a9bef 100644
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -5,6 +5,40 @@ from loguru import logger
from changedetectionio.content_fetchers import BrowserStepsStepException
+def manage_user_agent(headers, current_ua=''):
+ """
+ Basic setting of user-agent
+
+ NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
+ THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
+ This does not take care of
+ - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
+ - TCP/IP fingerprint JA3 etc
+ - Graphic rendering fingerprinting
+ - Your IP being obviously in a pool of bad actors
+ - Too many requests
+ - Scraping of SCH-UA browser replies (thanks google!!)
+ - Scraping of ServiceWorker, new window calls etc
+
+ See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
+ Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
+
+ :param page:
+ :param headers:
+ :return:
+ """
+ # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
+ ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None)
+ if ua_in_custom_headers:
+ return ua_in_custom_headers
+
+ if not ua_in_custom_headers and current_ua:
+ current_ua = current_ua.replace('HeadlessChrome', 'Chrome')
+ return current_ua
+
+ return None
+
+
class Fetcher():
browser_connection_is_custom = None
browser_connection_url = None
diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py
index 7faa2032..7950e033 100644
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -3,7 +3,8 @@ import os
from urllib.parse import urlparse
from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
class fetcher(Fetcher):
@@ -102,19 +103,16 @@ class fetcher(Fetcher):
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context(
- user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
+ accept_downloads=False, # Should never be needed
+ bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+ extra_http_headers=request_headers,
+ ignore_https_errors=True,
proxy=self.proxy,
- # This is needed to enable JavaScript execution on GitHub and others
- bypass_csp=True,
- # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
- service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
- # Should never be needed
- accept_downloads=False
+ service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+ user_agent=manage_user_agent(headers=request_headers),
)
self.page = context.new_page()
- if len(request_headers):
- context.set_extra_http_headers(request_headers)
# Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py
index 35f4b395..cad1b6b8 100644
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -5,7 +5,8 @@ import websockets.exceptions
from urllib.parse import urlparse
from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
@@ -100,10 +101,11 @@ class fetcher(Fetcher):
else:
self.page = await browser.newPage()
+ await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent')))
+
await self.page.setBypassCSP(True)
if request_headers:
await self.page.setExtraHTTPHeaders(request_headers)
- # @todo check user-agent worked
# SOCKS5 with authentication is not supported (yet)
# https://github.com/microsoft/playwright/issues/10567
diff --git a/changedetectionio/tests/fetchers/test_custom_js_before_content.py b/changedetectionio/tests/fetchers/test_custom_js_before_content.py
new file mode 100644
index 00000000..bec4334a
--- /dev/null
+++ b/changedetectionio/tests/fetchers/test_custom_js_before_content.py
@@ -0,0 +1,56 @@
+import os
+from flask import url_for
+from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
+
+
+def test_execute_custom_js(client, live_server):
+
+ live_server_setup(live_server)
+ assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
+
+ test_url = url_for('test_interactive_html_endpoint', _external=True)
+ test_url = test_url.replace('localhost.localdomain', 'cdio')
+ test_url = test_url.replace('localhost', 'cdio')
+
+ res = client.post(
+ url_for("form_quick_watch_add"),
+ data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
+ follow_redirects=True
+ )
+
+ assert b"Watch added in Paused state, saving will unpause" in res.data
+
+ res = client.post(
+ url_for("edit_page", uuid="first", unpause_on_save=1),
+ data={
+ "url": test_url,
+ "tags": "",
+ 'fetch_backend': "html_webdriver",
+ 'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();',
+ 'headers': "testheader: yes\buser-agent: MyCustomAgent",
+ },
+ follow_redirects=True
+ )
+ assert b"unpaused" in res.data
+ wait_for_all_checks(client)
+
+ uuid = extract_UUID_from_client(client)
+ assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
+
+ assert b"This text should be removed" not in res.data
+
+ # Check HTML conversion detected and workd
+ res = client.get(
+ url_for("preview_page", uuid=uuid),
+ follow_redirects=True
+ )
+ assert b"This text should be removed" not in res.data
+ assert b"I smell JavaScript because the button was pressed" in res.data
+
+ assert b"testheader: yes" in res.data
+ assert b"user-agent: mycustomagent" in res.data
+
+ client.get(
+ url_for("form_delete", uuid="all"),
+ follow_redirects=True
+ )
\ No newline at end of file
diff --git a/changedetectionio/tests/test_pdf.py b/changedetectionio/tests/test_pdf.py
index d8694a57..2d4fb6a9 100644
--- a/changedetectionio/tests/test_pdf.py
+++ b/changedetectionio/tests/test_pdf.py
@@ -29,7 +29,8 @@ def test_fetch_pdf(client, live_server):
follow_redirects=True
)
- assert b'PDF-1.5' not in res.data
+ # PDF header should not be there (it was converted to text)
+ assert b'PDF' not in res.data[:10]
assert b'hello world' in res.data
# So we know if the file changes in other ways
diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py
index 5974e47a..aab79163 100644
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -242,5 +242,28 @@ def live_server_setup(live_server):
resp.headers['Content-Type'] = 'application/pdf'
return resp
+ @live_server.app.route('/test-interactive-html-endpoint')
+ def test_interactive_html_endpoint():
+ header_text=""
+ for k,v in request.headers.items():
+ header_text += f"{k}: {v}
"
+
+ resp = make_response(f"""
+
+
changedetectionio/tests/visualselector/test_fetch_data.py+
This text should be removed
+