diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py
index bcac8ee9..6bb58b38 100644
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@@ -6,6 +6,8 @@ import re
from random import randint
from loguru import logger
+from changedetectionio.content_fetchers.base import manage_user_agent
+
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
# 0- off, 1- on
browser_step_ui_config = {'Choose one': '0 0',
@@ -206,21 +208,18 @@ class browsersteps_live_ui(steppable_browser_interface):
keep_open = 1000 * 60 * 5
now = time.time()
- # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
- from changedetectionio.content_fetchers import manage_user_agent
- manage_user_agent(headers=self.headers)
-
# @todo handle multiple contexts, bind a unique id from the browser on each req?
self.context = self.playwright_browser.new_context(
- accept_downloads=False, # Should never be needed
- bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+ accept_downloads=False, # Should never be needed
+ bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+ extra_http_headers=self.headers,
+ ignore_https_errors=True,
proxy=proxy,
- service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
- user_agent=manage_user_agent(headers=self.headers)
+ service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
+ # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+ user_agent=manage_user_agent(headers=self.headers),
)
- if self.headers:
- self.context.set_extra_http_headers(self.headers)
self.page = self.context.new_page()
diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py
index f3d22c17..d4481828 100644
--- a/changedetectionio/content_fetchers/__init__.py
+++ b/changedetectionio/content_fetchers/__init__.py
@@ -29,7 +29,8 @@ def available_fetchers():
# rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
- if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
+ # @note - For now, browser steps always uses playwright
+ if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or False:
logger.debug('Using Playwright library as fetcher')
from .playwright import fetcher as html_webdriver
else:
diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py
index bb42f66b..756a9bef 100644
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -5,6 +5,40 @@ from loguru import logger
from changedetectionio.content_fetchers import BrowserStepsStepException
+def manage_user_agent(headers, current_ua=''):
+ """
+ Basic setting of user-agent
+
+ NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
+ THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
+ This does not take care of
+ - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
+ - TCP/IP fingerprint JA3 etc
+ - Graphic rendering fingerprinting
+ - Your IP being obviously in a pool of bad actors
+ - Too many requests
+ - Scraping of SCH-UA browser replies (thanks google!!)
+ - Scraping of ServiceWorker, new window calls etc
+
+ See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
+ Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
+
+ :param page:
+ :param headers:
+ :return:
+ """
+ # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
+ ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None)
+ if ua_in_custom_headers:
+ return ua_in_custom_headers
+
+ if not ua_in_custom_headers and current_ua:
+ current_ua = current_ua.replace('HeadlessChrome', 'Chrome')
+ return current_ua
+
+ return None
+
+
class Fetcher():
browser_connection_is_custom = None
browser_connection_url = None
@@ -76,39 +110,6 @@ class Fetcher():
"""
return {k.lower(): v for k, v in self.headers.items()}
- def manage_user_agent(self, headers, current_ua=''):
- """
- Basic setting of user-agent
-
- NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
- THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
- This does not take care of
- - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
- - TCP/IP fingerprint JA3 etc
- - Graphic rendering fingerprinting
- - Your IP being obviously in a pool of bad actors
- - Too many requests
- - Scraping of SCH-UA browser replies (thanks google!!)
- - Scraping of ServiceWorker, new window calls etc
-
- See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
- Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
-
- :param page:
- :param headers:
- :return:
- """
- # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
- ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None)
- if ua_in_custom_headers:
- return ua_in_custom_headers
-
- if not ua_in_custom_headers and current_ua:
- current_ua = current_ua.replace('HeadlesssChrome', 'Chrome')
- return current_ua
-
- return None
-
def browser_steps_get_valid_steps(self):
if self.browser_steps is not None and len(self.browser_steps):
valid_steps = filter(
diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py
index b3a4c862..7950e033 100644
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -4,7 +4,7 @@ from urllib.parse import urlparse
from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
class fetcher(Fetcher):
@@ -103,18 +103,17 @@ class fetcher(Fetcher):
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context(
- accept_downloads=False, # Should never be needed
+ accept_downloads=False, # Should never be needed
bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+ extra_http_headers=request_headers,
+ ignore_https_errors=True,
proxy=self.proxy,
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
- user_agent = self.manage_user_agent(headers=request_headers),
+ user_agent=manage_user_agent(headers=request_headers),
)
self.page = context.new_page()
- if len(request_headers):
- context.set_extra_http_headers(request_headers)
-
# Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py
index 3e4a14da..cad1b6b8 100644
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -6,7 +6,7 @@ from urllib.parse import urlparse
from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
@@ -101,7 +101,7 @@ class fetcher(Fetcher):
else:
self.page = await browser.newPage()
- await self.page.setUserAgent(self.manage_user_agent(headers=request_headers, current_ua=self.page.evaluate('navigator.userAgent')))
+ await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent')))
await self.page.setBypassCSP(True)
if request_headers:
diff --git a/changedetectionio/tests/test_pdf.py b/changedetectionio/tests/test_pdf.py
index d8694a57..2d4fb6a9 100644
--- a/changedetectionio/tests/test_pdf.py
+++ b/changedetectionio/tests/test_pdf.py
@@ -29,7 +29,8 @@ def test_fetch_pdf(client, live_server):
follow_redirects=True
)
- assert b'PDF-1.5' not in res.data
+ # PDF header should not be there (it was converted to text)
+ assert b'PDF' not in res.data[:10]
assert b'hello world' in res.data
# So we know if the file changes in other ways
diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py
index 5974e47a..857e616f 100644
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -242,5 +242,29 @@ def live_server_setup(live_server):
resp.headers['Content-Type'] = 'application/pdf'
return resp
+ @live_server.app.route('/test-interactive-html-endpoint')
+ def test_interactive_html_endpoint():
+ import json
+ header_text=""
+ for k,v in request.headers.items():
+ header_text += f"{k}: {v}
"
+
+ resp = make_response(f"""
+
+
changedetectionio/tests/visualselector/test_fetch_data.py+
This text should be removed
+