diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index bcac8ee9..6bb58b38 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -6,6 +6,8 @@ import re from random import randint from loguru import logger +from changedetectionio.content_fetchers.base import manage_user_agent + # Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end # 0- off, 1- on browser_step_ui_config = {'Choose one': '0 0', @@ -206,21 +208,18 @@ class browsersteps_live_ui(steppable_browser_interface): keep_open = 1000 * 60 * 5 now = time.time() - # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default - from changedetectionio.content_fetchers import manage_user_agent - manage_user_agent(headers=self.headers) - # @todo handle multiple contexts, bind a unique id from the browser on each req? self.context = self.playwright_browser.new_context( - accept_downloads=False, # Should never be needed - bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others + accept_downloads=False, # Should never be needed + bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others + extra_http_headers=self.headers, + ignore_https_errors=True, proxy=proxy, - service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers - user_agent=manage_user_agent(headers=self.headers) + service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), + # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers + user_agent=manage_user_agent(headers=self.headers), ) - if self.headers: - self.context.set_extra_http_headers(self.headers) self.page = self.context.new_page() diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index f3d22c17..d4481828 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -29,7 +29,8 @@ def available_fetchers(): # rather than site-specific. use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) if use_playwright_as_chrome_fetcher: - if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')): + # @note - For now, browser steps always uses playwright + if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or False: logger.debug('Using Playwright library as fetcher') from .playwright import fetcher as html_webdriver else: diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py index bb42f66b..756a9bef 100644 --- a/changedetectionio/content_fetchers/base.py +++ b/changedetectionio/content_fetchers/base.py @@ -5,6 +5,40 @@ from loguru import logger from changedetectionio.content_fetchers import BrowserStepsStepException +def manage_user_agent(headers, current_ua=''): + """ + Basic setting of user-agent + + NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques + THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!! + This does not take care of + - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc + - TCP/IP fingerprint JA3 etc + - Graphic rendering fingerprinting + - Your IP being obviously in a pool of bad actors + - Too many requests + - Scraping of SCH-UA browser replies (thanks google!!) + - Scraping of ServiceWorker, new window calls etc + + See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da + Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth + + :param page: + :param headers: + :return: + """ + # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default + ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None) + if ua_in_custom_headers: + return ua_in_custom_headers + + if not ua_in_custom_headers and current_ua: + current_ua = current_ua.replace('HeadlessChrome', 'Chrome') + return current_ua + + return None + + class Fetcher(): browser_connection_is_custom = None browser_connection_url = None @@ -76,39 +110,6 @@ class Fetcher(): """ return {k.lower(): v for k, v in self.headers.items()} - def manage_user_agent(self, headers, current_ua=''): - """ - Basic setting of user-agent - - NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques - THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!! - This does not take care of - - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc - - TCP/IP fingerprint JA3 etc - - Graphic rendering fingerprinting - - Your IP being obviously in a pool of bad actors - - Too many requests - - Scraping of SCH-UA browser replies (thanks google!!) - - Scraping of ServiceWorker, new window calls etc - - See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da - Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth - - :param page: - :param headers: - :return: - """ - # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default - ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None) - if ua_in_custom_headers: - return ua_in_custom_headers - - if not ua_in_custom_headers and current_ua: - current_ua = current_ua.replace('HeadlesssChrome', 'Chrome') - return current_ua - - return None - def browser_steps_get_valid_steps(self): if self.browser_steps is not None and len(self.browser_steps): valid_steps = filter( diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index b3a4c862..7950e033 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -4,7 +4,7 @@ from urllib.parse import urlparse from loguru import logger -from changedetectionio.content_fetchers.base import Fetcher +from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable class fetcher(Fetcher): @@ -103,18 +103,17 @@ class fetcher(Fetcher): # Set user agent to prevent Cloudflare from blocking the browser # Use the default one configured in the App.py model that's passed from fetch_site_status.py context = browser.new_context( - accept_downloads=False, # Should never be needed + accept_downloads=False, # Should never be needed bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others + extra_http_headers=request_headers, + ignore_https_errors=True, proxy=self.proxy, service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers - user_agent = self.manage_user_agent(headers=request_headers), + user_agent=manage_user_agent(headers=request_headers), ) self.page = context.new_page() - if len(request_headers): - context.set_extra_http_headers(request_headers) - # Listen for all console events and handle errors self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index 3e4a14da..cad1b6b8 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -6,7 +6,7 @@ from urllib.parse import urlparse from loguru import logger -from changedetectionio.content_fetchers.base import Fetcher +from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError @@ -101,7 +101,7 @@ class fetcher(Fetcher): else: self.page = await browser.newPage() - await self.page.setUserAgent(self.manage_user_agent(headers=request_headers, current_ua=self.page.evaluate('navigator.userAgent'))) + await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent'))) await self.page.setBypassCSP(True) if request_headers: diff --git a/changedetectionio/tests/test_pdf.py b/changedetectionio/tests/test_pdf.py index d8694a57..2d4fb6a9 100644 --- a/changedetectionio/tests/test_pdf.py +++ b/changedetectionio/tests/test_pdf.py @@ -29,7 +29,8 @@ def test_fetch_pdf(client, live_server): follow_redirects=True ) - assert b'PDF-1.5' not in res.data + # PDF header should not be there (it was converted to text) + assert b'PDF' not in res.data[:10] assert b'hello world' in res.data # So we know if the file changes in other ways diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 5974e47a..857e616f 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -242,5 +242,29 @@ def live_server_setup(live_server): resp.headers['Content-Type'] = 'application/pdf' return resp + @live_server.app.route('/test-interactive-html-endpoint') + def test_interactive_html_endpoint(): + import json + header_text="" + for k,v in request.headers.items(): + header_text += f"{k}: {v}
" + + resp = make_response(f""" + + + Primitive JS check for
changedetectionio/tests/visualselector/test_fetch_data.py
+

This text should be removed

+
+ + +
+
+                {header_text.lower()}
+                
+ + """, 200) + resp.headers['Content-Type'] = 'text/html' + return resp + live_server.start() diff --git a/changedetectionio/tests/visualselector/test_fetch_data.py b/changedetectionio/tests/visualselector/test_fetch_data.py index 00db3cef..87f150fe 100644 --- a/changedetectionio/tests/visualselector/test_fetch_data.py +++ b/changedetectionio/tests/visualselector/test_fetch_data.py @@ -15,7 +15,9 @@ def test_visual_selector_content_ready(client, live_server): assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url - test_url = "https://changedetection.io/ci-test/test-runjs.html" + test_url = url_for('test_interactive_html_endpoint', _external=True) + test_url = test_url.replace('localhost.localdomain', 'cdio') + test_url = test_url.replace('localhost', 'cdio') res = client.post( url_for("form_quick_watch_add"), @@ -37,7 +39,9 @@ def test_visual_selector_content_ready(client, live_server): ) assert b"unpaused" in res.data wait_for_all_checks(client) + uuid = extract_UUID_from_client(client) + assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)" # Check the JS execute code before extract worked res = client.get( @@ -74,11 +78,13 @@ def test_visual_selector_content_ready(client, live_server): def test_basic_browserstep(client, live_server): - assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" #live_server_setup(live_server) - # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url - test_url = "https://changedetection.io/ci-test/test-runjs.html" + assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" + + test_url = url_for('test_interactive_html_endpoint', _external=True) + test_url = test_url.replace('localhost.localdomain', 'cdio') + test_url = test_url.replace('localhost', 'cdio') res = client.post( url_for("form_quick_watch_add"), @@ -90,14 +96,15 @@ def test_basic_browserstep(client, live_server): res = client.post( url_for("edit_page", uuid="first", unpause_on_save=1), data={ - "url": test_url, - "tags": "", - 'fetch_backend': "html_webdriver", - 'browser_steps-0-operation': 'Goto site', - 'browser_steps-1-operation': 'Click element', - 'browser_steps-1-selector': 'button[name=test-button]', - 'browser_steps-1-optional_value': '', - 'headers': "cOoKiE: notice-apa=1; test-value=1; " + "url": test_url, + "tags": "", + 'fetch_backend': "html_webdriver", + 'browser_steps-0-operation': 'Goto site', + 'browser_steps-1-operation': 'Click element', + 'browser_steps-1-selector': 'button[name=test-button]', + 'browser_steps-1-optional_value': '', + # For now, cookies doesnt work in headers because it must be a full cookiejar object + 'headers': "testheader: yes\buser-agent: MyCustomAgent", }, follow_redirects=True ) @@ -105,6 +112,9 @@ def test_basic_browserstep(client, live_server): wait_for_all_checks(client) uuid = extract_UUID_from_client(client) + assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)" + + assert b"This text should be removed" not in res.data # Check HTML conversion detected and workd res = client.get( @@ -113,16 +123,20 @@ def test_basic_browserstep(client, live_server): ) assert b"This text should be removed" not in res.data assert b"I smell JavaScript because the button was pressed" in res.data - # The JS on the page will set this if the cookie (and thus headers) was handled - assert b"test-value in headers found" in res.data + + assert b"testheader: yes" in res.data + assert b"user-agent: mycustomagent" in res.data + + four_o_four_url = url_for('test_endpoint', status_code=404, _external=True) + four_o_four_url = four_o_four_url.replace('localhost.localdomain', 'cdio') + four_o_four_url = four_o_four_url.replace('localhost', 'cdio') # now test for 404 errors res = client.post( url_for("edit_page", uuid=uuid, unpause_on_save=1), data={ - "url": "https://changedetection.io/404", + "url": four_o_four_url, "tags": "", - "headers": "", 'fetch_backend': "html_webdriver", 'browser_steps-0-operation': 'Goto site', 'browser_steps-1-operation': 'Click element',