more work on fixing headers

10 months ago · c95561edfb
parent c9a9ed2da8
commit c95561edfb
8 changed files with 108 additions and 69 deletions
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@ -6,6 +6,8 @@ import re
 from random import randint
 from loguru import logger
 from changedetectionio.content_fetchers.base import manage_user_agent
 # Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
 # 0- off, 1- on
 browser_step_ui_config = {'Choose one': '0 0',
@ -206,21 +208,18 @@ class browsersteps_live_ui(steppable_browser_interface):
        keep_open = 1000 * 60 * 5
        now = time.time()
        # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
        from changedetectionio.content_fetchers import manage_user_agent
        manage_user_agent(headers=self.headers)
        # @todo handle multiple contexts, bind a unique id from the browser on each req?
        self.context = self.playwright_browser.new_context(
-            accept_downloads=False, # Should never be needed
+            accept_downloads=False,  # Should never be needed
-            bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+            bypass_csp=True,  # This is needed to enable JavaScript execution on GitHub and others
            extra_http_headers=self.headers,
            ignore_https_errors=True,
            proxy=proxy,
-            service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+            service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
-            user_agent=manage_user_agent(headers=self.headers)
+            # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
            user_agent=manage_user_agent(headers=self.headers),
        )
        if self.headers:
            self.context.set_extra_http_headers(self.headers)
        self.page = self.context.new_page()
--- a/changedetectionio/content_fetchers/init.py
+++ b/changedetectionio/content_fetchers/init.py
@ -29,7 +29,8 @@ def available_fetchers():
 # rather than site-specific.
 use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
 if use_playwright_as_chrome_fetcher:
-    if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
+    # @note - For now, browser steps always uses playwright
    if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')) or False:
        logger.debug('Using Playwright library as fetcher')
        from .playwright import fetcher as html_webdriver
    else:
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@ -5,6 +5,40 @@ from loguru import logger
 from changedetectionio.content_fetchers import BrowserStepsStepException
 def manage_user_agent(headers, current_ua=''):
    """
    Basic setting of user-agent
    NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
    THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
    This does not take care of
    - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
    - TCP/IP fingerprint JA3 etc
    - Graphic rendering fingerprinting
    - Your IP being obviously in a pool of bad actors
    - Too many requests
    - Scraping of SCH-UA browser replies (thanks google!!)
    - Scraping of ServiceWorker, new window calls etc
    See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
    Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
    :param page:
    :param headers:
    :return:
    """
    # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
    ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None)
    if ua_in_custom_headers:
        return ua_in_custom_headers
    if not ua_in_custom_headers and current_ua:
        current_ua = current_ua.replace('HeadlessChrome', 'Chrome')
        return current_ua
    return None
 class Fetcher():
    browser_connection_is_custom = None
    browser_connection_url = None
@ -76,39 +110,6 @@ class Fetcher():
        """
        return {k.lower(): v for k, v in self.headers.items()}
    def manage_user_agent(self, headers, current_ua=''):
        """
        Basic setting of user-agent
        NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
        THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
        This does not take care of
        - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
        - TCP/IP fingerprint JA3 etc
        - Graphic rendering fingerprinting
        - Your IP being obviously in a pool of bad actors
        - Too many requests
        - Scraping of SCH-UA browser replies (thanks google!!)
        - Scraping of ServiceWorker, new window calls etc
        See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
        Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
        :param page:
        :param headers:
        :return:
        """
        # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
        ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None)
        if ua_in_custom_headers:
            return ua_in_custom_headers
        if not ua_in_custom_headers and current_ua:
            current_ua = current_ua.replace('HeadlesssChrome', 'Chrome')
            return current_ua
        return None
    def browser_steps_get_valid_steps(self):
        if self.browser_steps is not None and len(self.browser_steps):
            valid_steps = filter(
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
 class fetcher(Fetcher):
@ -103,18 +103,17 @@ class fetcher(Fetcher):
            # Set user agent to prevent Cloudflare from blocking the browser
            # Use the default one configured in the App.py model that's passed from fetch_site_status.py
            context = browser.new_context(
-                accept_downloads=False, # Should never be needed
+                accept_downloads=False,  # Should never be needed
                bypass_csp=True,  # This is needed to enable JavaScript execution on GitHub and others
                extra_http_headers=request_headers,
                ignore_https_errors=True,
                proxy=self.proxy,
                service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
-                user_agent = self.manage_user_agent(headers=request_headers),
+                user_agent=manage_user_agent(headers=request_headers),
            )
            self.page = context.new_page()
            if len(request_headers):
                context.set_extra_http_headers(request_headers)
            # Listen for all console events and handle errors
            self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@ -6,7 +6,7 @@ from urllib.parse import urlparse
 from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
 from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
@ -101,7 +101,7 @@ class fetcher(Fetcher):
        else:
            self.page = await browser.newPage()
-        await self.page.setUserAgent(self.manage_user_agent(headers=request_headers, current_ua=self.page.evaluate('navigator.userAgent')))
+        await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent')))
        await self.page.setBypassCSP(True)
        if request_headers:
--- a/changedetectionio/tests/test_pdf.py
+++ b/changedetectionio/tests/test_pdf.py
@ -29,7 +29,8 @@ def test_fetch_pdf(client, live_server):
        follow_redirects=True
    )
-    assert b'PDF-1.5' not in res.data
+    # PDF header should not be there (it was converted to text)
    assert b'PDF' not in res.data[:10]
    assert b'hello world' in res.data
    # So we know if the file changes in other ways
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@ -242,5 +242,29 @@ def live_server_setup(live_server):
            resp.headers['Content-Type'] = 'application/pdf'
            return resp
    @live_server.app.route('/test-interactive-html-endpoint')
    def test_interactive_html_endpoint():
        import json
        header_text=""
        for k,v in request.headers.items():
            header_text += f"{k}: {v}<br>"
        resp = make_response(f"""
        <html>
          <body>
          Primitive JS check for <pre>changedetectionio/tests/visualselector/test_fetch_data.py</pre>
            <p id="remove">This text should be removed</p>
              <form onsubmit="event.preventDefault();">
            <!-- obfuscated text so that we dont accidentally get a false positive due to conversion of the source :) --->
                <button name="test-button" onclick="getElementById('remove').remove();getElementById('some-content').innerHTML = atob('SSBzbWVsbCBKYXZhU2NyaXB0IGJlY2F1c2UgdGhlIGJ1dHRvbiB3YXMgcHJlc3NlZCE=')">Click here</button>
                <div id=some-content></div>
                <pre>
                {header_text.lower()}
                </pre>
              </body>
         </html>""", 200)
        resp.headers['Content-Type'] = 'text/html'
        return resp
    live_server.start()
--- a/changedetectionio/tests/visualselector/test_fetch_data.py
+++ b/changedetectionio/tests/visualselector/test_fetch_data.py
@ -15,7 +15,9 @@ def test_visual_selector_content_ready(client, live_server):
    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
    # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url
-    test_url = "https://changedetection.io/ci-test/test-runjs.html"
+    test_url = url_for('test_interactive_html_endpoint', _external=True)
    test_url = test_url.replace('localhost.localdomain', 'cdio')
    test_url = test_url.replace('localhost', 'cdio')
    res = client.post(
        url_for("form_quick_watch_add"),
@ -37,7 +39,9 @@ def test_visual_selector_content_ready(client, live_server):
    )
    assert b"unpaused" in res.data
    wait_for_all_checks(client)
    uuid = extract_UUID_from_client(client)
    assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
    # Check the JS execute code before extract worked
    res = client.get(
@ -74,11 +78,13 @@ def test_visual_selector_content_ready(client, live_server):
 def test_basic_browserstep(client, live_server):
    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
    #live_server_setup(live_server)
-    # Add our URL to the import page, because the docker container (playwright/selenium) wont be able to connect to our usual test url
+    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
-    test_url = "https://changedetection.io/ci-test/test-runjs.html"
+
    test_url = url_for('test_interactive_html_endpoint', _external=True)
    test_url = test_url.replace('localhost.localdomain', 'cdio')
    test_url = test_url.replace('localhost', 'cdio')
    res = client.post(
        url_for("form_quick_watch_add"),
@ -90,14 +96,15 @@ def test_basic_browserstep(client, live_server):
    res = client.post(
        url_for("edit_page", uuid="first", unpause_on_save=1),
        data={
-              "url": test_url,
+            "url": test_url,
-              "tags": "",
+            "tags": "",
-              'fetch_backend': "html_webdriver",
+            'fetch_backend': "html_webdriver",
-              'browser_steps-0-operation': 'Goto site',
+            'browser_steps-0-operation': 'Goto site',
-              'browser_steps-1-operation': 'Click element',
+            'browser_steps-1-operation': 'Click element',
-              'browser_steps-1-selector': 'button[name=test-button]',
+            'browser_steps-1-selector': 'button[name=test-button]',
-              'browser_steps-1-optional_value': '',
+            'browser_steps-1-optional_value': '',
-              'headers': "cOoKiE: notice-apa=1; test-value=1; "
+            # For now, cookies doesnt work in headers because it must be a full cookiejar object
            'headers': "testheader: yes\buser-agent: MyCustomAgent",
        },
        follow_redirects=True
    )
@ -105,6 +112,9 @@ def test_basic_browserstep(client, live_server):
    wait_for_all_checks(client)
    uuid = extract_UUID_from_client(client)
    assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
    assert b"This text should be removed" not in res.data
    # Check HTML conversion detected and workd
    res = client.get(
@ -113,16 +123,20 @@ def test_basic_browserstep(client, live_server):
    )
    assert b"This text should be removed" not in res.data
    assert b"I smell JavaScript because the button was pressed" in res.data
-    # The JS on the page will set this if the cookie (and thus headers) was handled
+
-    assert b"test-value in headers found" in res.data
+    assert b"testheader: yes" in res.data
    assert b"user-agent: mycustomagent" in res.data
    four_o_four_url =  url_for('test_endpoint', status_code=404, _external=True)
    four_o_four_url = four_o_four_url.replace('localhost.localdomain', 'cdio')
    four_o_four_url = four_o_four_url.replace('localhost', 'cdio')
    # now test for 404 errors
    res = client.post(
        url_for("edit_page", uuid=uuid, unpause_on_save=1),
        data={
-              "url": "https://changedetection.io/404",
+              "url": four_o_four_url,
              "tags": "",
              "headers": "",
              'fetch_backend': "html_webdriver",
              'browser_steps-0-operation': 'Goto site',
              'browser_steps-1-operation': 'Click element',