tidy-up

10 months ago · 903fc14960
parent 78b99aa2cd
commit 903fc14960
5 changed files with 52 additions and 51 deletions
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@ -206,24 +206,23 @@ class browsersteps_live_ui(steppable_browser_interface):
        keep_open = 1000 * 60 * 5
        now = time.time()

+        # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
+        from changedetectionio.content_fetchers import manage_user_agent
+        manage_user_agent(headers=self.headers)
+
        # @todo handle multiple contexts, bind a unique id from the browser on each req?
        self.context = self.playwright_browser.new_context(
-            # This is needed to enable JavaScript execution on GitHub and others
-            bypass_csp=True,
-            # Should never be needed
-            accept_downloads=False,
-            proxy=proxy
+            accept_downloads=False, # Should never be needed
+            bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+            proxy=proxy,
+            service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+            user_agent=manage_user_agent(headers=self.headers)
        )

-        self.page = self.context.new_page()
-
-        # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
-        from changedetectionio.content_fetchers.playwright import manage_user_agent
-        manage_user_agent(page=self.page, headers=self.headers)
-
        if self.headers:
            self.context.set_extra_http_headers(self.headers)

+        self.page = self.context.new_page()

        # self.page.set_default_navigation_timeout(keep_open)
        self.page.set_default_timeout(keep_open)
--- a/changedetectionio/content_fetchers/init.py
+++ b/changedetectionio/content_fetchers/init.py
@ -39,3 +39,4 @@ if use_playwright_as_chrome_fetcher:
 else:
    logger.debug("Falling back to selenium as fetcher")
    from .webdriver_selenium import fetcher as html_webdriver
+
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@ -76,6 +76,39 @@ class Fetcher():
        """
        return {k.lower(): v for k, v in self.headers.items()}

+    def manage_user_agent(self, headers, current_ua=''):
+        """
+        Basic setting of user-agent
+
+        NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
+        THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
+        This does not take care of
+        - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
+        - TCP/IP fingerprint JA3 etc
+        - Graphic rendering fingerprinting
+        - Your IP being obviously in a pool of bad actors
+        - Too many requests
+        - Scraping of SCH-UA browser replies (thanks google!!)
+        - Scraping of ServiceWorker, new window calls etc
+
+        See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
+        Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
+
+        :param page:
+        :param headers:
+        :return:
+        """
+        # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
+        ua_in_custom_headers = next((k for k in headers.keys() if k.lower() == "user-agent"), None)
+        if ua_in_custom_headers:
+            return ua_in_custom_headers
+
+        if not ua_in_custom_headers and current_ua:
+            current_ua = current_ua.replace('HeadlesssChrome', 'Chrome')
+            return current_ua
+
+        return None
+
    def browser_steps_get_valid_steps(self):
        if self.browser_steps is not None and len(self.browser_steps):
            valid_steps = filter(
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@ -3,41 +3,10 @@ import os
 from urllib.parse import urlparse

 from loguru import logger
+
 from changedetectionio.content_fetchers.base import Fetcher
 from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable

-
-def manage_user_agent(page, headers):
-    """
-    Basic setting of user-agent
-
-    NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
-    THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
-    This does not take care of
-    - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
-    - TCP/IP fingerprint JA3 etc
-    - Graphic rendering fingerprinting
-    - Your IP being obviously in a pool of bad actors
-    - Too many requests
-    - Scraping of SCH-UA browser replies (thanks google!!)
-    - Scraping of ServiceWorker, new window calls etc
-
-    See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
-    Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
-
-    :param page:
-    :param headers:
-    :return:
-    """
-    # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
-    ua_in_custom_headers = next((k for k in headers.keys() if k.lower() == "user-agent"), None)
-    if not ua_in_custom_headers:
-        current_ua = page.evaluate('navigator.userAgent').replace('HeadlesssChrome', 'Chrome')
-        page.set_user_agent(current_ua)
-
-
-
-
 class fetcher(Fetcher):
    fetcher_description = "Playwright {}/Javascript".format(
        os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
@ -134,17 +103,14 @@ class fetcher(Fetcher):
            # Set user agent to prevent Cloudflare from blocking the browser
            # Use the default one configured in the App.py model that's passed from fetch_site_status.py
            context = browser.new_context(
+                accept_downloads=False, # Should never be needed
+                bypass_csp=True,  # This is needed to enable JavaScript execution on GitHub and others
                proxy=self.proxy,
-                # This is needed to enable JavaScript execution on GitHub and others
-                bypass_csp=True,
-                # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
-                service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
-                # Should never be needed
-                accept_downloads=False
+                service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+                user_agent = self.manage_user_agent(headers=request_headers),
            )

            self.page = context.new_page()
-            manage_user_agent(page=self.page, headers=request_headers)

            if len(request_headers):
                context.set_extra_http_headers(request_headers)
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@ -5,6 +5,7 @@ import websockets.exceptions
 from urllib.parse import urlparse

 from loguru import logger
+
 from changedetectionio.content_fetchers.base import Fetcher
 from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError

@ -100,10 +101,11 @@ class fetcher(Fetcher):
        else:
            self.page = await browser.newPage()

+        await self.page.setUserAgent(self.manage_user_agent(headers=request_headers, current_ua=self.page.evaluate('navigator.userAgent')))
+
        await self.page.setBypassCSP(True)
        if request_headers:
            await self.page.setExtraHTTPHeaders(request_headers)
-            # @todo check user-agent worked

        # SOCKS5 with authentication is not supported (yet)
        # https://github.com/microsoft/playwright/issues/10567