From 78b99aa2cd0ef7d73e4251314e0ba9d2871da3a9 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Feb 2024 14:20:16 +0100 Subject: [PATCH] Re #2197 fixing headers and user-agent --- .../blueprint/browser_steps/browser_steps.py | 15 +++++--- .../blueprint/browser_steps/nonContext.py | 1 - .../content_fetchers/__init__.py | 5 ++- .../content_fetchers/playwright.py | 35 ++++++++++++++++++- 4 files changed, 49 insertions(+), 7 deletions(-) diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index 22710e99..e5b3f8b6 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -178,6 +178,7 @@ class browsersteps_live_ui(steppable_browser_interface): stale = False # bump and kill this if idle after X sec age_start = 0 + headers = {} # use a special driver, maybe locally etc command_executor = os.getenv( @@ -192,7 +193,8 @@ class browsersteps_live_ui(steppable_browser_interface): browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') - def __init__(self, playwright_browser, proxy=None): + def __init__(self, playwright_browser, proxy=None, headers=None): + self.headers = headers or {} self.age_start = time.time() self.playwright_browser = playwright_browser if self.context is None: @@ -206,9 +208,6 @@ class browsersteps_live_ui(steppable_browser_interface): # @todo handle multiple contexts, bind a unique id from the browser on each req? self.context = self.playwright_browser.new_context( - # @todo - # user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0', - # proxy=self.proxy, # This is needed to enable JavaScript execution on GitHub and others bypass_csp=True, # Should never be needed @@ -218,6 +217,14 @@ class browsersteps_live_ui(steppable_browser_interface): self.page = self.context.new_page() + # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default + from changedetectionio.content_fetchers.playwright import manage_user_agent + manage_user_agent(page=self.page, headers=self.headers) + + if self.headers: + self.context.set_extra_http_headers(self.headers) + + # self.page.set_default_navigation_timeout(keep_open) self.page.set_default_timeout(keep_open) # @todo probably this doesnt work diff --git a/changedetectionio/blueprint/browser_steps/nonContext.py b/changedetectionio/blueprint/browser_steps/nonContext.py index 5345f306..93abe269 100644 --- a/changedetectionio/blueprint/browser_steps/nonContext.py +++ b/changedetectionio/blueprint/browser_steps/nonContext.py @@ -1,5 +1,4 @@ from playwright.sync_api import PlaywrightContextManager -import asyncio # So playwright wants to run as a context manager, but we do something horrible and hacky # we are holding the session open for as long as possible, then shutting it down, and opening a new one diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index d54b9bb2..6fc9c53e 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -1,6 +1,6 @@ import sys from distutils.util import strtobool - +from loguru import logger from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException import os @@ -30,9 +30,12 @@ def available_fetchers(): use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) if use_playwright_as_chrome_fetcher: if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')): + logger.debug('Using Playwright library as fetcher') from .playwright import fetcher as html_webdriver else: + logger.debug('Using direct Python Puppeteer library as fetcher') from .puppeteer import fetcher as html_webdriver else: + logger.debug("Falling back to selenium as fetcher") from .webdriver_selenium import fetcher as html_webdriver diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py index 7faa2032..7a8560d9 100644 --- a/changedetectionio/content_fetchers/playwright.py +++ b/changedetectionio/content_fetchers/playwright.py @@ -6,6 +6,38 @@ from loguru import logger from changedetectionio.content_fetchers.base import Fetcher from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable + +def manage_user_agent(page, headers): + """ + Basic setting of user-agent + + NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques + THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!! + This does not take care of + - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc + - TCP/IP fingerprint JA3 etc + - Graphic rendering fingerprinting + - Your IP being obviously in a pool of bad actors + - Too many requests + - Scraping of SCH-UA browser replies (thanks google!!) + - Scraping of ServiceWorker, new window calls etc + + See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da + Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth + + :param page: + :param headers: + :return: + """ + # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default + ua_in_custom_headers = next((k for k in headers.keys() if k.lower() == "user-agent"), None) + if not ua_in_custom_headers: + current_ua = page.evaluate('navigator.userAgent').replace('HeadlesssChrome', 'Chrome') + page.set_user_agent(current_ua) + + + + class fetcher(Fetcher): fetcher_description = "Playwright {}/Javascript".format( os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() @@ -102,7 +134,6 @@ class fetcher(Fetcher): # Set user agent to prevent Cloudflare from blocking the browser # Use the default one configured in the App.py model that's passed from fetch_site_status.py context = browser.new_context( - user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None), proxy=self.proxy, # This is needed to enable JavaScript execution on GitHub and others bypass_csp=True, @@ -113,6 +144,8 @@ class fetcher(Fetcher): ) self.page = context.new_page() + manage_user_agent(page=self.page, headers=request_headers) + if len(request_headers): context.set_extra_http_headers(request_headers)