Re #2197 fixing headers and user-agent

2197-browsersteps-headers
dgtlmoon 10 months ago
parent 3d390b6ea4
commit 78b99aa2cd

@ -178,6 +178,7 @@ class browsersteps_live_ui(steppable_browser_interface):
stale = False
# bump and kill this if idle after X sec
age_start = 0
headers = {}
# use a special driver, maybe locally etc
command_executor = os.getenv(
@ -192,7 +193,8 @@ class browsersteps_live_ui(steppable_browser_interface):
browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
def __init__(self, playwright_browser, proxy=None):
def __init__(self, playwright_browser, proxy=None, headers=None):
self.headers = headers or {}
self.age_start = time.time()
self.playwright_browser = playwright_browser
if self.context is None:
@ -206,9 +208,6 @@ class browsersteps_live_ui(steppable_browser_interface):
# @todo handle multiple contexts, bind a unique id from the browser on each req?
self.context = self.playwright_browser.new_context(
# @todo
# user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
# proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True,
# Should never be needed
@ -218,6 +217,14 @@ class browsersteps_live_ui(steppable_browser_interface):
self.page = self.context.new_page()
# Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
from changedetectionio.content_fetchers.playwright import manage_user_agent
manage_user_agent(page=self.page, headers=self.headers)
if self.headers:
self.context.set_extra_http_headers(self.headers)
# self.page.set_default_navigation_timeout(keep_open)
self.page.set_default_timeout(keep_open)
# @todo probably this doesnt work

@ -1,5 +1,4 @@
from playwright.sync_api import PlaywrightContextManager
import asyncio
# So playwright wants to run as a context manager, but we do something horrible and hacky
# we are holding the session open for as long as possible, then shutting it down, and opening a new one

@ -1,6 +1,6 @@
import sys
from distutils.util import strtobool
from loguru import logger
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
import os
@ -30,9 +30,12 @@ def available_fetchers():
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
logger.debug('Using Playwright library as fetcher')
from .playwright import fetcher as html_webdriver
else:
logger.debug('Using direct Python Puppeteer library as fetcher')
from .puppeteer import fetcher as html_webdriver
else:
logger.debug("Falling back to selenium as fetcher")
from .webdriver_selenium import fetcher as html_webdriver

@ -6,6 +6,38 @@ from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
def manage_user_agent(page, headers):
"""
Basic setting of user-agent
NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
This does not take care of
- Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
- TCP/IP fingerprint JA3 etc
- Graphic rendering fingerprinting
- Your IP being obviously in a pool of bad actors
- Too many requests
- Scraping of SCH-UA browser replies (thanks google!!)
- Scraping of ServiceWorker, new window calls etc
See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
:param page:
:param headers:
:return:
"""
# Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
ua_in_custom_headers = next((k for k in headers.keys() if k.lower() == "user-agent"), None)
if not ua_in_custom_headers:
current_ua = page.evaluate('navigator.userAgent').replace('HeadlesssChrome', 'Chrome')
page.set_user_agent(current_ua)
class fetcher(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
@ -102,7 +134,6 @@ class fetcher(Fetcher):
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context(
user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True,
@ -113,6 +144,8 @@ class fetcher(Fetcher):
)
self.page = context.new_page()
manage_user_agent(page=self.page, headers=request_headers)
if len(request_headers):
context.set_extra_http_headers(request_headers)

Loading…
Cancel
Save