pull/2212/head
dgtlmoon 8 months ago
parent 78b99aa2cd
commit 903fc14960

@ -206,24 +206,23 @@ class browsersteps_live_ui(steppable_browser_interface):
keep_open = 1000 * 60 * 5
now = time.time()
# Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
from changedetectionio.content_fetchers import manage_user_agent
manage_user_agent(headers=self.headers)
# @todo handle multiple contexts, bind a unique id from the browser on each req?
self.context = self.playwright_browser.new_context(
# This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True,
# Should never be needed
accept_downloads=False,
proxy=proxy
accept_downloads=False, # Should never be needed
bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
proxy=proxy,
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
user_agent=manage_user_agent(headers=self.headers)
)
self.page = self.context.new_page()
# Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
from changedetectionio.content_fetchers.playwright import manage_user_agent
manage_user_agent(page=self.page, headers=self.headers)
if self.headers:
self.context.set_extra_http_headers(self.headers)
self.page = self.context.new_page()
# self.page.set_default_navigation_timeout(keep_open)
self.page.set_default_timeout(keep_open)

@ -39,3 +39,4 @@ if use_playwright_as_chrome_fetcher:
else:
logger.debug("Falling back to selenium as fetcher")
from .webdriver_selenium import fetcher as html_webdriver

@ -76,6 +76,39 @@ class Fetcher():
"""
return {k.lower(): v for k, v in self.headers.items()}
def manage_user_agent(self, headers, current_ua=''):
"""
Basic setting of user-agent
NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
This does not take care of
- Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
- TCP/IP fingerprint JA3 etc
- Graphic rendering fingerprinting
- Your IP being obviously in a pool of bad actors
- Too many requests
- Scraping of SCH-UA browser replies (thanks google!!)
- Scraping of ServiceWorker, new window calls etc
See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
:param page:
:param headers:
:return:
"""
# Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
ua_in_custom_headers = next((k for k in headers.keys() if k.lower() == "user-agent"), None)
if ua_in_custom_headers:
return ua_in_custom_headers
if not ua_in_custom_headers and current_ua:
current_ua = current_ua.replace('HeadlesssChrome', 'Chrome')
return current_ua
return None
def browser_steps_get_valid_steps(self):
if self.browser_steps is not None and len(self.browser_steps):
valid_steps = filter(

@ -3,41 +3,10 @@ import os
from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
def manage_user_agent(page, headers):
"""
Basic setting of user-agent
NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
This does not take care of
- Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
- TCP/IP fingerprint JA3 etc
- Graphic rendering fingerprinting
- Your IP being obviously in a pool of bad actors
- Too many requests
- Scraping of SCH-UA browser replies (thanks google!!)
- Scraping of ServiceWorker, new window calls etc
See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
:param page:
:param headers:
:return:
"""
# Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
ua_in_custom_headers = next((k for k in headers.keys() if k.lower() == "user-agent"), None)
if not ua_in_custom_headers:
current_ua = page.evaluate('navigator.userAgent').replace('HeadlesssChrome', 'Chrome')
page.set_user_agent(current_ua)
class fetcher(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
@ -134,17 +103,14 @@ class fetcher(Fetcher):
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context(
accept_downloads=False, # Should never be needed
bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True,
# Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
# Should never be needed
accept_downloads=False
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
user_agent = self.manage_user_agent(headers=request_headers),
)
self.page = context.new_page()
manage_user_agent(page=self.page, headers=request_headers)
if len(request_headers):
context.set_extra_http_headers(request_headers)

@ -5,6 +5,7 @@ import websockets.exceptions
from urllib.parse import urlparse
from loguru import logger
from changedetectionio.content_fetchers.base import Fetcher
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
@ -100,10 +101,11 @@ class fetcher(Fetcher):
else:
self.page = await browser.newPage()
await self.page.setUserAgent(self.manage_user_agent(headers=request_headers, current_ua=self.page.evaluate('navigator.userAgent')))
await self.page.setBypassCSP(True)
if request_headers:
await self.page.setExtraHTTPHeaders(request_headers)
# @todo check user-agent worked
# SOCKS5 with authentication is not supported (yet)
# https://github.com/microsoft/playwright/issues/10567

Loading…
Cancel
Save