From 669fd3ae0b8ac11da9b2d4cc7461ee92bd728ec9 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 9 Oct 2022 18:25:36 +0200 Subject: [PATCH] Dont use default Requests `user-agent` and `accept` headers in playwright+selenium requests, breaks sites such as united.com. (#1004) --- changedetectionio/content_fetcher.py | 5 +++++ changedetectionio/model/App.py | 4 ---- changedetectionio/store.py | 8 ++++++++ requirements.txt | 5 ++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 6742f01c..416ed6df 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -575,6 +575,11 @@ class html_requests(Fetcher): ignore_status_codes=False, current_css_filter=None): + # Make requests use a more modern looking user-agent + if not 'User-Agent' in request_headers: + request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') + proxies = {} # Allows override the proxy on a per-request basis diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index c5f0e977..daedde1b 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -13,10 +13,6 @@ class model(dict): 'watching': {}, 'settings': { 'headers': { - 'User-Agent': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'Accept-Encoding': 'gzip, deflate', # No support for brolti in python requests yet. - 'Accept-Language': 'en-GB,en-US;q=0.9,en;' }, 'requests': { 'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds diff --git a/changedetectionio/store.py b/changedetectionio/store.py index a0326e41..bd86039a 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -575,3 +575,11 @@ class ChangeDetectionStore: continue return + + # We incorrectly used common header overrides that should only apply to Requests + # These are now handled in content_fetcher::html_requests and shouldnt be passed to Playwright/Selenium + def update_7(self): + # These were hard-coded in early versions + for v in ['User-Agent', 'Accept', 'Accept-Encoding', 'Accept-Language']: + if self.data['settings']['headers'].get(v): + del self.data['settings']['headers'][v] diff --git a/requirements.txt b/requirements.txt index 26c53131..68aabe9a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,10 @@ flask_restful pytz # Set these versions together to avoid a RequestsDependencyWarning -requests[socks] ~= 2.26 +# >= 2.26 also adds Brotli support if brotli is installed +brotli ~= 1.0 +requests[socks] ~= 2.28 + urllib3 > 1.26 chardet > 2.3.0