Fetching/Requests - Fixing user agent header overrides per-watch of global settings (#2409)

2394-rss-access-settings
dgtlmoon 5 months ago committed by GitHub
parent de48892243
commit c6ee6687b5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -28,7 +28,7 @@ def manage_user_agent(headers, current_ua=''):
:return: :return:
""" """
# Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None) ua_in_custom_headers = headers.get('User-Agent')
if ua_in_custom_headers: if ua_in_custom_headers:
return ua_in_custom_headers return ua_in_custom_headers

@ -115,12 +115,11 @@ class fetcher(Fetcher):
# This user agent is similar to what was used when tweaking the evasions in inject_evasions_into_page(..) # This user agent is similar to what was used when tweaking the evasions in inject_evasions_into_page(..)
user_agent = None user_agent = None
if request_headers: if request_headers and request_headers.get('User-Agent'):
user_agent = next((value for key, value in request_headers.items() if key.lower().strip() == 'user-agent'), None) # Request_headers should now be CaaseInsensitiveDict
if user_agent: # Remove it so it's not sent again with headers after
await self.page.setUserAgent(user_agent) user_agent = request_headers.pop('User-Agent').strip()
# Remove it so it's not sent again with headers after await self.page.setUserAgent(user_agent)
[request_headers.pop(key) for key in list(request_headers) if key.lower().strip() == 'user-agent'.lower().strip()]
if not user_agent: if not user_agent:
# Attempt to strip 'HeadlessChrome' etc # Attempt to strip 'HeadlessChrome' etc

@ -1,10 +1,10 @@
from abc import abstractmethod from abc import abstractmethod
import os
import hashlib
import re
from copy import deepcopy
from changedetectionio.strtobool import strtobool from changedetectionio.strtobool import strtobool
from copy import deepcopy
from loguru import logger from loguru import logger
import hashlib
import os
import re
class difference_detection_processor(): class difference_detection_processor():
@ -21,7 +21,7 @@ class difference_detection_processor():
self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
def call_browser(self): def call_browser(self):
from requests.structures import CaseInsensitiveDict
# Protect against file:// access # Protect against file:// access
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
@ -93,14 +93,16 @@ class difference_detection_processor():
self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid')) self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid'))
# Tweak the base config with the per-watch ones # Tweak the base config with the per-watch ones
request_headers = self.watch.get('headers', []) request_headers = CaseInsensitiveDict()
request_headers.update(self.datastore.get_all_base_headers())
request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid')))
ua = self.datastore.data['settings']['requests'].get('default_ua') ua = self.datastore.data['settings']['requests'].get('default_ua')
if ua and ua.get(prefer_fetch_backend): if ua and ua.get(prefer_fetch_backend):
request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)}) request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)})
request_headers.update(self.watch.get('headers', {}))
request_headers.update(self.datastore.get_all_base_headers())
request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid')))
# https://github.com/psf/requests/issues/4525 # https://github.com/psf/requests/issues/4525
# Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
# do this by accident. # do this by accident.

@ -253,6 +253,62 @@ def test_method_in_request(client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data assert b'Deleted' in res.data
# Re #2408 - user-agent override test, also should handle case-insensitive header deduplication
def test_ua_global_override(client, live_server):
# live_server_setup(live_server)
test_url = url_for('test_headers', _external=True)
res = client.post(
url_for("settings_page"),
data={
"application-fetch_backend": "html_requests",
"application-minutes_between_check": 180,
"requests-default_ua-html_requests": "html-requests-user-agent"
},
follow_redirects=True
)
assert b'Settings updated' in res.data
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b"html-requests-user-agent" in res.data
# default user-agent should have shown by now
# now add a custom one in the headers
# Add some headers to a request
res = client.post(
url_for("edit_page", uuid="first"),
data={
"url": test_url,
"tags": "testtag",
"fetch_backend": 'html_requests',
# Important - also test case-insensitive
"headers": "User-AGent: agent-from-watch"},
follow_redirects=True
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b"agent-from-watch" in res.data
assert b"html-requests-user-agent" not in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_headers_textfile_in_request(client, live_server): def test_headers_textfile_in_request(client, live_server):
#live_server_setup(live_server) #live_server_setup(live_server)
# Add our URL to the import page # Add our URL to the import page
@ -333,7 +389,7 @@ def test_headers_textfile_in_request(client, live_server):
# Not needed anymore # Not needed anymore
os.unlink('test-datastore/headers.txt') os.unlink('test-datastore/headers.txt')
os.unlink('test-datastore/headers-testtag.txt') os.unlink('test-datastore/headers-testtag.txt')
os.unlink('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt')
# The service should echo back the request verb # The service should echo back the request verb
res = client.get( res = client.get(
url_for("preview_page", uuid="first"), url_for("preview_page", uuid="first"),

Loading…
Cancel
Save