From 9e954532d602c61a07b9f6d4d2453b997355cf57 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Mon, 22 May 2023 17:19:52 +0200
Subject: [PATCH] Fetcher - Ability to specify headers from a textfile per
 watch, global or per tag (
 https://github.com/dgtlmoon/changedetection.io/wiki/Adding-headers-from-an-external-file
 )

---
 changedetectionio/__init__.py                 |  2 +
 changedetectionio/model/App.py                | 12 ++++
 changedetectionio/model/Watch.py              | 34 +++++++++
 .../processors/text_json_diff.py              |  5 +-
 changedetectionio/store.py                    | 23 +++++-
 changedetectionio/templates/edit.html         |  9 +++
 changedetectionio/tests/conftest.py           | 17 +++--
 changedetectionio/tests/test_request.py       | 72 ++++++++++++++++++-
 8 files changed, 160 insertions(+), 14 deletions(-)

diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 8b3e8e39..67d2f0be 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -700,6 +700,7 @@ def changedetection_app(config=None, datastore_o=None):
                                      form=form,
                                      has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
                                      has_empty_checktime=using_default_check_time,
+                                     has_extra_headers_file=watch.has_extra_headers_file or datastore.has_extra_headers_file,
                                      is_html_webdriver=is_html_webdriver,
                                      jq_support=jq_support,
                                      playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False),
@@ -1444,6 +1445,7 @@ def check_for_new_version():
         # Check daily
         app.config.exit.wait(86400)
 
+
 def notification_runner():
     global notification_debug_log
     from datetime import datetime
diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py
index 7c7cac9f..54580b3d 100644
--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@@ -49,3 +49,15 @@ class model(dict):
     def __init__(self, *arg, **kw):
         super(model, self).__init__(*arg, **kw)
         self.update(self.base_config)
+
+
+def parse_headers_from_text_file(filepath):
+    headers = {}
+    with open(filepath, 'r') as f:
+        for l in f.readlines():
+            l = l.strip()
+            if not l.startswith('#') and ':' in l:
+                (k, v) = l.split(':')
+                headers[k.strip()] = v.strip()
+
+    return headers
\ No newline at end of file
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index ca654d04..77c07497 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -473,6 +473,40 @@ class model(dict):
         # None is set
         return False
 
+    @property
+    def has_extra_headers_file(self):
+        if os.path.isfile(os.path.join(self.watch_data_dir, 'headers.txt')):
+            return True
+
+        for f in self.all_tags:
+            fname = "headers-"+re.sub(r'[\W_]', '', f).lower().strip() + ".txt"
+            filepath = os.path.join(self.__datastore_path, fname)
+            if os.path.isfile(filepath):
+                return True
+
+        return False
+
+    def get_all_headers(self):
+        from .App import parse_headers_from_text_file
+        headers = self.get('headers', {}).copy()
+        # Available headers on the disk could 'headers.txt' in the watch data dir
+        filepath = os.path.join(self.watch_data_dir, 'headers.txt')
+        try:
+            if os.path.isfile(filepath):
+                headers.update(parse_headers_from_text_file(filepath))
+        except Exception as e:
+            print(f"ERROR reading headers.txt at {filepath}", str(e))
+
+        # Or each by tag, as tagname.txt in the main datadir
+        for f in self.all_tags:
+            fname = "headers-"+re.sub(r'[\W_]', '', f).lower().strip() + ".txt"
+            filepath = os.path.join(self.__datastore_path, fname)
+            try:
+                if os.path.isfile(filepath):
+                    headers.update(parse_headers_from_text_file(filepath))
+            except Exception as e:
+                print(f"ERROR reading headers.txt at {filepath}", str(e))
+        return headers
 
     def get_last_fetched_before_filters(self):
         import brotli
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index cf85522a..f767703b 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -70,10 +70,9 @@ class perform_site_check(difference_detection_processor):
         # Unset any existing notification error
         update_obj = {'last_notification_error': False, 'last_error': False}
 
-        extra_headers = watch.get('headers', [])
-
         # Tweak the base config with the per-watch ones
-        request_headers = deepcopy(self.datastore.data['settings']['headers'])
+        extra_headers = watch.get_all_headers()
+        request_headers = self.datastore.get_all_headers()
         request_headers.update(extra_headers)
 
         # https://github.com/psf/requests/issues/4525
diff --git a/changedetectionio/store.py b/changedetectionio/store.py
index f69eb907..5e071ce5 100644
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -3,7 +3,7 @@ from flask import (
 )
 
 from . model import App, Watch
-from copy import deepcopy
+from copy import deepcopy, copy
 from os import path, unlink
 from threading import Lock
 import json
@@ -474,8 +474,6 @@ class ChangeDetectionStore:
         return proxy_list if len(proxy_list) else None
 
 
-
-
     def get_preferred_proxy_for_watch(self, uuid):
         """
         Returns the preferred proxy by ID key
@@ -507,6 +505,25 @@ class ChangeDetectionStore:
 
         return None
 
+    @property
+    def has_extra_headers_file(self):
+        filepath = os.path.join(self.datastore_path, 'headers.txt')
+        return os.path.isfile(filepath)
+
+    def get_all_headers(self):
+        from .model.App import parse_headers_from_text_file
+        headers = copy(self.data['settings'].get('headers', {}))
+
+        filepath = os.path.join(self.datastore_path, 'headers.txt')
+        try:
+            if os.path.isfile(filepath):
+                headers.update(parse_headers_from_text_file(filepath))
+        except Exception as e:
+            print(f"ERROR reading headers.txt at {filepath}", str(e))
+
+        return headers
+
+
     # Run all updates
     # IMPORTANT - Each update could be run even when they have a new install and the schema is correct
     #             So therefor - each `update_n` should be very careful about checking if it needs to actually run
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 805c79c7..40b1101f 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -152,6 +152,15 @@
 {{ render_field(form.headers, rows=5, placeholder="Example
 Cookie: foobar
 User-Agent: wonderbra 1.0") }}
+
+                        <div class="pure-form-message-inline">
+                            {% if has_extra_headers_file %}
+                                <strong>Alert! Extra headers file found and will be added to this watch!</strong>
+                            {% else %}
+                                Headers can be also read from a file in your data-directory <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Adding-headers-from-an-external-file">Read more here</a>
+                            {% endif %}
+                        </div>
+
                     </div>
                     <div class="pure-control-group" id="request-body">
                                         {{ render_field(form.body, rows=5, placeholder="Example
diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py
index 948c5792..754ec1fc 100644
--- a/changedetectionio/tests/conftest.py
+++ b/changedetectionio/tests/conftest.py
@@ -14,13 +14,16 @@ global app
 
 def cleanup(datastore_path):
     # Unlink test output files
-    files = ['output.txt',
-             'url-watches.json',
-             'secret.txt',
-             'notification.txt',
-             'count.txt',
-             'endpoint-content.txt'
-                 ]
+    files = [
+        'count.txt',
+        'endpoint-content.txt'
+        'headers.txt',
+        'headers-testtag.txt',
+        'notification.txt',
+        'secret.txt',
+        'url-watches.json',
+        'output.txt',
+    ]
     for file in files:
         try:
             os.unlink("{}/{}".format(datastore_path, file))
diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py
index b3205d15..06e518e1 100644
--- a/changedetectionio/tests/test_request.py
+++ b/changedetectionio/tests/test_request.py
@@ -1,7 +1,8 @@
 import json
+import os
 import time
 from flask import url_for
-from . util import set_original_response, set_modified_response, live_server_setup
+from . util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks, extract_UUID_from_client
 
 def test_setup(live_server):
     live_server_setup(live_server)
@@ -234,3 +235,72 @@ def test_method_in_request(client, live_server):
     # Should be only one with method set to PATCH
     assert watches_with_method == 1
 
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
+def test_headers_textfile_in_request(client, live_server):
+    #live_server_setup(live_server)
+    # Add our URL to the import page
+    test_url = url_for('test_headers', _external=True)
+
+    # Add the test URL twice, we will check
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    time.sleep(1)
+
+
+    # Add some headers to a request
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={
+              "url": test_url,
+              "tag": "testtag",
+              "fetch_backend": "html_requests",
+              "headers": "xxx:ooo\ncool:yeah\r\n"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+    wait_for_all_checks(client)
+
+    with open('test-datastore/headers-testtag.txt', 'w') as f:
+        f.write("tag-header: test")
+
+    with open('test-datastore/headers.txt', 'w') as f:
+        f.write("global-header: nice\r\nnext-global-header: nice")
+
+    with open('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt', 'w') as f:
+        f.write("watch-header: nice")
+
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    wait_for_all_checks(client)
+
+    res = client.get(url_for("edit_page", uuid="first"))
+    assert b"Extra headers file found and will be added to this watch" in res.data
+
+    # Not needed anymore
+    os.unlink('test-datastore/headers.txt')
+    os.unlink('test-datastore/headers-testtag.txt')
+    os.unlink('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt')
+    # The service should echo back the request verb
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b"Global-Header:nice" in res.data
+    assert b"Next-Global-Header:nice" in res.data
+    assert b"Xxx:ooo" in res.data
+    assert b"Watch-Header:nice" in res.data
+    assert b"Tag-Header:test" in res.data
+
+
+    #unlink headers.txt on start/stop
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
\ No newline at end of file