From 3b80bb2f0e3a929392cf0e2aaa035a8fcd75f6e7 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Sun, 19 Mar 2023 21:12:22 +0100
Subject: [PATCH] Use brotli for reducing the size of the text snapshots
 (#1482)

---
 changedetectionio/__init__.py                 | 83 +++++++++----------
 changedetectionio/api/api_v1.py               |  4 +-
 changedetectionio/diff.py                     | 13 ++-
 changedetectionio/model/Watch.py              | 53 +++++++++---
 changedetectionio/run_basic_tests.sh          |  7 ++
 .../tests/test_jsonpath_jq_selector.py        |  4 +-
 .../tests/unit/test_notification_diff.py      | 24 ++++--
 changedetectionio/update_worker.py            | 13 ++-
 8 files changed, 119 insertions(+), 82 deletions(-)
diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 9f185b2b..23ea2dce 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -340,8 +340,6 @@ def changedetection_app(config=None, datastore_o=None):
             if len(dates) < 2:
                 continue
 
-            prev_fname = watch.history[dates[-2]]
-
             if not watch.viewed:
                 # Re #239 - GUID needs to be individual for each event
                 # @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228)
@@ -362,9 +360,12 @@ def changedetection_app(config=None, datastore_o=None):
 
                 watch_title = watch.get('title') if watch.get('title') else watch.get('url')
                 fe.title(title=watch_title)
-                latest_fname = watch.history[dates[-1]]
 
-                html_diff = diff.render_diff(prev_fname, latest_fname, include_equal=False, line_feed_sep="<br>")
+                html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]),
+                                             newest_version_file_contents=watch.get_history_snapshot(dates[-1]),
+                                             include_equal=False,
+                                             line_feed_sep="<br>")
+
                 fe.content(content="<html><body><h4>{}</h4>{}</body></html>".format(watch_title, html_diff),
                            type='CDATA')
 
@@ -847,28 +848,22 @@ def changedetection_app(config=None, datastore_o=None):
         # Save the current newest history as the most recently viewed
         datastore.set_last_viewed(uuid, time.time())
 
-        newest_file = history[dates[-1]]
-
         # Read as binary and force decode as UTF-8
         # Windows may fail decode in python if we just use 'r' mode (chardet decode exception)
         try:
-            with open(newest_file, 'r', encoding='utf-8', errors='ignore') as f:
-                newest_version_file_contents = f.read()
+            newest_version_file_contents = watch.get_history_snapshot(dates[-1])
         except Exception as e:
-            newest_version_file_contents = "Unable to read {}.\n".format(newest_file)
+            newest_version_file_contents = "Unable to read {}.\n".format(dates[-1])
 
         previous_version = request.args.get('previous_version')
-        try:
-            previous_file = history[previous_version]
-        except KeyError:
-            # Not present, use a default value, the second one in the sorted list.
-            previous_file = history[dates[-2]]
+        previous_timestamp = dates[-2]
+        if previous_version:
+            previous_timestamp = previous_version
 
         try:
-            with open(previous_file, 'r', encoding='utf-8', errors='ignore') as f:
-                previous_version_file_contents = f.read()
+            previous_version_file_contents = watch.get_history_snapshot(previous_timestamp)
         except Exception as e:
-            previous_version_file_contents = "Unable to read {}.\n".format(previous_file)
+            previous_version_file_contents = "Unable to read {}.\n".format(previous_timestamp)
 
 
         screenshot_url = watch.get_screenshot()
@@ -948,37 +943,35 @@ def changedetection_app(config=None, datastore_o=None):
             return output
 
         timestamp = list(watch.history.keys())[-1]
-        filename = watch.history[timestamp]
         try:
-            with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
-                tmp = f.readlines()
-
-                # Get what needs to be highlighted
-                ignore_rules = watch.get('ignore_text', []) + datastore.data['settings']['application']['global_ignore_text']
-
-                # .readlines will keep the \n, but we will parse it here again, in the future tidy this up
-                ignored_line_numbers = html_tools.strip_ignore_text(content="".join(tmp),
-                                                                    wordlist=ignore_rules,
-                                                                    mode='line numbers'
-                                                                    )
-
-                trigger_line_numbers = html_tools.strip_ignore_text(content="".join(tmp),
-                                                                    wordlist=watch['trigger_text'],
-                                                                    mode='line numbers'
-                                                                    )
-                # Prepare the classes and lines used in the template
-                i=0
-                for l in tmp:
-                    classes=[]
-                    i+=1
-                    if i in ignored_line_numbers:
-                        classes.append('ignored')
-                    if i in trigger_line_numbers:
-                        classes.append('triggered')
-                    content.append({'line': l, 'classes': ' '.join(classes)})
+            tmp = watch.get_history_snapshot(timestamp).splitlines()
+
+            # Get what needs to be highlighted
+            ignore_rules = watch.get('ignore_text', []) + datastore.data['settings']['application']['global_ignore_text']
+
+            # .readlines will keep the \n, but we will parse it here again, in the future tidy this up
+            ignored_line_numbers = html_tools.strip_ignore_text(content="\n".join(tmp),
+                                                                wordlist=ignore_rules,
+                                                                mode='line numbers'
+                                                                )
+
+            trigger_line_numbers = html_tools.strip_ignore_text(content="\n".join(tmp),
+                                                                wordlist=watch['trigger_text'],
+                                                                mode='line numbers'
+                                                                )
+            # Prepare the classes and lines used in the template
+            i=0
+            for l in tmp:
+                classes=[]
+                i+=1
+                if i in ignored_line_numbers:
+                    classes.append('ignored')
+                if i in trigger_line_numbers:
+                    classes.append('triggered')
+                content.append({'line': l, 'classes': ' '.join(classes)})
 
         except Exception as e:
-            content.append({'line': "File doesnt exist or unable to read file {}".format(filename), 'classes': ''})
+            content.append({'line': f"File doesnt exist or unable to read timestamp {timestamp}", 'classes': ''})
 
         output = render_template("preview.html",
                                  content=content,
diff --git a/changedetectionio/api/api_v1.py b/changedetectionio/api/api_v1.py
index e28972d6..b30c0d63 100644
--- a/changedetectionio/api/api_v1.py
+++ b/changedetectionio/api/api_v1.py
@@ -179,9 +179,7 @@ class WatchSingleHistory(Resource):
         if timestamp == 'latest':
             timestamp = list(watch.history.keys())[-1]
 
-        # @todo - Check for UTF-8 compatability
-        with open(watch.history[timestamp], 'r') as f:
-            content = f.read()
+        content = watch.get_history_snapshot(timestamp)
 
         response = make_response(content, 200)
         response.mimetype = "text/plain"
diff --git a/changedetectionio/diff.py b/changedetectionio/diff.py
index e21bfa69..2b566ffc 100644
--- a/changedetectionio/diff.py
+++ b/changedetectionio/diff.py
@@ -31,14 +31,11 @@ def customSequenceMatcher(before, after, include_equal=False, include_removed=Tr
 
 # only_differences - only return info about the differences, no context
 # line_feed_sep could be "<br>" or "<li>" or "\n" etc
-def render_diff(previous_file, newest_file, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"):
-    with open(newest_file, 'r') as f:
-        newest_version_file_contents = f.read()
-        newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()]
-
-    if previous_file:
-        with open(previous_file, 'r') as f:
-            previous_version_file_contents = f.read()
+def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"):
+
+    newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()]
+
+    if previous_version_file_contents:
             previous_version_file_contents = [line.rstrip() for line in previous_version_file_contents.splitlines()]
     else:
         previous_version_file_contents = ""
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index 403ac858..9af922ae 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -241,9 +241,32 @@ class model(dict):
         bump = self.history
         return self.__newest_history_key
 
+    def get_history_snapshot(self, timestamp):
+        import brotli
+        filepath = self.history[timestamp]
+
+        # See if a brotli versions exists and switch to that
+        if not filepath.endswith('.br') and os.path.isfile(f"{filepath}.br"):
+            filepath = f"{filepath}.br"
+
+        # OR in the backup case that the .br does not exist, but the plain one does
+        if filepath.endswith('.br') and not os.path.isfile(filepath):
+            if os.path.isfile(filepath.replace('.br', '')):
+                filepath = filepath.replace('.br', '')
+
+        if filepath.endswith('.br'):
+            # Brotli doesnt have a fileheader to detect it, so we rely on filename
+            # https://www.rfc-editor.org/rfc/rfc7932
+            with open(filepath, 'rb') as f:
+                return(brotli.decompress(f.read()).decode('utf-8'))
+
+        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+            return f.read()
+
     # Save some text file to the appropriate path and bump the history
     # result_obj from fetch_site_status.run()
     def save_history_text(self, contents, timestamp, snapshot_id):
+        import brotli
 
         self.ensure_data_dir_exists()
 
@@ -252,16 +275,21 @@ class model(dict):
         if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key):
             time.sleep(timestamp - self.__newest_history_key)
 
-        snapshot_fname = f"{snapshot_id}.txt"
-
-        # Only write if it does not exist, this is so that we dont bother re-saving the same data by checksum under different filenames.
-        dest = os.path.join(self.watch_data_dir, snapshot_fname)
-        if not os.path.exists(dest):
-            # in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading
-            # most sites are utf-8 and some are even broken utf-8
-            with open(dest, 'wb') as f:
-                f.write(contents)
-                f.close()
+        threshold = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024))
+        skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False'))
+
+        if not skip_brotli and len(contents) > threshold:
+            snapshot_fname = f"{snapshot_id}.txt.br"
+            dest = os.path.join(self.watch_data_dir, snapshot_fname)
+            if not os.path.exists(dest):
+                with open(dest, 'wb') as f:
+                    f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
+        else:
+            snapshot_fname = f"{snapshot_id}.txt"
+            dest = os.path.join(self.watch_data_dir, snapshot_fname)
+            if not os.path.exists(dest):
+                with open(dest, 'wb') as f:
+                    f.write(contents)
 
         # Append to index
         # @todo check last char was \n
@@ -359,6 +387,7 @@ class model(dict):
             return fname
         return False
 
+
     def pause(self):
         self['paused'] = True
 
@@ -388,8 +417,8 @@ class model(dict):
         # self.history will be keyed with the full path
         for k, fname in self.history.items():
             if os.path.isfile(fname):
-                with open(fname, "r") as f:
-                    contents = f.read()
+                if True:
+                    contents = self.get_history_snapshot(k)
                     res = re.findall(regex, contents, re.MULTILINE)
                     if res:
                         if not csv_writer:
diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh
index 9803d78d..70184051 100755
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -28,3 +28,10 @@ pytest tests/test_notification.py
 # Re-run with HIDE_REFERER set - could affect login
 export HIDE_REFERER=True
 pytest tests/test_access_control.py
+
+# Re-run a few tests that will trigger brotli based storage
+export SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD=5
+pytest tests/test_access_control.py
+pytest tests/test_notification.py
+pytest tests/test_backend.py
+pytest tests/test_rss.py
diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py
index 4ace6015..2a8fd52f 100644
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@@ -198,8 +198,8 @@ def test_check_json_without_filter(client, live_server):
     )
 
     # Should still see '"html": "<b>"'
-    assert b'&#34;&lt;b&gt;' in res.data
-    assert res.data.count(b'{\n') >= 2
+    assert b'&#34;html&#34;: &#34;&lt;b&gt;&#34;' in res.data
+    assert res.data.count(b'{') >= 2
 
     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
     assert b'Deleted' in res.data
diff --git a/changedetectionio/tests/unit/test_notification_diff.py b/changedetectionio/tests/unit/test_notification_diff.py
index 80f383a1..a6b7067f 100755
--- a/changedetectionio/tests/unit/test_notification_diff.py
+++ b/changedetectionio/tests/unit/test_notification_diff.py
@@ -13,21 +13,33 @@ class TestDiffBuilder(unittest.TestCase):
 
     def test_expected_diff_output(self):
         base_dir = os.path.dirname(__file__)
-        output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after.txt")
+        with open(base_dir + "/test-content/before.txt", 'r') as f:
+            previous_version_file_contents = f.read()
+
+        with open(base_dir + "/test-content/after.txt", 'r') as f:
+            newest_version_file_contents = f.read()
+
+        output = diff.render_diff(previous_version_file_contents, newest_version_file_contents)
         output = output.split("\n")
         self.assertIn('(changed) ok', output)
         self.assertIn('(into) xok', output)
         self.assertIn('(into) next-x-ok', output)
         self.assertIn('(added) and something new', output)
 
-
-        output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after-2.txt")
+        with open(base_dir + "/test-content/after-2.txt", 'r') as f:
+            newest_version_file_contents = f.read()
+        output = diff.render_diff(previous_version_file_contents, newest_version_file_contents)
         output = output.split("\n")
         self.assertIn('(removed) for having learned computerese,', output)
         self.assertIn('(removed) I continue to examine bits, bytes and words', output)
         
         #diff_removed
-        output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after.txt", include_equal=False, include_removed=True, include_added=False)
+        with open(base_dir + "/test-content/before.txt", 'r') as f:
+            previous_version_file_contents = f.read()
+
+        with open(base_dir + "/test-content/after.txt", 'r') as f:
+            newest_version_file_contents = f.read()
+        output = diff.render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=False)
         output = output.split("\n")
         self.assertIn('(changed) ok', output)
         self.assertIn('(into) xok', output)
@@ -35,7 +47,9 @@ class TestDiffBuilder(unittest.TestCase):
         self.assertNotIn('(added) and something new', output)
         
         #diff_removed
-        output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after-2.txt", include_equal=False, include_removed=True, include_added=False)
+        with open(base_dir + "/test-content/after-2.txt", 'r') as f:
+            newest_version_file_contents = f.read()
+        output = diff.render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=False)
         output = output.split("\n")
         self.assertIn('(removed) for having learned computerese,', output)
         self.assertIn('(removed) I continue to examine bits, bytes and words', output)
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py
index 2a7479dd..da570b58 100644
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -69,18 +69,17 @@ class update_worker(threading.Thread):
             else:
                 line_feed_sep = "\n"
 
-            with open(watch_history[dates[-1]], 'rb') as f:
-                snapshot_contents = f.read()
+            snapshot_contents = watch.get_history_snapshot(dates[-1])
 
             n_object.update({
                 'watch_url': watch['url'],
                 'uuid': watch_uuid,
                 'screenshot': watch.get_screenshot() if watch.get('notification_screenshot') else None,
-                'current_snapshot': snapshot_contents.decode('utf-8'),
-                'diff': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], line_feed_sep=line_feed_sep),
-                'diff_added': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_removed=False, line_feed_sep=line_feed_sep),
-                'diff_removed': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_added=False, line_feed_sep=line_feed_sep),
-                'diff_full': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_equal=True, line_feed_sep=line_feed_sep)
+                'current_snapshot': snapshot_contents,
+                'diff': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), line_feed_sep=line_feed_sep),
+                'diff_added': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_removed=False, line_feed_sep=line_feed_sep),
+                'diff_removed': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_added=False, line_feed_sep=line_feed_sep),
+                'diff_full': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_equal=True, line_feed_sep=line_feed_sep)
             })
             logging.info (">> SENDING NOTIFICATION")
             self.notification_q.put(n_object)