From 3b80bb2f0e3a929392cf0e2aaa035a8fcd75f6e7 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 19 Mar 2023 21:12:22 +0100 Subject: [PATCH] Use brotli for reducing the size of the text snapshots (#1482) --- changedetectionio/__init__.py | 83 +++++++++---------- changedetectionio/api/api_v1.py | 4 +- changedetectionio/diff.py | 13 ++- changedetectionio/model/Watch.py | 53 +++++++++--- changedetectionio/run_basic_tests.sh | 7 ++ .../tests/test_jsonpath_jq_selector.py | 4 +- .../tests/unit/test_notification_diff.py | 24 ++++-- changedetectionio/update_worker.py | 13 ++- 8 files changed, 119 insertions(+), 82 deletions(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 9f185b2b..23ea2dce 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -340,8 +340,6 @@ def changedetection_app(config=None, datastore_o=None): if len(dates) < 2: continue - prev_fname = watch.history[dates[-2]] - if not watch.viewed: # Re #239 - GUID needs to be individual for each event # @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228) @@ -362,9 +360,12 @@ def changedetection_app(config=None, datastore_o=None): watch_title = watch.get('title') if watch.get('title') else watch.get('url') fe.title(title=watch_title) - latest_fname = watch.history[dates[-1]] - html_diff = diff.render_diff(prev_fname, latest_fname, include_equal=False, line_feed_sep="
") + html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]), + newest_version_file_contents=watch.get_history_snapshot(dates[-1]), + include_equal=False, + line_feed_sep="
") + fe.content(content="

{}

{}".format(watch_title, html_diff), type='CDATA') @@ -847,28 +848,22 @@ def changedetection_app(config=None, datastore_o=None): # Save the current newest history as the most recently viewed datastore.set_last_viewed(uuid, time.time()) - newest_file = history[dates[-1]] - # Read as binary and force decode as UTF-8 # Windows may fail decode in python if we just use 'r' mode (chardet decode exception) try: - with open(newest_file, 'r', encoding='utf-8', errors='ignore') as f: - newest_version_file_contents = f.read() + newest_version_file_contents = watch.get_history_snapshot(dates[-1]) except Exception as e: - newest_version_file_contents = "Unable to read {}.\n".format(newest_file) + newest_version_file_contents = "Unable to read {}.\n".format(dates[-1]) previous_version = request.args.get('previous_version') - try: - previous_file = history[previous_version] - except KeyError: - # Not present, use a default value, the second one in the sorted list. - previous_file = history[dates[-2]] + previous_timestamp = dates[-2] + if previous_version: + previous_timestamp = previous_version try: - with open(previous_file, 'r', encoding='utf-8', errors='ignore') as f: - previous_version_file_contents = f.read() + previous_version_file_contents = watch.get_history_snapshot(previous_timestamp) except Exception as e: - previous_version_file_contents = "Unable to read {}.\n".format(previous_file) + previous_version_file_contents = "Unable to read {}.\n".format(previous_timestamp) screenshot_url = watch.get_screenshot() @@ -948,37 +943,35 @@ def changedetection_app(config=None, datastore_o=None): return output timestamp = list(watch.history.keys())[-1] - filename = watch.history[timestamp] try: - with open(filename, 'r', encoding='utf-8', errors='ignore') as f: - tmp = f.readlines() - - # Get what needs to be highlighted - ignore_rules = watch.get('ignore_text', []) + datastore.data['settings']['application']['global_ignore_text'] - - # .readlines will keep the \n, but we will parse it here again, in the future tidy this up - ignored_line_numbers = html_tools.strip_ignore_text(content="".join(tmp), - wordlist=ignore_rules, - mode='line numbers' - ) - - trigger_line_numbers = html_tools.strip_ignore_text(content="".join(tmp), - wordlist=watch['trigger_text'], - mode='line numbers' - ) - # Prepare the classes and lines used in the template - i=0 - for l in tmp: - classes=[] - i+=1 - if i in ignored_line_numbers: - classes.append('ignored') - if i in trigger_line_numbers: - classes.append('triggered') - content.append({'line': l, 'classes': ' '.join(classes)}) + tmp = watch.get_history_snapshot(timestamp).splitlines() + + # Get what needs to be highlighted + ignore_rules = watch.get('ignore_text', []) + datastore.data['settings']['application']['global_ignore_text'] + + # .readlines will keep the \n, but we will parse it here again, in the future tidy this up + ignored_line_numbers = html_tools.strip_ignore_text(content="\n".join(tmp), + wordlist=ignore_rules, + mode='line numbers' + ) + + trigger_line_numbers = html_tools.strip_ignore_text(content="\n".join(tmp), + wordlist=watch['trigger_text'], + mode='line numbers' + ) + # Prepare the classes and lines used in the template + i=0 + for l in tmp: + classes=[] + i+=1 + if i in ignored_line_numbers: + classes.append('ignored') + if i in trigger_line_numbers: + classes.append('triggered') + content.append({'line': l, 'classes': ' '.join(classes)}) except Exception as e: - content.append({'line': "File doesnt exist or unable to read file {}".format(filename), 'classes': ''}) + content.append({'line': f"File doesnt exist or unable to read timestamp {timestamp}", 'classes': ''}) output = render_template("preview.html", content=content, diff --git a/changedetectionio/api/api_v1.py b/changedetectionio/api/api_v1.py index e28972d6..b30c0d63 100644 --- a/changedetectionio/api/api_v1.py +++ b/changedetectionio/api/api_v1.py @@ -179,9 +179,7 @@ class WatchSingleHistory(Resource): if timestamp == 'latest': timestamp = list(watch.history.keys())[-1] - # @todo - Check for UTF-8 compatability - with open(watch.history[timestamp], 'r') as f: - content = f.read() + content = watch.get_history_snapshot(timestamp) response = make_response(content, 200) response.mimetype = "text/plain" diff --git a/changedetectionio/diff.py b/changedetectionio/diff.py index e21bfa69..2b566ffc 100644 --- a/changedetectionio/diff.py +++ b/changedetectionio/diff.py @@ -31,14 +31,11 @@ def customSequenceMatcher(before, after, include_equal=False, include_removed=Tr # only_differences - only return info about the differences, no context # line_feed_sep could be "
" or "
  • " or "\n" etc -def render_diff(previous_file, newest_file, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"): - with open(newest_file, 'r') as f: - newest_version_file_contents = f.read() - newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()] - - if previous_file: - with open(previous_file, 'r') as f: - previous_version_file_contents = f.read() +def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"): + + newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()] + + if previous_version_file_contents: previous_version_file_contents = [line.rstrip() for line in previous_version_file_contents.splitlines()] else: previous_version_file_contents = "" diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 403ac858..9af922ae 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -241,9 +241,32 @@ class model(dict): bump = self.history return self.__newest_history_key + def get_history_snapshot(self, timestamp): + import brotli + filepath = self.history[timestamp] + + # See if a brotli versions exists and switch to that + if not filepath.endswith('.br') and os.path.isfile(f"{filepath}.br"): + filepath = f"{filepath}.br" + + # OR in the backup case that the .br does not exist, but the plain one does + if filepath.endswith('.br') and not os.path.isfile(filepath): + if os.path.isfile(filepath.replace('.br', '')): + filepath = filepath.replace('.br', '') + + if filepath.endswith('.br'): + # Brotli doesnt have a fileheader to detect it, so we rely on filename + # https://www.rfc-editor.org/rfc/rfc7932 + with open(filepath, 'rb') as f: + return(brotli.decompress(f.read()).decode('utf-8')) + + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + return f.read() + # Save some text file to the appropriate path and bump the history # result_obj from fetch_site_status.run() def save_history_text(self, contents, timestamp, snapshot_id): + import brotli self.ensure_data_dir_exists() @@ -252,16 +275,21 @@ class model(dict): if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key): time.sleep(timestamp - self.__newest_history_key) - snapshot_fname = f"{snapshot_id}.txt" - - # Only write if it does not exist, this is so that we dont bother re-saving the same data by checksum under different filenames. - dest = os.path.join(self.watch_data_dir, snapshot_fname) - if not os.path.exists(dest): - # in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading - # most sites are utf-8 and some are even broken utf-8 - with open(dest, 'wb') as f: - f.write(contents) - f.close() + threshold = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024)) + skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False')) + + if not skip_brotli and len(contents) > threshold: + snapshot_fname = f"{snapshot_id}.txt.br" + dest = os.path.join(self.watch_data_dir, snapshot_fname) + if not os.path.exists(dest): + with open(dest, 'wb') as f: + f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) + else: + snapshot_fname = f"{snapshot_id}.txt" + dest = os.path.join(self.watch_data_dir, snapshot_fname) + if not os.path.exists(dest): + with open(dest, 'wb') as f: + f.write(contents) # Append to index # @todo check last char was \n @@ -359,6 +387,7 @@ class model(dict): return fname return False + def pause(self): self['paused'] = True @@ -388,8 +417,8 @@ class model(dict): # self.history will be keyed with the full path for k, fname in self.history.items(): if os.path.isfile(fname): - with open(fname, "r") as f: - contents = f.read() + if True: + contents = self.get_history_snapshot(k) res = re.findall(regex, contents, re.MULTILINE) if res: if not csv_writer: diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh index 9803d78d..70184051 100755 --- a/changedetectionio/run_basic_tests.sh +++ b/changedetectionio/run_basic_tests.sh @@ -28,3 +28,10 @@ pytest tests/test_notification.py # Re-run with HIDE_REFERER set - could affect login export HIDE_REFERER=True pytest tests/test_access_control.py + +# Re-run a few tests that will trigger brotli based storage +export SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD=5 +pytest tests/test_access_control.py +pytest tests/test_notification.py +pytest tests/test_backend.py +pytest tests/test_rss.py diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index 4ace6015..2a8fd52f 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -198,8 +198,8 @@ def test_check_json_without_filter(client, live_server): ) # Should still see '"html": ""' - assert b'"<b>' in res.data - assert res.data.count(b'{\n') >= 2 + assert b'"html": "<b>"' in res.data + assert res.data.count(b'{') >= 2 res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data diff --git a/changedetectionio/tests/unit/test_notification_diff.py b/changedetectionio/tests/unit/test_notification_diff.py index 80f383a1..a6b7067f 100755 --- a/changedetectionio/tests/unit/test_notification_diff.py +++ b/changedetectionio/tests/unit/test_notification_diff.py @@ -13,21 +13,33 @@ class TestDiffBuilder(unittest.TestCase): def test_expected_diff_output(self): base_dir = os.path.dirname(__file__) - output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after.txt") + with open(base_dir + "/test-content/before.txt", 'r') as f: + previous_version_file_contents = f.read() + + with open(base_dir + "/test-content/after.txt", 'r') as f: + newest_version_file_contents = f.read() + + output = diff.render_diff(previous_version_file_contents, newest_version_file_contents) output = output.split("\n") self.assertIn('(changed) ok', output) self.assertIn('(into) xok', output) self.assertIn('(into) next-x-ok', output) self.assertIn('(added) and something new', output) - - output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after-2.txt") + with open(base_dir + "/test-content/after-2.txt", 'r') as f: + newest_version_file_contents = f.read() + output = diff.render_diff(previous_version_file_contents, newest_version_file_contents) output = output.split("\n") self.assertIn('(removed) for having learned computerese,', output) self.assertIn('(removed) I continue to examine bits, bytes and words', output) #diff_removed - output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after.txt", include_equal=False, include_removed=True, include_added=False) + with open(base_dir + "/test-content/before.txt", 'r') as f: + previous_version_file_contents = f.read() + + with open(base_dir + "/test-content/after.txt", 'r') as f: + newest_version_file_contents = f.read() + output = diff.render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=False) output = output.split("\n") self.assertIn('(changed) ok', output) self.assertIn('(into) xok', output) @@ -35,7 +47,9 @@ class TestDiffBuilder(unittest.TestCase): self.assertNotIn('(added) and something new', output) #diff_removed - output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after-2.txt", include_equal=False, include_removed=True, include_added=False) + with open(base_dir + "/test-content/after-2.txt", 'r') as f: + newest_version_file_contents = f.read() + output = diff.render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=False) output = output.split("\n") self.assertIn('(removed) for having learned computerese,', output) self.assertIn('(removed) I continue to examine bits, bytes and words', output) diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 2a7479dd..da570b58 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -69,18 +69,17 @@ class update_worker(threading.Thread): else: line_feed_sep = "\n" - with open(watch_history[dates[-1]], 'rb') as f: - snapshot_contents = f.read() + snapshot_contents = watch.get_history_snapshot(dates[-1]) n_object.update({ 'watch_url': watch['url'], 'uuid': watch_uuid, 'screenshot': watch.get_screenshot() if watch.get('notification_screenshot') else None, - 'current_snapshot': snapshot_contents.decode('utf-8'), - 'diff': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], line_feed_sep=line_feed_sep), - 'diff_added': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_removed=False, line_feed_sep=line_feed_sep), - 'diff_removed': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_added=False, line_feed_sep=line_feed_sep), - 'diff_full': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_equal=True, line_feed_sep=line_feed_sep) + 'current_snapshot': snapshot_contents, + 'diff': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), line_feed_sep=line_feed_sep), + 'diff_added': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_removed=False, line_feed_sep=line_feed_sep), + 'diff_removed': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_added=False, line_feed_sep=line_feed_sep), + 'diff_full': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_equal=True, line_feed_sep=line_feed_sep) }) logging.info (">> SENDING NOTIFICATION") self.notification_q.put(n_object)