Use brotli for reducing the size of the text snapshots (#1482)

pull/1483/head
dgtlmoon 2 years ago committed by GitHub
parent e6d2d87b31
commit 3b80bb2f0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -340,8 +340,6 @@ def changedetection_app(config=None, datastore_o=None):
if len(dates) < 2: if len(dates) < 2:
continue continue
prev_fname = watch.history[dates[-2]]
if not watch.viewed: if not watch.viewed:
# Re #239 - GUID needs to be individual for each event # Re #239 - GUID needs to be individual for each event
# @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228) # @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228)
@ -362,9 +360,12 @@ def changedetection_app(config=None, datastore_o=None):
watch_title = watch.get('title') if watch.get('title') else watch.get('url') watch_title = watch.get('title') if watch.get('title') else watch.get('url')
fe.title(title=watch_title) fe.title(title=watch_title)
latest_fname = watch.history[dates[-1]]
html_diff = diff.render_diff(prev_fname, latest_fname, include_equal=False, line_feed_sep="<br>") html_diff = diff.render_diff(previous_version_file_contents=watch.get_history_snapshot(dates[-2]),
newest_version_file_contents=watch.get_history_snapshot(dates[-1]),
include_equal=False,
line_feed_sep="<br>")
fe.content(content="<html><body><h4>{}</h4>{}</body></html>".format(watch_title, html_diff), fe.content(content="<html><body><h4>{}</h4>{}</body></html>".format(watch_title, html_diff),
type='CDATA') type='CDATA')
@ -847,28 +848,22 @@ def changedetection_app(config=None, datastore_o=None):
# Save the current newest history as the most recently viewed # Save the current newest history as the most recently viewed
datastore.set_last_viewed(uuid, time.time()) datastore.set_last_viewed(uuid, time.time())
newest_file = history[dates[-1]]
# Read as binary and force decode as UTF-8 # Read as binary and force decode as UTF-8
# Windows may fail decode in python if we just use 'r' mode (chardet decode exception) # Windows may fail decode in python if we just use 'r' mode (chardet decode exception)
try: try:
with open(newest_file, 'r', encoding='utf-8', errors='ignore') as f: newest_version_file_contents = watch.get_history_snapshot(dates[-1])
newest_version_file_contents = f.read()
except Exception as e: except Exception as e:
newest_version_file_contents = "Unable to read {}.\n".format(newest_file) newest_version_file_contents = "Unable to read {}.\n".format(dates[-1])
previous_version = request.args.get('previous_version') previous_version = request.args.get('previous_version')
try: previous_timestamp = dates[-2]
previous_file = history[previous_version] if previous_version:
except KeyError: previous_timestamp = previous_version
# Not present, use a default value, the second one in the sorted list.
previous_file = history[dates[-2]]
try: try:
with open(previous_file, 'r', encoding='utf-8', errors='ignore') as f: previous_version_file_contents = watch.get_history_snapshot(previous_timestamp)
previous_version_file_contents = f.read()
except Exception as e: except Exception as e:
previous_version_file_contents = "Unable to read {}.\n".format(previous_file) previous_version_file_contents = "Unable to read {}.\n".format(previous_timestamp)
screenshot_url = watch.get_screenshot() screenshot_url = watch.get_screenshot()
@ -948,21 +943,19 @@ def changedetection_app(config=None, datastore_o=None):
return output return output
timestamp = list(watch.history.keys())[-1] timestamp = list(watch.history.keys())[-1]
filename = watch.history[timestamp]
try: try:
with open(filename, 'r', encoding='utf-8', errors='ignore') as f: tmp = watch.get_history_snapshot(timestamp).splitlines()
tmp = f.readlines()
# Get what needs to be highlighted # Get what needs to be highlighted
ignore_rules = watch.get('ignore_text', []) + datastore.data['settings']['application']['global_ignore_text'] ignore_rules = watch.get('ignore_text', []) + datastore.data['settings']['application']['global_ignore_text']
# .readlines will keep the \n, but we will parse it here again, in the future tidy this up # .readlines will keep the \n, but we will parse it here again, in the future tidy this up
ignored_line_numbers = html_tools.strip_ignore_text(content="".join(tmp), ignored_line_numbers = html_tools.strip_ignore_text(content="\n".join(tmp),
wordlist=ignore_rules, wordlist=ignore_rules,
mode='line numbers' mode='line numbers'
) )
trigger_line_numbers = html_tools.strip_ignore_text(content="".join(tmp), trigger_line_numbers = html_tools.strip_ignore_text(content="\n".join(tmp),
wordlist=watch['trigger_text'], wordlist=watch['trigger_text'],
mode='line numbers' mode='line numbers'
) )
@ -978,7 +971,7 @@ def changedetection_app(config=None, datastore_o=None):
content.append({'line': l, 'classes': ' '.join(classes)}) content.append({'line': l, 'classes': ' '.join(classes)})
except Exception as e: except Exception as e:
content.append({'line': "File doesnt exist or unable to read file {}".format(filename), 'classes': ''}) content.append({'line': f"File doesnt exist or unable to read timestamp {timestamp}", 'classes': ''})
output = render_template("preview.html", output = render_template("preview.html",
content=content, content=content,

@ -179,9 +179,7 @@ class WatchSingleHistory(Resource):
if timestamp == 'latest': if timestamp == 'latest':
timestamp = list(watch.history.keys())[-1] timestamp = list(watch.history.keys())[-1]
# @todo - Check for UTF-8 compatability content = watch.get_history_snapshot(timestamp)
with open(watch.history[timestamp], 'r') as f:
content = f.read()
response = make_response(content, 200) response = make_response(content, 200)
response.mimetype = "text/plain" response.mimetype = "text/plain"

@ -31,14 +31,11 @@ def customSequenceMatcher(before, after, include_equal=False, include_removed=Tr
# only_differences - only return info about the differences, no context # only_differences - only return info about the differences, no context
# line_feed_sep could be "<br>" or "<li>" or "\n" etc # line_feed_sep could be "<br>" or "<li>" or "\n" etc
def render_diff(previous_file, newest_file, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"): def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"):
with open(newest_file, 'r') as f:
newest_version_file_contents = f.read()
newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()] newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()]
if previous_file: if previous_version_file_contents:
with open(previous_file, 'r') as f:
previous_version_file_contents = f.read()
previous_version_file_contents = [line.rstrip() for line in previous_version_file_contents.splitlines()] previous_version_file_contents = [line.rstrip() for line in previous_version_file_contents.splitlines()]
else: else:
previous_version_file_contents = "" previous_version_file_contents = ""

@ -241,9 +241,32 @@ class model(dict):
bump = self.history bump = self.history
return self.__newest_history_key return self.__newest_history_key
def get_history_snapshot(self, timestamp):
import brotli
filepath = self.history[timestamp]
# See if a brotli versions exists and switch to that
if not filepath.endswith('.br') and os.path.isfile(f"{filepath}.br"):
filepath = f"{filepath}.br"
# OR in the backup case that the .br does not exist, but the plain one does
if filepath.endswith('.br') and not os.path.isfile(filepath):
if os.path.isfile(filepath.replace('.br', '')):
filepath = filepath.replace('.br', '')
if filepath.endswith('.br'):
# Brotli doesnt have a fileheader to detect it, so we rely on filename
# https://www.rfc-editor.org/rfc/rfc7932
with open(filepath, 'rb') as f:
return(brotli.decompress(f.read()).decode('utf-8'))
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
return f.read()
# Save some text file to the appropriate path and bump the history # Save some text file to the appropriate path and bump the history
# result_obj from fetch_site_status.run() # result_obj from fetch_site_status.run()
def save_history_text(self, contents, timestamp, snapshot_id): def save_history_text(self, contents, timestamp, snapshot_id):
import brotli
self.ensure_data_dir_exists() self.ensure_data_dir_exists()
@ -252,16 +275,21 @@ class model(dict):
if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key): if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key):
time.sleep(timestamp - self.__newest_history_key) time.sleep(timestamp - self.__newest_history_key)
snapshot_fname = f"{snapshot_id}.txt" threshold = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024))
skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False'))
# Only write if it does not exist, this is so that we dont bother re-saving the same data by checksum under different filenames. if not skip_brotli and len(contents) > threshold:
snapshot_fname = f"{snapshot_id}.txt.br"
dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest):
with open(dest, 'wb') as f:
f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
else:
snapshot_fname = f"{snapshot_id}.txt"
dest = os.path.join(self.watch_data_dir, snapshot_fname) dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest): if not os.path.exists(dest):
# in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading
# most sites are utf-8 and some are even broken utf-8
with open(dest, 'wb') as f: with open(dest, 'wb') as f:
f.write(contents) f.write(contents)
f.close()
# Append to index # Append to index
# @todo check last char was \n # @todo check last char was \n
@ -359,6 +387,7 @@ class model(dict):
return fname return fname
return False return False
def pause(self): def pause(self):
self['paused'] = True self['paused'] = True
@ -388,8 +417,8 @@ class model(dict):
# self.history will be keyed with the full path # self.history will be keyed with the full path
for k, fname in self.history.items(): for k, fname in self.history.items():
if os.path.isfile(fname): if os.path.isfile(fname):
with open(fname, "r") as f: if True:
contents = f.read() contents = self.get_history_snapshot(k)
res = re.findall(regex, contents, re.MULTILINE) res = re.findall(regex, contents, re.MULTILINE)
if res: if res:
if not csv_writer: if not csv_writer:

@ -28,3 +28,10 @@ pytest tests/test_notification.py
# Re-run with HIDE_REFERER set - could affect login # Re-run with HIDE_REFERER set - could affect login
export HIDE_REFERER=True export HIDE_REFERER=True
pytest tests/test_access_control.py pytest tests/test_access_control.py
# Re-run a few tests that will trigger brotli based storage
export SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD=5
pytest tests/test_access_control.py
pytest tests/test_notification.py
pytest tests/test_backend.py
pytest tests/test_rss.py

@ -198,8 +198,8 @@ def test_check_json_without_filter(client, live_server):
) )
# Should still see '"html": "<b>"' # Should still see '"html": "<b>"'
assert b'&#34;&lt;b&gt;' in res.data assert b'&#34;html&#34;: &#34;&lt;b&gt;&#34;' in res.data
assert res.data.count(b'{\n') >= 2 assert res.data.count(b'{') >= 2
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data assert b'Deleted' in res.data

@ -13,21 +13,33 @@ class TestDiffBuilder(unittest.TestCase):
def test_expected_diff_output(self): def test_expected_diff_output(self):
base_dir = os.path.dirname(__file__) base_dir = os.path.dirname(__file__)
output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after.txt") with open(base_dir + "/test-content/before.txt", 'r') as f:
previous_version_file_contents = f.read()
with open(base_dir + "/test-content/after.txt", 'r') as f:
newest_version_file_contents = f.read()
output = diff.render_diff(previous_version_file_contents, newest_version_file_contents)
output = output.split("\n") output = output.split("\n")
self.assertIn('(changed) ok', output) self.assertIn('(changed) ok', output)
self.assertIn('(into) xok', output) self.assertIn('(into) xok', output)
self.assertIn('(into) next-x-ok', output) self.assertIn('(into) next-x-ok', output)
self.assertIn('(added) and something new', output) self.assertIn('(added) and something new', output)
with open(base_dir + "/test-content/after-2.txt", 'r') as f:
output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after-2.txt") newest_version_file_contents = f.read()
output = diff.render_diff(previous_version_file_contents, newest_version_file_contents)
output = output.split("\n") output = output.split("\n")
self.assertIn('(removed) for having learned computerese,', output) self.assertIn('(removed) for having learned computerese,', output)
self.assertIn('(removed) I continue to examine bits, bytes and words', output) self.assertIn('(removed) I continue to examine bits, bytes and words', output)
#diff_removed #diff_removed
output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after.txt", include_equal=False, include_removed=True, include_added=False) with open(base_dir + "/test-content/before.txt", 'r') as f:
previous_version_file_contents = f.read()
with open(base_dir + "/test-content/after.txt", 'r') as f:
newest_version_file_contents = f.read()
output = diff.render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=False)
output = output.split("\n") output = output.split("\n")
self.assertIn('(changed) ok', output) self.assertIn('(changed) ok', output)
self.assertIn('(into) xok', output) self.assertIn('(into) xok', output)
@ -35,7 +47,9 @@ class TestDiffBuilder(unittest.TestCase):
self.assertNotIn('(added) and something new', output) self.assertNotIn('(added) and something new', output)
#diff_removed #diff_removed
output = diff.render_diff(previous_file=base_dir + "/test-content/before.txt", newest_file=base_dir + "/test-content/after-2.txt", include_equal=False, include_removed=True, include_added=False) with open(base_dir + "/test-content/after-2.txt", 'r') as f:
newest_version_file_contents = f.read()
output = diff.render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=False)
output = output.split("\n") output = output.split("\n")
self.assertIn('(removed) for having learned computerese,', output) self.assertIn('(removed) for having learned computerese,', output)
self.assertIn('(removed) I continue to examine bits, bytes and words', output) self.assertIn('(removed) I continue to examine bits, bytes and words', output)

@ -69,18 +69,17 @@ class update_worker(threading.Thread):
else: else:
line_feed_sep = "\n" line_feed_sep = "\n"
with open(watch_history[dates[-1]], 'rb') as f: snapshot_contents = watch.get_history_snapshot(dates[-1])
snapshot_contents = f.read()
n_object.update({ n_object.update({
'watch_url': watch['url'], 'watch_url': watch['url'],
'uuid': watch_uuid, 'uuid': watch_uuid,
'screenshot': watch.get_screenshot() if watch.get('notification_screenshot') else None, 'screenshot': watch.get_screenshot() if watch.get('notification_screenshot') else None,
'current_snapshot': snapshot_contents.decode('utf-8'), 'current_snapshot': snapshot_contents,
'diff': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], line_feed_sep=line_feed_sep), 'diff': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), line_feed_sep=line_feed_sep),
'diff_added': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_removed=False, line_feed_sep=line_feed_sep), 'diff_added': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_removed=False, line_feed_sep=line_feed_sep),
'diff_removed': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_added=False, line_feed_sep=line_feed_sep), 'diff_removed': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_added=False, line_feed_sep=line_feed_sep),
'diff_full': diff.render_diff(watch_history[dates[-2]], watch_history[dates[-1]], include_equal=True, line_feed_sep=line_feed_sep) 'diff_full': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_equal=True, line_feed_sep=line_feed_sep)
}) })
logging.info (">> SENDING NOTIFICATION") logging.info (">> SENDING NOTIFICATION")
self.notification_q.put(n_object) self.notification_q.put(n_object)

Loading…
Cancel
Save