From 6f4fd011e3df5cde7ca2b40462f1a41ff3b3d7b8 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 17 Feb 2023 17:15:27 +0100 Subject: [PATCH] Dont rewrite/resave snapshot when its the same data, just bump the history index, saves disk space. (#1414) --- changedetectionio/model/Watch.py | 17 ++++++++++------- changedetectionio/update_worker.py | 13 ++++++------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 4f367a71..d25837e9 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -241,7 +241,7 @@ class model(dict): # Save some text file to the appropriate path and bump the history # result_obj from fetch_site_status.run() - def save_history_text(self, contents, timestamp): + def save_history_text(self, contents, timestamp, snapshot_id): self.ensure_data_dir_exists() @@ -250,13 +250,16 @@ class model(dict): if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key): time.sleep(timestamp - self.__newest_history_key) - snapshot_fname = "{}.txt".format(str(uuid.uuid4())) + snapshot_fname = f"{snapshot_id}.txt" - # in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading - # most sites are utf-8 and some are even broken utf-8 - with open(os.path.join(self.watch_data_dir, snapshot_fname), 'wb') as f: - f.write(contents) - f.close() + # Only write if it does not exist, this is so that we dont bother re-saving the same data by checksum under different filenames. + dest = os.path.join(self.watch_data_dir, snapshot_fname) + if not os.path.exists(dest): + # in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading + # most sites are utf-8 and some are even broken utf-8 + with open(dest, 'wb') as f: + f.write(contents) + f.close() # Append to index # @todo check last char was \n diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index a77b5969..ccb01097 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -319,15 +319,14 @@ class update_worker(threading.Thread): # Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc if process_changedetection_results: try: - watch = self.datastore.data['watching'][uuid] - fname = "" # Saved history text filename + watch = self.datastore.data['watching'].get(uuid) + self.datastore.update_watch(uuid=uuid, update_obj=update_obj) - # For the FIRST time we check a site, or a change detected, save the snapshot. + # Also save the snapshot on the first time checked if changed_detected or not watch['last_checked']: - # A change was detected - watch.save_history_text(contents=contents, timestamp=str(round(time.time()))) - - self.datastore.update_watch(uuid=uuid, update_obj=update_obj) + watch.save_history_text(contents=contents, + timestamp=str(round(time.time())), + snapshot_id=update_obj.get('previous_md5', 'none')) # A change was detected if changed_detected: