From aa3c8a93701c383b0b4d2464dc277b4b4f46797b Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 31 May 2022 23:43:50 +0200 Subject: [PATCH] Move history data to a textfile, improves memory handling (#638) --- changedetectionio/__init__.py | 67 +++++------ changedetectionio/api/api_v1.py | 11 +- changedetectionio/fetch_site_status.py | 1 + changedetectionio/model/Watch.py | 108 ++++++++++++++++-- changedetectionio/store.py | 106 ++++++++--------- .../templates/watch-overview.html | 8 +- changedetectionio/tests/test_api.py | 17 +-- .../tests/test_element_removal.py | 5 +- .../tests/test_history_consistency.py | 84 ++++++++++++++ changedetectionio/tests/test_trigger.py | 9 +- changedetectionio/tests/test_trigger_regex.py | 13 ++- .../tests/test_trigger_regex_with_filter.py | 24 ++-- changedetectionio/tests/util.py | 19 +++ changedetectionio/update_worker.py | 16 +-- 14 files changed, 330 insertions(+), 158 deletions(-) create mode 100644 changedetectionio/tests/test_history_consistency.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index a5087e19..ed838ca6 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -178,6 +178,10 @@ def changedetection_app(config=None, datastore_o=None): global datastore datastore = datastore_o + # so far just for read-only via tests, but this will be moved eventually to be the main source + # (instead of the global var) + app.config['DATASTORE']=datastore_o + #app.config.update(config or {}) login_manager = flask_login.LoginManager(app) @@ -317,25 +321,19 @@ def changedetection_app(config=None, datastore_o=None): for watch in sorted_watches: - dates = list(watch['history'].keys()) + dates = list(watch.history.keys()) # Re #521 - Don't bother processing this one if theres less than 2 snapshots, means we never had a change detected. if len(dates) < 2: continue - # Convert to int, sort and back to str again - # @todo replace datastore getter that does this automatically - dates = [int(i) for i in dates] - dates.sort(reverse=True) - dates = [str(i) for i in dates] - prev_fname = watch['history'][dates[1]] + prev_fname = watch.history[dates[-2]] - if not watch['viewed']: + if not watch.viewed: # Re #239 - GUID needs to be individual for each event # @todo In the future make this a configurable link back (see work on BASE_URL https://github.com/dgtlmoon/changedetection.io/pull/228) guid = "{}/{}".format(watch['uuid'], watch['last_changed']) fe = fg.add_entry() - # Include a link to the diff page, they will have to login here to see if password protection is enabled. # Description is the page you watch, link takes you to the diff JS UI page base_url = datastore.data['settings']['application']['base_url'] @@ -350,13 +348,13 @@ def changedetection_app(config=None, datastore_o=None): watch_title = watch.get('title') if watch.get('title') else watch.get('url') fe.title(title=watch_title) - latest_fname = watch['history'][dates[0]] + latest_fname = watch.history[dates[-1]] html_diff = diff.render_diff(prev_fname, latest_fname, include_equal=False, line_feed_sep="
") fe.description(description="

{}

{}".format(watch_title, html_diff)) fe.guid(guid, permalink=False) - dt = datetime.datetime.fromtimestamp(int(watch['newest_history_key'])) + dt = datetime.datetime.fromtimestamp(int(watch.newest_history_key)) dt = dt.replace(tzinfo=pytz.UTC) fe.pubDate(dt) @@ -491,10 +489,10 @@ def changedetection_app(config=None, datastore_o=None): # 0 means that theres only one, so that there should be no 'unviewed' history available if newest_history_key == 0: - newest_history_key = list(datastore.data['watching'][uuid]['history'].keys())[0] + newest_history_key = list(datastore.data['watching'][uuid].history.keys())[0] if newest_history_key: - with open(datastore.data['watching'][uuid]['history'][newest_history_key], + with open(datastore.data['watching'][uuid].history[newest_history_key], encoding='utf-8') as file: raw_content = file.read() @@ -588,12 +586,12 @@ def changedetection_app(config=None, datastore_o=None): # Reset the previous_md5 so we process a new snapshot including stripping ignore text. if form_ignore_text: - if len(datastore.data['watching'][uuid]['history']): + if len(datastore.data['watching'][uuid].history): extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) # Reset the previous_md5 so we process a new snapshot including stripping ignore text. if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']: - if len(datastore.data['watching'][uuid]['history']): + if len(datastore.data['watching'][uuid].history): extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) # Be sure proxy value is None @@ -754,7 +752,7 @@ def changedetection_app(config=None, datastore_o=None): # Save the current newest history as the most recently viewed for watch_uuid, watch in datastore.data['watching'].items(): - datastore.set_last_viewed(watch_uuid, watch['newest_history_key']) + datastore.set_last_viewed(watch_uuid, watch.newest_history_key) flash("Cleared all statuses.") return redirect(url_for('index')) @@ -774,20 +772,17 @@ def changedetection_app(config=None, datastore_o=None): flash("No history found for the specified link, bad link?", "error") return redirect(url_for('index')) - dates = list(watch['history'].keys()) - # Convert to int, sort and back to str again - # @todo replace datastore getter that does this automatically - dates = [int(i) for i in dates] - dates.sort(reverse=True) - dates = [str(i) for i in dates] + history = watch.history + dates = list(history.keys()) if len(dates) < 2: flash("Not enough saved change detection snapshots to produce a report.", "error") return redirect(url_for('index')) # Save the current newest history as the most recently viewed - datastore.set_last_viewed(uuid, dates[0]) - newest_file = watch['history'][dates[0]] + datastore.set_last_viewed(uuid, time.time()) + + newest_file = history[dates[-1]] try: with open(newest_file, 'r') as f: @@ -797,10 +792,10 @@ def changedetection_app(config=None, datastore_o=None): previous_version = request.args.get('previous_version') try: - previous_file = watch['history'][previous_version] + previous_file = history[previous_version] except KeyError: # Not present, use a default value, the second one in the sorted list. - previous_file = watch['history'][dates[1]] + previous_file = history[dates[-2]] try: with open(previous_file, 'r') as f: @@ -817,7 +812,7 @@ def changedetection_app(config=None, datastore_o=None): extra_stylesheets=extra_stylesheets, versions=dates[1:], uuid=uuid, - newest_version_timestamp=dates[0], + newest_version_timestamp=dates[-1], current_previous_version=str(previous_version), current_diff_url=watch['url'], extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']), @@ -845,9 +840,9 @@ def changedetection_app(config=None, datastore_o=None): flash("No history found for the specified link, bad link?", "error") return redirect(url_for('index')) - if len(watch['history']): - timestamps = sorted(watch['history'].keys(), key=lambda x: int(x)) - filename = watch['history'][timestamps[-1]] + if watch.history_n >0: + timestamps = sorted(watch.history.keys(), key=lambda x: int(x)) + filename = watch.history[timestamps[-1]] try: with open(filename, 'r') as f: tmp = f.readlines() @@ -1141,6 +1136,7 @@ def changedetection_app(config=None, datastore_o=None): # copy it to memory as trim off what we dont need (history) watch = deepcopy(datastore.data['watching'][uuid]) + # For older versions that are not a @property if (watch.get('history')): del (watch['history']) @@ -1249,6 +1245,7 @@ def notification_runner(): # Thread runner to check every minute, look for new watches to feed into the Queue. def ticker_thread_check_time_launch_checks(): from changedetectionio import update_worker + import logging # Spin up Workers that do the fetching # Can be overriden by ENV or use the default settings @@ -1267,9 +1264,10 @@ def ticker_thread_check_time_launch_checks(): running_uuids.append(t.current_uuid) # Re #232 - Deepcopy the data incase it changes while we're iterating through it all + watch_uuid_list = [] while True: try: - copied_datastore = deepcopy(datastore) + watch_uuid_list = datastore.data['watching'].keys() except RuntimeError as e: # RuntimeError: dictionary changed size during iteration time.sleep(0.1) @@ -1286,7 +1284,12 @@ def ticker_thread_check_time_launch_checks(): recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60)) recheck_time_system_seconds = datastore.threshold_seconds - for uuid, watch in copied_datastore.data['watching'].items(): + for uuid in watch_uuid_list: + + watch = datastore.data['watching'].get(uuid) + if not watch: + logging.error("Watch: {} no longer present.".format(uuid)) + continue # No need todo further processing if it's paused if watch['paused']: diff --git a/changedetectionio/api/api_v1.py b/changedetectionio/api/api_v1.py index 4f178195..d61e93c0 100644 --- a/changedetectionio/api/api_v1.py +++ b/changedetectionio/api/api_v1.py @@ -28,8 +28,7 @@ class Watch(Resource): return "OK", 200 # Return without history, get that via another API call - watch['history_n'] = len(watch['history']) - del (watch['history']) + watch['history_n'] = watch.history_n return watch @auth.check_token @@ -52,7 +51,7 @@ class WatchHistory(Resource): watch = self.datastore.data['watching'].get(uuid) if not watch: abort(404, message='No watch exists with the UUID of {}'.format(uuid)) - return watch['history'], 200 + return watch.history, 200 class WatchSingleHistory(Resource): @@ -69,13 +68,13 @@ class WatchSingleHistory(Resource): if not watch: abort(404, message='No watch exists with the UUID of {}'.format(uuid)) - if not len(watch['history']): + if not len(watch.history): abort(404, message='Watch found but no history exists for the UUID {}'.format(uuid)) if timestamp == 'latest': - timestamp = list(watch['history'].keys())[-1] + timestamp = list(watch.history.keys())[-1] - with open(watch['history'][timestamp], 'r') as f: + with open(watch.history[timestamp], 'r') as f: content = f.read() response = make_response(content, 200) diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 8629f454..c0fbf2de 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -224,6 +224,7 @@ class perform_site_check(): result = html_tools.strip_ignore_text(content=str(stripped_text_from_html), wordlist=watch['trigger_text'], mode="line numbers") + # If it returned any lines that matched.. if result: blocked_by_not_found_trigger_text = False diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 672e0a3e..f884387f 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -1,5 +1,4 @@ import os - import uuid as uuid_builder minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 60)) @@ -12,22 +11,24 @@ from changedetectionio.notification import ( class model(dict): - base_config = { + __newest_history_key = None + __history_n=0 + + __base_config = { 'url': None, 'tag': None, 'last_checked': 0, 'last_changed': 0, 'paused': False, 'last_viewed': 0, # history key value of the last viewed via the [diff] link - 'newest_history_key': 0, + #'newest_history_key': 0, 'title': None, 'previous_md5': False, -# UUID not needed, should be generated only as a key -# 'uuid': + 'uuid': str(uuid_builder.uuid4()), 'headers': {}, # Extra headers to send 'body': None, 'method': 'GET', - 'history': {}, # Dict of timestamp and output stripped filename + #'history': {}, # Dict of timestamp and output stripped filename 'ignore_text': [], # List of text to ignore when calculating the comparison checksum # Custom notification content 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) @@ -48,10 +49,103 @@ class model(dict): } def __init__(self, *arg, **kw): - self.update(self.base_config) + import uuid + self.update(self.__base_config) + self.__datastore_path = kw['datastore_path'] + + self['uuid'] = str(uuid.uuid4()) + + del kw['datastore_path'] + + if kw.get('default'): + self.update(kw['default']) + del kw['default'] + # goes at the end so we update the default object with the initialiser super(model, self).__init__(*arg, **kw) + @property + def viewed(self): + if int(self.newest_history_key) <= int(self['last_viewed']): + return True + + return False + + @property + def history_n(self): + return self.__history_n + + @property + def history(self): + tmp_history = {} + import logging + import time + + # Read the history file as a dict + fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt") + if os.path.isfile(fname): + logging.debug("Disk IO accessed " + str(time.time())) + with open(fname, "r") as f: + tmp_history = dict(i.strip().split(',', 2) for i in f.readlines()) + + if len(tmp_history): + self.__newest_history_key = list(tmp_history.keys())[-1] + + self.__history_n = len(tmp_history) + + return tmp_history + + @property + def has_history(self): + fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt") + return os.path.isfile(fname) + + # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. + @property + def newest_history_key(self): + if self.__newest_history_key is not None: + return self.__newest_history_key + + if len(self.history) <= 1: + return 0 + + + bump = self.history + return self.__newest_history_key + + + # Save some text file to the appropriate path and bump the history + # result_obj from fetch_site_status.run() + def save_history_text(self, contents, timestamp): + import uuid + from os import mkdir, path, unlink + import logging + + output_path = "{}/{}".format(self.__datastore_path, self['uuid']) + + # Incase the operator deleted it, check and create. + if not os.path.isdir(output_path): + mkdir(output_path) + + snapshot_fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4()) + logging.debug("Saving history text {}".format(snapshot_fname)) + + with open(snapshot_fname, 'wb') as f: + f.write(contents) + f.close() + + # Append to index + # @todo check last char was \n + index_fname = "{}/history.txt".format(output_path) + with open(index_fname, 'a') as f: + f.write("{},{}\n".format(timestamp, snapshot_fname)) + f.close() + + self.__newest_history_key = timestamp + self.__history_n+=1 + + #@todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status + return snapshot_fname @property def has_empty_checktime(self): diff --git a/changedetectionio/store.py b/changedetectionio/store.py index ddcf2453..9960ed9c 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -40,7 +40,7 @@ class ChangeDetectionStore: # Base definition for all watchers # deepcopy part of #569 - not sure why its needed exactly - self.generic_definition = deepcopy(Watch.model()) + self.generic_definition = deepcopy(Watch.model(datastore_path = datastore_path, default={})) if path.isfile('changedetectionio/source.txt'): with open('changedetectionio/source.txt') as f: @@ -71,13 +71,10 @@ class ChangeDetectionStore: if 'application' in from_disk['settings']: self.__data['settings']['application'].update(from_disk['settings']['application']) - # Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future. - # @todo pretty sure theres a python we todo this with an abstracted(?) object! + # Convert each existing watch back to the Watch.model object for uuid, watch in self.__data['watching'].items(): - _blank = deepcopy(self.generic_definition) - _blank.update(watch) - self.__data['watching'].update({uuid: _blank}) - self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid) + watch['uuid']=uuid + self.__data['watching'][uuid] = Watch.model(datastore_path=self.datastore_path, default=watch) print("Watching:", uuid, self.__data['watching'][uuid]['url']) # First time ran, doesnt exist. @@ -130,22 +127,6 @@ class ChangeDetectionStore: # Finally start the thread that will manage periodic data saves to JSON save_data_thread = threading.Thread(target=self.save_datastore).start() - # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. - def get_newest_history_key(self, uuid): - if len(self.__data['watching'][uuid]['history']) == 1: - return 0 - - dates = list(self.__data['watching'][uuid]['history'].keys()) - # Convert to int, sort and back to str again - # @todo replace datastore getter that does this automatically - dates = [int(i) for i in dates] - dates.sort(reverse=True) - if len(dates): - # always keyed as str - return str(dates[0]) - - return 0 - def set_last_viewed(self, uuid, timestamp): self.data['watching'][uuid].update({'last_viewed': int(timestamp)}) self.needs_write = True @@ -170,7 +151,6 @@ class ChangeDetectionStore: del (update_obj[dict_key]) self.__data['watching'][uuid].update(update_obj) - self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid) self.needs_write = True @@ -188,14 +168,14 @@ class ChangeDetectionStore: @property def data(self): has_unviewed = False - for uuid, v in self.__data['watching'].items(): - self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid) - if int(v['newest_history_key']) <= int(v['last_viewed']): - self.__data['watching'][uuid]['viewed'] = True + for uuid, watch in self.__data['watching'].items(): + #self.__data['watching'][uuid]['viewed']=True +# if int(watch.newest_history_key) <= int(watch['last_viewed']): +# self.__data['watching'][uuid]['viewed'] = True - else: - self.__data['watching'][uuid]['viewed'] = False - has_unviewed = True + # else: +# self.__data['watching'][uuid]['viewed'] = False +# has_unviewed = True # #106 - Be sure this is None on empty string, False, None, etc # Default var for fetch_backend @@ -239,11 +219,11 @@ class ChangeDetectionStore: # GitHub #30 also delete history records for uuid in self.data['watching']: - for path in self.data['watching'][uuid]['history'].values(): + for path in self.data['watching'][uuid].history.values(): self.unlink_history_file(path) else: - for path in self.data['watching'][uuid]['history'].values(): + for path in self.data['watching'][uuid].history.values(): self.unlink_history_file(path) del self.data['watching'][uuid] @@ -275,13 +255,14 @@ class ChangeDetectionStore: def scrub_watch(self, uuid): import pathlib - self.__data['watching'][uuid].update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'newest_history_key': 0, 'previous_md5': False}) + self.__data['watching'][uuid].update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'previous_md5': False}) self.needs_write_urgent = True for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"): unlink(item) def add_watch(self, url, tag="", extras=None, write_to_disk_now=True): + if extras is None: extras = {} # should always be str @@ -317,16 +298,15 @@ class ChangeDetectionStore: return False with self.lock: - # @todo use a common generic version of this - new_uuid = str(uuid_builder.uuid4()) + # #Re 569 - # Not sure why deepcopy was needed here, sometimes new watches would appear to already have 'history' set - # I assumed this would instantiate a new object but somehow an existing dict was getting used - new_watch = deepcopy(Watch.model({ + new_watch = Watch.model(datastore_path=self.datastore_path, default={ 'url': url, 'tag': tag - })) + }) + new_uuid = new_watch['uuid'] + logging.debug("Added URL {} - {}".format(url, new_uuid)) for k in ['uuid', 'history', 'last_checked', 'last_changed', 'newest_history_key', 'previous_md5', 'viewed']: if k in apply_extras: @@ -346,23 +326,6 @@ class ChangeDetectionStore: self.sync_to_json() return new_uuid - # Save some text file to the appropriate path and bump the history - # result_obj from fetch_site_status.run() - def save_history_text(self, watch_uuid, contents): - import uuid - - output_path = "{}/{}".format(self.datastore_path, watch_uuid) - # Incase the operator deleted it, check and create. - if not os.path.isdir(output_path): - mkdir(output_path) - - fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4()) - with open(fname, 'wb') as f: - f.write(contents) - f.close() - - return fname - def get_screenshot(self, watch_uuid): output_path = "{}/{}".format(self.datastore_path, watch_uuid) fname = "{}/last-screenshot.png".format(output_path) @@ -448,8 +411,8 @@ class ChangeDetectionStore: index=[] for uuid in self.data['watching']: - for id in self.data['watching'][uuid]['history']: - index.append(self.data['watching'][uuid]['history'][str(id)]) + for id in self.data['watching'][uuid].history: + index.append(self.data['watching'][uuid].history[str(id)]) import pathlib @@ -520,3 +483,28 @@ class ChangeDetectionStore: # Only upgrade individual watch time if it was set if watch.get('minutes_between_check', False): self.data['watching'][uuid]['time_between_check']['minutes'] = watch['minutes_between_check'] + + # Move the history list to a flat text file index + # Better than SQLite because this list is only appended to, and works across NAS / NFS type setups + def update_2(self): + # @todo test running this on a newly updated one (when this already ran) + for uuid, watch in self.data['watching'].items(): + history = [] + + if watch.get('history', False): + for d, p in watch['history'].items(): + d = int(d) # Used to be keyed as str, we'll fix this now too + history.append("{},{}\n".format(d,p)) + + if len(history): + target_path = os.path.join(self.datastore_path, uuid) + if os.path.exists(target_path): + with open(os.path.join(target_path, "history.txt"), "w") as f: + f.writelines(history) + else: + logging.warning("Datastore history directory {} does not exist, skipping history import.".format(target_path)) + + # No longer needed, dynamically pulled from the disk when needed. + # But we should set it back to a empty dict so we don't break if this schema runs on an earlier version. + # In the distant future we can remove this entirely + self.data['watching'][uuid]['history'] = {} diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index f2d0c857..ea866c87 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -46,7 +46,7 @@ {% if watch.last_error is defined and watch.last_error != False %}error{% endif %} {% if watch.last_notification_error is defined and watch.last_notification_error != False %}error{% endif %} {% if watch.paused is defined and watch.paused != False %}paused{% endif %} - {% if watch.newest_history_key| int > watch.last_viewed| int %}unviewed{% endif %} + {% if watch.newest_history_key| int > watch.last_viewed and watch.history_n>=2 %}unviewed{% endif %} {% if watch.uuid in queued_uuids %}queued{% endif %}"> {{ loop.index }} Pause @@ -68,7 +68,7 @@ {% endif %} {{watch|format_last_checked_time}} - {% if watch.history|length >= 2 and watch.last_changed %} + {% if watch.history_n >=2 and watch.last_changed %} {{watch.last_changed|format_timestamp_timeago}} {% else %} Not yet @@ -78,10 +78,10 @@ {% if watch.uuid in queued_uuids %}Queued{% else %}Recheck{% endif %} Edit - {% if watch.history|length >= 2 %} + {% if watch.history_n >= 2 %} Diff {% else %} - {% if watch.history|length == 1 %} + {% if watch.history_n == 1 %} Preview {% endif %} {% endif %} diff --git a/changedetectionio/tests/test_api.py b/changedetectionio/tests/test_api.py index 774dd634..5f982b69 100644 --- a/changedetectionio/tests/test_api.py +++ b/changedetectionio/tests/test_api.py @@ -2,7 +2,7 @@ import time from flask import url_for -from .util import live_server_setup +from .util import live_server_setup, extract_api_key_from_UI import json import uuid @@ -53,23 +53,10 @@ def is_valid_uuid(val): return False -# kinda funky, but works for now -def _extract_api_key_from_UI(client): - import re - res = client.get( - url_for("settings_page"), - ) - # {{api_key}} - - m = re.search('(.+?)', str(res.data)) - api_key = m.group(1) - return api_key.strip() - - def test_api_simple(client, live_server): live_server_setup(live_server) - api_key = _extract_api_key_from_UI(client) + api_key = extract_api_key_from_UI(client) # Create a watch set_original_response() diff --git a/changedetectionio/tests/test_element_removal.py b/changedetectionio/tests/test_element_removal.py index 5f43f766..3e384b77 100644 --- a/changedetectionio/tests/test_element_removal.py +++ b/changedetectionio/tests/test_element_removal.py @@ -150,9 +150,8 @@ def test_element_removal_full(client, live_server): # Give the thread time to pick it up time.sleep(sleep_time_for_fetch_thread) - # No change yet - first check - res = client.get(url_for("index")) - assert b"unviewed" not in res.data + # so that we set the state to 'unviewed' after all the edits + client.get(url_for("diff_history_page", uuid="first")) # Make a change to header/footer/nav set_modified_response() diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py new file mode 100644 index 00000000..a7e46a57 --- /dev/null +++ b/changedetectionio/tests/test_history_consistency.py @@ -0,0 +1,84 @@ +#!/usr/bin/python3 + +import time +import os +import json +import logging +from flask import url_for +from .util import live_server_setup +from urllib.parse import urlparse, parse_qs + +def test_consistent_history(client, live_server): + live_server_setup(live_server) + + # Give the endpoint time to spin up + time.sleep(1) + r = range(1, 50) + + for one in r: + test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + + assert b"1 Imported" in res.data + + time.sleep(3) + while True: + res = client.get(url_for("index")) + logging.debug("Waiting for 'Checking now' to go away..") + if b'Checking now' not in res.data: + break + time.sleep(0.5) + + time.sleep(3) + # Essentially just triggers the DB write/update + res = client.post( + url_for("settings_page"), + data={"application-empty_pages_are_a_change": "", + "requests-time_between_check-minutes": 180, + 'application-fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Settings updated." in res.data + + # Give it time to write it out + time.sleep(3) + json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json') + + json_obj = None + with open(json_db_file, 'r') as f: + json_obj = json.load(f) + + # assert the right amount of watches was found in the JSON + assert len(json_obj['watching']) == len(r), "Correct number of watches was found in the JSON" + + # each one should have a history.txt containing just one line + for w in json_obj['watching'].keys(): + history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt') + assert os.path.isfile(history_txt_index_file), "History.txt should exist where I expect it - {}".format(history_txt_index_file) + + # Same like in model.Watch + with open(history_txt_index_file, "r") as f: + tmp_history = dict(i.strip().split(',', 2) for i in f.readlines()) + assert len(tmp_history) == 1, "History.txt should contain 1 line" + + # Should be two files,. the history.txt , and the snapshot.txt + files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, + w)) + # Find the snapshot one + for fname in files_in_watch_dir: + if fname != 'history.txt': + # contents should match what we requested as content returned from the test url + with open(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, fname), 'r') as snapshot_f: + contents = snapshot_f.read() + watch_url = json_obj['watching'][w]['url'] + u = urlparse(watch_url) + q = parse_qs(u[4]) + assert q['content'][0] == contents.strip(), "Snapshot file {} should contain {}".format(fname, q['content'][0]) + + + + assert len(files_in_watch_dir) == 2, "Should be just two files in the dir, history.txt and the snapshot" diff --git a/changedetectionio/tests/test_trigger.py b/changedetectionio/tests/test_trigger.py index cc622ca4..6305ff02 100644 --- a/changedetectionio/tests/test_trigger.py +++ b/changedetectionio/tests/test_trigger.py @@ -78,9 +78,6 @@ def test_trigger_functionality(client, live_server): # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) - # Goto the edit page, add our ignore text # Add our URL to the import page res = client.post( @@ -98,6 +95,12 @@ def test_trigger_functionality(client, live_server): ) assert bytes(trigger_text.encode('utf-8')) in res.data + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # so that we set the state to 'unviewed' after all the edits + client.get(url_for("diff_history_page", uuid="first")) + # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) diff --git a/changedetectionio/tests/test_trigger_regex.py b/changedetectionio/tests/test_trigger_regex.py index 4ff772c7..555b4649 100644 --- a/changedetectionio/tests/test_trigger_regex.py +++ b/changedetectionio/tests/test_trigger_regex.py @@ -42,9 +42,6 @@ def test_trigger_regex_functionality(client, live_server): ) assert b"1 Imported" in res.data - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(sleep_time_for_fetch_thread) @@ -60,7 +57,9 @@ def test_trigger_regex_functionality(client, live_server): "fetch_backend": "html_requests"}, follow_redirects=True ) - + time.sleep(sleep_time_for_fetch_thread) + # so that we set the state to 'unviewed' after all the edits + client.get(url_for("diff_history_page", uuid="first")) with open("test-datastore/endpoint-content.txt", "w") as f: f.write("some new noise") @@ -78,4 +77,8 @@ def test_trigger_regex_functionality(client, live_server): client.get(url_for("form_watch_checknow"), follow_redirects=True) time.sleep(sleep_time_for_fetch_thread) res = client.get(url_for("index")) - assert b'unviewed' in res.data \ No newline at end of file + assert b'unviewed' in res.data + + # Cleanup everything + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data \ No newline at end of file diff --git a/changedetectionio/tests/test_trigger_regex_with_filter.py b/changedetectionio/tests/test_trigger_regex_with_filter.py index 6ffe3a19..1f95046a 100644 --- a/changedetectionio/tests/test_trigger_regex_with_filter.py +++ b/changedetectionio/tests/test_trigger_regex_with_filter.py @@ -22,10 +22,9 @@ def set_original_ignore_response(): -def test_trigger_regex_functionality(client, live_server): +def test_trigger_regex_functionality_with_filter(client, live_server): live_server_setup(live_server) - sleep_time_for_fetch_thread = 3 set_original_ignore_response() @@ -42,26 +41,24 @@ def test_trigger_regex_functionality(client, live_server): ) assert b"1 Imported" in res.data - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - - # Give the thread time to pick it up + # it needs time to save the original version time.sleep(sleep_time_for_fetch_thread) - # It should report nothing found (just a new one shouldnt have anything) - res = client.get(url_for("index")) - assert b'unviewed' not in res.data - ### test regex with filter res = client.post( url_for("edit_page", uuid="first"), - data={"trigger_text": "/cool.stuff\d/", + data={"trigger_text": "/cool.stuff/", "url": test_url, "css_filter": '#in-here', "fetch_backend": "html_requests"}, follow_redirects=True ) + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + client.get(url_for("diff_history_page", uuid="first")) + # Check that we have the expected text.. but it's not in the css filter we want with open("test-datastore/endpoint-content.txt", "w") as f: f.write("some new noise with cool stuff2 ok") @@ -73,6 +70,7 @@ def test_trigger_regex_functionality(client, live_server): res = client.get(url_for("index")) assert b'unviewed' not in res.data + # now this should trigger something with open("test-datastore/endpoint-content.txt", "w") as f: f.write("some new noise with cool stuff6 ok") @@ -81,4 +79,6 @@ def test_trigger_regex_functionality(client, live_server): res = client.get(url_for("index")) assert b'unviewed' in res.data - +# Cleanup everything + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index d1457fab..af32d8ae 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 from flask import make_response, request +from flask import url_for def set_original_response(): test_return_data = """ @@ -55,14 +56,32 @@ def set_more_modified_response(): return None +# kinda funky, but works for now +def extract_api_key_from_UI(client): + import re + res = client.get( + url_for("settings_page"), + ) + # {{api_key}} + + m = re.search('(.+?)', str(res.data)) + api_key = m.group(1) + return api_key.strip() + def live_server_setup(live_server): @live_server.app.route('/test-endpoint') def test_endpoint(): ctype = request.args.get('content_type') status_code = request.args.get('status_code') + content = request.args.get('content') or None try: + if content is not None: + resp = make_response(content, status_code) + resp.headers['Content-Type'] = ctype if ctype else 'text/html' + return resp + # Tried using a global var here but didn't seem to work, so reading from a file instead. with open("test-datastore/endpoint-content.txt", "r") as f: resp = make_response(f.read(), status_code) diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 0e2b344f..a46c63c9 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -75,9 +75,7 @@ class update_worker(threading.Thread): # For the FIRST time we check a site, or a change detected, save the snapshot. if changed_detected or not watch['last_checked']: # A change was detected - fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) - # Should always be keyed by string(timestamp) - self.datastore.update_watch(uuid, {"history": {str(round(time.time())): fname}}) + fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time()))) # Generally update anything interesting returned self.datastore.update_watch(uuid=uuid, update_obj=update_obj) @@ -88,16 +86,10 @@ class update_worker(threading.Thread): print (">> Change detected in UUID {} - {}".format(uuid, watch['url'])) # Notifications should only trigger on the second time (first time, we gather the initial snapshot) - if len(watch['history']) > 1: + if watch.history_n >= 2: - dates = list(watch['history'].keys()) - # Convert to int, sort and back to str again - # @todo replace datastore getter that does this automatically - dates = [int(i) for i in dates] - dates.sort(reverse=True) - dates = [str(i) for i in dates] - - prev_fname = watch['history'][dates[1]] + dates = list(watch.history.keys()) + prev_fname = watch.history[dates[-2]] # Did it have any notification alerts to hit?