From b5c1fce13699b7668bbc58a376b2aaa9382b11c5 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 2 Jan 2022 22:28:34 +0100 Subject: [PATCH] Re #133 Option for ignoring whitespacing (#345) * Global setting option to ignore whitespace when detecting a change --- changedetectionio/__init__.py | 12 +- changedetectionio/fetch_site_status.py | 23 +-- changedetectionio/forms.py | 3 +- changedetectionio/store.py | 1 + changedetectionio/templates/settings.html | 19 ++- changedetectionio/tests/conftest.py | 3 +- changedetectionio/tests/test_ignore_text.py | 2 +- .../tests/test_ignorewhitespace.py | 96 ++++++++++++ changedetectionio/update_worker.py | 137 +++++++++--------- 9 files changed, 208 insertions(+), 88 deletions(-) create mode 100644 changedetectionio/tests/test_ignorewhitespace.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 24718357..be1fc6be 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -405,7 +405,7 @@ def changedetection_app(config=None, datastore_o=None): # Get the most recent one newest_history_key = datastore.get_val(uuid, 'newest_history_key') - # 0 means that theres only one, so that there should be no 'unviewed' history availabe + # 0 means that theres only one, so that there should be no 'unviewed' history available if newest_history_key == 0: newest_history_key = list(datastore.data['watching'][uuid]['history'].keys())[0] @@ -418,7 +418,11 @@ def changedetection_app(config=None, datastore_o=None): stripped_content = handler.strip_ignore_text(raw_content, datastore.data['watching'][uuid]['ignore_text']) - checksum = hashlib.md5(stripped_content).hexdigest() + if datastore.data['settings']['application'].get('ignore_whitespace', False): + checksum = hashlib.md5(stripped_content.translate(None, b'\r\n\t ')).hexdigest() + else: + checksum = hashlib.md5(stripped_content).hexdigest() + return checksum return datastore.data['watching'][uuid]['previous_md5'] @@ -553,6 +557,7 @@ def changedetection_app(config=None, datastore_o=None): form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] + form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend'] form.notification_title.data = datastore.data['settings']['application']['notification_title'] @@ -580,7 +585,8 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['application']['base_url'] = form.base_url.data datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data - + datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data + if form.trigger_check.data: if len(form.notification_urls.data): n_object = {'watch_url': "Test from changedetection.io!", diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index dec73987..0a957114 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -58,8 +58,7 @@ class perform_site_check(): watch = self.datastore.data['watching'][uuid] - update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'], - 'history': {}, + update_obj = { "last_checked": timestamp } @@ -137,8 +136,16 @@ class perform_site_check(): else: stripped_text_from_html = stripped_text_from_html.encode('utf8') + # Re #133 - if we should strip whitespaces from triggering the change detected comparison + if self.datastore.data['settings']['application'].get('ignore_whitespace', False): + fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() + else: + fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() - fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() + # On the first run of a site, watch['previous_md5'] will be an empty string, set it the current one. + if not len(watch['previous_md5']): + watch['previous_md5'] = fetched_md5 + update_obj["previous_md5"] = fetched_md5 blocked_by_not_found_trigger_text = False @@ -160,16 +167,12 @@ class perform_site_check(): break - # could be None or False depending on JSON type - # On the first run of a site, watch['previous_md5'] will be an empty string + if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5: changed_detected = True - - # Don't confuse people by updating as last-changed, when it actually just changed from None.. - if self.datastore.get_val(uuid, 'previous_md5'): - update_obj["last_changed"] = timestamp - update_obj["previous_md5"] = fetched_md5 + update_obj["last_changed"] = timestamp + # Extract title as title if is_html: diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 586a27bb..020d9fa8 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -258,4 +258,5 @@ class globalSettingsForm(commonSettingsForm): [validators.NumberRange(min=1)]) extract_title_as_title = BooleanField('Extract from document and use as watch title') base_url = StringField('Base URL', validators=[validators.Optional()]) - global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) \ No newline at end of file + global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) + ignore_whitespace = BooleanField('Ignore whitespace') \ No newline at end of file diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 3cc049c0..d27f0476 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -46,6 +46,7 @@ class ChangeDetectionStore: 'extract_title_as_title': False, 'fetch_backend': 'html_requests', 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum + 'ignore_whitespace': False, 'notification_urls': [], # Apprise URL list # Custom notification content 'notification_title': None, diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 5c031305..69c80686 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -69,15 +69,24 @@ <div class="tab-pane-inner" id="filters"> - <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span> - <fieldset class="pure-group"> + + <fieldset class="pure-group"> + {{ render_field(form.ignore_whitespace) }} + <span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/> + <i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc. + </span> + </fieldset> + + + <fieldset class="pure-group"> {{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line /some.regex\d{2}/ for case-INsensitive regex ") }} - <span class="pure-form-message-inline"> - Each line processed separately, any line matching will be ignored.<br/> + <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br/> + <span class="pure-form-message-inline">Each line processed separately, any line matching will be ignored.<br/> Regular Expression support, wrap the line in forward slash <b>/regex/</b>. - </span> + </span> + </fieldset> </div> <div id="actions"> diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py index f34ed5bb..aced3075 100644 --- a/changedetectionio/tests/conftest.py +++ b/changedetectionio/tests/conftest.py @@ -18,7 +18,8 @@ def cleanup(datastore_path): 'url-watches.json', 'notification.txt', 'count.txt', - 'endpoint-content.txt'] + 'endpoint-content.txt' + ] for file in files: try: os.unlink("{}/{}".format(datastore_path, file)) diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index 79aa761d..726a6f9b 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -235,4 +235,4 @@ def test_check_global_ignore_text_functionality(client, live_server): assert b'unviewed' in res.data res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) - assert b'Deleted' in res.data \ No newline at end of file + assert b'Deleted' in res.data diff --git a/changedetectionio/tests/test_ignorewhitespace.py b/changedetectionio/tests/test_ignorewhitespace.py new file mode 100644 index 00000000..062efd70 --- /dev/null +++ b/changedetectionio/tests/test_ignorewhitespace.py @@ -0,0 +1,96 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup + +def test_setup(live_server): + live_server_setup(live_server) + + +# Should be the same as set_original_ignore_response() but with a little more whitespacing +def set_original_ignore_response_but_with_whitespace(): + test_return_data = """<html> + <body> + Some initial text</br> + <p> + + + Which is across multiple lines</p> + <br> + </br> + + So let's see what happens. </br> + + + </body> + </html> + + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def set_original_ignore_response(): + test_return_data = """<html> + <body> + Some initial text</br> + <p>Which is across multiple lines</p> + </br> + So let's see what happens. </br> + </body> + </html> + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + + +# If there was only a change in the whitespacing, then we shouldnt have a change detected +def test_check_ignore_whitespace(client, live_server): + sleep_time_for_fetch_thread = 3 + + # Give the endpoint time to spin up + time.sleep(1) + + set_original_ignore_response() + + # Goto the settings page, add our ignore text + res = client.post( + url_for("settings_page"), + data={ + "minutes_between_check": 180, + "ignore_whitespace": "y", + 'fetch_backend': "html_requests" + }, + follow_redirects=True + ) + assert b"Settings updated." in res.data + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + set_original_ignore_response_but_with_whitespace() + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'/test-endpoint' in res.data diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index a8992d96..a4181426 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -64,74 +64,77 @@ class update_worker(threading.Thread): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) else: - if update_obj: - try: - self.datastore.update_watch(uuid=uuid, update_obj=update_obj) - if changed_detected: - n_object = {} - # A change was detected - fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) - - # Update history with the stripped text for future reference, this will also mean we save the first - # Should always be keyed by string(timestamp) - self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}}) - - watch = self.datastore.data['watching'][uuid] - - print (">> Change detected in UUID {} - {}".format(uuid, watch['url'])) - - # Notifications should only trigger on the second time (first time, we gather the initial snapshot) - if len(watch['history']) > 1: - - dates = list(watch['history'].keys()) - # Convert to int, sort and back to str again - # @todo replace datastore getter that does this automatically - dates = [int(i) for i in dates] - dates.sort(reverse=True) - dates = [str(i) for i in dates] - - prev_fname = watch['history'][dates[1]] - - - # Did it have any notification alerts to hit? - if len(watch['notification_urls']): - print(">>> Notifications queued for UUID from watch {}".format(uuid)) - n_object['notification_urls'] = watch['notification_urls'] - n_object['notification_title'] = watch['notification_title'] - n_object['notification_body'] = watch['notification_body'] - n_object['notification_format'] = watch['notification_format'] - - # No? maybe theres a global setting, queue them all - elif len(self.datastore.data['settings']['application']['notification_urls']): - print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid)) - n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls'] - n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title'] - n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body'] - n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format'] + try: + watch = self.datastore.data['watching'][uuid] + + # For the FIRST time we check a site, or a change detected, save the snapshot. + if changed_detected or not watch['last_checked']: + # A change was detected + fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) + # Should always be keyed by string(timestamp) + self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}}) + + # Generally update anything interesting returned + self.datastore.update_watch(uuid=uuid, update_obj=update_obj) + + # A change was detected + if changed_detected: + n_object = {} + print (">> Change detected in UUID {} - {}".format(uuid, watch['url'])) + + # Notifications should only trigger on the second time (first time, we gather the initial snapshot) + if len(watch['history']) > 1: + + dates = list(watch['history'].keys()) + # Convert to int, sort and back to str again + # @todo replace datastore getter that does this automatically + dates = [int(i) for i in dates] + dates.sort(reverse=True) + dates = [str(i) for i in dates] + + prev_fname = watch['history'][dates[1]] + + + # Did it have any notification alerts to hit? + if len(watch['notification_urls']): + print(">>> Notifications queued for UUID from watch {}".format(uuid)) + n_object['notification_urls'] = watch['notification_urls'] + n_object['notification_title'] = watch['notification_title'] + n_object['notification_body'] = watch['notification_body'] + n_object['notification_format'] = watch['notification_format'] + + # No? maybe theres a global setting, queue them all + elif len(self.datastore.data['settings']['application']['notification_urls']): + print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid)) + n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls'] + n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title'] + n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body'] + n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format'] + else: + print(">>> NO notifications queued, watch and global notification URLs were empty.") + + # Only prepare to notify if the rules above matched + if 'notification_urls' in n_object: + # HTML needs linebreak, but MarkDown and Text can use a linefeed + if n_object['notification_format'] == 'HTML': + line_feed_sep = "</br>" else: - print(">>> NO notifications queued, watch and global notification URLs were empty.") - - # Only prepare to notify if the rules above matched - if 'notification_urls' in n_object: - # HTML needs linebreak, but MarkDown and Text can use a linefeed - if n_object['notification_format'] == 'HTML': - line_feed_sep = "</br>" - else: - line_feed_sep = "\n" - - from changedetectionio import diff - n_object.update({ - 'watch_url': watch['url'], - 'uuid': uuid, - 'current_snapshot': contents.decode('utf-8'), - 'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep), - 'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep) - }) - - self.notification_q.put(n_object) - - except Exception as e: - print("!!!! Exception in update_worker !!!\n", e) + line_feed_sep = "\n" + + from changedetectionio import diff + n_object.update({ + 'watch_url': watch['url'], + 'uuid': uuid, + 'current_snapshot': contents.decode('utf-8'), + 'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep), + 'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep) + }) + + self.notification_q.put(n_object) + + except Exception as e: + # Catch everything possible here, so that if a worker crashes, we don't lose it until restart! + print("!!!! Exception in update_worker !!!\n", e) self.current_uuid = None # Done self.q.task_done()