From 616c0b3f65feeb9dbe21e83b06bbf42c0d27613e Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 2 Feb 2024 11:36:58 +0100 Subject: [PATCH] New text filter - Sort text alphabetically filter (#2153) --- changedetectionio/forms.py | 1 + changedetectionio/model/Watch.py | 1 + .../processors/text_json_diff.py | 6 ++ changedetectionio/templates/edit.html | 4 + changedetectionio/tests/test_unique_lines.py | 83 +++++++++++++++++-- 5 files changed, 86 insertions(+), 9 deletions(-) diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 9f72a748..a4480cc1 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -465,6 +465,7 @@ class watchForm(commonSettingsForm): method = SelectField('Request method', choices=valid_method, default=default_method) ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False) check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False) + sort_text_alphabetically = BooleanField('Sort text alphabetically', default=False) filter_text_added = BooleanField('Added lines', default=True) filter_text_replaced = BooleanField('Replaced/changed lines', default=True) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 2bb7a400..602df5cc 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -58,6 +58,7 @@ base_config = { 'previous_md5_before_filters': False, # Used for skipping changedetection entirely 'proxy': None, # Preferred proxy connection 'remote_server_reply': None, # From 'server' reply header + 'sort_text_alphabetically': False, 'subtractive_selectors': [], 'tag': '', # Old system of text name for a tag, to be removed 'tags': [], # list of UUIDs to App.Tags diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 619a2856..0f185150 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -204,6 +204,12 @@ class perform_site_check(difference_detection_processor): is_rss=is_rss # #1874 activate the something</p> will add an extra line feed to signify the paragraph gap + # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. + stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n') + stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() )) + # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index a5acd213..812ddb2b 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -339,6 +339,10 @@ nav <span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span> </fieldset> + <fieldset class="pure-control-group"> + {{ render_checkbox_field(form.sort_text_alphabetically) }} + <span class="pure-form-message-inline">Helps reduce changes detected caused by sites shuffling lines around, combine with <i>check unique lines</i> below.</span> + </fieldset> <fieldset class="pure-control-group"> {{ render_checkbox_field(form.check_unique_lines) }} <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span> diff --git a/changedetectionio/tests/test_unique_lines.py b/changedetectionio/tests/test_unique_lines.py index e7132ca5..4eff11fd 100644 --- a/changedetectionio/tests/test_unique_lines.py +++ b/changedetectionio/tests/test_unique_lines.py @@ -2,7 +2,7 @@ import time from flask import url_for -from .util import live_server_setup +from .util import live_server_setup, wait_for_all_checks def set_original_ignore_response(): @@ -34,6 +34,23 @@ def set_modified_swapped_lines(): with open("test-datastore/endpoint-content.txt", "w") as f: f.write(test_return_data) +def set_modified_swapped_lines_with_extra_text_for_sorting(): + test_return_data = """<html> + <body> + <p> Which is across multiple lines</p> + <p>Some initial text</p> + <p> So let's see what happens.</p> + <p>Z last</p> + <p>0 numerical</p> + <p>A uppercase</p> + <p>a lowercase</p> + </body> + </html> + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + def set_modified_with_trigger_text_response(): test_return_data = """<html> @@ -49,15 +66,14 @@ def set_modified_with_trigger_text_response(): with open("test-datastore/endpoint-content.txt", "w") as f: f.write(test_return_data) +def test_setup(client, live_server): + live_server_setup(live_server) def test_unique_lines_functionality(client, live_server): - live_server_setup(live_server) + #live_server_setup(live_server) - sleep_time_for_fetch_thread = 3 set_original_ignore_response() - # Give the endpoint time to spin up - time.sleep(1) # Add our URL to the import page test_url = url_for('test_endpoint', _external=True) @@ -67,7 +83,7 @@ def test_unique_lines_functionality(client, live_server): follow_redirects=True ) assert b"1 Imported" in res.data - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # Add our URL to the import page res = client.post( @@ -83,12 +99,11 @@ def test_unique_lines_functionality(client, live_server): # Make a change set_modified_swapped_lines() - time.sleep(sleep_time_for_fetch_thread) # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (no new 'unviewed' class) res = client.get(url_for("index")) @@ -97,7 +112,57 @@ def test_unique_lines_functionality(client, live_server): # Now set the content which contains the new text and re-ordered existing text set_modified_with_trigger_text_response() client.get(url_for("form_watch_checknow"), follow_redirects=True) - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) res = client.get(url_for("index")) assert b'unviewed' in res.data + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + +def test_sort_lines_functionality(client, live_server): + #live_server_setup(live_server) + + set_modified_swapped_lines_with_extra_text_for_sorting() + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + wait_for_all_checks(client) + + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"sort_text_alphabetically": "n", + "url": test_url, + "fetch_backend": "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + wait_for_all_checks(client) + + + res = client.get(url_for("index")) + # Should be a change registered + assert b'unviewed' in res.data + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + assert res.data.find(b'0 numerical') < res.data.find(b'Z last') + assert res.data.find(b'A uppercase') < res.data.find(b'Z last') + assert res.data.find(b'Some initial text') < res.data.find(b'Which is across multiple lines') + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data \ No newline at end of file