From fa4eeb24ccbe7cd9914922115a9f815dbe9c9075 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 2 Feb 2024 11:15:22 +0100
Subject: [PATCH] New filter - ability to sort alphabetically

---
 changedetectionio/forms.py                    |  1 +
 changedetectionio/model/Watch.py              |  1 +
 .../processors/text_json_diff.py              |  6 ++
 changedetectionio/templates/edit.html         |  4 ++
 changedetectionio/tests/test_unique_lines.py  | 71 ++++++++++++++++++-
 5 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 9f72a748..a4480cc1 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -465,6 +465,7 @@ class watchForm(commonSettingsForm):
     method = SelectField('Request method', choices=valid_method, default=default_method)
     ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
     check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False)
+    sort_text_alphabetically =  BooleanField('Sort text alphabetically', default=False)
 
     filter_text_added = BooleanField('Added lines', default=True)
     filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index 2bb7a400..602df5cc 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -58,6 +58,7 @@ base_config = {
     'previous_md5_before_filters': False,  # Used for skipping changedetection entirely
     'proxy': None,  # Preferred proxy connection
     'remote_server_reply': None, # From 'server' reply header
+    'sort_text_alphabetically': False,
     'subtractive_selectors': [],
     'tag': '', # Old system of text name for a tag, to be removed
     'tags': [], # list of UUIDs to App.Tags
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 619a2856..0f185150 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -204,6 +204,12 @@ class perform_site_check(difference_detection_processor):
                             is_rss=is_rss # #1874 activate the <title workaround hack
                         )
 
+        if watch.get('sort_text_alphabetically') and stripped_text_from_html:
+            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
+            # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
+            stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n')
+            stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() ))
+
         # Re #340 - return the content before the 'ignore text' was applied
         text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
 
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index a5acd213..812ddb2b 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -339,6 +339,10 @@ nav
                     <span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
                 </fieldset>
 
+                <fieldset class="pure-control-group">
+                    {{ render_checkbox_field(form.sort_text_alphabetically) }}
+                    <span class="pure-form-message-inline">Helps reduce changes detected caused by sites shuffling lines around, combine with <i>check unique lines</i> below.</span>
+                </fieldset>
                 <fieldset class="pure-control-group">
                     {{ render_checkbox_field(form.check_unique_lines) }}
                     <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
diff --git a/changedetectionio/tests/test_unique_lines.py b/changedetectionio/tests/test_unique_lines.py
index 90801873..4eff11fd 100644
--- a/changedetectionio/tests/test_unique_lines.py
+++ b/changedetectionio/tests/test_unique_lines.py
@@ -34,6 +34,23 @@ def set_modified_swapped_lines():
     with open("test-datastore/endpoint-content.txt", "w") as f:
         f.write(test_return_data)
 
+def set_modified_swapped_lines_with_extra_text_for_sorting():
+    test_return_data = """<html>
+     <body>
+     <p>&nbsp;Which is across multiple lines</p>     
+     <p>Some initial text</p>
+     <p>   So let's see what happens.</p>
+     <p>Z last</p>
+     <p>0 numerical</p>
+     <p>A uppercase</p>
+     <p>a lowercase</p>     
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
 
 def set_modified_with_trigger_text_response():
     test_return_data = """<html>
@@ -49,9 +66,11 @@ def set_modified_with_trigger_text_response():
     with open("test-datastore/endpoint-content.txt", "w") as f:
         f.write(test_return_data)
 
+def test_setup(client, live_server):
+    live_server_setup(live_server)
 
 def test_unique_lines_functionality(client, live_server):
-    live_server_setup(live_server)
+    #live_server_setup(live_server)
 
 
     set_original_ignore_response()
@@ -96,4 +115,54 @@ def test_unique_lines_functionality(client, live_server):
     wait_for_all_checks(client)
     res = client.get(url_for("index"))
     assert b'unviewed' in res.data
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
+def test_sort_lines_functionality(client, live_server):
+    #live_server_setup(live_server)
+
+    set_modified_swapped_lines_with_extra_text_for_sorting()
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    wait_for_all_checks(client)
+
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"sort_text_alphabetically": "n",
+              "url": test_url,
+              "fetch_backend": "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    wait_for_all_checks(client)
+
+
+    res = client.get(url_for("index"))
+    # Should be a change registered
+    assert b'unviewed' in res.data
+
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
 
+    assert res.data.find(b'0 numerical') < res.data.find(b'Z last')
+    assert res.data.find(b'A uppercase') < res.data.find(b'Z last')
+    assert res.data.find(b'Some initial text') < res.data.find(b'Which is across multiple lines')
+    
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
\ No newline at end of file