From 2b054ced8c15f6168acbcfb424b88bcc09073284 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 28 Jun 2022 18:34:32 +0200 Subject: [PATCH] [new filter] Filter option - Trigger only when NEW content (lines) are detected ( compared to earlier text snapshots ) (#685) --- changedetectionio/fetch_site_status.py | 11 ++ changedetectionio/forms.py | 1 + changedetectionio/model/Watch.py | 14 +++ changedetectionio/templates/edit.html | 6 ++ changedetectionio/tests/test_unique_lines.py | 104 +++++++++++++++++++ 5 files changed, 136 insertions(+) create mode 100644 changedetectionio/tests/test_unique_lines.py diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 2ddf5ca3..aca0fd41 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -1,4 +1,5 @@ import hashlib +import logging import os import re import time @@ -262,6 +263,16 @@ class perform_site_check(): if not watch['title'] or not len(watch['title']): update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) + if changed_detected: + if watch.get('check_unique_lines', False): + has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines()) + # One or more lines? unsure? + if not has_unique_lines: + logging.debug("check_unique_lines: UUID {} didnt have anything new setting change_detected=False".format(uuid)) + changed_detected = False + else: + logging.debug("check_unique_lines: UUID {} had unique content".format(uuid)) + # Always record the new checksum update_obj["previous_md5"] = fetched_md5 diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index dc6f3082..c857b64c 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -340,6 +340,7 @@ class watchForm(commonSettingsForm): body = TextAreaField('Request body', [validators.Optional()]) method = SelectField('Request method', choices=valid_method, default=default_method) ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False) + check_unique_lines = BooleanField('Only trigger when new lines appear', default=False) trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()]) text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()]) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 64f299fd..8debe4c9 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -41,6 +41,7 @@ class model(dict): 'text_should_not_be_present': [], # Text that should not present 'fetch_backend': None, 'extract_title_as_title': False, + 'check_unique_lines': False, # On change-detected, compare against all history if its something new 'proxy': None, # Preferred proxy connection # Re #110, so then if this is set to None, we know to use the default value instead # Requires setting to None on submit if it's the same as the default @@ -163,3 +164,16 @@ class model(dict): if x: seconds += x * n return seconds + + # Iterate over all history texts and see if something new exists + def lines_contain_something_unique_compared_to_history(self, lines=[]): + local_lines = [l.decode('utf-8').strip().lower() for l in lines] + + # Compare each lines (set) against each history text file (set) looking for something new.. + for k, v in self.history.items(): + alist = [line.decode('utf-8').strip().lower() for line in open(v, 'rb')] + res = set(alist) != set(local_lines) + if res: + return True + + return False diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index addcbf34..9d193322 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -147,6 +147,12 @@ User-Agent: wonderbra 1.0") }} +
+
+ {{ render_checkbox_field(form.check_unique_lines) }} + Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch. +
+
{{ render_field(form.css_filter, placeholder=".class-name or #some-id, or other CSS selector rule.", class="m-d") }} diff --git a/changedetectionio/tests/test_unique_lines.py b/changedetectionio/tests/test_unique_lines.py new file mode 100644 index 00000000..6fb2e420 --- /dev/null +++ b/changedetectionio/tests/test_unique_lines.py @@ -0,0 +1,104 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from .util import live_server_setup + + +def set_original_ignore_response(): + test_return_data = """ + +

Some initial text

+

Which is across multiple lines

+

So let's see what happens.

+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +# The same but just re-ordered the text +def set_modified_swapped_lines(): + # Re-ordered and with some whitespacing, should get stripped() too. + test_return_data = """ + +

Some initial text

+

So let's see what happens.

+

 Which is across multiple lines

+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def set_modified_with_trigger_text_response(): + test_return_data = """ + +

Some initial text

+

So let's see what happens.

+

and a new line!

+

Which is across multiple lines

+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def test_unique_lines_functionality(client, live_server): + live_server_setup(live_server) + + sleep_time_for_fetch_thread = 3 + + set_original_ignore_response() + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + time.sleep(sleep_time_for_fetch_thread) + + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"check_unique_lines": "y", + "url": test_url, + "fetch_backend": "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + assert b'unviewed' not in res.data + + # Make a change + set_modified_swapped_lines() + + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + + # Now set the content which contains the new text and re-ordered existing text + set_modified_with_trigger_text_response() + client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) + assert b'unviewed' in res.data +