diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 5011afaf..d3ac30fc 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -480,8 +480,10 @@ class processor_text_json_diff_form(commonSettingsForm): body = TextAreaField('Request body', [validators.Optional()]) method = SelectField('Request method', choices=valid_method, default=default_method) ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False) - check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False) + check_unique_lines = BooleanField('Only trigger when unique lines appear in all history', default=False) + remove_duplicate_lines = BooleanField('Remove duplicate lines of text', default=False) sort_text_alphabetically = BooleanField('Sort text alphabetically', default=False) + trim_text_whitespace = BooleanField('Trim whitespace before and after text', default=False) filter_text_added = BooleanField('Added lines', default=True) filter_text_replaced = BooleanField('Replaced/changed lines', default=True) diff --git a/changedetectionio/model/__init__.py b/changedetectionio/model/__init__.py index e439de4f..3b95c91c 100644 --- a/changedetectionio/model/__init__.py +++ b/changedetectionio/model/__init__.py @@ -60,6 +60,8 @@ class watch_base(dict): 'time_between_check_use_default': True, 'title': None, 'track_ldjson_price_data': None, + 'trim_text_whitespace': False, + 'remove_duplicate_lines': False, 'trigger_text': [], # List of text or regex to wait for until a change is detected 'url': '', 'uuid': str(uuid.uuid4()), diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 77c37131..d8be0967 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -218,11 +218,19 @@ class perform_site_check(difference_detection_processor): is_rss=is_rss)) #1874 activate the something</p> will add an extra line feed to signify the paragraph gap # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. - stripped_text_from_html = stripped_text_from_html.replace('\n\n', '\n') - stripped_text_from_html = '\n'.join( sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower() )) + stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n") + stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower())) + # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') @@ -304,7 +312,7 @@ class perform_site_check(difference_detection_processor): for match in res: regex_matched_output += [match] + [b'\n'] - # Now we will only show what the regex matched + ########################################################## stripped_text_from_html = b'' text_content_before_ignored_filter = b'' if regex_matched_output: @@ -312,6 +320,8 @@ class perform_site_check(difference_detection_processor): stripped_text_from_html = b''.join(regex_matched_output) text_content_before_ignored_filter = stripped_text_from_html + + # Re #133 - if we should strip whitespaces from triggering the change detected comparison if self.datastore.data['settings']['application'].get('ignore_whitespace', False): fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 8ce58755..118aaec6 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -331,11 +331,22 @@ nav <span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br> <span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span> </fieldset> - + <fieldset class="pure-control-group"> + {{ render_checkbox_field(form.check_unique_lines) }} + <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span> + </fieldset> + <fieldset class="pure-control-group"> + {{ render_checkbox_field(form.remove_duplicate_lines) }} + <span class="pure-form-message-inline">Remove duplicate lines of text</span> + </fieldset> <fieldset class="pure-control-group"> {{ render_checkbox_field(form.sort_text_alphabetically) }} <span class="pure-form-message-inline">Helps reduce changes detected caused by sites shuffling lines around, combine with <i>check unique lines</i> below.</span> </fieldset> + <fieldset class="pure-control-group"> + {{ render_checkbox_field(form.trim_text_whitespace) }} + <span class="pure-form-message-inline">Remove any whitespace before and after each line of text</span> + </fieldset> <fieldset class="pure-control-group"> {{ render_checkbox_field(form.check_unique_lines) }} <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span> diff --git a/changedetectionio/tests/test_unique_lines.py b/changedetectionio/tests/test_unique_lines.py index 65840073..f4148157 100644 --- a/changedetectionio/tests/test_unique_lines.py +++ b/changedetectionio/tests/test_unique_lines.py @@ -11,6 +11,8 @@ def set_original_ignore_response(): <p>Some initial text</p> <p>Which is across multiple lines</p> <p>So let's see what happens.</p> + <p>  So let's see what happens. <br> </p> + <p>A - sortable line</p> </body> </html> """ @@ -164,5 +166,52 @@ def test_sort_lines_functionality(client, live_server, measure_memory_usage): assert res.data.find(b'A uppercase') < res.data.find(b'Z last') assert res.data.find(b'Some initial text') < res.data.find(b'Which is across multiple lines') + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + +def test_extra_filters(client, live_server, measure_memory_usage): + #live_server_setup(live_server) + + set_original_ignore_response() + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + wait_for_all_checks(client) + + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"remove_duplicate_lines": "y", + "trim_text_whitespace": "y", + "sort_text_alphabetically": "", # leave this OFF for testing + "url": test_url, + "fetch_backend": "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + # Give the thread time to pick it up + wait_for_all_checks(client) + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + wait_for_all_checks(client) + + res = client.get( + url_for("preview_page", uuid="first") + ) + + assert res.data.count(b"see what happens.") == 1 + + # still should remain unsorted ('A - sortable line') stays at the end + assert res.data.find(b'A - sortable line') > res.data.find(b'Which is across multiple lines') + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data \ No newline at end of file