diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 23ea2dce..7a453abd 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -601,6 +601,16 @@ def changedetection_app(config=None, datastore_o=None): if datastore.proxy_list is not None and form.data['proxy'] == '': extra_update_obj['proxy'] = None + # Unsetting all filter_text methods should make it go back to default + # This particularly affects tests running + if 'filter_text_added' in form.data and not form.data.get('filter_text_added') \ + and 'filter_text_replaced' in form.data and not form.data.get('filter_text_replaced') \ + and 'filter_text_removed' in form.data and not form.data.get('filter_text_removed'): + extra_update_obj['filter_text_added'] = True + extra_update_obj['filter_text_replaced'] = True + extra_update_obj['filter_text_removed'] = True + + datastore.data['watching'][uuid].update(form.data) datastore.data['watching'][uuid].update(extra_update_obj) diff --git a/changedetectionio/diff.py b/changedetectionio/diff.py index 2b566ffc..c3d8b0cd 100644 --- a/changedetectionio/diff.py +++ b/changedetectionio/diff.py @@ -10,7 +10,7 @@ def same_slicer(l, a, b): return l[a:b] # like .compare but a little different output -def customSequenceMatcher(before, after, include_equal=False, include_removed=True, include_added=True): +def customSequenceMatcher(before, after, include_equal=False, include_removed=True, include_added=True, include_replaced=True, include_change_type_prefix=True): cruncher = difflib.SequenceMatcher(isjunk=lambda x: x in " \\t", a=before, b=after) # @todo Line-by-line mode instead of buncghed, including `after` that is not in `before` (maybe unset?) @@ -19,19 +19,23 @@ def customSequenceMatcher(before, after, include_equal=False, include_removed=Tr g = before[alo:ahi] yield g elif include_removed and tag == 'delete': - g = ["(removed) " + i for i in same_slicer(before, alo, ahi)] + row_prefix = "(removed) " if include_change_type_prefix else '' + g = [ row_prefix + i for i in same_slicer(before, alo, ahi)] yield g - elif tag == 'replace': - g = ["(changed) " + i for i in same_slicer(before, alo, ahi)] - g += ["(into) " + i for i in same_slicer(after, blo, bhi)] + elif include_replaced and tag == 'replace': + row_prefix = "(changed) " if include_change_type_prefix else '' + g = [row_prefix + i for i in same_slicer(before, alo, ahi)] + row_prefix = "(into) " if include_change_type_prefix else '' + g += [row_prefix + i for i in same_slicer(after, blo, bhi)] yield g elif include_added and tag == 'insert': - g = ["(added) " + i for i in same_slicer(after, blo, bhi)] + row_prefix = "(added) " if include_change_type_prefix else '' + g = [row_prefix + i for i in same_slicer(after, blo, bhi)] yield g # only_differences - only return info about the differences, no context # line_feed_sep could be "
" or "
  • " or "\n" etc -def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"): +def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, include_replaced=True, line_feed_sep="\n", include_change_type_prefix=True): newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()] @@ -40,9 +44,13 @@ def render_diff(previous_version_file_contents, newest_version_file_contents, in else: previous_version_file_contents = "" - rendered_diff = customSequenceMatcher(previous_version_file_contents, - newest_version_file_contents, - include_equal, include_removed, include_added) + rendered_diff = customSequenceMatcher(before=previous_version_file_contents, + after=newest_version_file_contents, + include_equal=include_equal, + include_removed=include_removed, + include_added=include_added, + include_replaced=include_replaced, + include_change_type_prefix=include_change_type_prefix) # Recursively join lists f = lambda L: line_feed_sep.join([f(x) if type(x) is list else x for x in L]) diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 9c086e37..55566a01 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -399,13 +399,19 @@ class watchForm(commonSettingsForm): body = TextAreaField('Request body', [validators.Optional()]) method = SelectField('Request method', choices=valid_method, default=default_method) ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False) - check_unique_lines = BooleanField('Only trigger when new lines appear', default=False) + check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False) + + filter_text_added = BooleanField('Added lines', default=True) + filter_text_replaced = BooleanField('Replaced/changed lines', default=True) + filter_text_removed = BooleanField('Removed lines', default=True) + + # @todo this class could be moved to its own text_json_diff_watchForm and this goes to restock_diff_Watchform perhaps in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True) trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()]) if os.getenv("PLAYWRIGHT_DRIVER_URL"): browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10) - text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()]) + text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()]) webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()]) save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 9af922ae..c2b48c2e 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -26,6 +26,9 @@ base_config = { 'fetch_backend': 'system', # plaintext, playwright etc 'processor': 'text_json_diff', # could be restock_diff or others from .processors 'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), + 'filter_text_added': True, + 'filter_text_replaced': True, + 'filter_text_removed': True, 'has_ldjson_price_data': None, 'track_ldjson_price_data': None, 'headers': {}, # Extra headers to send @@ -326,7 +329,8 @@ class model(dict): # Compare each lines (set) against each history text file (set) looking for something new.. existing_history = set({}) for k, v in self.history.items(): - alist = set([line.decode('utf-8').strip().lower() for line in open(v, 'rb')]) + content = self.get_history_snapshot(k) + alist = set([line.strip().lower() for line in content.splitlines()]) existing_history = existing_history.union(alist) # Check that everything in local_lines(new stuff) already exists in existing_history - it should @@ -454,3 +458,38 @@ class model(dict): # Return list of tags, stripped and lowercase, used for searching def all_tags(self): return [s.strip().lower() for s in self.get('tag','').split(',')] + + def has_special_diff_filter_options_set(self): + + # All False - nothing would be done, so act like it's not processable + if not self.get('filter_text_added', True) and not self.get('filter_text_replaced', True) and not self.get('filter_text_removed', True): + return False + + # Or one is set + if not self.get('filter_text_added', True) or not self.get('filter_text_replaced', True) or not self.get('filter_text_removed', True): + return True + + # None is set + return False + + + def get_last_fetched_before_filters(self): + import brotli + filepath = os.path.join(self.watch_data_dir, 'last-fetched.br') + + if not os.path.isfile(filepath): + # If a previous attempt doesnt yet exist, just snarf the previous snapshot instead + dates = list(self.history.keys()) + if len(dates): + return self.get_history_snapshot(dates[-1]) + else: + return '' + + with open(filepath, 'rb') as f: + return(brotli.decompress(f.read()).decode('utf-8')) + + def save_last_fetched_before_filters(self, contents): + import brotli + filepath = os.path.join(self.watch_data_dir, 'last-fetched.br') + with open(filepath, 'wb') as f: + f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 14ce14f3..cf85522a 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -279,6 +279,34 @@ class perform_site_check(difference_detection_processor): # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') + + # @todo whitespace coming from missing rtrim()? + # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about. + # Rewrite's the processing text based on only what diff result they want to see + if watch.has_special_diff_filter_options_set() and len(watch.history.keys()): + # Now the content comes from the diff-parser and not the returned HTTP traffic, so could be some differences + from .. import diff + # needs to not include (added) etc or it may get used twice + # Replace the processed text with the preferred result + rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(), + newest_version_file_contents=stripped_text_from_html, + include_equal=False, # not the same lines + include_added=watch.get('filter_text_added', True), + include_removed=watch.get('filter_text_removed', True), + include_replaced=watch.get('filter_text_replaced', True), + line_feed_sep="\n", + include_change_type_prefix=False) + + watch.save_last_fetched_before_filters(text_content_before_ignored_filter) + + if not rendered_diff and stripped_text_from_html: + # We had some content, but no differences were found + # Store our new file as the MD5 so it will trigger in the future + c = hashlib.md5(text_content_before_ignored_filter.translate(None, b'\r\n\t ')).hexdigest() + return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8') + else: + stripped_text_from_html = rendered_diff + # Treat pages with no renderable text content as a change? No by default empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: @@ -337,6 +365,7 @@ class perform_site_check(difference_detection_processor): blocked = True # Filter and trigger works the same, so reuse it # It should return the line numbers that match + # Unblock flow if the trigger was found (some text remained after stripped what didnt match) result = html_tools.strip_ignore_text(content=str(stripped_text_from_html), wordlist=trigger_text, mode="line numbers") diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh index 70184051..d9fa9ff0 100755 --- a/changedetectionio/run_basic_tests.sh +++ b/changedetectionio/run_basic_tests.sh @@ -35,3 +35,4 @@ pytest tests/test_access_control.py pytest tests/test_notification.py pytest tests/test_backend.py pytest tests/test_rss.py +pytest tests/test_unique_lines.py \ No newline at end of file diff --git a/changedetectionio/static/styles/scss/styles.scss b/changedetectionio/static/styles/scss/styles.scss index 4ed3412f..4ec31ed3 100644 --- a/changedetectionio/static/styles/scss/styles.scss +++ b/changedetectionio/static/styles/scss/styles.scss @@ -893,6 +893,21 @@ body.full-width { font-size: .875em; } } + .text-filtering { + h3 { + margin-top: 0; + } + border: 1px solid #ccc; + padding: 1rem; + border-radius: 5px; + margin-bottom: 1rem; + fieldset:last-of-type { + padding-bottom: 0; + .pure-control-group { + padding-bottom: 0; + } + } + } } ul { diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css index c8497847..1e0559f4 100644 --- a/changedetectionio/static/styles/styles.css +++ b/changedetectionio/static/styles/styles.css @@ -872,6 +872,17 @@ body.full-width .edit-form { color: var(--color-text-input-description); } .edit-form .pure-form-message-inline code { font-size: .875em; } + .edit-form .text-filtering { + border: 1px solid #ccc; + padding: 1rem; + border-radius: 5px; + margin-bottom: 1rem; } + .edit-form .text-filtering h3 { + margin-top: 0; } + .edit-form .text-filtering fieldset:last-of-type { + padding-bottom: 0; } + .edit-form .text-filtering fieldset:last-of-type .pure-control-group { + padding-bottom: 0; } ul { padding-left: 1em; diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 298b8c90..b8f0a747 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -244,12 +244,6 @@ User-Agent: wonderbra 1.0") }}
  • -
    -
    - {{ render_checkbox_field(form.check_unique_lines) }} - Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch. -
    -
    {% set field = render_field(form.include_filters, rows=5, @@ -287,37 +281,39 @@ xpath://body/div/span[contains(@class, 'example-class')]", href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help.
    -
    - {{ render_field(form.subtractive_selectors, rows=5, placeholder="header +
    + {{ render_field(form.subtractive_selectors, rows=5, placeholder="header footer nav .stockticker") }} - +
    • Remove HTML element(s) by CSS selector before text conversion.
    • Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML.
    -
    -
    - {{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line -/some.regex\d{2}/ for case-INsensitive regex - ") }} - - - +
    +
    +
    +

    Text filtering

    + Limit trigger/ignore/block/extract to;
    + {{ render_checkbox_field(form.filter_text_added) }} + {{ render_checkbox_field(form.filter_text_replaced) }} + {{ render_checkbox_field(form.filter_text_removed) }} + Note: Depending on the length and similarity of the text on each line, the algorithm may consider an addition instead of replacement for example. + So it's always better to select Added+Replaced when you're interested in new content.
    + When content is merely moved in a list, it will also trigger an addition, consider enabling Only trigger when unique lines appear +
    - +
    + {{ render_checkbox_field(form.check_unique_lines) }} + Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch. +
    {{ render_field(form.trigger_text, rows=5, placeholder="Some text to wait for in a line /some.regex\d{2}/ for case-INsensitive regex - ") }} +") }}
    • Text to wait for before triggering a change/notification, all text and regex are tested case-insensitive.
    • @@ -328,6 +324,21 @@ nav
    +
    + {{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line +/some.regex\d{2}/ for case-INsensitive regex +") }} + +
      +
    • Each line processed separately, any line matching will be ignored (removed before creating the checksum)
    • +
    • Regular Expression support, wrap the entire line in forward slash /regex/
    • +
    • Changing this will affect the comparison checksum which may trigger an alert
    • +
    • Use the preview/show current tab to see ignores
    • +
    +
    + +
    +
    {{ render_field(form.text_should_not_be_present, rows=5, placeholder="For example: Out of stock @@ -362,6 +373,7 @@ Unavailable") }}
    +
    {% endif %} diff --git a/changedetectionio/tests/test_add_replace_remove_filter.py b/changedetectionio/tests/test_add_replace_remove_filter.py new file mode 100644 index 00000000..aac41f92 --- /dev/null +++ b/changedetectionio/tests/test_add_replace_remove_filter.py @@ -0,0 +1,99 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from .util import live_server_setup +from changedetectionio import html_tools + + +def set_original(excluding=None): + test_return_data = """ + +

    Some initial text

    +

    So let's see what happens.

    +

    and a new line!

    +

    The golden line

    +

    A BREAK TO MAKE THE TOP LINE STAY AS "REMOVED" OR IT WILL GET COUNTED AS "CHANGED INTO"

    +

    Something irrelevant

    + + + """ + + if excluding: + output = "" + for i in test_return_data.splitlines(): + if not excluding in i: + output += f"{i}\n" + + test_return_data = output + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def test_check_removed_line_contains_trigger(client, live_server): + live_server_setup(live_server) + + sleep_time_for_fetch_thread = 3 + + # Give the endpoint time to spin up + time.sleep(1) + set_original() + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"trigger_text": 'The golden line', + "url": test_url, + 'fetch_backend': "html_requests", + 'filter_text_removed': 'y'}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + time.sleep(sleep_time_for_fetch_thread) + set_original(excluding='Something irrelevant') + + # A line thats not the trigger should not trigger anything + res = client.get(url_for("form_watch_checknow"), follow_redirects=True) + assert b'1 watches queued for rechecking.' in res.data + time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + # The trigger line is REMOVED, this should trigger + set_original(excluding='The golden line') + client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + + # Now add it back, and we should not get a trigger + client.get(url_for("mark_all_viewed"), follow_redirects=True) + set_original(excluding=None) + client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + # Remove it again, and we should get a trigger + set_original(excluding='The golden line') + client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data diff --git a/changedetectionio/tests/test_block_while_text_present.py b/changedetectionio/tests/test_block_while_text_present.py index 0f38bf36..0538060b 100644 --- a/changedetectionio/tests/test_block_while_text_present.py +++ b/changedetectionio/tests/test_block_while_text_present.py @@ -87,7 +87,10 @@ def test_check_block_changedetection_text_NOT_present(client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"text_should_not_be_present": ignore_text, "url": test_url, 'fetch_backend': "html_requests"}, + data={"text_should_not_be_present": ignore_text, + "url": test_url, + 'fetch_backend': "html_requests" + }, follow_redirects=True ) assert b"Updated watch." in res.data @@ -129,7 +132,6 @@ def test_check_block_changedetection_text_NOT_present(client, live_server): set_modified_response_minus_block_text() client.get(url_for("form_watch_checknow"), follow_redirects=True) time.sleep(sleep_time_for_fetch_thread) - res = client.get(url_for("index")) assert b'unviewed' in res.data diff --git a/changedetectionio/tests/test_unique_lines.py b/changedetectionio/tests/test_unique_lines.py index 6fb2e420..e7132ca5 100644 --- a/changedetectionio/tests/test_unique_lines.py +++ b/changedetectionio/tests/test_unique_lines.py @@ -94,7 +94,6 @@ def test_unique_lines_functionality(client, live_server): res = client.get(url_for("index")) assert b'unviewed' not in res.data - # Now set the content which contains the new text and re-ordered existing text set_modified_with_trigger_text_response() client.get(url_for("form_watch_checknow"), follow_redirects=True) diff --git a/changedetectionio/tests/unit/test_notification_diff.py b/changedetectionio/tests/unit/test_notification_diff.py index a6b7067f..6f323146 100755 --- a/changedetectionio/tests/unit/test_notification_diff.py +++ b/changedetectionio/tests/unit/test_notification_diff.py @@ -19,8 +19,12 @@ class TestDiffBuilder(unittest.TestCase): with open(base_dir + "/test-content/after.txt", 'r') as f: newest_version_file_contents = f.read() - output = diff.render_diff(previous_version_file_contents, newest_version_file_contents) + output = diff.render_diff(previous_version_file_contents=previous_version_file_contents, + newest_version_file_contents=newest_version_file_contents) + output = output.split("\n") + + self.assertIn('(changed) ok', output) self.assertIn('(into) xok', output) self.assertIn('(into) next-x-ok', output) diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index da570b58..85bb0ff6 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -322,6 +322,7 @@ class update_worker(threading.Thread): self.cleanup_error_artifacts(uuid) + # # Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc if process_changedetection_results: try: