Ability to set which text to process triggers on (added, removed, changed) according to the difference (#1483)

2 years ago · 55b6ae86e8
parent 66b892f770
commit 55b6ae86e8
14 changed files with 277 additions and 41 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -601,6 +601,16 @@ def changedetection_app(config=None, datastore_o=None):
            if datastore.proxy_list is not None and form.data['proxy'] == '':
                extra_update_obj['proxy'] = None

+            # Unsetting all filter_text methods should make it go back to default
+            # This particularly affects tests running
+            if 'filter_text_added' in form.data and not form.data.get('filter_text_added') \
+                    and 'filter_text_replaced' in form.data and not form.data.get('filter_text_replaced') \
+                    and 'filter_text_removed' in form.data and not form.data.get('filter_text_removed'):
+                extra_update_obj['filter_text_added'] = True
+                extra_update_obj['filter_text_replaced'] = True
+                extra_update_obj['filter_text_removed'] = True
+
+
            datastore.data['watching'][uuid].update(form.data)
            datastore.data['watching'][uuid].update(extra_update_obj)

--- a/changedetectionio/diff.py
+++ b/changedetectionio/diff.py
@ -10,7 +10,7 @@ def same_slicer(l, a, b):
        return l[a:b]

 # like .compare but a little different output
-def customSequenceMatcher(before, after, include_equal=False, include_removed=True, include_added=True):
+def customSequenceMatcher(before, after, include_equal=False, include_removed=True, include_added=True, include_replaced=True, include_change_type_prefix=True):
    cruncher = difflib.SequenceMatcher(isjunk=lambda x: x in " \\t", a=before, b=after)

    # @todo Line-by-line mode instead of buncghed, including `after` that is not in `before` (maybe unset?)
@ -19,19 +19,23 @@ def customSequenceMatcher(before, after, include_equal=False, include_removed=Tr
            g = before[alo:ahi]
            yield g
        elif include_removed and tag == 'delete':
-            g = ["(removed) " + i for i in same_slicer(before, alo, ahi)]
+            row_prefix = "(removed) " if include_change_type_prefix else ''
+            g = [ row_prefix + i for i in same_slicer(before, alo, ahi)]
            yield g
-        elif tag == 'replace':
-            g = ["(changed) " + i for i in same_slicer(before, alo, ahi)]
-            g += ["(into) " + i for i in same_slicer(after, blo, bhi)]
+        elif include_replaced and tag == 'replace':
+            row_prefix = "(changed) " if include_change_type_prefix else ''
+            g = [row_prefix + i for i in same_slicer(before, alo, ahi)]
+            row_prefix = "(into) " if include_change_type_prefix else ''
+            g += [row_prefix + i for i in same_slicer(after, blo, bhi)]
            yield g
        elif include_added and tag == 'insert':
-            g = ["(added) " + i for i in same_slicer(after, blo, bhi)]
+            row_prefix = "(added) " if include_change_type_prefix else ''
+            g = [row_prefix + i for i in same_slicer(after, blo, bhi)]
            yield g

 # only_differences - only return info about the differences, no context
 # line_feed_sep could be "<br>" or "<li>" or "\n" etc
-def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, line_feed_sep="\n"):
+def render_diff(previous_version_file_contents, newest_version_file_contents, include_equal=False, include_removed=True, include_added=True, include_replaced=True, line_feed_sep="\n", include_change_type_prefix=True):

    newest_version_file_contents = [line.rstrip() for line in newest_version_file_contents.splitlines()]

@ -40,9 +44,13 @@ def render_diff(previous_version_file_contents, newest_version_file_contents, in
    else:
        previous_version_file_contents = ""

-    rendered_diff = customSequenceMatcher(previous_version_file_contents,
-                                          newest_version_file_contents,
-                                          include_equal, include_removed, include_added)
+    rendered_diff = customSequenceMatcher(before=previous_version_file_contents,
+                                          after=newest_version_file_contents,
+                                          include_equal=include_equal,
+                                          include_removed=include_removed,
+                                          include_added=include_added,
+                                          include_replaced=include_replaced,
+                                          include_change_type_prefix=include_change_type_prefix)

    # Recursively join lists
    f = lambda L: line_feed_sep.join([f(x) if type(x) is list else x for x in L])
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -399,13 +399,19 @@ class watchForm(commonSettingsForm):
    body = TextAreaField('Request body', [validators.Optional()])
    method = SelectField('Request method', choices=valid_method, default=default_method)
    ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
-    check_unique_lines = BooleanField('Only trigger when new lines appear', default=False)
+    check_unique_lines = BooleanField('Only trigger when unique lines appear', default=False)
+
+    filter_text_added = BooleanField('Added lines', default=True)
+    filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
+    filter_text_removed = BooleanField('Removed lines', default=True)
+
+    # @todo this class could be moved to its own text_json_diff_watchForm and this goes to restock_diff_Watchform perhaps
    in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True)

    trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
        browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10)
-    text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()])
+    text_should_not_be_present = StringListField('Block change-detection while text matches', [validators.Optional(), ValidateListRegex()])
    webdriver_js_execute_code = TextAreaField('Execute JavaScript before change detection', render_kw={"rows": "5"}, validators=[validators.Optional()])

    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -26,6 +26,9 @@ base_config = {
    'fetch_backend': 'system', # plaintext, playwright etc
    'processor': 'text_json_diff', # could be restock_diff or others from .processors
    'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
+    'filter_text_added': True,
+    'filter_text_replaced': True,
+    'filter_text_removed': True,
    'has_ldjson_price_data': None,
    'track_ldjson_price_data': None,
    'headers': {},  # Extra headers to send
@ -326,7 +329,8 @@ class model(dict):
        # Compare each lines (set) against each history text file (set) looking for something new..
        existing_history = set({})
        for k, v in self.history.items():
-            alist = set([line.decode('utf-8').strip().lower() for line in open(v, 'rb')])
+            content = self.get_history_snapshot(k)
+            alist = set([line.strip().lower() for line in content.splitlines()])
            existing_history = existing_history.union(alist)

        # Check that everything in local_lines(new stuff) already exists in existing_history - it should
@ -454,3 +458,38 @@ class model(dict):
    # Return list of tags, stripped and lowercase, used for searching
    def all_tags(self):
        return [s.strip().lower() for s in self.get('tag','').split(',')]
+
+    def has_special_diff_filter_options_set(self):
+
+        # All False - nothing would be done, so act like it's not processable
+        if not self.get('filter_text_added', True) and not self.get('filter_text_replaced', True) and not self.get('filter_text_removed', True):
+            return False
+
+        # Or one is set
+        if not self.get('filter_text_added', True) or not self.get('filter_text_replaced', True) or not self.get('filter_text_removed', True):
+            return True
+
+        # None is set
+        return False
+
+
+    def get_last_fetched_before_filters(self):
+        import brotli
+        filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
+
+        if not os.path.isfile(filepath):
+            # If a previous attempt doesnt yet exist, just snarf the previous snapshot instead
+            dates = list(self.history.keys())
+            if len(dates):
+                return self.get_history_snapshot(dates[-1])
+            else:
+                return ''
+
+        with open(filepath, 'rb') as f:
+            return(brotli.decompress(f.read()).decode('utf-8'))
+
+    def save_last_fetched_before_filters(self, contents):
+        import brotli
+        filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
+        with open(filepath, 'wb') as f:
+            f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@ -279,6 +279,34 @@ class perform_site_check(difference_detection_processor):
        # Re #340 - return the content before the 'ignore text' was applied
        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

+
+        # @todo whitespace coming from missing rtrim()?
+        # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
+        # Rewrite's the processing text based on only what diff result they want to see
+        if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
+            # Now the content comes from the diff-parser and not the returned HTTP traffic, so could be some differences
+            from .. import diff
+            # needs to not include (added) etc or it may get used twice
+            # Replace the processed text with the preferred result
+            rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(),
+                                                       newest_version_file_contents=stripped_text_from_html,
+                                                       include_equal=False,  # not the same lines
+                                                       include_added=watch.get('filter_text_added', True),
+                                                       include_removed=watch.get('filter_text_removed', True),
+                                                       include_replaced=watch.get('filter_text_replaced', True),
+                                                       line_feed_sep="\n",
+                                                       include_change_type_prefix=False)
+
+            watch.save_last_fetched_before_filters(text_content_before_ignored_filter)
+
+            if not rendered_diff and stripped_text_from_html:
+                # We had some content, but no differences were found
+                # Store our new file as the MD5 so it will trigger in the future
+                c = hashlib.md5(text_content_before_ignored_filter.translate(None, b'\r\n\t ')).hexdigest()
+                return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
+            else:
+                stripped_text_from_html = rendered_diff
+
        # Treat pages with no renderable text content as a change? No by default
        empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
@ -337,6 +365,7 @@ class perform_site_check(difference_detection_processor):
            blocked = True
            # Filter and trigger works the same, so reuse it
            # It should return the line numbers that match
+            # Unblock flow if the trigger was found (some text remained after stripped what didnt match)
            result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
                                                  wordlist=trigger_text,
                                                  mode="line numbers")
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@ -35,3 +35,4 @@ pytest tests/test_access_control.py
 pytest tests/test_notification.py
 pytest tests/test_backend.py
 pytest tests/test_rss.py
+pytest tests/test_unique_lines.py
--- a/changedetectionio/static/styles/scss/styles.scss
+++ b/changedetectionio/static/styles/scss/styles.scss
@ -893,6 +893,21 @@ body.full-width {
      font-size: .875em;
    }
  }
+  .text-filtering {
+    h3 {
+      margin-top: 0;
+    }
+    border: 1px solid #ccc;
+    padding: 1rem;
+    border-radius: 5px;
+    margin-bottom: 1rem;
+    fieldset:last-of-type {
+      padding-bottom: 0;
+      .pure-control-group {
+        padding-bottom: 0;
+      }
+    }
+  }
 }

 ul {
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@ -872,6 +872,17 @@ body.full-width .edit-form {
    color: var(--color-text-input-description); }
    .edit-form .pure-form-message-inline code {
      font-size: .875em; }
+  .edit-form .text-filtering {
+    border: 1px solid #ccc;
+    padding: 1rem;
+    border-radius: 5px;
+    margin-bottom: 1rem; }
+    .edit-form .text-filtering h3 {
+      margin-top: 0; }
+    .edit-form .text-filtering fieldset:last-of-type {
+      padding-bottom: 0; }
+      .edit-form .text-filtering fieldset:last-of-type .pure-control-group {
+        padding-bottom: 0; }

 ul {
  padding-left: 1em;
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -244,12 +244,6 @@ User-Agent: wonderbra 1.0") }}
                                </li>
                            </ul>
                    </div>
-                    <fieldset>
-                        <div class="pure-control-group">
-                            {{ render_checkbox_field(form.check_unique_lines) }}
-                            <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
-                        </div>
-                    </fieldset>
                    <div class="pure-control-group">
                        {% set field = render_field(form.include_filters,
                            rows=5,
@ -287,7 +281,7 @@ xpath://body/div/span[contains(@class, 'example-class')]",
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br>
                </span>
                    </div>
-                    <div class="pure-control-group">
+                <fieldset class="pure-control-group">
                    {{ render_field(form.subtractive_selectors, rows=5, placeholder="header
 footer
 nav
@ -298,20 +292,22 @@ nav
                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
                        </ul>
                      </span>
-                    </div>
-                <fieldset class="pure-group">
-                    {{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
-/some.regex\d{2}/ for case-INsensitive regex
-                    ") }}
-                    <span class="pure-form-message-inline">
-                        <ul>
-                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
-                            <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
-                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
-                            <li>Use the preview/show current tab to see ignores</li>
-                        </ul>
-                </span>
+                </fieldset>
+                <div class="text-filtering">
+                <fieldset class="pure-group" id="text-filtering-type-options">
+                    <h3>Text filtering</h3>
+                        Limit trigger/ignore/block/extract to;<br>
+                        {{ render_checkbox_field(form.filter_text_added) }}
+                        {{ render_checkbox_field(form.filter_text_replaced) }}
+                        {{ render_checkbox_field(form.filter_text_removed) }}
+                    <span class="pure-form-message-inline">Note: Depending on the length and similarity of the text on each line, the algorithm may consider an <strong>addition</strong> instead of <strong>replacement</strong> for example.</span>
+                    <span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
+                    <span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
+                </fieldset>

+                <fieldset class="pure-control-group">
+                    {{ render_checkbox_field(form.check_unique_lines) }}
+                    <span class="pure-form-message-inline">Good for websites that just move the content around, and you want to know when NEW content is added, compares new lines against all history for this watch.</span>
                </fieldset>
                <fieldset>
                    <div class="pure-control-group">
@ -328,6 +324,21 @@ nav
                        </span>
                    </div>
                </fieldset>
+                <fieldset class="pure-group">
+                    {{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
+/some.regex\d{2}/ for case-INsensitive regex
+") }}
+                    <span class="pure-form-message-inline">
+                        <ul>
+                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
+                            <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
+                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
+                            <li>Use the preview/show current tab to see ignores</li>
+                        </ul>
+                </span>
+
+                </fieldset>
+
                <fieldset>
                    <div class="pure-control-group">
                        {{ render_field(form.text_should_not_be_present, rows=5, placeholder="For example: Out of stock
@ -363,6 +374,7 @@ Unavailable") }}
                    </div>
                </fieldset>
                </div>
+            </div>
            {% endif %}

            {% if watch['processor'] == 'restock_diff' %}
--- a/changedetectionio/tests/test_add_replace_remove_filter.py
+++ b/changedetectionio/tests/test_add_replace_remove_filter.py
@ -0,0 +1,99 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from .util import live_server_setup
+from changedetectionio import html_tools
+
+
+def set_original(excluding=None):
+    test_return_data = """<html>
+     <body>
+     <p>Some initial text</p>
+     <p>So let's see what happens.</p>
+     <p>and a new line!</p>
+     <p>The golden line</p>
+     <p>A BREAK TO MAKE THE TOP LINE STAY AS "REMOVED" OR IT WILL GET COUNTED AS "CHANGED INTO"</p>
+     <p>Something irrelevant</p>          
+     </body>
+     </html>
+    """
+
+    if excluding:
+        output = ""
+        for i in test_return_data.splitlines():
+            if not excluding in i:
+                output += f"{i}\n"
+
+        test_return_data = output
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+def test_check_removed_line_contains_trigger(client, live_server):
+    live_server_setup(live_server)
+
+    sleep_time_for_fetch_thread = 3
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+    set_original()
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"trigger_text": 'The golden line',
+              "url": test_url,
+              'fetch_backend': "html_requests",
+              'filter_text_removed': 'y'},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+    time.sleep(sleep_time_for_fetch_thread)
+    set_original(excluding='Something irrelevant')
+
+    # A line thats not the trigger should not trigger anything
+    res = client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    assert b'1 watches queued for rechecking.' in res.data
+    time.sleep(sleep_time_for_fetch_thread)
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+
+    # The trigger line is REMOVED,  this should trigger
+    set_original(excluding='The golden line')
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(sleep_time_for_fetch_thread)
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data
+
+
+    # Now add it back, and we should not get a trigger
+    client.get(url_for("mark_all_viewed"), follow_redirects=True)
+    set_original(excluding=None)
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(sleep_time_for_fetch_thread)
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+
+    # Remove it again, and we should get a trigger
+    set_original(excluding='The golden line')
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(sleep_time_for_fetch_thread)
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data
+
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
--- a/changedetectionio/tests/test_block_while_text_present.py
+++ b/changedetectionio/tests/test_block_while_text_present.py
@ -87,7 +87,10 @@ def test_check_block_changedetection_text_NOT_present(client, live_server):
    # Add our URL to the import page
    res = client.post(
        url_for("edit_page", uuid="first"),
-        data={"text_should_not_be_present": ignore_text, "url": test_url, 'fetch_backend': "html_requests"},
+        data={"text_should_not_be_present": ignore_text,
+              "url": test_url,
+              'fetch_backend': "html_requests"
+              },
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
@ -129,7 +132,6 @@ def test_check_block_changedetection_text_NOT_present(client, live_server):
    set_modified_response_minus_block_text()
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
-
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data

--- a/changedetectionio/tests/test_unique_lines.py
+++ b/changedetectionio/tests/test_unique_lines.py
@ -94,7 +94,6 @@ def test_unique_lines_functionality(client, live_server):
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data

-
    # Now set the content which contains the new text and re-ordered existing text
    set_modified_with_trigger_text_response()
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
--- a/changedetectionio/tests/unit/test_notification_diff.py
+++ b/changedetectionio/tests/unit/test_notification_diff.py
@ -19,8 +19,12 @@ class TestDiffBuilder(unittest.TestCase):
        with open(base_dir + "/test-content/after.txt", 'r') as f:
            newest_version_file_contents = f.read()

-        output = diff.render_diff(previous_version_file_contents, newest_version_file_contents)
+        output = diff.render_diff(previous_version_file_contents=previous_version_file_contents,
+                                  newest_version_file_contents=newest_version_file_contents)
+
        output = output.split("\n")
+
+
        self.assertIn('(changed) ok', output)
        self.assertIn('(into) xok', output)
        self.assertIn('(into) next-x-ok', output)
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -322,6 +322,7 @@ class update_worker(threading.Thread):

                        self.cleanup_error_artifacts(uuid)

+                    #
                    # Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc
                    if process_changedetection_results:
                        try: