From 7da32f9ac36c44c244a326d9d24dcf20b1c33b40 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Wed, 15 Jun 2022 22:56:43 +0200
Subject: [PATCH] New filter - Block change-detection if text matches - for
 example, block change-detection while the text "out of stock" is on the page,
 know when the text is no longer on the page (#698)

---
 changedetectionio/fetch_site_status.py        |  28 +++-
 changedetectionio/forms.py                    |   2 +
 changedetectionio/model/Watch.py              |   3 +-
 changedetectionio/run_all_tests.sh            |   2 -
 changedetectionio/store.py                    |  15 +-
 changedetectionio/templates/edit.html         |  16 ++
 changedetectionio/tests/conftest.py           |   2 +
 .../tests/test_block_while_text_present.py    | 137 ++++++++++++++++++
 changedetectionio/update_worker.py            |  13 +-
 9 files changed, 198 insertions(+), 20 deletions(-)
 create mode 100644 changedetectionio/tests/test_block_while_text_present.py

diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 7a4f0a2e..b5eef3ab 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -225,25 +225,40 @@ class perform_site_check():
             fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
 
         ############ Blocking rules, after checksum #################
-        blocked_by_not_found_trigger_text = False
+        blocked = False
 
         if len(watch['trigger_text']):
-            # Yeah, lets block first until something matches
-            blocked_by_not_found_trigger_text = True
+            # Assume blocked
+            blocked = True
             # Filter and trigger works the same, so reuse it
             # It should return the line numbers that match
             result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
                                                   wordlist=watch['trigger_text'],
                                                   mode="line numbers")
-            # If it returned any lines that matched..
+            # Unblock if the trigger was found
             if result:
-                blocked_by_not_found_trigger_text = False
+                blocked = False
 
-        if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
+
+        if len(watch['text_should_not_be_present']):
+            # If anything matched, then we should block a change from happening
+            result = html_tools.strip_ignore_text(content=str(stripped_text_from_html),
+                                                  wordlist=watch['text_should_not_be_present'],
+                                                  mode="line numbers")
+            if result:
+                blocked = True
+
+        # The main thing that all this at the moment comes down to :)
+        if watch['previous_md5'] != fetched_md5:
             changed_detected = True
 
+        # Looks like something changed, but did it match all the rules?
+        if blocked:
+            changed_detected = False
+        else:
             update_obj["last_changed"] = timestamp
 
+
         # Extract title as title
         if is_html:
             if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']:
@@ -257,5 +272,4 @@ class perform_site_check():
         if not watch.get('previous_md5'):
             watch['previous_md5'] = fetched_md5
 
-
         return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot, fetcher.xpath_data
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 4b672cb2..dc6f3082 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -341,6 +341,8 @@ class watchForm(commonSettingsForm):
     method = SelectField('Request method', choices=valid_method, default=default_method)
     ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
     trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
+    text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()])
+
     save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
     save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
     proxy = RadioField('Proxy')
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index f55ffce0..64f299fd 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -38,6 +38,7 @@ class model(dict):
             'extract_text': [],  # Extract text by regex after filters
             'subtractive_selectors': [],
             'trigger_text': [],  # List of text or regex to wait for until a change is detected
+            'text_should_not_be_present': [], # Text that should not present
             'fetch_backend': None,
             'extract_title_as_title': False,
             'proxy': None, # Preferred proxy connection
@@ -85,7 +86,7 @@ class model(dict):
         # Read the history file as a dict
         fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt")
         if os.path.isfile(fname):
-            logging.debug("Disk IO accessed " + str(time.time()))
+            logging.debug("Reading history index " + str(time.time()))
             with open(fname, "r") as f:
                 tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
 
diff --git a/changedetectionio/run_all_tests.sh b/changedetectionio/run_all_tests.sh
index 625429c7..c2bbf9aa 100755
--- a/changedetectionio/run_all_tests.sh
+++ b/changedetectionio/run_all_tests.sh
@@ -9,8 +9,6 @@
 # exit when any command fails
 set -e
 
-export MINIMUM_SECONDS_RECHECK_TIME=0
-
 find tests/test_*py -type f|while read test_name
 do
   echo "TEST RUNNING $test_name"
diff --git a/changedetectionio/store.py b/changedetectionio/store.py
index 4c020515..fca06438 100644
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -290,14 +290,15 @@ class ChangeDetectionStore:
                                      headers={'App-Guid': self.__data['app_guid']})
                 res = r.json()
 
-                # List of permisable stuff we accept from the wild internet
+                # List of permissible attributes we accept from the wild internet
                 for k in ['url', 'tag',
-                                   'paused', 'title',
-                                   'previous_md5', 'headers',
-                                   'body', 'method',
-                                   'ignore_text', 'css_filter',
-                                   'subtractive_selectors', 'trigger_text',
-                                   'extract_title_as_title', 'extract_text']:
+                          'paused', 'title',
+                          'previous_md5', 'headers',
+                          'body', 'method',
+                          'ignore_text', 'css_filter',
+                          'subtractive_selectors', 'trigger_text',
+                          'extract_title_as_title', 'extract_text',
+                          'text_should_not_be_present']:
                     if res.get(k):
                         apply_extras[k] = res[k]
 
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 009f6ad0..6a72153c 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -199,6 +199,22 @@ nav
                         </span>
                     </div>
                 </fieldset>
+                <fieldset>
+                    <div class="pure-control-group">
+                        {{ render_field(form.text_should_not_be_present, rows=5, placeholder="For example: Out of stock
+Sold out
+Not in stock
+Unavailable") }}
+                        <span class="pure-form-message-inline">
+                            <ul>
+                                <li>Block change-detection while this text is on the page, all text and regex are tested <i>case-insensitive</i>, good for waiting for when a product is available again</li>
+                                <li>Block text is processed from the result-text that comes out of any CSS/JSON Filters for this watch</li>
+                                <li>All lines here must not exist (think of each line as "OR")</li>
+                                <li>Note: Wrap in forward slash / to use regex  example: <code>/foo\d/</code></li>
+                            </ul>
+                        </span>
+                    </div>
+                </fieldset>
                 <fieldset>
                     <div class="pure-control-group">
                         {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py
index 258ce6a1..2d0b1349 100644
--- a/changedetectionio/tests/conftest.py
+++ b/changedetectionio/tests/conftest.py
@@ -32,6 +32,8 @@ def app(request):
     """Create application for the tests."""
     datastore_path = "./test-datastore"
 
+    # So they don't delay in fetching
+    os.environ["MINIMUM_SECONDS_RECHECK_TIME"] = "0"
     try:
         os.mkdir(datastore_path)
     except FileExistsError:
diff --git a/changedetectionio/tests/test_block_while_text_present.py b/changedetectionio/tests/test_block_while_text_present.py
new file mode 100644
index 00000000..e2236e62
--- /dev/null
+++ b/changedetectionio/tests/test_block_while_text_present.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from . util import live_server_setup
+from changedetectionio import html_tools
+
+def set_original_ignore_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     </body>
+     </html>
+
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+def set_modified_original_ignore_response():
+    test_return_data = """<html>
+       <body>
+     Some NEW nice initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     <p>new ignore stuff</p>
+     <p>out of stock</p>
+     <p>blah</p>
+     </body>
+     </html>
+
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+# Is the same but includes ZZZZZ, 'ZZZZZ' is the last line in ignore_text
+def set_modified_response_minus_block_text():
+    test_return_data = """<html>
+       <body>
+     Some NEW nice initial text</br>
+     <p>Which is across multiple lines</p>
+     <p>now on sale $2/p>
+     </br>
+     So let's see what happens.  </br>
+     <p>new ignore stuff</p>
+     <p>blah</p>
+     </body>
+     </html>
+
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+def test_check_block_changedetection_text_NOT_present(client, live_server):
+    sleep_time_for_fetch_thread = 3
+    live_server_setup(live_server)
+    # Use a mix of case in ZzZ to prove it works case-insensitive.
+    ignore_text = "out of stoCk\r\nfoobar"
+
+    set_original_ignore_response()
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"text_should_not_be_present": ignore_text, "url": test_url, 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+    # Check it saved
+    res = client.get(
+        url_for("edit_page", uuid="first"),
+    )
+    assert bytes(ignore_text.encode('utf-8')) in res.data
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # It should report nothing found (no new 'unviewed' class)
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+    assert b'/test-endpoint' in res.data
+
+    # The page changed, BUT the text is still there, just the rest of it changes, we should not see a change
+    set_modified_original_ignore_response()
+
+    # Trigger a check
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # It should report nothing found (no new 'unviewed' class)
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+    assert b'/test-endpoint' in res.data
+
+
+    # Now we set a change where the text is gone, it should now trigger
+    set_modified_response_minus_block_text()
+    client.get(url_for("form_watch_checknow"), follow_redirects=True)
+    time.sleep(sleep_time_for_fetch_thread)
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' in res.data
+
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py
index b72b21ca..bda34fef 100644
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -98,9 +98,16 @@ class update_worker(threading.Thread):
 
                                 # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
                                 if watch.history_n >= 2:
-
-                                    dates = list(watch.history.keys())
-                                    prev_fname = watch.history[dates[-2]]
+                                    print(">> Change detected in UUID {} - {}".format(uuid, watch['url']))
+                                    watch_history = watch.history
+                                    dates = list(watch_history.keys())
+                                    # Theoretically it's possible that this could be just 1 long,
+                                    # - In the case that the timestamp key was not unique
+                                    if len(dates) == 1:
+                                        raise ValueError(
+                                            "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
+                                        )
+                                    prev_fname = watch_history[dates[-2]]
 
 
                                     # Did it have any notification alerts to hit?