From 7da32f9ac36c44c244a326d9d24dcf20b1c33b40 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 15 Jun 2022 22:56:43 +0200 Subject: [PATCH] New filter - Block change-detection if text matches - for example, block change-detection while the text "out of stock" is on the page, know when the text is no longer on the page (#698) --- changedetectionio/fetch_site_status.py | 28 +++- changedetectionio/forms.py | 2 + changedetectionio/model/Watch.py | 3 +- changedetectionio/run_all_tests.sh | 2 - changedetectionio/store.py | 15 +- changedetectionio/templates/edit.html | 16 ++ changedetectionio/tests/conftest.py | 2 + .../tests/test_block_while_text_present.py | 137 ++++++++++++++++++ changedetectionio/update_worker.py | 13 +- 9 files changed, 198 insertions(+), 20 deletions(-) create mode 100644 changedetectionio/tests/test_block_while_text_present.py diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 7a4f0a2e..b5eef3ab 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -225,25 +225,40 @@ class perform_site_check(): fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() ############ Blocking rules, after checksum ################# - blocked_by_not_found_trigger_text = False + blocked = False if len(watch['trigger_text']): - # Yeah, lets block first until something matches - blocked_by_not_found_trigger_text = True + # Assume blocked + blocked = True # Filter and trigger works the same, so reuse it # It should return the line numbers that match result = html_tools.strip_ignore_text(content=str(stripped_text_from_html), wordlist=watch['trigger_text'], mode="line numbers") - # If it returned any lines that matched.. + # Unblock if the trigger was found if result: - blocked_by_not_found_trigger_text = False + blocked = False - if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5: + + if len(watch['text_should_not_be_present']): + # If anything matched, then we should block a change from happening + result = html_tools.strip_ignore_text(content=str(stripped_text_from_html), + wordlist=watch['text_should_not_be_present'], + mode="line numbers") + if result: + blocked = True + + # The main thing that all this at the moment comes down to :) + if watch['previous_md5'] != fetched_md5: changed_detected = True + # Looks like something changed, but did it match all the rules? + if blocked: + changed_detected = False + else: update_obj["last_changed"] = timestamp + # Extract title as title if is_html: if self.datastore.data['settings']['application']['extract_title_as_title'] or watch['extract_title_as_title']: @@ -257,5 +272,4 @@ class perform_site_check(): if not watch.get('previous_md5'): watch['previous_md5'] = fetched_md5 - return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot, fetcher.xpath_data diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 4b672cb2..dc6f3082 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -341,6 +341,8 @@ class watchForm(commonSettingsForm): method = SelectField('Request method', choices=valid_method, default=default_method) ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False) trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()]) + text_should_not_be_present = StringListField('Block change-detection if text matches', [validators.Optional(), ValidateListRegex()]) + save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"}) proxy = RadioField('Proxy') diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index f55ffce0..64f299fd 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -38,6 +38,7 @@ class model(dict): 'extract_text': [], # Extract text by regex after filters 'subtractive_selectors': [], 'trigger_text': [], # List of text or regex to wait for until a change is detected + 'text_should_not_be_present': [], # Text that should not present 'fetch_backend': None, 'extract_title_as_title': False, 'proxy': None, # Preferred proxy connection @@ -85,7 +86,7 @@ class model(dict): # Read the history file as a dict fname = os.path.join(self.__datastore_path, self.get('uuid'), "history.txt") if os.path.isfile(fname): - logging.debug("Disk IO accessed " + str(time.time())) + logging.debug("Reading history index " + str(time.time())) with open(fname, "r") as f: tmp_history = dict(i.strip().split(',', 2) for i in f.readlines()) diff --git a/changedetectionio/run_all_tests.sh b/changedetectionio/run_all_tests.sh index 625429c7..c2bbf9aa 100755 --- a/changedetectionio/run_all_tests.sh +++ b/changedetectionio/run_all_tests.sh @@ -9,8 +9,6 @@ # exit when any command fails set -e -export MINIMUM_SECONDS_RECHECK_TIME=0 - find tests/test_*py -type f|while read test_name do echo "TEST RUNNING $test_name" diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 4c020515..fca06438 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -290,14 +290,15 @@ class ChangeDetectionStore: headers={'App-Guid': self.__data['app_guid']}) res = r.json() - # List of permisable stuff we accept from the wild internet + # List of permissible attributes we accept from the wild internet for k in ['url', 'tag', - 'paused', 'title', - 'previous_md5', 'headers', - 'body', 'method', - 'ignore_text', 'css_filter', - 'subtractive_selectors', 'trigger_text', - 'extract_title_as_title', 'extract_text']: + 'paused', 'title', + 'previous_md5', 'headers', + 'body', 'method', + 'ignore_text', 'css_filter', + 'subtractive_selectors', 'trigger_text', + 'extract_title_as_title', 'extract_text', + 'text_should_not_be_present']: if res.get(k): apply_extras[k] = res[k] diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 009f6ad0..6a72153c 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -199,6 +199,22 @@ nav +
+
+ {{ render_field(form.text_should_not_be_present, rows=5, placeholder="For example: Out of stock +Sold out +Not in stock +Unavailable") }} + +
    +
  • Block change-detection while this text is on the page, all text and regex are tested case-insensitive, good for waiting for when a product is available again
  • +
  • Block text is processed from the result-text that comes out of any CSS/JSON Filters for this watch
  • +
  • All lines here must not exist (think of each line as "OR")
  • +
  • Note: Wrap in forward slash / to use regex example: /foo\d/
  • +
+
+
+
{{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }} diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py index 258ce6a1..2d0b1349 100644 --- a/changedetectionio/tests/conftest.py +++ b/changedetectionio/tests/conftest.py @@ -32,6 +32,8 @@ def app(request): """Create application for the tests.""" datastore_path = "./test-datastore" + # So they don't delay in fetching + os.environ["MINIMUM_SECONDS_RECHECK_TIME"] = "0" try: os.mkdir(datastore_path) except FileExistsError: diff --git a/changedetectionio/tests/test_block_while_text_present.py b/changedetectionio/tests/test_block_while_text_present.py new file mode 100644 index 00000000..e2236e62 --- /dev/null +++ b/changedetectionio/tests/test_block_while_text_present.py @@ -0,0 +1,137 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup +from changedetectionio import html_tools + +def set_original_ignore_response(): + test_return_data = """ + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+ + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def set_modified_original_ignore_response(): + test_return_data = """ + + Some NEW nice initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+

new ignore stuff

+

out of stock

+

blah

+ + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +# Is the same but includes ZZZZZ, 'ZZZZZ' is the last line in ignore_text +def set_modified_response_minus_block_text(): + test_return_data = """ + + Some NEW nice initial text
+

Which is across multiple lines

+

now on sale $2/p> +
+ So let's see what happens.
+

new ignore stuff

+

blah

+ + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def test_check_block_changedetection_text_NOT_present(client, live_server): + sleep_time_for_fetch_thread = 3 + live_server_setup(live_server) + # Use a mix of case in ZzZ to prove it works case-insensitive. + ignore_text = "out of stoCk\r\nfoobar" + + set_original_ignore_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"text_should_not_be_present": ignore_text, "url": test_url, 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + # Check it saved + res = client.get( + url_for("edit_page", uuid="first"), + ) + assert bytes(ignore_text.encode('utf-8')) in res.data + + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'/test-endpoint' in res.data + + # The page changed, BUT the text is still there, just the rest of it changes, we should not see a change + set_modified_original_ignore_response() + + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'/test-endpoint' in res.data + + + # Now we set a change where the text is gone, it should now trigger + set_modified_response_minus_block_text() + client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index b72b21ca..bda34fef 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -98,9 +98,16 @@ class update_worker(threading.Thread): # Notifications should only trigger on the second time (first time, we gather the initial snapshot) if watch.history_n >= 2: - - dates = list(watch.history.keys()) - prev_fname = watch.history[dates[-2]] + print(">> Change detected in UUID {} - {}".format(uuid, watch['url'])) + watch_history = watch.history + dates = list(watch_history.keys()) + # Theoretically it's possible that this could be just 1 long, + # - In the case that the timestamp key was not unique + if len(dates) == 1: + raise ValueError( + "History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?" + ) + prev_fname = watch_history[dates[-2]] # Did it have any notification alerts to hit?