From a82fad705919190069e0162bdaa1e090735b4537 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sat, 23 Jul 2022 17:15:27 +0200 Subject: [PATCH] Send notification when CSS/xPath filter is missing after more than 6 (configurable) attempts (#771) --- changedetectionio/forms.py | 7 + changedetectionio/html_tools.py | 18 ++- changedetectionio/model/App.py | 3 + changedetectionio/model/Watch.py | 2 + changedetectionio/notification.py | 1 - changedetectionio/templates/edit.html | 6 + changedetectionio/templates/settings.html | 8 +- .../tests/test_filter_failure_notification.py | 123 ++++++++++++++++++ changedetectionio/update_worker.py | 51 +++++++- 9 files changed, 208 insertions(+), 11 deletions(-) create mode 100644 changedetectionio/tests/test_filter_failure_notification.py diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 4ad1b1a7..9d29c5c1 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -349,6 +349,8 @@ class watchForm(commonSettingsForm): save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"}) proxy = RadioField('Proxy') + filter_failure_notification_send = BooleanField( + 'Send a notification when the filter can no longer be found on the page', default=False) def validate(self, **kwargs): if not super().validate(): @@ -387,6 +389,11 @@ class globalSettingsApplicationForm(commonSettingsForm): api_access_token_enabled = BooleanField('API access token security check enabled', default=True, validators=[validators.Optional()]) password = SaltyPasswordField() + filter_failure_notification_threshold_attempts = IntegerField('Number of times the filter can be missing before sending a notification', + render_kw={"style": "width: 5em;"}, + validators=[validators.NumberRange(min=0, + message="Should contain zero or more attempts")]) + class globalSettingsForm(Form): # Define these as FormFields/"sub forms", this way it matches the JSON storage diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index cc6a476d..6b73e5d1 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -1,5 +1,4 @@ import json -import re from typing import List from bs4 import BeautifulSoup @@ -8,16 +7,23 @@ import re from inscriptis import get_text from inscriptis.model.config import ParserConfig +class FilterNotFoundInResponse(ValueError): + def __init__(self, msg): + ValueError.__init__(self, msg) class JSONNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) + # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches def css_filter(css_filter, html_content): soup = BeautifulSoup(html_content, "html.parser") html_block = "" - for item in soup.select(css_filter, separator=""): + r = soup.select(css_filter, separator="") + if len(r) == 0: + raise FilterNotFoundInResponse(css_filter) + for item in r: html_block += str(item) return html_block + "\n" @@ -42,8 +48,12 @@ def xpath_filter(xpath_filter, html_content): tree = html.fromstring(bytes(html_content, encoding='utf-8')) html_block = "" - for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}): - html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"
" + r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}) + if len(r) == 0: + raise FilterNotFoundInResponse(css_filter) + + for item in r: + html_block += etree.tostring(item, pretty_print=True).decode('utf-8') + "
" return html_block diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index 21ed1431..6e74d483 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -5,6 +5,8 @@ from changedetectionio.notification import ( default_notification_title, ) +_FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6 + class model(dict): base_config = { 'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!", @@ -30,6 +32,7 @@ class model(dict): 'extract_title_as_title': False, 'empty_pages_are_a_change': False, 'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"), + 'filter_failure_notification_threshold_attempts': _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT, 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_subtractive_selectors': [], 'ignore_whitespace': True, diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 6c22b390..0e80d41b 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -41,6 +41,8 @@ class model(dict): 'trigger_text': [], # List of text or regex to wait for until a change is detected 'text_should_not_be_present': [], # Text that should not present 'fetch_backend': None, + 'filter_failure_notification_send': True, + 'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine. 'extract_title_as_title': False, 'check_unique_lines': False, # On change-detected, compare against all history if its something new 'proxy': None, # Preferred proxy connection diff --git a/changedetectionio/notification.py b/changedetectionio/notification.py index 8b396275..b0def158 100644 --- a/changedetectionio/notification.py +++ b/changedetectionio/notification.py @@ -34,7 +34,6 @@ def process_notification(n_object, datastore): valid_notification_formats[default_notification_format], ) - # Insert variables into the notification content notification_parameters = create_notification_parameters(n_object, datastore) diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 5ba754d5..c706b0b2 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -62,6 +62,12 @@
{{ render_checkbox_field(form.extract_title_as_title) }}
+
+ {{ render_checkbox_field(form.filter_failure_notification_send) }} + + Sends a notification when the filter can no longer be seen on the page, good for knowing when the page changed and your filter will not work anymore. + +
diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 57ba0e57..8b3e2e8d 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -36,7 +36,13 @@ {{ render_field(form.requests.form.jitter_seconds, class="jitter_seconds") }} Example - 3 seconds random jitter could trigger up to 3 seconds earlier or up to 3 seconds later - +
+ {{ render_field(form.application.form.filter_failure_notification_threshold_attempts, class="filter_failure_notification_threshold_attempts") }} + After this many consecutive times that the CSS/xPath filter is missing, send a notification +
+ Set to 0 to disable +
+
{% if not hide_remove_pass %} {% if current_user.is_authenticated %} diff --git a/changedetectionio/tests/test_filter_failure_notification.py b/changedetectionio/tests/test_filter_failure_notification.py new file mode 100644 index 00000000..17dff1f3 --- /dev/null +++ b/changedetectionio/tests/test_filter_failure_notification.py @@ -0,0 +1,123 @@ +import os +import time +import re +from flask import url_for +from .util import set_original_response, live_server_setup +from changedetectionio.model import App + + +def set_response_with_filter(): + test_return_data = """ + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+
Some text thats the same
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + + +# Hard to just add more live server URLs when one test is already running (I think) +# So we add our test here (was in a different file) +def test_check_notification(client, live_server): + live_server_setup(live_server) + set_original_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("form_watch_add"), + data={"url": test_url, "tag": ''}, + follow_redirects=True + ) + assert b"Watch added" in res.data + + # Give the thread time to pick up the first version + time.sleep(3) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + url = url_for('test_notification_endpoint', _external=True) + notification_url = url.replace('http', 'json') + + print(">>>> Notification URL: " + notification_url) + + # Just a regular notification setting, this will be used by the special 'filter not found' notification + notification_form_data = {"notification_urls": notification_url, + "notification_title": "New ChangeDetection.io Notification - {watch_url}", + "notification_body": "BASE URL: {base_url}\n" + "Watch URL: {watch_url}\n" + "Watch UUID: {watch_uuid}\n" + "Watch title: {watch_title}\n" + "Watch tag: {watch_tag}\n" + "Preview: {preview_url}\n" + "Diff URL: {diff_url}\n" + "Snapshot: {current_snapshot}\n" + "Diff: {diff}\n" + "Diff Full: {diff_full}\n" + ":-)", + "notification_format": "Text"} + + notification_form_data.update({ + "url": test_url, + "tag": "my tag", + "title": "my title", + "headers": "", + "css_filter": '#nope-doesnt-exist', + "fetch_backend": "html_requests"}) + + res = client.post( + url_for("edit_page", uuid="first"), + data=notification_form_data, + follow_redirects=True + ) + assert b"Updated watch." in res.data + time.sleep(3) + + # Now the notification should not exist, because we didnt reach the threshold + assert not os.path.isfile("test-datastore/notification.txt") + + for i in range(0, App._FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT): + res = client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(3) + + # We should see something in the frontend + assert b'Did the page change its layout' in res.data + + # Now it should exist and contain our "filter not found" alert + assert os.path.isfile("test-datastore/notification.txt") + notification = False + with open("test-datastore/notification.txt", 'r') as f: + notification = f.read() + assert 'CSS/xPath filter was not present in the page' in notification + assert '#nope-doesnt-exist' in notification + + # Remove it and prove that it doesnt trigger when not expected + os.unlink("test-datastore/notification.txt") + set_response_with_filter() + + for i in range(0, App._FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT): + client.get(url_for("form_watch_checknow"), follow_redirects=True) + time.sleep(3) + + # It should have sent a notification, but.. + assert os.path.isfile("test-datastore/notification.txt") + # but it should not contain the info about the failed filter + with open("test-datastore/notification.txt", 'r') as f: + notification = f.read() + assert not 'CSS/xPath filter was not present in the page' in notification + + # cleanup for the next + client.get( + url_for("form_delete", uuid="all"), + follow_redirects=True + ) diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 7202142d..a9acf502 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -3,6 +3,8 @@ import queue import time from changedetectionio import content_fetcher +from changedetectionio.html_tools import FilterNotFoundInResponse + # A single update worker # # Requests for checking on a single site(watch) from a queue of watches @@ -19,6 +21,32 @@ class update_worker(threading.Thread): self.datastore = datastore super().__init__(*args, **kwargs) + def send_filter_failure_notification(self, uuid): + + threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts') + watch = self.datastore.data['watching'].get(uuid, False) + + n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page', + 'notification_body': "Your configured CSS/xPath filter of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format( + watch['css_filter'], + threshold), + 'notification_format': 'text'} + + if len(watch['notification_urls']): + n_object['notification_urls'] = watch['notification_urls'] + + elif len(self.datastore.data['settings']['application']['notification_urls']): + n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls'] + + # Only prepare to notify if the rules above matched + if 'notification_urls' in n_object: + n_object.update({ + 'watch_url': watch['url'], + 'uuid': uuid + }) + self.notification_q.put(n_object) + print("Sent filter not found notification for {}".format(uuid)) + def run(self): from changedetectionio import fetch_site_status @@ -55,11 +83,23 @@ class update_worker(threading.Thread): except content_fetcher.ReplyWithContentButNoText as e: # Totally fine, it's by choice - just continue on, nothing more to care about # Page had elements/content but no renderable text - if self.datastore.data['watching'].get(uuid, False) and self.datastore.data['watching'][uuid].get('css_filter'): - self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (CSS / xPath Filter not found in page?)"}) - else: - self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."}) - pass + self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found."}) + except FilterNotFoundInResponse as e: + err_text = "Filter '{}' not found - Did the page change its layout?".format(str(e)) + c = 0 + if self.datastore.data['watching'].get(uuid, False): + c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5) + c += 1 + + # Send notification if we reached the threshold? + threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0) + print("Filter for {} not found, consecutive_filter_failures: {}".format(uuid, c)) + if threshold >0 and c >= threshold: + self.send_filter_failure_notification(uuid) + c = 0 + + self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, + 'consecutive_filter_failures': c}) except content_fetcher.EmptyReply as e: # Some kind of custom to-str handler in the exception handler that does this? err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code) @@ -89,6 +129,7 @@ class update_worker(threading.Thread): fname = watch.save_history_text(contents=contents, timestamp=str(round(time.time()))) # Generally update anything interesting returned + update_obj['consecutive_filter_failures'] = 0 self.datastore.update_watch(uuid=uuid, update_obj=update_obj) # A change was detected