From d4359c2e678ded4635bbd9993f1c0a558d9388d0 Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Sat, 12 Mar 2022 13:29:30 +0100 Subject: [PATCH] Add filter to remove elements by CSS rule from HTML before change detection is run (#445) --- changedetectionio/__init__.py | 4 + changedetectionio/fetch_site_status.py | 23 ++- changedetectionio/forms.py | 117 +++++++----- changedetectionio/html_tools.py | 22 ++- changedetectionio/store.py | 23 ++- changedetectionio/templates/edit.html | 13 +- changedetectionio/templates/settings.html | 13 +- .../tests/test_element_removal.py | 168 ++++++++++++++++++ 8 files changed, 321 insertions(+), 62 deletions(-) create mode 100644 changedetectionio/tests/test_element_removal.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index a6595383..e7f4b87a 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -35,6 +35,7 @@ from flask import ( url_for, ) from flask_login import login_required + from changedetectionio import html_tools __version__ = '0.39.9' @@ -526,6 +527,7 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['watching'][uuid]['css_filter'] = form.css_filter.data.strip() + datastore.data['watching'][uuid]['subtractive_selectors'] = form.subtractive_selectors.data # Reset the previous_md5 so we process a new snapshot including stripping ignore text. if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']: @@ -598,6 +600,7 @@ def changedetection_app(config=None, datastore_o=None): if request.method == 'GET': form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] + form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors'] form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] @@ -626,6 +629,7 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['settings']['application']['notification_format'] = form.notification_format.data datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['application']['base_url'] = form.base_url.data + datastore.data['settings']['application']['global_subtractive_selectors'] = form.global_subtractive_selectors.data datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 35ef2a01..fa5660f8 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -1,11 +1,11 @@ -import time -from changedetectionio import content_fetcher -from changedetectionio import html_tools import hashlib -from inscriptis import get_text -import urllib3 -from . import html_tools import re +import time + +import urllib3 +from inscriptis import get_text + +from changedetectionio import content_fetcher, html_tools urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -72,8 +72,15 @@ class perform_site_check(): is_json = 'application/json' in fetcher.headers.get('Content-Type', '') is_html = not is_json css_filter_rule = watch['css_filter'] + subtractive_selectors = watch.get( + "subtractive_selectors", [] + ) + self.datastore.data["settings"]["application"].get( + "global_subtractive_selectors", [] + ) has_filter_rule = css_filter_rule and len(css_filter_rule.strip()) + has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) + if is_json and not has_filter_rule: css_filter_rule = "json:$" has_filter_rule = True @@ -100,11 +107,11 @@ class perform_site_check(): else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) - + if has_subtractive_selectors: + html_content = html_tools.element_removal(subtractive_selectors, html_content) # get_text() via inscriptis stripped_text_from_html = get_text(html_content) - # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index facfc7bd..59bb800c 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -1,13 +1,30 @@ -from wtforms import Form, SelectField, RadioField, BooleanField, StringField, PasswordField, validators, IntegerField, fields, TextAreaField, \ - Field +import re -from wtforms import widgets, SubmitField -from wtforms.validators import ValidationError +from wtforms import ( + BooleanField, + Field, + Form, + IntegerField, + PasswordField, + RadioField, + SelectField, + StringField, + SubmitField, + TextAreaField, + fields, + validators, + widgets, +) from wtforms.fields import html5 -from changedetectionio import content_fetcher -import re +from wtforms.validators import ValidationError -from changedetectionio.notification import default_notification_format, valid_notification_formats, default_notification_body, default_notification_title +from changedetectionio import content_fetcher +from changedetectionio.notification import ( + default_notification_body, + default_notification_format, + default_notification_title, + valid_notification_formats, +) valid_method = { 'GET', @@ -45,8 +62,8 @@ class SaltyPasswordField(StringField): encrypted_password = "" def build_password(self, password): - import hashlib import base64 + import hashlib import secrets # Make a new salt on every new password and store it with the password @@ -104,9 +121,10 @@ class ValidateContentFetcherIsReady(object): self.message = message def __call__(self, form, field): - from changedetectionio import content_fetcher import urllib3.exceptions + from changedetectionio import content_fetcher + # Better would be a radiohandler that keeps a reference to each class if field.data is not None: klass = getattr(content_fetcher, field.data) @@ -213,52 +231,69 @@ class ValidateListRegex(object): except re.error: message = field.gettext('RegEx \'%s\' is not a valid regular expression.') raise ValidationError(message % (line)) - -class ValidateCSSJSONXPathInput(object): + +class ValidateCSSJSONXPATHInput(object): """ Filter validation @todo CSS validator ;) """ - def __init__(self, message=None): + def __init__(self, message=None, allow_xpath=True, allow_json=True): self.message = message + self.allow_xpath = allow_xpath + self.allow_json = allow_json def __call__(self, form, field): + if isinstance(field.data, str): + data = [field.data] + else: + data = field.data + + for line in data: # Nothing to see here - if not len(field.data.strip()): - return + if not len(line.strip()): + return - # Does it look like XPath? - if field.data.strip()[0] == '/': - from lxml import html, etree - tree = html.fromstring("") + # Does it look like XPath? + if line.strip()[0] == '/': + if not self.allow_xpath: + raise ValidationError("XPath not permitted in this field!") + from lxml import etree, html + tree = html.fromstring("") - try: - tree.xpath(field.data.strip()) - except etree.XPathEvalError as e: - message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') - raise ValidationError(message % (field.data, str(e))) - except: - raise ValidationError("A system-error occurred when validating your XPath expression") + try: + tree.xpath(line.strip()) + except etree.XPathEvalError as e: + message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') + raise ValidationError(message % (line, str(e))) + except: + raise ValidationError("A system-error occurred when validating your XPath expression") - if 'json:' in field.data: - from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError - from jsonpath_ng.ext import parse + if 'json:' in line: + if not self.allow_json: + raise ValidationError("JSONPath not permitted in this field!") - input = field.data.replace('json:', '') + from jsonpath_ng.exceptions import ( + JsonPathLexerError, + JsonPathParserError, + ) + from jsonpath_ng.ext import parse - try: - parse(input) - except (JsonPathParserError, JsonPathLexerError) as e: - message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)') - raise ValidationError(message % (input, str(e))) - except: - raise ValidationError("A system-error occurred when validating your JSONPath expression") + input = line.replace('json:', '') - # Re #265 - maybe in the future fetch the page and offer a - # warning/notice that its possible the rule doesnt yet match anything? + try: + parse(input) + except (JsonPathParserError, JsonPathLexerError) as e: + message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)') + raise ValidationError(message % (input, str(e))) + except: + raise ValidationError("A system-error occurred when validating your JSONPath expression") + # Re #265 - maybe in the future fetch the page and offer a + # warning/notice that its possible the rule doesnt yet match anything? + + class quickWatchForm(Form): # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5 # `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run @@ -282,7 +317,8 @@ class watchForm(commonSettingsForm): minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck', [validators.Optional(), validators.NumberRange(min=1)]) - css_filter = StringField('CSS/JSON/XPath Filter', [ValidateCSSJSONXPathInput()]) + css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()]) + subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) title = StringField('Title') ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) @@ -314,5 +350,6 @@ class globalSettingsForm(commonSettingsForm): [validators.NumberRange(min=1)]) extract_title_as_title = BooleanField('Extract from document and use as watch title') base_url = StringField('Base URL', validators=[validators.Optional()]) + global_subtractive_selectors = StringListField('Ignore elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) - ignore_whitespace = BooleanField('Ignore whitespace') \ No newline at end of file + ignore_whitespace = BooleanField('Ignore whitespace') diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index d251aebc..dc7c846c 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -1,7 +1,10 @@ import json +import re +from typing import List + from bs4 import BeautifulSoup from jsonpath_ng.ext import parse -import re + class JSONNotFound(ValueError): def __init__(self, msg): @@ -16,11 +19,22 @@ def css_filter(css_filter, html_content): return html_block + "\n" +def subtractive_css_selector(css_selector, html_content): + soup = BeautifulSoup(html_content, "html.parser") + for item in soup.select(css_selector): + item.decompose() + return str(soup) + + +def element_removal(selectors: List[str], html_content): + """Joins individual filters into one css filter.""" + selector = ",".join(selectors) + return subtractive_css_selector(selector, html_content) + # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content): - from lxml import html - from lxml import etree + from lxml import etree, html tree = html.fromstring(html_content) html_block = "" @@ -151,4 +165,4 @@ def strip_ignore_text(content, wordlist, mode="content"): if mode == "line numbers": return ignored_line_numbers - return "\n".encode('utf8').join(output) \ No newline at end of file + return "\n".encode('utf8').join(output) diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 86081735..5e3902b8 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -1,15 +1,19 @@ -from os import unlink, path, mkdir import json +import logging +import os +import threading +import time import uuid as uuid_builder -from threading import Lock from copy import deepcopy +from os import mkdir, path, unlink +from threading import Lock -import logging -import time -import threading -import os +from changedetectionio.notification import ( + default_notification_body, + default_notification_format, + default_notification_title, +) -from changedetectionio.notification import default_notification_format, default_notification_body, default_notification_title # Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods? # Open a github issue if you know something :) @@ -46,6 +50,7 @@ class ChangeDetectionStore: 'extract_title_as_title': False, 'fetch_backend': 'html_requests', 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum + 'global_subtractive_selectors': [], 'ignore_whitespace': False, 'notification_urls': [], # Apprise URL list # Custom notification content @@ -82,6 +87,7 @@ class ChangeDetectionStore: 'notification_body': default_notification_body, 'notification_format': default_notification_format, 'css_filter': "", + 'subtractive_selectors': [], 'trigger_text': [], # List of text or regex to wait for until a change is detected 'fetch_backend': None, 'extract_title_as_title': False @@ -144,8 +150,8 @@ class ChangeDetectionStore: unlink(password_reset_lockfile) if not 'app_guid' in self.__data: - import sys import os + import sys if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ: self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4()) else: @@ -430,6 +436,7 @@ class ChangeDetectionStore: index.append(self.data['watching'][uuid]['history'][str(id)]) import pathlib + # Only in the sub-directories for item in pathlib.Path(self.datastore_path).rglob("*/*txt"): if not str(item) in index: diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 6b0be692..97e3195a 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -122,7 +122,18 @@ User-Agent: wonderbra 1.0") }} href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/> </span> </div> - + <fieldset class="pure-group"> + {{ render_field(form.subtractive_selectors, rows=5, placeholder="header +footer +nav +.stockticker") }} + <span class="pure-form-message-inline"> + <ul> + <li> Remove HTML element(s) by CSS selector before text conversion. </li> + <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li> + </ul> + </span> + </fieldset> </fieldset> <fieldset class="pure-group"> {{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 3157bca6..a5a7d8da 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -83,7 +83,18 @@ </span> </fieldset> - + <fieldset class="pure-group"> + {{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header +footer +nav +.stockticker") }} + <span class="pure-form-message-inline"> + <ul> + <li> Remove HTML element(s) by CSS selector before text conversion. </li> + <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li> + </ul> + </span> + </fieldset> <fieldset class="pure-group"> {{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line /some.regex\d{2}/ for case-INsensitive regex diff --git a/changedetectionio/tests/test_element_removal.py b/changedetectionio/tests/test_element_removal.py new file mode 100644 index 00000000..aab4875d --- /dev/null +++ b/changedetectionio/tests/test_element_removal.py @@ -0,0 +1,168 @@ +#!/usr/bin/python3 + +import time + +from flask import url_for + +from ..html_tools import * +from .util import live_server_setup + + +def test_setup(live_server): + live_server_setup(live_server) + + +def set_original_response(): + test_return_data = """<html> + <header> + <h2>Header</h2> + </header> + <nav> + <ul> + <li><a href="#">A</a></li> + <li><a href="#">B</a></li> + <li><a href="#">C</a></li> + </ul> + </nav> + <body> + Some initial text</br> + <p>Which is across multiple lines</p> + </br> + So let's see what happens. </br> + <div id="changetext">Some text that will change</div> + </body> + <footer> + <p>Footer</p> + </footer> + </html> + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def set_modified_response(): + test_return_data = """<html> + <header> + <h2>Header changed</h2> + </header> + <nav> + <ul> + <li><a href="#">A changed</a></li> + <li><a href="#">B</a></li> + <li><a href="#">C</a></li> + </ul> + </nav> + <body> + Some initial text</br> + <p>Which is across multiple lines</p> + </br> + So let's see what happens. </br> + <div id="changetext">Some text that changes</div> + </body> + <footer> + <p>Footer changed</p> + </footer> + </html> + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def test_element_removal_output(): + from changedetectionio import fetch_site_status + from inscriptis import get_text + + # Check text with sub-parts renders correctly + content = """<html> + <header> + <h2>Header</h2> + </header> + <nav> + <ul> + <li><a href="#">A</a></li> + </ul> + </nav> + <body> + Some initial text</br> + <p>across multiple lines</p> + <div id="changetext">Some text that changes</div> + </body> + <footer> + <p>Footer</p> + </footer> + </html> + """ + html_blob = element_removal( + ["header", "footer", "nav", "#changetext"], html_content=content + ) + text = get_text(html_blob) + assert ( + text + == """Some initial text + +across multiple lines +""" + ) + + +def test_element_removal_full(client, live_server): + sleep_time_for_fetch_thread = 3 + + set_original_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for("test_endpoint", _external=True) + res = client.post( + url_for("import_page"), data={"urls": test_url}, follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Goto the edit page, add the filter data + # Not sure why \r needs to be added - absent of the #changetext this is not necessary + subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext" + res = client.post( + url_for("edit_page", uuid="first"), + data={ + "subtractive_selectors": subtractive_selectors_data, + "url": test_url, + "tag": "", + "headers": "", + "fetch_backend": "html_requests", + }, + follow_redirects=True, + ) + assert b"Updated watch." in res.data + + # Check it saved + res = client.get( + url_for("edit_page", uuid="first"), + ) + assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # No change yet - first check + res = client.get(url_for("index")) + assert b"unviewed" not in res.data + + # Make a change to header/footer/nav + set_modified_response() + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # There should not be an unviewed change, as changes should be removed + res = client.get(url_for("index")) + assert b"unviewed" not in res.data