diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index a6595383..e7f4b87a 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -35,6 +35,7 @@ from flask import ( url_for, ) from flask_login import login_required + from changedetectionio import html_tools __version__ = '0.39.9' @@ -526,6 +527,7 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['watching'][uuid]['css_filter'] = form.css_filter.data.strip() + datastore.data['watching'][uuid]['subtractive_selectors'] = form.subtractive_selectors.data # Reset the previous_md5 so we process a new snapshot including stripping ignore text. if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']: @@ -598,6 +600,7 @@ def changedetection_app(config=None, datastore_o=None): if request.method == 'GET': form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] + form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors'] form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] @@ -626,6 +629,7 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['settings']['application']['notification_format'] = form.notification_format.data datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['application']['base_url'] = form.base_url.data + datastore.data['settings']['application']['global_subtractive_selectors'] = form.global_subtractive_selectors.data datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 35ef2a01..fa5660f8 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -1,11 +1,11 @@ -import time -from changedetectionio import content_fetcher -from changedetectionio import html_tools import hashlib -from inscriptis import get_text -import urllib3 -from . import html_tools import re +import time + +import urllib3 +from inscriptis import get_text + +from changedetectionio import content_fetcher, html_tools urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -72,8 +72,15 @@ class perform_site_check(): is_json = 'application/json' in fetcher.headers.get('Content-Type', '') is_html = not is_json css_filter_rule = watch['css_filter'] + subtractive_selectors = watch.get( + "subtractive_selectors", [] + ) + self.datastore.data["settings"]["application"].get( + "global_subtractive_selectors", [] + ) has_filter_rule = css_filter_rule and len(css_filter_rule.strip()) + has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) + if is_json and not has_filter_rule: css_filter_rule = "json:$" has_filter_rule = True @@ -100,11 +107,11 @@ class perform_site_check(): else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) - + if has_subtractive_selectors: + html_content = html_tools.element_removal(subtractive_selectors, html_content) # get_text() via inscriptis stripped_text_from_html = get_text(html_content) - # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index facfc7bd..59bb800c 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -1,13 +1,30 @@ -from wtforms import Form, SelectField, RadioField, BooleanField, StringField, PasswordField, validators, IntegerField, fields, TextAreaField, \ - Field +import re -from wtforms import widgets, SubmitField -from wtforms.validators import ValidationError +from wtforms import ( + BooleanField, + Field, + Form, + IntegerField, + PasswordField, + RadioField, + SelectField, + StringField, + SubmitField, + TextAreaField, + fields, + validators, + widgets, +) from wtforms.fields import html5 -from changedetectionio import content_fetcher -import re +from wtforms.validators import ValidationError -from changedetectionio.notification import default_notification_format, valid_notification_formats, default_notification_body, default_notification_title +from changedetectionio import content_fetcher +from changedetectionio.notification import ( + default_notification_body, + default_notification_format, + default_notification_title, + valid_notification_formats, +) valid_method = { 'GET', @@ -45,8 +62,8 @@ class SaltyPasswordField(StringField): encrypted_password = "" def build_password(self, password): - import hashlib import base64 + import hashlib import secrets # Make a new salt on every new password and store it with the password @@ -104,9 +121,10 @@ class ValidateContentFetcherIsReady(object): self.message = message def __call__(self, form, field): - from changedetectionio import content_fetcher import urllib3.exceptions + from changedetectionio import content_fetcher + # Better would be a radiohandler that keeps a reference to each class if field.data is not None: klass = getattr(content_fetcher, field.data) @@ -213,52 +231,69 @@ class ValidateListRegex(object): except re.error: message = field.gettext('RegEx \'%s\' is not a valid regular expression.') raise ValidationError(message % (line)) - -class ValidateCSSJSONXPathInput(object): + +class ValidateCSSJSONXPATHInput(object): """ Filter validation @todo CSS validator ;) """ - def __init__(self, message=None): + def __init__(self, message=None, allow_xpath=True, allow_json=True): self.message = message + self.allow_xpath = allow_xpath + self.allow_json = allow_json def __call__(self, form, field): + if isinstance(field.data, str): + data = [field.data] + else: + data = field.data + + for line in data: # Nothing to see here - if not len(field.data.strip()): - return + if not len(line.strip()): + return - # Does it look like XPath? - if field.data.strip()[0] == '/': - from lxml import html, etree - tree = html.fromstring("") + # Does it look like XPath? + if line.strip()[0] == '/': + if not self.allow_xpath: + raise ValidationError("XPath not permitted in this field!") + from lxml import etree, html + tree = html.fromstring("") - try: - tree.xpath(field.data.strip()) - except etree.XPathEvalError as e: - message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') - raise ValidationError(message % (field.data, str(e))) - except: - raise ValidationError("A system-error occurred when validating your XPath expression") + try: + tree.xpath(line.strip()) + except etree.XPathEvalError as e: + message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') + raise ValidationError(message % (line, str(e))) + except: + raise ValidationError("A system-error occurred when validating your XPath expression") - if 'json:' in field.data: - from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError - from jsonpath_ng.ext import parse + if 'json:' in line: + if not self.allow_json: + raise ValidationError("JSONPath not permitted in this field!") - input = field.data.replace('json:', '') + from jsonpath_ng.exceptions import ( + JsonPathLexerError, + JsonPathParserError, + ) + from jsonpath_ng.ext import parse - try: - parse(input) - except (JsonPathParserError, JsonPathLexerError) as e: - message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)') - raise ValidationError(message % (input, str(e))) - except: - raise ValidationError("A system-error occurred when validating your JSONPath expression") + input = line.replace('json:', '') - # Re #265 - maybe in the future fetch the page and offer a - # warning/notice that its possible the rule doesnt yet match anything? + try: + parse(input) + except (JsonPathParserError, JsonPathLexerError) as e: + message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)') + raise ValidationError(message % (input, str(e))) + except: + raise ValidationError("A system-error occurred when validating your JSONPath expression") + # Re #265 - maybe in the future fetch the page and offer a + # warning/notice that its possible the rule doesnt yet match anything? + + class quickWatchForm(Form): # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5 # `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run @@ -282,7 +317,8 @@ class watchForm(commonSettingsForm): minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck', [validators.Optional(), validators.NumberRange(min=1)]) - css_filter = StringField('CSS/JSON/XPath Filter', [ValidateCSSJSONXPathInput()]) + css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()]) + subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) title = StringField('Title') ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) @@ -314,5 +350,6 @@ class globalSettingsForm(commonSettingsForm): [validators.NumberRange(min=1)]) extract_title_as_title = BooleanField('Extract