diff --git a/README.md b/README.md
index 77a45f6d..97dcc408 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,8 @@ docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/dat
```bash
docker-compose pull && docker-compose up -d
```
+### Filters
+XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
### Notifications
diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 28c27420..7f678657 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -114,15 +114,17 @@ class perform_site_check():
if 'json:' in css_filter_rule:
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
is_html = False
- else:
- # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
- stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
if is_html:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content = fetcher.content
if has_filter_rule:
- html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+ # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
+ if css_filter_rule[0] == '/':
+ html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+ else:
+ # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+ html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
# get_text() via inscriptis
stripped_text_from_html = get_text(html_content)
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 020d9fa8..bd40435a 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -181,7 +181,7 @@ class ValidateListRegex(object):
message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
raise ValidationError(message % (line))
-class ValidateCSSJSONInput(object):
+class ValidateCSSJSONXPATHInput(object):
"""
Filter validation
@todo CSS validator ;)
@@ -191,6 +191,24 @@ class ValidateCSSJSONInput(object):
self.message = message
def __call__(self, form, field):
+
+ # Nothing to see here
+ if not len(field.data.strip()):
+ return
+
+ # Does it look like XPath?
+ if field.data.strip()[0] == '/':
+ from lxml import html, etree
+ tree = html.fromstring("")
+
+ try:
+ tree.xpath(field.data.strip())
+ except etree.XPathEvalError as e:
+ message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
+ raise ValidationError(message % (field.data, str(e)))
+ except:
+ raise ValidationError("A system-error occurred when validating your XPath expression")
+
if 'json:' in field.data:
from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
from jsonpath_ng.ext import parse
@@ -202,6 +220,8 @@ class ValidateCSSJSONInput(object):
except (JsonPathParserError, JsonPathLexerError) as e:
message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
raise ValidationError(message % (input, str(e)))
+ except:
+ raise ValidationError("A system-error occurred when validating your JSONPath expression")
# Re #265 - maybe in the future fetch the page and offer a
# warning/notice that its possible the rule doesnt yet match anything?
@@ -229,7 +249,7 @@ class watchForm(commonSettingsForm):
minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
[validators.Optional(), validators.NumberRange(min=1)])
- css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()])
+ css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
title = StringField('Title')
ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 5c795c23..7a6b91c6 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -16,6 +16,21 @@ def css_filter(css_filter, html_content):
return html_block + "\n"
+
+# Return str Utf-8 of matched rules
+def xpath_filter(xpath_filter, html_content):
+ from lxml import html
+ from lxml import etree
+
+ tree = html.fromstring(html_content)
+ html_block = ""
+
+ for item in tree.xpath(xpath_filter.strip()):
+ html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"
"
+
+ return html_block
+
+
# Extract/find element
def extract_element(find='title', html_content=''):
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index f30c0705..466b7318 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -95,8 +95,10 @@ User-Agent: wonderbra 1.0") }}
Which is across multiple lines
+ + So let's see what happens. +Which is across multiple lines
+ + So let's see what happens. THIS CHANGES AND SHOULDNT TRIGGER A CHANGE +