diff --git a/README.md b/README.md
index 67de6c11..e0a941e4 100644
--- a/README.md
+++ b/README.md
@@ -268,3 +268,7 @@ I offer commercial support, this software is depended on by network security, ae
[license-shield]: https://img.shields.io/github/license/dgtlmoon/changedetection.io.svg?style=for-the-badge
[release-link]: https://github.com/dgtlmoon/changedetection.io/releases
[docker-link]: https://hub.docker.com/r/dgtlmoon/changedetection.io
+
+## Third-party licenses
+
+changedetectionio.html_tools.elementpath_tostring: Copyright (c), 2018-2021, SISSA (Scuola Internazionale Superiore di Studi Avanzati), Licensed under [MIT license](https://github.com/sissaschool/elementpath/blob/master/LICENSE)
diff --git a/changedetectionio/blueprint/tags/templates/edit-tag.html b/changedetectionio/blueprint/tags/templates/edit-tag.html
index 6372156d..449ba382 100644
--- a/changedetectionio/blueprint/tags/templates/edit-tag.html
+++ b/changedetectionio/blueprint/tags/templates/edit-tag.html
@@ -69,11 +69,12 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{% endif %}
-
XPath - Limit text to this XPath rule, simply start with a forward-slash,
+
XPath - Limit text to this XPath rule, simply start with a forward-slash. To specify XPath to be used explicitly or the XPath rule starts with an XPath function: Prefix with xpath:
Example: Get all titles from an RSS feed //title/text()
+
To use XPath1.0: Prefix with xpath1:
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index b3de842b..c640b218 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -328,11 +328,30 @@ class ValidateCSSJSONXPATHInput(object):
return
# Does it look like XPath?
- if line.strip()[0] == '/':
+ if line.strip()[0] == '/' or line.strip().startswith('xpath:'):
if not self.allow_xpath:
raise ValidationError("XPath not permitted in this field!")
from lxml import etree, html
+ import elementpath
+ # xpath 2.0-3.1
+ from elementpath.xpath3 import XPath3Parser
tree = html.fromstring("")
+ line = line.replace('xpath:', '')
+
+ try:
+ elementpath.select(tree, line.strip(), parser=XPath3Parser)
+ except elementpath.ElementPathError as e:
+ message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
+ raise ValidationError(message % (line, str(e)))
+ except:
+ raise ValidationError("A system-error occurred when validating your XPath expression")
+
+ if line.strip().startswith('xpath1:'):
+ if not self.allow_xpath:
+ raise ValidationError("XPath not permitted in this field!")
+ from lxml import etree, html
+ tree = html.fromstring("")
+ line = line.replace('xpath1:', '')
try:
tree.xpath(line.strip())
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 19ca653b..7c9844c8 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -69,10 +69,89 @@ def element_removal(selectors: List[str], html_content):
selector = ",".join(selectors)
return subtractive_css_selector(selector, html_content)
+def elementpath_tostring(obj):
+ """
+ change elementpath.select results to string type
+ # The MIT License (MIT), Copyright (c), 2018-2021, SISSA (Scuola Internazionale Superiore di Studi Avanzati)
+ # https://github.com/sissaschool/elementpath/blob/dfcc2fd3d6011b16e02bf30459a7924f547b47d0/elementpath/xpath_tokens.py#L1038
+ """
+
+ import elementpath
+ from decimal import Decimal
+ import math
+
+ if obj is None:
+ return ''
+ # https://elementpath.readthedocs.io/en/latest/xpath_api.html#elementpath.select
+ elif isinstance(obj, elementpath.XPathNode):
+ return obj.string_value
+ elif isinstance(obj, bool):
+ return 'true' if obj else 'false'
+ elif isinstance(obj, Decimal):
+ value = format(obj, 'f')
+ if '.' in value:
+ return value.rstrip('0').rstrip('.')
+ return value
+
+ elif isinstance(obj, float):
+ if math.isnan(obj):
+ return 'NaN'
+ elif math.isinf(obj):
+ return str(obj).upper()
+
+ value = str(obj)
+ if '.' in value:
+ value = value.rstrip('0').rstrip('.')
+ if '+' in value:
+ value = value.replace('+', '')
+ if 'e' in value:
+ return value.upper()
+ return value
+
+ return str(obj)
# Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
from lxml import etree, html
+ import elementpath
+ # xpath 2.0-3.1
+ from elementpath.xpath3 import XPath3Parser
+
+ parser = etree.HTMLParser()
+ if is_rss:
+ # So that we can keep CDATA for cdata_in_document_to_text() to process
+ parser = etree.XMLParser(strip_cdata=False)
+
+ tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
+ html_block = ""
+
+ r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
+ #@note: //title/text() wont work where CDATA..
+
+ if type(r) != list:
+ r = [r]
+
+ for element in r:
+ # When there's more than 1 match, then add the suffix to separate each line
+ # And where the matched result doesn't include something that will cause Inscriptis to add a newline
+ # (This way each 'match' reliably has a new-line in the diff)
+ # Divs are converted to 4 whitespaces by inscriptis
+ if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
+ html_block += TEXT_FILTER_LIST_LINE_SUFFIX
+
+ if type(element) == str:
+ html_block += element
+ elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree):
+ html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
+ else:
+ html_block += elementpath_tostring(element)
+
+ return html_block
+
+# Return str Utf-8 of matched rules
+# 'xpath1:'
+def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
+ from lxml import etree, html
parser = None
if is_rss:
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index fc35b135..b8cf8a9e 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -173,6 +173,11 @@ class perform_site_check(difference_detection_processor):
html_content=self.fetcher.content,
append_pretty_line_formatting=not watch.is_source_type_url,
is_rss=is_rss)
+ elif filter_rule.startswith('xpath1:'):
+ html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
+ html_content=fetcher.content,
+ append_pretty_line_formatting=not is_source,
+ is_rss=is_rss)
else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content += html_tools.include_filters(include_filters=filter_rule,
diff --git a/changedetectionio/store.py b/changedetectionio/store.py
index c00018c4..9522d582 100644
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -847,4 +847,14 @@ class ChangeDetectionStore:
if not watch.get('date_created'):
self.data['watching'][uuid]['date_created'] = i
i+=1
- return
\ No newline at end of file
+ return
+
+ # #1774 - protect xpath1 against migration
+ def update_14(self):
+ for awatch in self.__data["watching"]:
+ if self.__data["watching"][awatch]['include_filters']:
+ for num, selector in enumerate(self.__data["watching"][awatch]['include_filters']):
+ if selector.startswith('/'):
+ self.__data["watching"][awatch]['include_filters'][num] = 'xpath1:' + selector
+ if selector.startswith('xpath:'):
+ self.__data["watching"][awatch]['include_filters'][num] = selector.replace('xpath:', 'xpath1:', 1)
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 270cdbce..e6882280 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -290,11 +290,12 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{% endif %}
-
XPath - Limit text to this XPath rule, simply start with a forward-slash,
+
XPath - Limit text to this XPath rule, simply start with a forward-slash. To specify XPath to be used explicitly or the XPath rule starts with an XPath function: Prefix with xpath: