From 59d31bf76f395f95433908829301fd317df6bc4c Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Wed, 5 Jan 2022 17:58:07 +0100
Subject: [PATCH] XPath support (#355)

* XPath support and minor improvements to form validation
---
 README.md                                     |   2 +
 changedetectionio/fetch_site_status.py        |  10 +-
 changedetectionio/forms.py                    |  24 +++-
 changedetectionio/html_tools.py               |  15 +++
 changedetectionio/templates/edit.html         |   4 +-
 .../tests/test_xpath_selector.py              | 118 ++++++++++++++++++
 requirements.txt                              |   5 +-
 7 files changed, 170 insertions(+), 8 deletions(-)
 create mode 100644 changedetectionio/tests/test_xpath_selector.py
diff --git a/README.md b/README.md
index 77a45f6d..97dcc408 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,8 @@ docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/dat
 ```bash
 docker-compose pull && docker-compose up -d
 ```
+### Filters
+XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
 
 ### Notifications
 
diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 28c27420..7f678657 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -114,15 +114,17 @@ class perform_site_check():
                 if 'json:' in css_filter_rule:
                     stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
                     is_html = False
-                else:
-                    # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
-                    stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
 
             if is_html:
                 # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                 html_content = fetcher.content
                 if has_filter_rule:
-                    html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+                    # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
+                    if css_filter_rule[0] == '/':
+                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+                    else:
+                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                        html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
 
                 # get_text() via inscriptis
                 stripped_text_from_html = get_text(html_content)
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 020d9fa8..bd40435a 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -181,7 +181,7 @@ class ValidateListRegex(object):
                     message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
                     raise ValidationError(message % (line))
 
-class ValidateCSSJSONInput(object):
+class ValidateCSSJSONXPATHInput(object):
     """
     Filter validation
     @todo CSS validator ;)
@@ -191,6 +191,24 @@ class ValidateCSSJSONInput(object):
         self.message = message
 
     def __call__(self, form, field):
+
+        # Nothing to see here
+        if not len(field.data.strip()):
+            return
+
+        # Does it look like XPath?
+        if field.data.strip()[0] == '/':
+            from lxml import html, etree
+            tree = html.fromstring("<html></html>")
+
+            try:
+                tree.xpath(field.data.strip())
+            except etree.XPathEvalError as e:
+                message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
+                raise ValidationError(message % (field.data, str(e)))
+            except:
+                raise ValidationError("A system-error occurred when validating your XPath expression")
+
         if 'json:' in field.data:
             from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
             from jsonpath_ng.ext import parse
@@ -202,6 +220,8 @@ class ValidateCSSJSONInput(object):
             except (JsonPathParserError, JsonPathLexerError) as e:
                 message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
                 raise ValidationError(message % (input, str(e)))
+            except:
+                raise ValidationError("A system-error occurred when validating your JSONPath expression")
 
             # Re #265 - maybe in the future fetch the page and offer a
             # warning/notice that its possible the rule doesnt yet match anything?
@@ -229,7 +249,7 @@ class watchForm(commonSettingsForm):
 
     minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
                                                [validators.Optional(), validators.NumberRange(min=1)])
-    css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()])
+    css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
     title = StringField('Title')
 
     ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 5c795c23..7a6b91c6 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -16,6 +16,21 @@ def css_filter(css_filter, html_content):
 
     return html_block + "\n"
 
+
+# Return str Utf-8 of matched rules
+def xpath_filter(xpath_filter, html_content):
+    from lxml import html
+    from lxml import etree
+
+    tree = html.fromstring(html_content)
+    html_block = ""
+
+    for item in tree.xpath(xpath_filter.strip()):
+        html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"
+
+    return html_block
+
+
 # Extract/find element
 def extract_element(find='title', html_content=''):
 
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index f30c0705..466b7318 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -95,8 +95,10 @@ User-Agent: wonderbra 1.0") }}
                         <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                         <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a
                                 href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
+                        <li>XPATH - Limit text to this XPath rule, simply start with a forward-slash, example  <b>//*[contains(@class, 'sametext')]</b>, <a
+                                href="http://xpather.com/" target="new">test your XPath here</a></li>
                     </ul>
-                    Please be sure that you thoroughly understand how to write CSS or JSONPath selector rules before filing an issue on GitHub! <a
+                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
                                 href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
                 </span>
                     </div>
diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py
new file mode 100644
index 00000000..c5646c81
--- /dev/null
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -0,0 +1,118 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from . util import live_server_setup
+
+from ..html_tools import *
+
+def test_setup(live_server):
+    live_server_setup(live_server)
+
+def set_original_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     <div class="sametext">Some text thats the same</div>
+     <div class="changetext">Some text that will change</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+    return None
+
+def set_modified_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  THIS CHANGES AND SHOULDNT TRIGGER A CHANGE</br>
+     <div class="sametext">Some text thats the same</div>
+     <div class="changetext">Some new text</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+    return None
+
+
+def test_check_markup_xpath_filter_restriction(client, live_server):
+    sleep_time_for_fetch_thread = 3
+
+    xpath_filter = "//*[contains(@class, 'sametext')]"
+
+    set_original_response()
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # view it/reset state back to viewed
+    client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True)
+
+    #  Make a change
+    set_modified_response()
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+
+def test_xpath_validation(client, live_server):
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"is not a valid XPath expression" in res.data
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 23583d11..688ad92b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,8 +26,11 @@ paho-mqtt
 # ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
 cryptography ~= 3.4
 
-# Used for CSS filtering, replace with soupsieve and lxml for xpath
+# Used for CSS filtering
 bs4
 
+# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
+lxml
+
 # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
 selenium ~= 4.1.0