From 59d31bf76f395f95433908829301fd317df6bc4c Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 5 Jan 2022 17:58:07 +0100 Subject: [PATCH] XPath support (#355) * XPath support and minor improvements to form validation --- README.md | 2 + changedetectionio/fetch_site_status.py | 10 +- changedetectionio/forms.py | 24 +++- changedetectionio/html_tools.py | 15 +++ changedetectionio/templates/edit.html | 4 +- .../tests/test_xpath_selector.py | 118 ++++++++++++++++++ requirements.txt | 5 +- 7 files changed, 170 insertions(+), 8 deletions(-) create mode 100644 changedetectionio/tests/test_xpath_selector.py diff --git a/README.md b/README.md index 77a45f6d..97dcc408 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,8 @@ docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/dat ```bash docker-compose pull && docker-compose up -d ``` +### Filters +XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. ### Notifications diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 28c27420..7f678657 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -114,15 +114,17 @@ class perform_site_check(): if 'json:' in css_filter_rule: stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) is_html = False - else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) if is_html: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content if has_filter_rule: - html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) + # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." + if css_filter_rule[0] == '/': + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) + else: + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) # get_text() via inscriptis stripped_text_from_html = get_text(html_content) diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 020d9fa8..bd40435a 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -181,7 +181,7 @@ class ValidateListRegex(object): message = field.gettext('RegEx \'%s\' is not a valid regular expression.') raise ValidationError(message % (line)) -class ValidateCSSJSONInput(object): +class ValidateCSSJSONXPATHInput(object): """ Filter validation @todo CSS validator ;) @@ -191,6 +191,24 @@ class ValidateCSSJSONInput(object): self.message = message def __call__(self, form, field): + + # Nothing to see here + if not len(field.data.strip()): + return + + # Does it look like XPath? + if field.data.strip()[0] == '/': + from lxml import html, etree + tree = html.fromstring("") + + try: + tree.xpath(field.data.strip()) + except etree.XPathEvalError as e: + message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') + raise ValidationError(message % (field.data, str(e))) + except: + raise ValidationError("A system-error occurred when validating your XPath expression") + if 'json:' in field.data: from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError from jsonpath_ng.ext import parse @@ -202,6 +220,8 @@ class ValidateCSSJSONInput(object): except (JsonPathParserError, JsonPathLexerError) as e: message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)') raise ValidationError(message % (input, str(e))) + except: + raise ValidationError("A system-error occurred when validating your JSONPath expression") # Re #265 - maybe in the future fetch the page and offer a # warning/notice that its possible the rule doesnt yet match anything? @@ -229,7 +249,7 @@ class watchForm(commonSettingsForm): minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck', [validators.Optional(), validators.NumberRange(min=1)]) - css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()]) + css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()]) title = StringField('Title') ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 5c795c23..7a6b91c6 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -16,6 +16,21 @@ def css_filter(css_filter, html_content): return html_block + "\n" + +# Return str Utf-8 of matched rules +def xpath_filter(xpath_filter, html_content): + from lxml import html + from lxml import etree + + tree = html.fromstring(html_content) + html_block = "" + + for item in tree.xpath(xpath_filter.strip()): + html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"
" + + return html_block + + # Extract/find element def extract_element(find='title', html_content=''): diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index f30c0705..466b7318 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -95,8 +95,10 @@ User-Agent: wonderbra 1.0") }}
  • CSS - Limit text to this CSS rule, only text matching this CSS rule is included.
  • JSON - Limit text to this JSON rule, using JSONPath, prefix with "json:", test your JSONPath here
  • +
  • XPATH - Limit text to this XPath rule, simply start with a forward-slash, example //*[contains(@class, 'sametext')], test your XPath here
  • - Please be sure that you thoroughly understand how to write CSS or JSONPath selector rules before filing an issue on GitHub! here for more CSS selector help.
    diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py new file mode 100644 index 00000000..c5646c81 --- /dev/null +++ b/changedetectionio/tests/test_xpath_selector.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup + +from ..html_tools import * + +def test_setup(live_server): + live_server_setup(live_server) + +def set_original_response(): + test_return_data = """ + + Some initial text
    +

    Which is across multiple lines

    +
    + So let's see what happens.
    +
    Some text thats the same
    +
    Some text that will change
    + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + +def set_modified_response(): + test_return_data = """ + + Some initial text
    +

    Which is across multiple lines

    +
    + So let's see what happens. THIS CHANGES AND SHOULDNT TRIGGER A CHANGE
    +
    Some text thats the same
    +
    Some new text
    + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + return None + + +def test_check_markup_xpath_filter_restriction(client, live_server): + sleep_time_for_fetch_thread = 3 + + xpath_filter = "//*[contains(@class, 'sametext')]" + + set_original_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # view it/reset state back to viewed + client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True) + + # Make a change + set_modified_response() + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + +def test_xpath_validation(client, live_server): + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"is not a valid XPath expression" in res.data \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 23583d11..688ad92b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,8 +26,11 @@ paho-mqtt # ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly cryptography ~= 3.4 -# Used for CSS filtering, replace with soupsieve and lxml for xpath +# Used for CSS filtering bs4 +# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe. +lxml + # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0 selenium ~= 4.1.0