From b72ecfa85245cbcafa1f5273a05ca6aa4d0709de Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Jun 2023 15:22:13 +0200 Subject: [PATCH] WIP --- changedetectionio/forms.py | 3 ++- changedetectionio/html_tools.py | 3 ++- changedetectionio/tests/test_xpath_selector.py | 9 +++++---- requirements.txt | 2 ++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 7199b445..3cbe20db 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -316,10 +316,11 @@ class ValidateCSSJSONXPATHInput(object): if not self.allow_xpath: raise ValidationError("XPath not permitted in this field!") from lxml import etree, html + import elementpath tree = html.fromstring("") try: - tree.xpath(line.strip()) + elementpath.select(tree, line) except etree.XPathEvalError as e: message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') raise ValidationError(message % (line, str(e))) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 0cdaeea4..eeaa68f4 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -51,12 +51,13 @@ def element_removal(selectors: List[str], html_content): # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False): + import elementpath from lxml import etree, html tree = html.fromstring(bytes(html_content, encoding='utf-8')) html_block = "" - r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}) + r = elementpath.select(tree, xpath_filter.strip()) #@note: //title/text() wont work where CDATA.. for element in r: diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py index fa8f4e8d..a40ec8b1 100644 --- a/changedetectionio/tests/test_xpath_selector.py +++ b/changedetectionio/tests/test_xpath_selector.py @@ -2,7 +2,7 @@ import time from flask import url_for -from . util import live_server_setup +from .util import live_server_setup, wait_for_all_checks from ..html_tools import * @@ -164,6 +164,7 @@ def test_check_xpath_text_function_utf8(client, live_server): assert b'Deleted' in res.data def test_check_markup_xpath_filter_restriction(client, live_server): + live_server_setup(live_server) sleep_time_for_fetch_thread = 3 xpath_filter = "//*[contains(@class, 'sametext')]" @@ -183,7 +184,7 @@ def test_check_markup_xpath_filter_restriction(client, live_server): assert b"1 Imported" in res.data # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # Goto the edit page, add our ignore text # Add our URL to the import page @@ -195,7 +196,7 @@ def test_check_markup_xpath_filter_restriction(client, live_server): assert b"Updated watch." in res.data # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # view it/reset state back to viewed client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True) @@ -206,7 +207,7 @@ def test_check_markup_xpath_filter_restriction(client, live_server): # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) res = client.get(url_for("index")) assert b'unviewed' not in res.data diff --git a/requirements.txt b/requirements.txt index 2e53e1b7..6d51bfd9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,6 +42,8 @@ paho-mqtt # (introduced once apprise became a dep) cryptography~=3.4 +elementpath + # Used for CSS filtering beautifulsoup4