From dc936a2e8a631268d637bee65ca3e99d5396eacf Mon Sep 17 00:00:00 2001 From: Michael McMillan Date: Tue, 17 Sep 2024 22:43:04 +0200 Subject: [PATCH] Filters - Add support for also removing HTML elements using XPath selectors (#2632) --- .../blueprint/tags/templates/edit-tag.html | 8 ++++--- changedetectionio/forms.py | 4 ++-- changedetectionio/html_tools.py | 22 ++++++++++++++++--- .../templates/_common_fields.html | 2 +- changedetectionio/templates/edit.html | 9 ++++---- changedetectionio/templates/settings.html | 8 ++++--- .../tests/test_element_removal.py | 14 +++++++++++- 7 files changed, 50 insertions(+), 17 deletions(-) diff --git a/changedetectionio/blueprint/tags/templates/edit-tag.html b/changedetectionio/blueprint/tags/templates/edit-tag.html index 99f86005..a713cf6a 100644 --- a/changedetectionio/blueprint/tags/templates/edit-tag.html +++ b/changedetectionio/blueprint/tags/templates/edit-tag.html @@ -89,11 +89,13 @@ xpath://body/div/span[contains(@class, 'example-class')]", {{ render_field(form.subtractive_selectors, rows=5, placeholder="header footer nav -.stockticker") }} +.stockticker +//*[contains(text(), 'Advertisement')]") }} diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 1b718cfe..5011afaf 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -469,7 +469,7 @@ class processor_text_json_diff_form(commonSettingsForm): include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='') - subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) + subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)]) extract_text = StringListField('Extract text', [ValidateListRegex()]) @@ -576,7 +576,7 @@ class globalSettingsApplicationForm(commonSettingsForm): empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False) fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) - global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) + global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)]) ignore_whitespace = BooleanField('Ignore whitespace') password = SaltyPasswordField() pager_size = IntegerField('Pager size', diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index ffe00cd0..7c2e1eba 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -1,4 +1,5 @@ from typing import List +from lxml import etree import json import re @@ -57,11 +58,26 @@ def subtractive_css_selector(css_selector, html_content): item.decompose() return str(soup) +def subtractive_xpath_selector(xpath_selector, html_content): + html_tree = etree.HTML(html_content) + elements_to_remove = html_tree.xpath(xpath_selector) + + for element in elements_to_remove: + element.getparent().remove(element) + + modified_html = etree.tostring(html_tree, method="html").decode("utf-8") + return modified_html def element_removal(selectors: List[str], html_content): - """Joins individual filters into one css filter.""" - selector = ",".join(selectors) - return subtractive_css_selector(selector, html_content) + """Removes elements that match a list of CSS or xPath selectors.""" + modified_html = html_content + for selector in selectors: + if selector.startswith(('xpath:', 'xpath1:', '//')): + xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:') + modified_html = subtractive_xpath_selector(xpath_selector, modified_html) + else: + modified_html = subtractive_css_selector(selector, modified_html) + return modified_html def elementpath_tostring(obj): """ diff --git a/changedetectionio/templates/_common_fields.html b/changedetectionio/templates/_common_fields.html index 0b80a701..9447f903 100644 --- a/changedetectionio/templates/_common_fields.html +++ b/changedetectionio/templates/_common_fields.html @@ -15,7 +15,7 @@ Tip: Use AppRise Notification URLs for notification to just about any service! Please read the notification services wiki here for important configuration notes.

Show advanced help and tips
-