From dc936a2e8a631268d637bee65ca3e99d5396eacf Mon Sep 17 00:00:00 2001
From: Michael McMillan
Date: Tue, 17 Sep 2024 22:43:04 +0200
Subject: [PATCH] Filters - Add support for also removing HTML elements using
XPath selectors (#2632)
---
.../blueprint/tags/templates/edit-tag.html | 8 ++++---
changedetectionio/forms.py | 4 ++--
changedetectionio/html_tools.py | 22 ++++++++++++++++---
.../templates/_common_fields.html | 2 +-
changedetectionio/templates/edit.html | 9 ++++----
changedetectionio/templates/settings.html | 8 ++++---
.../tests/test_element_removal.py | 14 +++++++++++-
7 files changed, 50 insertions(+), 17 deletions(-)
diff --git a/changedetectionio/blueprint/tags/templates/edit-tag.html b/changedetectionio/blueprint/tags/templates/edit-tag.html
index 99f86005..a713cf6a 100644
--- a/changedetectionio/blueprint/tags/templates/edit-tag.html
+++ b/changedetectionio/blueprint/tags/templates/edit-tag.html
@@ -89,11 +89,13 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{{ render_field(form.subtractive_selectors, rows=5, placeholder="header
footer
nav
-.stockticker") }}
+.stockticker
+//*[contains(text(), 'Advertisement')]") }}
- - Remove HTML element(s) by CSS selector before text conversion.
- - Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML.
+ - Remove HTML element(s) by CSS and XPath selectors before text conversion.
+ - Don't paste HTML here, use only CSS and XPath selectors
+ - Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML.
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 1b718cfe..5011afaf 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -469,7 +469,7 @@ class processor_text_json_diff_form(commonSettingsForm):
include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='')
- subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
+ subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])
extract_text = StringListField('Extract text', [ValidateListRegex()])
@@ -576,7 +576,7 @@ class globalSettingsApplicationForm(commonSettingsForm):
empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False)
fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
- global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
+ global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])
ignore_whitespace = BooleanField('Ignore whitespace')
password = SaltyPasswordField()
pager_size = IntegerField('Pager size',
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index ffe00cd0..7c2e1eba 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -1,4 +1,5 @@
from typing import List
+from lxml import etree
import json
import re
@@ -57,11 +58,26 @@ def subtractive_css_selector(css_selector, html_content):
item.decompose()
return str(soup)
+def subtractive_xpath_selector(xpath_selector, html_content):
+ html_tree = etree.HTML(html_content)
+ elements_to_remove = html_tree.xpath(xpath_selector)
+
+ for element in elements_to_remove:
+ element.getparent().remove(element)
+
+ modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
+ return modified_html
def element_removal(selectors: List[str], html_content):
- """Joins individual filters into one css filter."""
- selector = ",".join(selectors)
- return subtractive_css_selector(selector, html_content)
+ """Removes elements that match a list of CSS or xPath selectors."""
+ modified_html = html_content
+ for selector in selectors:
+ if selector.startswith(('xpath:', 'xpath1:', '//')):
+ xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
+ modified_html = subtractive_xpath_selector(xpath_selector, modified_html)
+ else:
+ modified_html = subtractive_css_selector(selector, modified_html)
+ return modified_html
def elementpath_tostring(obj):
"""
diff --git a/changedetectionio/templates/_common_fields.html b/changedetectionio/templates/_common_fields.html
index 0b80a701..9447f903 100644
--- a/changedetectionio/templates/_common_fields.html
+++ b/changedetectionio/templates/_common_fields.html
@@ -15,7 +15,7 @@
Tip: Use AppRise Notification URLs for notification to just about any service! Please read the notification services wiki here for important configuration notes.
Show advanced help and tips
-
+
discord://
(or https://discord.com/api/webhooks...
)) only supports a maximum 2,000 characters of notification text, including the title.
tgram://
bots can't send messages to other bots, so you should specify chat ID of non-bot user.
tgram://
only supports very limited HTML and can fail when extra tags are sent, read more here (or use plaintext/markdown format)
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 9d974c44..c5cc9725 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -310,12 +310,13 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header
footer
nav
-.stockticker") }}
+.stockticker
+//*[contains(text(), 'Advertisement')]") }}
- - Remove HTML element(s) by CSS selector before text conversion.
- - Don't paste HTML here, use only CSS selectors
- - Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML.
+ - Remove HTML element(s) by CSS and XPath selectors before text conversion.
+ - Don't paste HTML here, use only CSS and XPath selectors
+ - Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML.
diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html
index f1131f94..e9911770 100644
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -155,11 +155,13 @@
{{ render_field(form.application.form.global_subtractive_selectors, rows=5, placeholder="header
footer
nav
-.stockticker") }}
+.stockticker
+//*[contains(text(), 'Advertisement')]") }}
- - Remove HTML element(s) by CSS selector before text conversion.
- - Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML.
+ - Remove HTML element(s) by CSS and XPath selectors before text conversion.
+ - Don't paste HTML here, use only CSS and XPath selectors
+ - Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML.
diff --git a/changedetectionio/tests/test_element_removal.py b/changedetectionio/tests/test_element_removal.py
index 121d2ab9..b7fda560 100644
--- a/changedetectionio/tests/test_element_removal.py
+++ b/changedetectionio/tests/test_element_removal.py
@@ -87,6 +87,9 @@ def test_element_removal_output():
Some initial text
across multiple lines
Some text that changes
+ Some text should be matched by xPath // selector
+ Some text should be matched by xPath selector
+ Some text should be matched by xPath1 selector