Filters - Add support for also removing HTML elements using XPath selectors (#2632)

pull/1945/head^2
Michael McMillan 3 months ago committed by GitHub
parent 8c1527c1ad
commit dc936a2e8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -89,11 +89,13 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{{ render_field(form.subtractive_selectors, rows=5, placeholder="header {{ render_field(form.subtractive_selectors, rows=5, placeholder="header
footer footer
nav nav
.stockticker") }} .stockticker
//*[contains(text(), 'Advertisement')]") }}
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li> Remove HTML element(s) by CSS selector before text conversion. </li> <li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li> <li> Don't paste HTML here, use only CSS and XPath selectors </li>
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
</ul> </ul>
</span> </span>
</fieldset> </fieldset>

@ -469,7 +469,7 @@ class processor_text_json_diff_form(commonSettingsForm):
include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='') include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='')
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])
extract_text = StringListField('Extract text', [ValidateListRegex()]) extract_text = StringListField('Extract text', [ValidateListRegex()])
@ -576,7 +576,7 @@ class globalSettingsApplicationForm(commonSettingsForm):
empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False) empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False)
fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_json=False)])
ignore_whitespace = BooleanField('Ignore whitespace') ignore_whitespace = BooleanField('Ignore whitespace')
password = SaltyPasswordField() password = SaltyPasswordField()
pager_size = IntegerField('Pager size', pager_size = IntegerField('Pager size',

@ -1,4 +1,5 @@
from typing import List from typing import List
from lxml import etree
import json import json
import re import re
@ -57,11 +58,26 @@ def subtractive_css_selector(css_selector, html_content):
item.decompose() item.decompose()
return str(soup) return str(soup)
def subtractive_xpath_selector(xpath_selector, html_content):
html_tree = etree.HTML(html_content)
elements_to_remove = html_tree.xpath(xpath_selector)
for element in elements_to_remove:
element.getparent().remove(element)
modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
return modified_html
def element_removal(selectors: List[str], html_content): def element_removal(selectors: List[str], html_content):
"""Joins individual filters into one css filter.""" """Removes elements that match a list of CSS or xPath selectors."""
selector = ",".join(selectors) modified_html = html_content
return subtractive_css_selector(selector, html_content) for selector in selectors:
if selector.startswith(('xpath:', 'xpath1:', '//')):
xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
modified_html = subtractive_xpath_selector(xpath_selector, modified_html)
else:
modified_html = subtractive_css_selector(selector, modified_html)
return modified_html
def elementpath_tostring(obj): def elementpath_tostring(obj):
""" """

@ -15,7 +15,7 @@
<strong>Tip:</strong> Use <a target=_new href="https://github.com/caronc/apprise">AppRise Notification URLs</a> for notification to just about any service! <i><a target=_new href="https://github.com/dgtlmoon/changedetection.io/wiki/Notification-configuration-notes">Please read the notification services wiki here for important configuration notes</a></i>.<br> <strong>Tip:</strong> Use <a target=_new href="https://github.com/caronc/apprise">AppRise Notification URLs</a> for notification to just about any service! <i><a target=_new href="https://github.com/dgtlmoon/changedetection.io/wiki/Notification-configuration-notes">Please read the notification services wiki here for important configuration notes</a></i>.<br>
</p> </p>
<div data-target="#advanced-help-notifications" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div> <div data-target="#advanced-help-notifications" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</div>
<ul style="display: none" id="advanced-help-notifications"> <ul style="display: none" id="advanced-help-notifications">
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_discord">discord://</a></code> (or <code>https://discord.com/api/webhooks...</code>)) only supports a maximum <strong>2,000 characters</strong> of notification text, including the title.</li> <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_discord">discord://</a></code> (or <code>https://discord.com/api/webhooks...</code>)) only supports a maximum <strong>2,000 characters</strong> of notification text, including the title.</li>
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> bots can't send messages to other bots, so you should specify chat ID of non-bot user.</li> <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> bots can't send messages to other bots, so you should specify chat ID of non-bot user.</li>
<li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> only supports very limited HTML and can fail when extra tags are sent, <a href="https://core.telegram.org/bots/api#html-style">read more here</a> (or use plaintext/markdown format)</li> <li><code><a target=_new href="https://github.com/caronc/apprise/wiki/Notify_telegram">tgram://</a></code> only supports very limited HTML and can fail when extra tags are sent, <a href="https://core.telegram.org/bots/api#html-style">read more here</a> (or use plaintext/markdown format)</li>

@ -310,12 +310,13 @@ xpath://body/div/span[contains(@class, 'example-class')]",
{{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header {{ render_field(form.subtractive_selectors, rows=5, placeholder=has_tag_filters_extra+"header
footer footer
nav nav
.stockticker") }} .stockticker
//*[contains(text(), 'Advertisement')]") }}
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li> Remove HTML element(s) by CSS selector before text conversion. </li> <li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
<li> Don't paste HTML here, use only CSS selectors </li> <li> Don't paste HTML here, use only CSS and XPath selectors </li>
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li> <li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
</ul> </ul>
</span> </span>
</fieldset> </fieldset>

@ -155,11 +155,13 @@
{{ render_field(form.application.form.global_subtractive_selectors, rows=5, placeholder="header {{ render_field(form.application.form.global_subtractive_selectors, rows=5, placeholder="header
footer footer
nav nav
.stockticker") }} .stockticker
//*[contains(text(), 'Advertisement')]") }}
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li> Remove HTML element(s) by CSS selector before text conversion. </li> <li> Remove HTML element(s) by CSS and XPath selectors before text conversion. </li>
<li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li> <li> Don't paste HTML here, use only CSS and XPath selectors </li>
<li> Add multiple elements, CSS or XPath selectors per line to ignore multiple parts of the HTML. </li>
</ul> </ul>
</span> </span>
</fieldset> </fieldset>

@ -87,6 +87,9 @@ def test_element_removal_output():
Some initial text<br> Some initial text<br>
<p>across multiple lines</p> <p>across multiple lines</p>
<div id="changetext">Some text that changes</div> <div id="changetext">Some text that changes</div>
<div>Some text should be matched by xPath // selector</div>
<div>Some text should be matched by xPath selector</div>
<div>Some text should be matched by xPath1 selector</div>
</body> </body>
<footer> <footer>
<p>Footer</p> <p>Footer</p>
@ -94,7 +97,16 @@ def test_element_removal_output():
</html> </html>
""" """
html_blob = element_removal( html_blob = element_removal(
["header", "footer", "nav", "#changetext"], html_content=content [
"header",
"footer",
"nav",
"#changetext",
"//*[contains(text(), 'xPath // selector')]",
"xpath://*[contains(text(), 'xPath selector')]",
"xpath1://*[contains(text(), 'xPath1 selector')]"
],
html_content=content
) )
text = get_text(html_blob) text = get_text(html_blob)
assert ( assert (

Loading…
Cancel
Save