Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes

pull/535/head
dgtlmoon 3 years ago committed by GitHub
parent 1890881977
commit 9809af142d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -641,6 +641,7 @@ def changedetection_app(config=None, datastore_o=None):
form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors'] form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace'] form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
form.render_anchor_tag_content.data = datastore.data['settings']['application']['render_anchor_tag_content']
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend'] form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
form.notification_title.data = datastore.data['settings']['application']['notification_title'] form.notification_title.data = datastore.data['settings']['application']['notification_title']
@ -671,6 +672,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data
datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
datastore.data['settings']['application']['real_browser_save_screenshot'] = form.real_browser_save_screenshot.data datastore.data['settings']['application']['real_browser_save_screenshot'] = form.real_browser_save_screenshot.data
datastore.data['settings']['application']['render_anchor_tag_content'] = form.render_anchor_tag_content.data
if not os.getenv("SALTED_PASS", False) and form.password.encrypted_password: if not os.getenv("SALTED_PASS", False) and form.password.encrypted_password:
datastore.data['settings']['application']['password'] = form.password.encrypted_password datastore.data['settings']['application']['password'] = form.password.encrypted_password

@ -4,7 +4,6 @@ import re
import time import time
import urllib3 import urllib3
from inscriptis import get_text
from changedetectionio import content_fetcher, html_tools from changedetectionio import content_fetcher, html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -88,7 +87,7 @@ class perform_site_check():
has_filter_rule = css_filter_rule and len(css_filter_rule.strip()) has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
if is_json and not has_filter_rule: if is_json and not has_filter_rule:
css_filter_rule = "json:$" css_filter_rule = "json:$"
has_filter_rule = True has_filter_rule = True
@ -117,9 +116,14 @@ class perform_site_check():
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
if has_subtractive_selectors: if has_subtractive_selectors:
html_content = html_tools.element_removal(subtractive_selectors, html_content) html_content = html_tools.element_removal(subtractive_selectors, html_content)
# get_text() via inscriptis # extract text
stripped_text_from_html = get_text(html_content) stripped_text_from_html = \
html_tools.html_to_text(
html_content,
render_anchor_tag_content=self.datastore.data["settings"][
"application"].get(
"render_anchor_tag_content", False)
)
# Re #340 - return the content before the 'ignore text' was applied # Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

@ -231,7 +231,7 @@ class ValidateListRegex(object):
except re.error: except re.error:
message = field.gettext('RegEx \'%s\' is not a valid regular expression.') message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
raise ValidationError(message % (line)) raise ValidationError(message % (line))
class ValidateCSSJSONXPATHInput(object): class ValidateCSSJSONXPATHInput(object):
""" """
Filter validation Filter validation
@ -293,7 +293,7 @@ class ValidateCSSJSONXPATHInput(object):
# Re #265 - maybe in the future fetch the page and offer a # Re #265 - maybe in the future fetch the page and offer a
# warning/notice that its possible the rule doesnt yet match anything? # warning/notice that its possible the rule doesnt yet match anything?
class quickWatchForm(Form): class quickWatchForm(Form):
# https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5 # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
# `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run # `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
@ -352,6 +352,10 @@ class globalSettingsForm(commonSettingsForm):
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
ignore_whitespace = BooleanField('Ignore whitespace') ignore_whitespace = BooleanField('Ignore whitespace')
render_anchor_tag_content = BooleanField('Render Anchor Tag Content',
default=False)
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?') real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?')
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"}) removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})

@ -4,6 +4,9 @@ from typing import List
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse from jsonpath_ng.ext import parse
import re
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
class JSONNotFound(ValueError): class JSONNotFound(ValueError):
@ -25,12 +28,12 @@ def subtractive_css_selector(css_selector, html_content):
item.decompose() item.decompose()
return str(soup) return str(soup)
def element_removal(selectors: List[str], html_content): def element_removal(selectors: List[str], html_content):
"""Joins individual filters into one css filter.""" """Joins individual filters into one css filter."""
selector = ",".join(selectors) selector = ",".join(selectors)
return subtractive_css_selector(selector, html_content) return subtractive_css_selector(selector, html_content)
# Return str Utf-8 of matched rules # Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content): def xpath_filter(xpath_filter, html_content):
@ -167,3 +170,35 @@ def strip_ignore_text(content, wordlist, mode="content"):
return ignored_line_numbers return ignored_line_numbers
return "\n".encode('utf8').join(output) return "\n".encode('utf8').join(output)
def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
"""Converts html string to a string with just the text. If ignoring
rendering anchor tag content is enable, anchor tag content are also
included in the text
:param html_content: string with html content
:param render_anchor_tag_content: boolean flag indicating whether to extract
hyperlinks (the anchor tag content) together with text. This refers to the
'href' inside 'a' tags.
Anchor tag content is rendered in the following manner:
'[ text ](anchor tag content)'
:return: extracted text from the HTML
"""
# if anchor tag content flag is set to True define a config for
# extracting this content
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]}, display_links=True
)
# otherwise set config to None
else:
parser_config = None
# get text and annotations via inscriptis
text_content = get_text(html_content, config=parser_config)
return text_content

@ -52,6 +52,7 @@ class ChangeDetectionStore:
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [], 'global_subtractive_selectors': [],
'ignore_whitespace': False, 'ignore_whitespace': False,
'render_anchor_tag_content': False,
'notification_urls': [], # Apprise URL list 'notification_urls': [], # Apprise URL list
# Custom notification content # Custom notification content
'notification_title': default_notification_title, 'notification_title': default_notification_title,

@ -91,10 +91,16 @@
<fieldset class="pure-group"> <fieldset class="pure-group">
{{ render_field(form.ignore_whitespace) }} {{ render_field(form.ignore_whitespace) }}
<span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/> <span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/>
<i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc. <i>Note:</i> Changing this will change the status of your existing watches, possibly trigger alerts etc.
</span>
</fieldset>
<fieldset class="pure-group">
{{ render_field(form.render_anchor_tag_content) }}
<span class="pure-form-message-inline">Render anchor tag content, default disabled, when enabled renders links as <code>(link text)[https://somesite.com]</code>
<br/>
<i>Note:</i> Changing this could affect the content of your existing watches, possibly trigger alerts etc.
</span> </span>
</fieldset> </fieldset>
<fieldset class="pure-group"> <fieldset class="pure-group">
{{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header {{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header
footer footer

@ -26,7 +26,8 @@ def test_snapshot_api_detects_change(client, live_server):
time.sleep(1) time.sleep(1)
# Add our URL to the import page # Add our URL to the import page
test_url = url_for('test_endpoint', content_type="text/plain", _external=True) test_url = url_for('test_endpoint', content_type="text/plain",
_external=True)
res = client.post( res = client.post(
url_for("import_page"), url_for("import_page"),
data={"urls": test_url}, data={"urls": test_url},

@ -0,0 +1,38 @@
#!/usr/bin/python3
"""Test suite for the method to extract text from an html string"""
from ..html_tools import html_to_text
def test_html_to_text_func():
test_html = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
<a href="/first_link"> More Text </a>
</br>
So let's see what happens. </br>
<a href="second_link.com"> Even More Text </a>
</body>
</html>
"""
# extract text, with 'render_anchor_tag_content' set to False
text_content = html_to_text(test_html, render_anchor_tag_content=False)
no_links_text = \
"Some initial text\n\nWhich is across multiple " \
"lines\n\nMore Text So let's see what happens. Even More Text"
# check that no links are in the extracted text
assert text_content == no_links_text
# extract text, with 'render_anchor_tag_content' set to True
text_content = html_to_text(test_html, render_anchor_tag_content=True)
links_text = \
"Some initial text\n\nWhich is across multiple lines\n\n[ More Text " \
"](/first_link) So let's see what happens. [ Even More Text ]" \
"(second_link.com)"
# check that links are present in the extracted text
assert text_content == links_text

@ -0,0 +1,219 @@
#!/usr/bin/python3
"""Test suite for the render/not render anchor tag content functionality"""
import time
from flask import url_for
from .util import live_server_setup
def test_setup(live_server):
live_server_setup(live_server)
def set_original_ignore_response():
test_return_data = """<html>
<body>
Some initial text</br>
<a href="/original_link"> Some More Text </a>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
# Should be the same as set_original_ignore_response() but with a different
# link
def set_modified_ignore_response():
test_return_data = """<html>
<body>
Some initial text</br>
<a href="/modified_link"> Some More Text </a>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_render_anchor_tag_content_true(client, live_server):
"""Testing that the link changes are detected when
render_anchor_tag_content setting is set to true"""
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
# set original html text
set_original_ignore_response()
# Goto the settings page, choose not to ignore links
res = client.post(
url_for("settings_page"),
data={
"minutes_between_check": 180,
"render_anchor_tag_content": "true",
"fetch_backend": "html_requests",
},
follow_redirects=True,
)
assert b"Settings updated." in res.data
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
res = client.post(
url_for("import_page"), data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# set a new html text with a modified link
set_modified_ignore_response()
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# check that the anchor tag content is rendered
res = client.get(url_for("preview_page", uuid="first"))
assert '(/modified_link)' in res.data.decode()
# since the link has changed, and we chose to render anchor tag content,
# we should detect a change (new 'unviewed' class)
res = client.get(url_for("index"))
assert b"unviewed" in res.data
assert b"/test-endpoint" in res.data
# Cleanup everything
res = client.get(url_for("api_delete", uuid="all"),
follow_redirects=True)
assert b'Deleted' in res.data
def test_render_anchor_tag_content_false(client, live_server):
"""Testing that anchor tag content changes are ignored when
render_anchor_tag_content setting is set to false"""
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
# set the original html text
set_original_ignore_response()
# Goto the settings page, choose to ignore hyperlinks
res = client.post(
url_for("settings_page"),
data={
"minutes_between_check": 180,
"render_anchor_tag_content": "false",
"fetch_backend": "html_requests",
},
follow_redirects=True,
)
assert b"Settings updated." in res.data
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
res = client.post(
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# set a new html text, with a modified link
set_modified_ignore_response()
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# check that the anchor tag content is not rendered
res = client.get(url_for("preview_page", uuid="first"))
assert '(/modified_link)' not in res.data.decode()
# even though the link has changed, we shouldn't detect a change since
# we selected to not render anchor tag content (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b"unviewed" not in res.data
assert b"/test-endpoint" in res.data
# Cleanup everything
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_render_anchor_tag_content_default(client, live_server):
"""Testing that anchor tag content changes are ignored when the
render_anchor_tag_content setting is not explicitly selected"""
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
# set the original html text
set_original_ignore_response()
# Goto the settings page, not passing the render_anchor_tag_content setting
res = client.post(
url_for("settings_page"),
data={
"minutes_between_check": 180,
"fetch_backend": "html_requests",
},
follow_redirects=True,
)
assert b"Settings updated." in res.data
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
res = client.post(
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# set a new html text, with a modified link
set_modified_ignore_response()
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# check that the anchor tag content is not rendered
res = client.get(url_for("preview_page", uuid="first"))
assert '(/modified_link)' not in res.data.decode()
# even though the link has changed, we shouldn't detect a change since
# we did not select the setting and the default behaviour is to not
# render anchor tag content (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b"unviewed" not in res.data
assert b"/test-endpoint" in res.data
# Cleanup everything
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
Loading…
Cancel
Save