diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 7b25a5c8..e29553ee 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -641,6 +641,7 @@ def changedetection_app(config=None, datastore_o=None): form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors'] form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace'] + form.render_anchor_tag_content.data = datastore.data['settings']['application']['render_anchor_tag_content'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend'] form.notification_title.data = datastore.data['settings']['application']['notification_title'] @@ -671,6 +672,7 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data datastore.data['settings']['application']['real_browser_save_screenshot'] = form.real_browser_save_screenshot.data + datastore.data['settings']['application']['render_anchor_tag_content'] = form.render_anchor_tag_content.data if not os.getenv("SALTED_PASS", False) and form.password.encrypted_password: datastore.data['settings']['application']['password'] = form.password.encrypted_password diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 786e7a2f..aef24bcd 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -4,7 +4,6 @@ import re import time import urllib3 -from inscriptis import get_text from changedetectionio import content_fetcher, html_tools urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -88,7 +87,7 @@ class perform_site_check(): has_filter_rule = css_filter_rule and len(css_filter_rule.strip()) has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) - + if is_json and not has_filter_rule: css_filter_rule = "json:$" has_filter_rule = True @@ -117,9 +116,14 @@ class perform_site_check(): html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) - # get_text() via inscriptis - stripped_text_from_html = get_text(html_content) - + # extract text + stripped_text_from_html = \ + html_tools.html_to_text( + html_content, + render_anchor_tag_content=self.datastore.data["settings"][ + "application"].get( + "render_anchor_tag_content", False) + ) # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 4dc7921a..2f28a62d 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -231,7 +231,7 @@ class ValidateListRegex(object): except re.error: message = field.gettext('RegEx \'%s\' is not a valid regular expression.') raise ValidationError(message % (line)) - + class ValidateCSSJSONXPATHInput(object): """ Filter validation @@ -293,7 +293,7 @@ class ValidateCSSJSONXPATHInput(object): # Re #265 - maybe in the future fetch the page and offer a # warning/notice that its possible the rule doesnt yet match anything? - + class quickWatchForm(Form): # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5 # `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run @@ -352,6 +352,10 @@ class globalSettingsForm(commonSettingsForm): global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) ignore_whitespace = BooleanField('Ignore whitespace') + + render_anchor_tag_content = BooleanField('Render Anchor Tag Content', + default=False) + save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?') removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"}) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index fda2cf25..d355c209 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -4,6 +4,9 @@ from typing import List from bs4 import BeautifulSoup from jsonpath_ng.ext import parse +import re +from inscriptis import get_text +from inscriptis.model.config import ParserConfig class JSONNotFound(ValueError): @@ -25,12 +28,12 @@ def subtractive_css_selector(css_selector, html_content): item.decompose() return str(soup) - + def element_removal(selectors: List[str], html_content): """Joins individual filters into one css filter.""" selector = ",".join(selectors) return subtractive_css_selector(selector, html_content) - + # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content): @@ -167,3 +170,35 @@ def strip_ignore_text(content, wordlist, mode="content"): return ignored_line_numbers return "\n".encode('utf8').join(output) + + +def html_to_text(html_content: str, render_anchor_tag_content=False) -> str: + """Converts html string to a string with just the text. If ignoring + rendering anchor tag content is enable, anchor tag content are also + included in the text + + :param html_content: string with html content + :param render_anchor_tag_content: boolean flag indicating whether to extract + hyperlinks (the anchor tag content) together with text. This refers to the + 'href' inside 'a' tags. + Anchor tag content is rendered in the following manner: + '[ text ](anchor tag content)' + :return: extracted text from the HTML + """ + # if anchor tag content flag is set to True define a config for + # extracting this content + if render_anchor_tag_content: + + parser_config = ParserConfig( + annotation_rules={"a": ["hyperlink"]}, display_links=True + ) + + # otherwise set config to None + else: + parser_config = None + + # get text and annotations via inscriptis + text_content = get_text(html_content, config=parser_config) + + return text_content + diff --git a/changedetectionio/store.py b/changedetectionio/store.py index e37710de..4aa5cc3e 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -52,6 +52,7 @@ class ChangeDetectionStore: 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_subtractive_selectors': [], 'ignore_whitespace': False, + 'render_anchor_tag_content': False, 'notification_urls': [], # Apprise URL list # Custom notification content 'notification_title': default_notification_title, diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index fac9da3a..996c58d4 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -91,10 +91,16 @@
{{ render_field(form.ignore_whitespace) }} Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.
- Note: Changing this will change the status of your existing watches, possibily trigger alerts etc. + Note: Changing this will change the status of your existing watches, possibly trigger alerts etc. +
+
+
+ {{ render_field(form.render_anchor_tag_content) }} + Render anchor tag content, default disabled, when enabled renders links as (link text)[https://somesite.com] +
+ Note: Changing this could affect the content of your existing watches, possibly trigger alerts etc.
-
{{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header footer diff --git a/changedetectionio/tests/test_api.py b/changedetectionio/tests/test_api.py index 370fd10a..19d53ed7 100644 --- a/changedetectionio/tests/test_api.py +++ b/changedetectionio/tests/test_api.py @@ -26,7 +26,8 @@ def test_snapshot_api_detects_change(client, live_server): time.sleep(1) # Add our URL to the import page - test_url = url_for('test_endpoint', content_type="text/plain", _external=True) + test_url = url_for('test_endpoint', content_type="text/plain", + _external=True) res = client.post( url_for("import_page"), data={"urls": test_url}, diff --git a/changedetectionio/tests/test_html_to_text.py b/changedetectionio/tests/test_html_to_text.py new file mode 100644 index 00000000..2dcabdc3 --- /dev/null +++ b/changedetectionio/tests/test_html_to_text.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +"""Test suite for the method to extract text from an html string""" +from ..html_tools import html_to_text + + +def test_html_to_text_func(): + test_html = """ + + Some initial text
+

Which is across multiple lines

+ More Text +
+ So let's see what happens.
+ Even More Text + + + """ + + # extract text, with 'render_anchor_tag_content' set to False + text_content = html_to_text(test_html, render_anchor_tag_content=False) + + no_links_text = \ + "Some initial text\n\nWhich is across multiple " \ + "lines\n\nMore Text So let's see what happens. Even More Text" + + # check that no links are in the extracted text + assert text_content == no_links_text + + # extract text, with 'render_anchor_tag_content' set to True + text_content = html_to_text(test_html, render_anchor_tag_content=True) + + links_text = \ + "Some initial text\n\nWhich is across multiple lines\n\n[ More Text " \ + "](/first_link) So let's see what happens. [ Even More Text ]" \ + "(second_link.com)" + + # check that links are present in the extracted text + assert text_content == links_text diff --git a/changedetectionio/tests/test_ignorehyperlinks.py b/changedetectionio/tests/test_ignorehyperlinks.py new file mode 100644 index 00000000..9356305b --- /dev/null +++ b/changedetectionio/tests/test_ignorehyperlinks.py @@ -0,0 +1,219 @@ +#!/usr/bin/python3 +"""Test suite for the render/not render anchor tag content functionality""" + +import time +from flask import url_for +from .util import live_server_setup + + +def test_setup(live_server): + live_server_setup(live_server) + +def set_original_ignore_response(): + test_return_data = """ + + Some initial text
+ Some More Text +
+ So let's see what happens.
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +# Should be the same as set_original_ignore_response() but with a different +# link +def set_modified_ignore_response(): + test_return_data = """ + + Some initial text
+ Some More Text +
+ So let's see what happens.
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + +def test_render_anchor_tag_content_true(client, live_server): + """Testing that the link changes are detected when + render_anchor_tag_content setting is set to true""" + sleep_time_for_fetch_thread = 3 + + # Give the endpoint time to spin up + time.sleep(1) + + # set original html text + set_original_ignore_response() + + # Goto the settings page, choose not to ignore links + res = client.post( + url_for("settings_page"), + data={ + "minutes_between_check": 180, + "render_anchor_tag_content": "true", + "fetch_backend": "html_requests", + }, + follow_redirects=True, + ) + assert b"Settings updated." in res.data + + # Add our URL to the import page + test_url = url_for("test_endpoint", _external=True) + res = client.post( + url_for("import_page"), data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # set a new html text with a modified link + set_modified_ignore_response() + time.sleep(sleep_time_for_fetch_thread) + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # check that the anchor tag content is rendered + res = client.get(url_for("preview_page", uuid="first")) + assert '(/modified_link)' in res.data.decode() + + # since the link has changed, and we chose to render anchor tag content, + # we should detect a change (new 'unviewed' class) + res = client.get(url_for("index")) + assert b"unviewed" in res.data + assert b"/test-endpoint" in res.data + + # Cleanup everything + res = client.get(url_for("api_delete", uuid="all"), + follow_redirects=True) + assert b'Deleted' in res.data + + +def test_render_anchor_tag_content_false(client, live_server): + """Testing that anchor tag content changes are ignored when + render_anchor_tag_content setting is set to false""" + sleep_time_for_fetch_thread = 3 + + # Give the endpoint time to spin up + time.sleep(1) + + # set the original html text + set_original_ignore_response() + + # Goto the settings page, choose to ignore hyperlinks + res = client.post( + url_for("settings_page"), + data={ + "minutes_between_check": 180, + "render_anchor_tag_content": "false", + "fetch_backend": "html_requests", + }, + follow_redirects=True, + ) + assert b"Settings updated." in res.data + + # Add our URL to the import page + test_url = url_for("test_endpoint", _external=True) + res = client.post( + url_for("import_page"), data={"urls": test_url}, follow_redirects=True + ) + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # set a new html text, with a modified link + set_modified_ignore_response() + time.sleep(sleep_time_for_fetch_thread) + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # check that the anchor tag content is not rendered + res = client.get(url_for("preview_page", uuid="first")) + assert '(/modified_link)' not in res.data.decode() + + # even though the link has changed, we shouldn't detect a change since + # we selected to not render anchor tag content (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b"unviewed" not in res.data + assert b"/test-endpoint" in res.data + + # Cleanup everything + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + +def test_render_anchor_tag_content_default(client, live_server): + """Testing that anchor tag content changes are ignored when the + render_anchor_tag_content setting is not explicitly selected""" + sleep_time_for_fetch_thread = 3 + + # Give the endpoint time to spin up + time.sleep(1) + + # set the original html text + set_original_ignore_response() + + # Goto the settings page, not passing the render_anchor_tag_content setting + res = client.post( + url_for("settings_page"), + data={ + "minutes_between_check": 180, + "fetch_backend": "html_requests", + }, + follow_redirects=True, + ) + assert b"Settings updated." in res.data + + # Add our URL to the import page + test_url = url_for("test_endpoint", _external=True) + res = client.post( + url_for("import_page"), data={"urls": test_url}, follow_redirects=True + ) + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # set a new html text, with a modified link + set_modified_ignore_response() + time.sleep(sleep_time_for_fetch_thread) + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # check that the anchor tag content is not rendered + res = client.get(url_for("preview_page", uuid="first")) + assert '(/modified_link)' not in res.data.decode() + + # even though the link has changed, we shouldn't detect a change since + # we did not select the setting and the default behaviour is to not + # render anchor tag content (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b"unviewed" not in res.data + assert b"/test-endpoint" in res.data + + # Cleanup everything + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data