Option to render links as [Some Text ](/link), adds the ability to change-detect on hyperlink changes

3 years ago · 9809af142d
parent 1890881977
commit 9809af142d
9 changed files with 322 additions and 12 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -641,6 +641,7 @@ def changedetection_app(config=None, datastore_o=None):
            form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
            form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
            form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
            form.render_anchor_tag_content.data = datastore.data['settings']['application']['render_anchor_tag_content']
            form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
            form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
            form.notification_title.data = datastore.data['settings']['application']['notification_title']
@ -671,6 +672,7 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.data['settings']['application']['global_ignore_text'] =  form.global_ignore_text.data
            datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
            datastore.data['settings']['application']['real_browser_save_screenshot'] = form.real_browser_save_screenshot.data
            datastore.data['settings']['application']['render_anchor_tag_content'] = form.render_anchor_tag_content.data
            if not os.getenv("SALTED_PASS", False) and form.password.encrypted_password:
                datastore.data['settings']['application']['password'] = form.password.encrypted_password
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -4,7 +4,6 @@ import re
 import time
 import urllib3
 from inscriptis import get_text
 from changedetectionio import content_fetcher, html_tools
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -88,7 +87,7 @@ class perform_site_check():
            has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
            has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
-            
+
            if is_json and not has_filter_rule:
                css_filter_rule = "json:$"
                has_filter_rule = True
@ -117,9 +116,14 @@ class perform_site_check():
                            html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
                    if has_subtractive_selectors:
                        html_content = html_tools.element_removal(subtractive_selectors, html_content)
-                    # get_text() via inscriptis
+                    # extract text
-                    stripped_text_from_html = get_text(html_content)
+                    stripped_text_from_html = \
-
+                        html_tools.html_to_text(
                            html_content,
                            render_anchor_tag_content=self.datastore.data["settings"][
                                "application"].get(
                                "render_anchor_tag_content", False)
                        )
            # Re #340 - return the content before the 'ignore text' was applied
            text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -231,7 +231,7 @@ class ValidateListRegex(object):
                except re.error:
                    message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
                    raise ValidationError(message % (line))
-              
+
 class ValidateCSSJSONXPATHInput(object):
    """
    Filter validation
@ -293,7 +293,7 @@ class ValidateCSSJSONXPATHInput(object):
                # Re #265 - maybe in the future fetch the page and offer a
                # warning/notice that its possible the rule doesnt yet match anything?
-            
+
 class quickWatchForm(Form):
    # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
    # `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
@ -352,6 +352,10 @@ class globalSettingsForm(commonSettingsForm):
    global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
    global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
    ignore_whitespace = BooleanField('Ignore whitespace')
    render_anchor_tag_content = BooleanField('Render Anchor Tag Content',
                                             default=False)
    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
    real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?')
    removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -4,6 +4,9 @@ from typing import List
 from bs4 import BeautifulSoup
 from jsonpath_ng.ext import parse
 import re
 from inscriptis import get_text
 from inscriptis.model.config import ParserConfig
 class JSONNotFound(ValueError):
@ -25,12 +28,12 @@ def subtractive_css_selector(css_selector, html_content):
        item.decompose()
    return str(soup)
-    
+
 def element_removal(selectors: List[str], html_content):
    """Joins individual filters into one css filter."""
    selector = ",".join(selectors)
    return subtractive_css_selector(selector, html_content)
-    
+
 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content):
@ -167,3 +170,35 @@ def strip_ignore_text(content, wordlist, mode="content"):
        return ignored_line_numbers
    return "\n".encode('utf8').join(output)
 def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    """Converts html string to a string with just the text. If ignoring
    rendering anchor tag content is enable, anchor tag content are also
    included in the text
    :param html_content: string with html content
    :param render_anchor_tag_content: boolean flag indicating whether to extract
    hyperlinks (the anchor tag content) together with text. This refers to the
    'href' inside 'a' tags.
    Anchor tag content is rendered in the following manner:
    '[ text ](anchor tag content)'
    :return: extracted text from the HTML
    """
    #  if anchor tag content flag is set to True define a config for
    #  extracting this content
    if render_anchor_tag_content:
        parser_config = ParserConfig(
            annotation_rules={"a": ["hyperlink"]}, display_links=True
        )
    # otherwise set config to None
    else:
        parser_config = None
    # get text and annotations via inscriptis
    text_content = get_text(html_content, config=parser_config)
    return text_content
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -52,6 +52,7 @@ class ChangeDetectionStore:
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
                    'global_subtractive_selectors': [],
                    'ignore_whitespace': False,
                    'render_anchor_tag_content': False,
                    'notification_urls': [], # Apprise URL list
                    # Custom notification content
                    'notification_title': default_notification_title,
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@ -91,10 +91,16 @@
                    <fieldset class="pure-group">
                    {{ render_field(form.ignore_whitespace) }}
                    <span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/>
-                    <i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc.
+                    <i>Note:</i> Changing this will change the status of your existing watches, possibly trigger alerts etc.
                    </span>
                    </fieldset>
                <fieldset class="pure-group">
                    {{ render_field(form.render_anchor_tag_content) }}
                    <span class="pure-form-message-inline">Render anchor tag content, default disabled, when enabled renders links as <code>(link text)[https://somesite.com]</code>
                        <br/>
                    <i>Note:</i> Changing this could affect the content of your existing watches, possibly trigger alerts etc.
                    </span>
                    </fieldset>
                    <fieldset class="pure-group">
                      {{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header
 footer
--- a/changedetectionio/tests/test_api.py
+++ b/changedetectionio/tests/test_api.py
@ -26,7 +26,8 @@ def test_snapshot_api_detects_change(client, live_server):
    time.sleep(1)
    # Add our URL to the import page
-    test_url = url_for('test_endpoint', content_type="text/plain", _external=True)
+    test_url = url_for('test_endpoint', content_type="text/plain",
                       _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
--- a/changedetectionio/tests/test_html_to_text.py
+++ b/changedetectionio/tests/test_html_to_text.py
@ -0,0 +1,38 @@
 #!/usr/bin/python3
 """Test suite for the method to extract text from an html string"""
 from ..html_tools import html_to_text
 def test_html_to_text_func():
    test_html = """<html>
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     <a href="/first_link"> More Text </a>
     </br>
     So let's see what happens.  </br>
     <a href="second_link.com"> Even More Text </a>
     </body>
     </html>
    """
    # extract text, with 'render_anchor_tag_content' set to False
    text_content = html_to_text(test_html, render_anchor_tag_content=False)
    no_links_text = \
        "Some initial text\n\nWhich is across multiple " \
        "lines\n\nMore Text So let's see what happens. Even More Text"
    # check that no links are in the extracted text
    assert text_content == no_links_text
    # extract text, with 'render_anchor_tag_content' set to True
    text_content = html_to_text(test_html, render_anchor_tag_content=True)
    links_text = \
        "Some initial text\n\nWhich is across multiple lines\n\n[ More Text " \
        "](/first_link) So let's see what happens. [ Even More Text ]" \
        "(second_link.com)"
    # check that links are present in the extracted text
    assert text_content == links_text
--- a/changedetectionio/tests/test_ignorehyperlinks.py
+++ b/changedetectionio/tests/test_ignorehyperlinks.py
@ -0,0 +1,219 @@
 #!/usr/bin/python3
 """Test suite for the render/not render anchor tag content functionality"""
 import time
 from flask import url_for
 from .util import live_server_setup
 def test_setup(live_server):
    live_server_setup(live_server)
 def set_original_ignore_response():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <a href="/original_link"> Some More Text </a>
     </br>
     So let's see what happens.  </br>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 # Should be the same as set_original_ignore_response() but with a different
 # link
 def set_modified_ignore_response():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <a href="/modified_link"> Some More Text </a>
     </br>
     So let's see what happens.  </br>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 def test_render_anchor_tag_content_true(client, live_server):
    """Testing that the link changes are detected when
    render_anchor_tag_content setting is set to true"""
    sleep_time_for_fetch_thread = 3
    # Give the endpoint time to spin up
    time.sleep(1)
    # set original html text
    set_original_ignore_response()
    # Goto the settings page, choose not to ignore links
    res = client.post(
        url_for("settings_page"),
        data={
            "minutes_between_check": 180,
            "render_anchor_tag_content": "true",
            "fetch_backend": "html_requests",
        },
        follow_redirects=True,
    )
    assert b"Settings updated." in res.data
    # Add our URL to the import page
    test_url = url_for("test_endpoint", _external=True)
    res = client.post(
        url_for("import_page"), data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(sleep_time_for_fetch_thread)
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # set a new html text with a modified link
    set_modified_ignore_response()
    time.sleep(sleep_time_for_fetch_thread)
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # check that the anchor tag content is rendered
    res = client.get(url_for("preview_page", uuid="first"))
    assert '(/modified_link)' in res.data.decode()
    # since the link has changed, and we chose to render anchor tag content,
    # we should detect a change (new 'unviewed' class)
    res = client.get(url_for("index"))
    assert b"unviewed" in res.data
    assert b"/test-endpoint" in res.data
    # Cleanup everything
    res = client.get(url_for("api_delete", uuid="all"),
                     follow_redirects=True)
    assert b'Deleted' in res.data
 def test_render_anchor_tag_content_false(client, live_server):
    """Testing that anchor tag content changes are ignored when
    render_anchor_tag_content setting is set to false"""
    sleep_time_for_fetch_thread = 3
    # Give the endpoint time to spin up
    time.sleep(1)
    # set the original html text
    set_original_ignore_response()
    # Goto the settings page, choose to ignore hyperlinks
    res = client.post(
        url_for("settings_page"),
        data={
            "minutes_between_check": 180,
            "render_anchor_tag_content": "false",
            "fetch_backend": "html_requests",
        },
        follow_redirects=True,
    )
    assert b"Settings updated." in res.data
    # Add our URL to the import page
    test_url = url_for("test_endpoint", _external=True)
    res = client.post(
        url_for("import_page"), data={"urls": test_url}, follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(sleep_time_for_fetch_thread)
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # set a new html text, with a modified link
    set_modified_ignore_response()
    time.sleep(sleep_time_for_fetch_thread)
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # check that the anchor tag content is not rendered
    res = client.get(url_for("preview_page", uuid="first"))
    assert '(/modified_link)' not in res.data.decode()
    # even though the link has changed, we shouldn't detect a change since
    # we selected to not render anchor tag content (no new 'unviewed' class)
    res = client.get(url_for("index"))
    assert b"unviewed" not in res.data
    assert b"/test-endpoint" in res.data
    # Cleanup everything
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
 def test_render_anchor_tag_content_default(client, live_server):
    """Testing that anchor tag content changes are ignored when the
    render_anchor_tag_content setting is not explicitly selected"""
    sleep_time_for_fetch_thread = 3
    # Give the endpoint time to spin up
    time.sleep(1)
    # set the original html text
    set_original_ignore_response()
    # Goto the settings page, not passing the render_anchor_tag_content setting
    res = client.post(
        url_for("settings_page"),
        data={
            "minutes_between_check": 180,
            "fetch_backend": "html_requests",
        },
        follow_redirects=True,
    )
    assert b"Settings updated." in res.data
    # Add our URL to the import page
    test_url = url_for("test_endpoint", _external=True)
    res = client.post(
        url_for("import_page"), data={"urls": test_url}, follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(sleep_time_for_fetch_thread)
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # set a new html text, with a modified link
    set_modified_ignore_response()
    time.sleep(sleep_time_for_fetch_thread)
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # check that the anchor tag content is not rendered
    res = client.get(url_for("preview_page", uuid="first"))
    assert '(/modified_link)' not in res.data.decode()
    # even though the link has changed, we shouldn't detect a change since
    # we did not select the setting and the default behaviour is to not
    # render anchor tag content (no new 'unviewed' class)
    res = client.get(url_for("index"))
    assert b"unviewed" not in res.data
    assert b"/test-endpoint" in res.data
    # Cleanup everything
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data