Add filter to remove elements by CSS rule from HTML before change detection is run (#445)

3 years ago · d4359c2e67
parent 44fc804991
commit d4359c2e67
8 changed files with 321 additions and 62 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -35,6 +35,7 @@ from flask import (
    url_for,
 )
 from flask_login import login_required
+
 from changedetectionio import html_tools

 __version__ = '0.39.9'
@ -526,6 +527,7 @@ def changedetection_app(config=None, datastore_o=None):


            datastore.data['watching'][uuid]['css_filter'] = form.css_filter.data.strip()
+            datastore.data['watching'][uuid]['subtractive_selectors'] = form.subtractive_selectors.data

            # Reset the previous_md5 so we process a new snapshot including stripping ignore text.
            if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']:
@ -598,6 +600,7 @@ def changedetection_app(config=None, datastore_o=None):
        if request.method == 'GET':
            form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
            form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
+            form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
            form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
            form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
            form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
@ -626,6 +629,7 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.data['settings']['application']['notification_format'] = form.notification_format.data
            datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
            datastore.data['settings']['application']['base_url'] = form.base_url.data
+            datastore.data['settings']['application']['global_subtractive_selectors'] = form.global_subtractive_selectors.data
            datastore.data['settings']['application']['global_ignore_text'] =  form.global_ignore_text.data
            datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data

--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -1,11 +1,11 @@
-import time
-from changedetectionio import content_fetcher
-from changedetectionio import html_tools
 import hashlib
-from inscriptis import get_text
-import urllib3
-from . import html_tools
 import re
+import time
+
+import urllib3
+from inscriptis import get_text
+
+from changedetectionio import content_fetcher, html_tools

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

@ -72,8 +72,15 @@ class perform_site_check():
            is_json = 'application/json' in fetcher.headers.get('Content-Type', '')
            is_html = not is_json
            css_filter_rule = watch['css_filter']
+            subtractive_selectors = watch.get(
+                "subtractive_selectors", []
+            ) + self.datastore.data["settings"]["application"].get(
+                "global_subtractive_selectors", []
+            )

            has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
+            has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
+            
            if is_json and not has_filter_rule:
                css_filter_rule = "json:$"
                has_filter_rule = True
@ -100,11 +107,11 @@ class perform_site_check():
                        else:
                            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                            html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
-
+                    if has_subtractive_selectors:
+                        html_content = html_tools.element_removal(subtractive_selectors, html_content)
                    # get_text() via inscriptis
                    stripped_text_from_html = get_text(html_content)

-
            # Re #340 - return the content before the 'ignore text' was applied
            text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -1,13 +1,30 @@
-from wtforms import Form, SelectField, RadioField, BooleanField, StringField, PasswordField, validators, IntegerField, fields, TextAreaField, \
-    Field
+import re

-from wtforms import widgets, SubmitField
-from wtforms.validators import ValidationError
+from wtforms import (
+    BooleanField,
+    Field,
+    Form,
+    IntegerField,
+    PasswordField,
+    RadioField,
+    SelectField,
+    StringField,
+    SubmitField,
+    TextAreaField,
+    fields,
+    validators,
+    widgets,
+)
 from wtforms.fields import html5
-from changedetectionio import content_fetcher
-import re
+from wtforms.validators import ValidationError

-from changedetectionio.notification import default_notification_format, valid_notification_formats, default_notification_body, default_notification_title
+from changedetectionio import content_fetcher
+from changedetectionio.notification import (
+    default_notification_body,
+    default_notification_format,
+    default_notification_title,
+    valid_notification_formats,
+)

 valid_method = {
    'GET',
@ -45,8 +62,8 @@ class SaltyPasswordField(StringField):
    encrypted_password = ""

    def build_password(self, password):
-        import hashlib
        import base64
+        import hashlib
        import secrets

        # Make a new salt on every new password and store it with the password
@ -104,9 +121,10 @@ class ValidateContentFetcherIsReady(object):
        self.message = message

    def __call__(self, form, field):
-        from changedetectionio import content_fetcher
        import urllib3.exceptions

+        from changedetectionio import content_fetcher
+
        # Better would be a radiohandler that keeps a reference to each class
        if field.data is not None:
            klass = getattr(content_fetcher, field.data)
@ -214,50 +232,67 @@ class ValidateListRegex(object):
                    message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
                    raise ValidationError(message % (line))
              
-class ValidateCSSJSONXPathInput(object):
+class ValidateCSSJSONXPATHInput(object):
    """
    Filter validation
    @todo CSS validator ;)
    """

-    def __init__(self, message=None):
+    def __init__(self, message=None, allow_xpath=True, allow_json=True):
        self.message = message
+        self.allow_xpath = allow_xpath
+        self.allow_json = allow_json

    def __call__(self, form, field):

+        if isinstance(field.data, str):
+            data = [field.data]
+        else:
+            data = field.data
+
+        for line in data:
        # Nothing to see here
-        if not len(field.data.strip()):
-            return
+            if not len(line.strip()):
+                return

-        # Does it look like XPath?
-        if field.data.strip()[0] == '/':
-            from lxml import html, etree
-            tree = html.fromstring("<html></html>")
+            # Does it look like XPath?
+            if line.strip()[0] == '/':
+                if not self.allow_xpath:
+                    raise ValidationError("XPath not permitted in this field!")
+                from lxml import etree, html
+                tree = html.fromstring("<html></html>")

-            try:
-                tree.xpath(field.data.strip())
-            except etree.XPathEvalError as e:
-                message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
-                raise ValidationError(message % (field.data, str(e)))
-            except:
-                raise ValidationError("A system-error occurred when validating your XPath expression")
+                try:
+                    tree.xpath(line.strip())
+                except etree.XPathEvalError as e:
+                    message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
+                    raise ValidationError(message % (line, str(e)))
+                except:
+                    raise ValidationError("A system-error occurred when validating your XPath expression")

-        if 'json:' in field.data:
-            from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
-            from jsonpath_ng.ext import parse
+            if 'json:' in line:
+                if not self.allow_json:
+                    raise ValidationError("JSONPath not permitted in this field!")

-            input = field.data.replace('json:', '')
+                from jsonpath_ng.exceptions import (
+                    JsonPathLexerError,
+                    JsonPathParserError,
+                )
+                from jsonpath_ng.ext import parse

-            try:
-                parse(input)
-            except (JsonPathParserError, JsonPathLexerError) as e:
-                message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
-                raise ValidationError(message % (input, str(e)))
-            except:
-                raise ValidationError("A system-error occurred when validating your JSONPath expression")
+                input = line.replace('json:', '')
+
+                try:
+                    parse(input)
+                except (JsonPathParserError, JsonPathLexerError) as e:
+                    message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
+                    raise ValidationError(message % (input, str(e)))
+                except:
+                    raise ValidationError("A system-error occurred when validating your JSONPath expression")
+
+                # Re #265 - maybe in the future fetch the page and offer a
+                # warning/notice that its possible the rule doesnt yet match anything?

-            # Re #265 - maybe in the future fetch the page and offer a
-            # warning/notice that its possible the rule doesnt yet match anything?
            
 class quickWatchForm(Form):
    # https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
@ -282,7 +317,8 @@ class watchForm(commonSettingsForm):

    minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck',
                                               [validators.Optional(), validators.NumberRange(min=1)])
-    css_filter = StringField('CSS/JSON/XPath Filter', [ValidateCSSJSONXPathInput()])
+    css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
+    subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
    title = StringField('Title')

    ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
@ -314,5 +350,6 @@ class globalSettingsForm(commonSettingsForm):
                                               [validators.NumberRange(min=1)])
    extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
    base_url = StringField('Base URL', validators=[validators.Optional()])
+    global_subtractive_selectors = StringListField('Ignore elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
    global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
    ignore_whitespace = BooleanField('Ignore whitespace')
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -1,7 +1,10 @@
 import json
+import re
+from typing import List
+
 from bs4 import BeautifulSoup
 from jsonpath_ng.ext import parse
-import re
+

 class JSONNotFound(ValueError):
    def __init__(self, msg):
@ -16,11 +19,22 @@ def css_filter(css_filter, html_content):

    return html_block + "\n"

+def subtractive_css_selector(css_selector, html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    for item in soup.select(css_selector):
+        item.decompose()
+    return str(soup)
+
+    
+def element_removal(selectors: List[str], html_content):
+    """Joins individual filters into one css filter."""
+    selector = ",".join(selectors)
+    return subtractive_css_selector(selector, html_content)
+    

 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content):
-    from lxml import html
-    from lxml import etree
+    from lxml import etree, html

    tree = html.fromstring(html_content)
    html_block = ""
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -1,15 +1,19 @@
-from os import unlink, path, mkdir
 import json
+import logging
+import os
+import threading
+import time
 import uuid as uuid_builder
-from threading import Lock
 from copy import deepcopy
+from os import mkdir, path, unlink
+from threading import Lock

-import logging
-import time
-import threading
-import os
+from changedetectionio.notification import (
+    default_notification_body,
+    default_notification_format,
+    default_notification_title,
+)

-from changedetectionio.notification import default_notification_format, default_notification_body, default_notification_title

 # Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
 # Open a github issue if you know something :)
@ -46,6 +50,7 @@ class ChangeDetectionStore:
                    'extract_title_as_title': False,
                    'fetch_backend': 'html_requests',
                    'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
+                    'global_subtractive_selectors': [],
                    'ignore_whitespace': False,
                    'notification_urls': [], # Apprise URL list
                    # Custom notification content
@ -82,6 +87,7 @@ class ChangeDetectionStore:
            'notification_body': default_notification_body,
            'notification_format': default_notification_format,
            'css_filter': "",
+            'subtractive_selectors': [],
            'trigger_text': [],  # List of text or regex to wait for until a change is detected
            'fetch_backend': None,
            'extract_title_as_title': False
@ -144,8 +150,8 @@ class ChangeDetectionStore:
            unlink(password_reset_lockfile)

        if not 'app_guid' in self.__data:
-            import sys
            import os
+            import sys
            if "pytest" in sys.modules or "PYTEST_CURRENT_TEST" in os.environ:
                self.__data['app_guid'] = "test-" + str(uuid_builder.uuid4())
            else:
@ -430,6 +436,7 @@ class ChangeDetectionStore:
                index.append(self.data['watching'][uuid]['history'][str(id)])

        import pathlib
+
        # Only in the sub-directories
        for item in pathlib.Path(self.datastore_path).rglob("*/*txt"):
            if not str(item) in index:
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -122,7 +122,18 @@ User-Agent: wonderbra 1.0") }}
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
                </span>
                    </div>
-
+                    <fieldset class="pure-group">
+                      {{ render_field(form.subtractive_selectors, rows=5, placeholder="header
+footer
+nav
+.stockticker") }}
+                      <span class="pure-form-message-inline">
+                        <ul>
+                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
+                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
+                        </ul>
+                      </span>
+                    </fieldset>
                </fieldset>
                <fieldset class="pure-group">
                    {{ render_field(form.ignore_text, rows=5, placeholder="Some text to ignore in a line
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@ -83,7 +83,18 @@
                    </span>
                    </fieldset>

-
+                    <fieldset class="pure-group">
+                      {{ render_field(form.global_subtractive_selectors, rows=5, placeholder="header
+footer
+nav
+.stockticker") }}
+                      <span class="pure-form-message-inline">
+                        <ul>
+                          <li> Remove HTML element(s) by CSS selector before text conversion. </li>
+                          <li> Add multiple elements or CSS selectors per line to ignore multiple parts of the HTML. </li>
+                        </ul>
+                      </span>
+                    </fieldset>
                    <fieldset class="pure-group">
                    {{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line
 /some.regex\d{2}/ for case-INsensitive regex
--- a/changedetectionio/tests/test_element_removal.py
+++ b/changedetectionio/tests/test_element_removal.py
@ -0,0 +1,168 @@
+#!/usr/bin/python3
+
+import time
+
+from flask import url_for
+
+from ..html_tools import *
+from .util import live_server_setup
+
+
+def test_setup(live_server):
+    live_server_setup(live_server)
+
+
+def set_original_response():
+    test_return_data = """<html>
+    <header>
+    <h2>Header</h2>
+    </header>
+    <nav>
+    <ul>
+      <li><a href="#">A</a></li>
+      <li><a href="#">B</a></li>
+      <li><a href="#">C</a></li>
+    </ul>
+    </nav>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+    <div id="changetext">Some text that will change</div>
+     </body>
+    <footer>
+    <p>Footer</p>
+    </footer>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+def set_modified_response():
+    test_return_data = """<html>
+    <header>
+    <h2>Header changed</h2>
+    </header>
+    <nav>
+    <ul>
+      <li><a href="#">A changed</a></li>
+      <li><a href="#">B</a></li>
+      <li><a href="#">C</a></li>
+    </ul>
+    </nav>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+    <div id="changetext">Some text that changes</div>
+     </body>
+    <footer>
+    <p>Footer changed</p>
+    </footer>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+
+def test_element_removal_output():
+    from changedetectionio import fetch_site_status
+    from inscriptis import get_text
+
+    # Check text with sub-parts renders correctly
+    content = """<html>
+    <header>
+    <h2>Header</h2>
+    </header>
+    <nav>
+    <ul>
+      <li><a href="#">A</a></li>
+    </ul>
+    </nav>
+       <body>
+     Some initial text</br>
+     <p>across multiple lines</p>
+     <div id="changetext">Some text that changes</div>
+     </body>
+    <footer>
+    <p>Footer</p>
+    </footer>
+     </html>
+    """
+    html_blob = element_removal(
+        ["header", "footer", "nav", "#changetext"], html_content=content
+    )
+    text = get_text(html_blob)
+    assert (
+        text
+        == """Some initial text
+
+across multiple lines
+"""
+    )
+
+
+def test_element_removal_full(client, live_server):
+    sleep_time_for_fetch_thread = 3
+
+    set_original_response()
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for("test_endpoint", _external=True)
+    res = client.post(
+        url_for("import_page"), data={"urls": test_url}, follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Goto the edit page, add the filter data
+    # Not sure why \r needs to be added - absent of the #changetext this is not necessary
+    subtractive_selectors_data = "header\r\nfooter\r\nnav\r\n#changetext"
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={
+            "subtractive_selectors": subtractive_selectors_data,
+            "url": test_url,
+            "tag": "",
+            "headers": "",
+            "fetch_backend": "html_requests",
+        },
+        follow_redirects=True,
+    )
+    assert b"Updated watch." in res.data
+
+    # Check it saved
+    res = client.get(
+        url_for("edit_page", uuid="first"),
+    )
+    assert bytes(subtractive_selectors_data.encode("utf-8")) in res.data
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # No change yet - first check
+    res = client.get(url_for("index"))
+    assert b"unviewed" not in res.data
+
+    #  Make a change to header/footer/nav
+    set_modified_response()
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # There should not be an unviewed change, as changes should be removed
+    res = client.get(url_for("index"))
+    assert b"unviewed" not in res.data