Auto extract html title as title (#102)

* Auto extract <title> as watch title, Minor refactor for html tooling
4 years ago · 25185e6d00
parent 9af1ea9fc0
commit 25185e6d00
10 changed files with 65 additions and 23 deletions
--- a/backend/init.py
+++ b/backend/init.py
@ -442,6 +442,7 @@ def changedetection_app(config=None, datastore_o=None):
        if request.method == 'GET':
            form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
            form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
            form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
            # Password unset is a GET
            if request.values.get('removepassword') == 'true':
@ -454,6 +455,7 @@ def changedetection_app(config=None, datastore_o=None):
            datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
            datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data
            datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data
            if len(form.notification_urls.data):
                import apprise
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -3,18 +3,10 @@ import requests
 import hashlib
 from inscriptis import get_text
 import urllib3
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+from . import html_tools
-# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 class css_filter(object):
    def apply(self, css_filter, html_content):
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html_content, "html.parser")
        html_block = ""
        for item in soup.select(css_filter, separator=""):
            html_block += str(item)
        return html_block+"\n"
 # Some common stuff here that can be moved to a base class
 class perform_site_check():
@ -59,6 +51,7 @@ class perform_site_check():
    def run(self, uuid):
        timestamp = int(time.time())  # used for storage etc too
        stripped_text_from_html = False
        changed_detected = False
@ -98,8 +91,7 @@ class perform_site_check():
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
            if css_filter_rule and len(css_filter_rule.strip()):
-                filter = css_filter()
+                html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
                html = filter.apply(css_filter=css_filter_rule, html_content=r.content)
            stripped_text_from_html = get_text(html)
@ -150,4 +142,10 @@ class perform_site_check():
                update_obj["previous_md5"] = fetched_md5
            # Extract title as title
            if self.datastore.data['settings']['application']['extract_title_as_title']:
                if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']):
                    update_obj['title'] = html_tools.extract_element(find='title', html_content=html)
        return changed_detected, update_obj, stripped_text_from_html
--- a/backend/forms.py
+++ b/backend/forms.py
@ -128,4 +128,5 @@ class globalSettingsForm(Form):
                                               [validators.NumberRange(min=1)])
    notification_urls = StringListField('Notification URL List')
    extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
    trigger_check = BooleanField('Send test notification on save')
--- a/backend/html_tools.py
+++ b/backend/html_tools.py
@ -0,0 +1,23 @@
 from bs4 import BeautifulSoup
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def css_filter(css_filter, html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    html_block = ""
    for item in soup.select(css_filter, separator=""):
        html_block += str(item)
    return html_block + "\n"
 # Extract/find element
 def extract_element(find='title', html_content=''):
    html_title = False
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.find(find)
    if title and title.string is not None:
        html_title = title.string.strip()
    return html_title
--- a/backend/store.py
+++ b/backend/store.py
@ -38,6 +38,7 @@ class ChangeDetectionStore:
                },
                'application': {
                    'password': False,
                    'extract_title_as_title': False,
                    'notification_urls': [] # Apprise URL list
                }
            }
--- a/backend/templates/settings.html
+++ b/backend/templates/settings.html
@ -16,6 +16,10 @@
                    {{ render_field(form.password, size=10) }}
                {% endif %}
            </div>
            <div class="pure-control-group">
                {{ render_field(form.extract_title_as_title) }}
                  <span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span>
            </div>
            <div class="pure-control-group">
                {{ render_field(form.notification_urls, rows=5, placeholder="Gitter - gitter://token/room
 Office365 - o365://TenantID:AccountEmail/ClientID/ClientSecret/TargetEmail
@ -27,7 +31,6 @@ SMTPS - mailtos://user:pass@mail.domain.com?to=receivingAddress@example.com
                <div class="pure-controls">
                    <span class="pure-form-message-inline"><label for="trigger-test-notification" class="pure-checkbox">
                        <input type="checkbox" id="trigger-test-notification" name="trigger-test-notification"> Send test notification on save.</label></span>
                </div>
            <br/>
--- a/backend/tests/test_backend.py
+++ b/backend/tests/test_backend.py
@ -8,8 +8,6 @@ from . util import set_original_response, set_modified_response, live_server_set
 sleep_time_for_fetch_thread = 3
 def test_check_basic_change_detection_functionality(client, live_server):
    set_original_response()
    live_server_setup(live_server)
@ -82,15 +80,27 @@ def test_check_basic_change_detection_functionality(client, live_server):
        # It should report nothing found (no new 'unviewed' class)
        res = client.get(url_for("index"))
        assert b'unviewed' not in res.data
        assert b'head title' not in res.data # Should not be present because this is off by default
        assert b'test-endpoint' in res.data
    set_original_response()
    # Enable auto pickup of <title> in settings
    res = client.post(
        url_for("settings_page"),
        data={"extract_title_as_title": "1", "minutes_between_check": 180},
        follow_redirects=True
    )
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
    # It should have picked up the <title>
    assert b'head title' in res.data
    #
    # Cleanup everything
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
--- a/backend/tests/test_css_selector.py
+++ b/backend/tests/test_css_selector.py
@ -4,6 +4,8 @@ import time
 from flask import url_for
 from . util import live_server_setup
 from ..html_tools import *
 def test_setup(live_server):
    live_server_setup(live_server)
@ -48,11 +50,9 @@ def test_css_filter_output():
    from backend import fetch_site_status
    from inscriptis import get_text
    css_filter = fetch_site_status.css_filter()
    # Check text with sub-parts renders correctly
    content = """<html> <body><div id="thingthing" >  Some really <b>bold</b> text  </div> </body> </html>"""
-    html_blob = css_filter.apply(css_filter="#thingthing", html_content=content)
+    html_blob = css_filter(css_filter="#thingthing", html_content=content)
    text = get_text(html_blob)
    assert text == "  Some really bold text"
@ -61,7 +61,7 @@ def test_css_filter_output():
    <div class="parts">Block A</div> <div class="parts">Block B</div></body> 
    </html>
 """
-    html_blob = css_filter.apply(css_filter=".parts", html_content=content)
+    html_blob = css_filter(css_filter=".parts", html_content=content)
    text = get_text(html_blob)
    # Divs are converted to 4 whitespaces by inscriptis
--- a/backend/tests/util.py
+++ b/backend/tests/util.py
@ -3,7 +3,8 @@
 def set_original_response():
    test_return_data = """<html>
-       <body>
+    <head><title>head title</title></head>
    <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     </br>
@ -18,7 +19,8 @@ def set_original_response():
 def set_modified_response():
    test_return_data = """<html>
-       <body>
+    <head><title>modified head title</title></head>
    <body>
     Some initial text</br>
     <p>which has this one new line</p>
     </br>
--- a/backend/update_worker.py
+++ b/backend/update_worker.py
@ -31,8 +31,10 @@ class update_worker(threading.Thread):
                    try:
                        changed_detected, result, contents = update_handler.run(uuid)
-                    except PermissionError as s:
+                    except PermissionError as e:
-                        self.app.logger.error("File permission error updating", uuid, str(s))
+                        self.app.logger.error("File permission error updating", uuid, str(e))
                    except Exception as e:
                        self.app.logger.error("Exception reached", uuid, str(e))
                    else:
                        if result:
                            try: