Auto extract html title as title (#102)

* Auto extract <title> as watch title, Minor refactor for html tooling
4 years ago · 25185e6d00
parent 9af1ea9fc0
commit 25185e6d00
10 changed files with 65 additions and 23 deletions
--- a/backend/init.py
+++ b/backend/init.py
@ -442,6 +442,7 @@ def changedetection_app(config=None, datastore_o=None):
        if request.method == 'GET':
            form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
            form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
+            form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']

            # Password unset is a GET
            if request.values.get('removepassword') == 'true':
@ -454,6 +455,7 @@ def changedetection_app(config=None, datastore_o=None):

            datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
            datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data
+            datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data

            if len(form.notification_urls.data):
                import apprise
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -3,18 +3,10 @@ import requests
 import hashlib
 from inscriptis import get_text
 import urllib3
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+from . import html_tools

-# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
-class css_filter(object):
-    def apply(self, css_filter, html_content):
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html_content, "html.parser")
-        html_block = ""
-        for item in soup.select(css_filter, separator=""):
-            html_block += str(item)
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

-        return html_block+"\n"

 # Some common stuff here that can be moved to a base class
 class perform_site_check():
@ -59,6 +51,7 @@ class perform_site_check():

    def run(self, uuid):
        timestamp = int(time.time())  # used for storage etc too
+
        stripped_text_from_html = False
        changed_detected = False

@ -98,8 +91,7 @@ class perform_site_check():
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
            css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
            if css_filter_rule and len(css_filter_rule.strip()):
-                filter = css_filter()
-                html = filter.apply(css_filter=css_filter_rule, html_content=r.content)
+                html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)

            stripped_text_from_html = get_text(html)

@ -150,4 +142,10 @@ class perform_site_check():

                update_obj["previous_md5"] = fetched_md5

+            # Extract title as title
+            if self.datastore.data['settings']['application']['extract_title_as_title']:
+                if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']):
+                    update_obj['title'] = html_tools.extract_element(find='title', html_content=html)
+
+
        return changed_detected, update_obj, stripped_text_from_html
--- a/backend/forms.py
+++ b/backend/forms.py
@ -128,4 +128,5 @@ class globalSettingsForm(Form):
                                               [validators.NumberRange(min=1)])

    notification_urls = StringListField('Notification URL List')
+    extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
    trigger_check = BooleanField('Send test notification on save')
--- a/backend/html_tools.py
+++ b/backend/html_tools.py
@ -0,0 +1,23 @@
+from bs4 import BeautifulSoup
+
+
+# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
+def css_filter(css_filter, html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    html_block = ""
+    for item in soup.select(css_filter, separator=""):
+        html_block += str(item)
+
+    return html_block + "\n"
+
+
+# Extract/find element
+def extract_element(find='title', html_content=''):
+    html_title = False
+
+    soup = BeautifulSoup(html_content, 'html.parser')
+    title = soup.find(find)
+    if title and title.string is not None:
+        html_title = title.string.strip()
+
+    return html_title
--- a/backend/store.py
+++ b/backend/store.py
@ -38,6 +38,7 @@ class ChangeDetectionStore:
                },
                'application': {
                    'password': False,
+                    'extract_title_as_title': False,
                    'notification_urls': [] # Apprise URL list
                }
            }
--- a/backend/templates/settings.html
+++ b/backend/templates/settings.html
@ -16,6 +16,10 @@
                    {{ render_field(form.password, size=10) }}
                {% endif %}
            </div>
+            <div class="pure-control-group">
+                {{ render_field(form.extract_title_as_title) }}
+                  <span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span>
+            </div>
            <div class="pure-control-group">
                {{ render_field(form.notification_urls, rows=5, placeholder="Gitter - gitter://token/room
 Office365 - o365://TenantID:AccountEmail/ClientID/ClientSecret/TargetEmail
@ -27,7 +31,6 @@ SMTPS - mailtos://user:pass@mail.domain.com?to=receivingAddress@example.com
                <div class="pure-controls">
                    <span class="pure-form-message-inline"><label for="trigger-test-notification" class="pure-checkbox">
                        <input type="checkbox" id="trigger-test-notification" name="trigger-test-notification"> Send test notification on save.</label></span>
-
                </div>

            <br/>
--- a/backend/tests/test_backend.py
+++ b/backend/tests/test_backend.py
@ -8,8 +8,6 @@ from . util import set_original_response, set_modified_response, live_server_set
 sleep_time_for_fetch_thread = 3


-
-
 def test_check_basic_change_detection_functionality(client, live_server):
    set_original_response()
    live_server_setup(live_server)
@ -82,15 +80,27 @@ def test_check_basic_change_detection_functionality(client, live_server):
        # It should report nothing found (no new 'unviewed' class)
        res = client.get(url_for("index"))
        assert b'unviewed' not in res.data
+        assert b'head title' not in res.data # Should not be present because this is off by default
        assert b'test-endpoint' in res.data

    set_original_response()

+    # Enable auto pickup of <title> in settings
+    res = client.post(
+        url_for("settings_page"),
+        data={"extract_title_as_title": "1", "minutes_between_check": 180},
+        follow_redirects=True
+    )
+
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
+
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
+    # It should have picked up the <title>
+    assert b'head title' in res.data

+    #
    # Cleanup everything
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
--- a/backend/tests/test_css_selector.py
+++ b/backend/tests/test_css_selector.py
@ -4,6 +4,8 @@ import time
 from flask import url_for
 from . util import live_server_setup

+from ..html_tools import *
+
 def test_setup(live_server):
    live_server_setup(live_server)

@ -48,11 +50,9 @@ def test_css_filter_output():
    from backend import fetch_site_status
    from inscriptis import get_text

-    css_filter = fetch_site_status.css_filter()
-
    # Check text with sub-parts renders correctly
    content = """<html> <body><div id="thingthing" >  Some really <b>bold</b> text  </div> </body> </html>"""
-    html_blob = css_filter.apply(css_filter="#thingthing", html_content=content)
+    html_blob = css_filter(css_filter="#thingthing", html_content=content)
    text = get_text(html_blob)
    assert text == "  Some really bold text"

@ -61,7 +61,7 @@ def test_css_filter_output():
    <div class="parts">Block A</div> <div class="parts">Block B</div></body> 
    </html>
 """
-    html_blob = css_filter.apply(css_filter=".parts", html_content=content)
+    html_blob = css_filter(css_filter=".parts", html_content=content)
    text = get_text(html_blob)

    # Divs are converted to 4 whitespaces by inscriptis
--- a/backend/tests/util.py
+++ b/backend/tests/util.py
@ -3,6 +3,7 @@

 def set_original_response():
    test_return_data = """<html>
+    <head><title>head title</title></head>
    <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
@ -18,6 +19,7 @@ def set_original_response():

 def set_modified_response():
    test_return_data = """<html>
+    <head><title>modified head title</title></head>
    <body>
     Some initial text</br>
     <p>which has this one new line</p>
--- a/backend/update_worker.py
+++ b/backend/update_worker.py
@ -31,8 +31,10 @@ class update_worker(threading.Thread):
                    try:
                        changed_detected, result, contents = update_handler.run(uuid)

-                    except PermissionError as s:
-                        self.app.logger.error("File permission error updating", uuid, str(s))
+                    except PermissionError as e:
+                        self.app.logger.error("File permission error updating", uuid, str(e))
+                    except Exception as e:
+                        self.app.logger.error("Exception reached", uuid, str(e))
                    else:
                        if result:
                            try: