Auto extract html title as title (#102)

* Auto extract <title> as watch title, Minor refactor for html tooling
pull/107/head
dgtlmoon 3 years ago committed by GitHub
parent 9af1ea9fc0
commit 25185e6d00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -442,6 +442,7 @@ def changedetection_app(config=None, datastore_o=None):
if request.method == 'GET': if request.method == 'GET':
form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
# Password unset is a GET # Password unset is a GET
if request.values.get('removepassword') == 'true': if request.values.get('removepassword') == 'true':
@ -454,6 +455,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data
datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data
if len(form.notification_urls.data): if len(form.notification_urls.data):
import apprise import apprise

@ -3,18 +3,10 @@ import requests
import hashlib import hashlib
from inscriptis import get_text from inscriptis import get_text
import urllib3 import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from . import html_tools
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class css_filter(object):
def apply(self, css_filter, html_content):
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
html_block = ""
for item in soup.select(css_filter, separator=""):
html_block += str(item)
return html_block+"\n"
# Some common stuff here that can be moved to a base class # Some common stuff here that can be moved to a base class
class perform_site_check(): class perform_site_check():
@ -59,6 +51,7 @@ class perform_site_check():
def run(self, uuid): def run(self, uuid):
timestamp = int(time.time()) # used for storage etc too timestamp = int(time.time()) # used for storage etc too
stripped_text_from_html = False stripped_text_from_html = False
changed_detected = False changed_detected = False
@ -98,8 +91,7 @@ class perform_site_check():
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter'] css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
if css_filter_rule and len(css_filter_rule.strip()): if css_filter_rule and len(css_filter_rule.strip()):
filter = css_filter() html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
html = filter.apply(css_filter=css_filter_rule, html_content=r.content)
stripped_text_from_html = get_text(html) stripped_text_from_html = get_text(html)
@ -150,4 +142,10 @@ class perform_site_check():
update_obj["previous_md5"] = fetched_md5 update_obj["previous_md5"] = fetched_md5
# Extract title as title
if self.datastore.data['settings']['application']['extract_title_as_title']:
if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=html)
return changed_detected, update_obj, stripped_text_from_html return changed_detected, update_obj, stripped_text_from_html

@ -128,4 +128,5 @@ class globalSettingsForm(Form):
[validators.NumberRange(min=1)]) [validators.NumberRange(min=1)])
notification_urls = StringListField('Notification URL List') notification_urls = StringListField('Notification URL List')
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
trigger_check = BooleanField('Send test notification on save') trigger_check = BooleanField('Send test notification on save')

@ -0,0 +1,23 @@
from bs4 import BeautifulSoup
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def css_filter(css_filter, html_content):
soup = BeautifulSoup(html_content, "html.parser")
html_block = ""
for item in soup.select(css_filter, separator=""):
html_block += str(item)
return html_block + "\n"
# Extract/find element
def extract_element(find='title', html_content=''):
html_title = False
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find(find)
if title and title.string is not None:
html_title = title.string.strip()
return html_title

@ -38,6 +38,7 @@ class ChangeDetectionStore:
}, },
'application': { 'application': {
'password': False, 'password': False,
'extract_title_as_title': False,
'notification_urls': [] # Apprise URL list 'notification_urls': [] # Apprise URL list
} }
} }

@ -16,6 +16,10 @@
{{ render_field(form.password, size=10) }} {{ render_field(form.password, size=10) }}
{% endif %} {% endif %}
</div> </div>
<div class="pure-control-group">
{{ render_field(form.extract_title_as_title) }}
<span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span>
</div>
<div class="pure-control-group"> <div class="pure-control-group">
{{ render_field(form.notification_urls, rows=5, placeholder="Gitter - gitter://token/room {{ render_field(form.notification_urls, rows=5, placeholder="Gitter - gitter://token/room
Office365 - o365://TenantID:AccountEmail/ClientID/ClientSecret/TargetEmail Office365 - o365://TenantID:AccountEmail/ClientID/ClientSecret/TargetEmail
@ -27,7 +31,6 @@ SMTPS - mailtos://user:pass@mail.domain.com?to=receivingAddress@example.com
<div class="pure-controls"> <div class="pure-controls">
<span class="pure-form-message-inline"><label for="trigger-test-notification" class="pure-checkbox"> <span class="pure-form-message-inline"><label for="trigger-test-notification" class="pure-checkbox">
<input type="checkbox" id="trigger-test-notification" name="trigger-test-notification"> Send test notification on save.</label></span> <input type="checkbox" id="trigger-test-notification" name="trigger-test-notification"> Send test notification on save.</label></span>
</div> </div>
<br/> <br/>

@ -8,8 +8,6 @@ from . util import set_original_response, set_modified_response, live_server_set
sleep_time_for_fetch_thread = 3 sleep_time_for_fetch_thread = 3
def test_check_basic_change_detection_functionality(client, live_server): def test_check_basic_change_detection_functionality(client, live_server):
set_original_response() set_original_response()
live_server_setup(live_server) live_server_setup(live_server)
@ -82,15 +80,27 @@ def test_check_basic_change_detection_functionality(client, live_server):
# It should report nothing found (no new 'unviewed' class) # It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' not in res.data assert b'unviewed' not in res.data
assert b'head title' not in res.data # Should not be present because this is off by default
assert b'test-endpoint' in res.data assert b'test-endpoint' in res.data
set_original_response() set_original_response()
# Enable auto pickup of <title> in settings
res = client.post(
url_for("settings_page"),
data={"extract_title_as_title": "1", "minutes_between_check": 180},
follow_redirects=True
)
client.get(url_for("api_watch_checknow"), follow_redirects=True) client.get(url_for("api_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread) time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' in res.data assert b'unviewed' in res.data
# It should have picked up the <title>
assert b'head title' in res.data
#
# Cleanup everything # Cleanup everything
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data assert b'Deleted' in res.data

@ -4,6 +4,8 @@ import time
from flask import url_for from flask import url_for
from . util import live_server_setup from . util import live_server_setup
from ..html_tools import *
def test_setup(live_server): def test_setup(live_server):
live_server_setup(live_server) live_server_setup(live_server)
@ -48,11 +50,9 @@ def test_css_filter_output():
from backend import fetch_site_status from backend import fetch_site_status
from inscriptis import get_text from inscriptis import get_text
css_filter = fetch_site_status.css_filter()
# Check text with sub-parts renders correctly # Check text with sub-parts renders correctly
content = """<html> <body><div id="thingthing" > Some really <b>bold</b> text </div> </body> </html>""" content = """<html> <body><div id="thingthing" > Some really <b>bold</b> text </div> </body> </html>"""
html_blob = css_filter.apply(css_filter="#thingthing", html_content=content) html_blob = css_filter(css_filter="#thingthing", html_content=content)
text = get_text(html_blob) text = get_text(html_blob)
assert text == " Some really bold text" assert text == " Some really bold text"
@ -61,7 +61,7 @@ def test_css_filter_output():
<div class="parts">Block A</div> <div class="parts">Block B</div></body> <div class="parts">Block A</div> <div class="parts">Block B</div></body>
</html> </html>
""" """
html_blob = css_filter.apply(css_filter=".parts", html_content=content) html_blob = css_filter(css_filter=".parts", html_content=content)
text = get_text(html_blob) text = get_text(html_blob)
# Divs are converted to 4 whitespaces by inscriptis # Divs are converted to 4 whitespaces by inscriptis

@ -3,7 +3,8 @@
def set_original_response(): def set_original_response():
test_return_data = """<html> test_return_data = """<html>
<body> <head><title>head title</title></head>
<body>
Some initial text</br> Some initial text</br>
<p>Which is across multiple lines</p> <p>Which is across multiple lines</p>
</br> </br>
@ -18,7 +19,8 @@ def set_original_response():
def set_modified_response(): def set_modified_response():
test_return_data = """<html> test_return_data = """<html>
<body> <head><title>modified head title</title></head>
<body>
Some initial text</br> Some initial text</br>
<p>which has this one new line</p> <p>which has this one new line</p>
</br> </br>

@ -31,8 +31,10 @@ class update_worker(threading.Thread):
try: try:
changed_detected, result, contents = update_handler.run(uuid) changed_detected, result, contents = update_handler.run(uuid)
except PermissionError as s: except PermissionError as e:
self.app.logger.error("File permission error updating", uuid, str(s)) self.app.logger.error("File permission error updating", uuid, str(e))
except Exception as e:
self.app.logger.error("Exception reached", uuid, str(e))
else: else:
if result: if result:
try: try:

Loading…
Cancel
Save