diff --git a/backend/__init__.py b/backend/__init__.py index ce9ac721..9c00e2d4 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -442,6 +442,7 @@ def changedetection_app(config=None, datastore_o=None): if request.method == 'GET': form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] + form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] # Password unset is a GET if request.values.get('removepassword') == 'true': @@ -454,6 +455,7 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data + datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data if len(form.notification_urls.data): import apprise diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index e0296b45..12216e19 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -3,18 +3,10 @@ import requests import hashlib from inscriptis import get_text import urllib3 -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +from . import html_tools -# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches -class css_filter(object): - def apply(self, css_filter, html_content): - from bs4 import BeautifulSoup - soup = BeautifulSoup(html_content, "html.parser") - html_block = "" - for item in soup.select(css_filter, separator=""): - html_block += str(item) +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - return html_block+"\n" # Some common stuff here that can be moved to a base class class perform_site_check(): @@ -59,6 +51,7 @@ class perform_site_check(): def run(self, uuid): timestamp = int(time.time()) # used for storage etc too + stripped_text_from_html = False changed_detected = False @@ -98,8 +91,7 @@ class perform_site_check(): # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text css_filter_rule = self.datastore.data['watching'][uuid]['css_filter'] if css_filter_rule and len(css_filter_rule.strip()): - filter = css_filter() - html = filter.apply(css_filter=css_filter_rule, html_content=r.content) + html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content) stripped_text_from_html = get_text(html) @@ -150,4 +142,10 @@ class perform_site_check(): update_obj["previous_md5"] = fetched_md5 + # Extract title as title + if self.datastore.data['settings']['application']['extract_title_as_title']: + if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']): + update_obj['title'] = html_tools.extract_element(find='title', html_content=html) + + return changed_detected, update_obj, stripped_text_from_html diff --git a/backend/forms.py b/backend/forms.py index 13935abd..ea8fd995 100644 --- a/backend/forms.py +++ b/backend/forms.py @@ -128,4 +128,5 @@ class globalSettingsForm(Form): [validators.NumberRange(min=1)]) notification_urls = StringListField('Notification URL List') + extract_title_as_title = BooleanField('Extract from document and use as watch title') trigger_check = BooleanField('Send test notification on save') diff --git a/backend/html_tools.py b/backend/html_tools.py new file mode 100644 index 00000000..904910d2 --- /dev/null +++ b/backend/html_tools.py @@ -0,0 +1,23 @@ +from bs4 import BeautifulSoup + + +# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches +def css_filter(css_filter, html_content): + soup = BeautifulSoup(html_content, "html.parser") + html_block = "" + for item in soup.select(css_filter, separator=""): + html_block += str(item) + + return html_block + "\n" + + +# Extract/find element +def extract_element(find='title', html_content=''): + html_title = False + + soup = BeautifulSoup(html_content, 'html.parser') + title = soup.find(find) + if title and title.string is not None: + html_title = title.string.strip() + + return html_title diff --git a/backend/store.py b/backend/store.py index 46d108c1..fd5ec895 100644 --- a/backend/store.py +++ b/backend/store.py @@ -38,6 +38,7 @@ class ChangeDetectionStore: }, 'application': { 'password': False, + 'extract_title_as_title': False, 'notification_urls': [] # Apprise URL list } } diff --git a/backend/templates/settings.html b/backend/templates/settings.html index e136ad93..457e5f4b 100644 --- a/backend/templates/settings.html +++ b/backend/templates/settings.html @@ -16,6 +16,10 @@ {{ render_field(form.password, size=10) }} {% endif %} </div> + <div class="pure-control-group"> + {{ render_field(form.extract_title_as_title) }} + <span class="pure-form-message-inline">Note: This will automatically apply to all existing watches.</span> + </div> <div class="pure-control-group"> {{ render_field(form.notification_urls, rows=5, placeholder="Gitter - gitter://token/room Office365 - o365://TenantID:AccountEmail/ClientID/ClientSecret/TargetEmail @@ -27,7 +31,6 @@ SMTPS - mailtos://user:pass@mail.domain.com?to=receivingAddress@example.com <div class="pure-controls"> <span class="pure-form-message-inline"><label for="trigger-test-notification" class="pure-checkbox"> <input type="checkbox" id="trigger-test-notification" name="trigger-test-notification"> Send test notification on save.</label></span> - </div> <br/> diff --git a/backend/tests/test_backend.py b/backend/tests/test_backend.py index 502cd998..8f944605 100644 --- a/backend/tests/test_backend.py +++ b/backend/tests/test_backend.py @@ -8,8 +8,6 @@ from . util import set_original_response, set_modified_response, live_server_set sleep_time_for_fetch_thread = 3 - - def test_check_basic_change_detection_functionality(client, live_server): set_original_response() live_server_setup(live_server) @@ -82,15 +80,27 @@ def test_check_basic_change_detection_functionality(client, live_server): # It should report nothing found (no new 'unviewed' class) res = client.get(url_for("index")) assert b'unviewed' not in res.data + assert b'head title' not in res.data # Should not be present because this is off by default assert b'test-endpoint' in res.data set_original_response() + # Enable auto pickup of <title> in settings + res = client.post( + url_for("settings_page"), + data={"extract_title_as_title": "1", "minutes_between_check": 180}, + follow_redirects=True + ) + client.get(url_for("api_watch_checknow"), follow_redirects=True) time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) assert b'unviewed' in res.data + # It should have picked up the <title> + assert b'head title' in res.data + # # Cleanup everything res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data diff --git a/backend/tests/test_css_selector.py b/backend/tests/test_css_selector.py index 40d7e23a..6425600c 100644 --- a/backend/tests/test_css_selector.py +++ b/backend/tests/test_css_selector.py @@ -4,6 +4,8 @@ import time from flask import url_for from . util import live_server_setup +from ..html_tools import * + def test_setup(live_server): live_server_setup(live_server) @@ -48,11 +50,9 @@ def test_css_filter_output(): from backend import fetch_site_status from inscriptis import get_text - css_filter = fetch_site_status.css_filter() - # Check text with sub-parts renders correctly content = """<html> <body><div id="thingthing" > Some really <b>bold</b> text </div> </body> </html>""" - html_blob = css_filter.apply(css_filter="#thingthing", html_content=content) + html_blob = css_filter(css_filter="#thingthing", html_content=content) text = get_text(html_blob) assert text == " Some really bold text" @@ -61,7 +61,7 @@ def test_css_filter_output(): <div class="parts">Block A</div> <div class="parts">Block B</div></body> </html> """ - html_blob = css_filter.apply(css_filter=".parts", html_content=content) + html_blob = css_filter(css_filter=".parts", html_content=content) text = get_text(html_blob) # Divs are converted to 4 whitespaces by inscriptis diff --git a/backend/tests/util.py b/backend/tests/util.py index 8a14ee63..05a63445 100644 --- a/backend/tests/util.py +++ b/backend/tests/util.py @@ -3,7 +3,8 @@ def set_original_response(): test_return_data = """<html> - <body> + <head><title>head title + Some initial text

Which is across multiple lines


@@ -18,7 +19,8 @@ def set_original_response(): def set_modified_response(): test_return_data = """ - + modified head title + Some initial text

which has this one new line


diff --git a/backend/update_worker.py b/backend/update_worker.py index 5d1db851..9f02f95c 100644 --- a/backend/update_worker.py +++ b/backend/update_worker.py @@ -31,8 +31,10 @@ class update_worker(threading.Thread): try: changed_detected, result, contents = update_handler.run(uuid) - except PermissionError as s: - self.app.logger.error("File permission error updating", uuid, str(s)) + except PermissionError as e: + self.app.logger.error("File permission error updating", uuid, str(e)) + except Exception as e: + self.app.logger.error("Exception reached", uuid, str(e)) else: if result: try: