diff --git a/backend/__init__.py b/backend/__init__.py
index ce9ac721..9c00e2d4 100644
--- a/backend/__init__.py
+++ b/backend/__init__.py
@@ -442,6 +442,7 @@ def changedetection_app(config=None, datastore_o=None):
if request.method == 'GET':
form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
+ form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
# Password unset is a GET
if request.values.get('removepassword') == 'true':
@@ -454,6 +455,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data
+ datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data
if len(form.notification_urls.data):
import apprise
diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py
index e0296b45..12216e19 100644
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@@ -3,18 +3,10 @@ import requests
import hashlib
from inscriptis import get_text
import urllib3
-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+from . import html_tools
-# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
-class css_filter(object):
- def apply(self, css_filter, html_content):
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html_content, "html.parser")
- html_block = ""
- for item in soup.select(css_filter, separator=""):
- html_block += str(item)
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- return html_block+"\n"
# Some common stuff here that can be moved to a base class
class perform_site_check():
@@ -59,6 +51,7 @@ class perform_site_check():
def run(self, uuid):
timestamp = int(time.time()) # used for storage etc too
+
stripped_text_from_html = False
changed_detected = False
@@ -98,8 +91,7 @@ class perform_site_check():
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
if css_filter_rule and len(css_filter_rule.strip()):
- filter = css_filter()
- html = filter.apply(css_filter=css_filter_rule, html_content=r.content)
+ html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
stripped_text_from_html = get_text(html)
@@ -150,4 +142,10 @@ class perform_site_check():
update_obj["previous_md5"] = fetched_md5
+ # Extract title as title
+ if self.datastore.data['settings']['application']['extract_title_as_title']:
+ if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']):
+ update_obj['title'] = html_tools.extract_element(find='title', html_content=html)
+
+
return changed_detected, update_obj, stripped_text_from_html
diff --git a/backend/forms.py b/backend/forms.py
index 13935abd..ea8fd995 100644
--- a/backend/forms.py
+++ b/backend/forms.py
@@ -128,4 +128,5 @@ class globalSettingsForm(Form):
[validators.NumberRange(min=1)])
notification_urls = StringListField('Notification URL List')
+ extract_title_as_title = BooleanField('Extract
from document and use as watch title')
trigger_check = BooleanField('Send test notification on save')
diff --git a/backend/html_tools.py b/backend/html_tools.py
new file mode 100644
index 00000000..904910d2
--- /dev/null
+++ b/backend/html_tools.py
@@ -0,0 +1,23 @@
+from bs4 import BeautifulSoup
+
+
+# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
+def css_filter(css_filter, html_content):
+ soup = BeautifulSoup(html_content, "html.parser")
+ html_block = ""
+ for item in soup.select(css_filter, separator=""):
+ html_block += str(item)
+
+ return html_block + "\n"
+
+
+# Extract/find element
+def extract_element(find='title', html_content=''):
+ html_title = False
+
+ soup = BeautifulSoup(html_content, 'html.parser')
+ title = soup.find(find)
+ if title and title.string is not None:
+ html_title = title.string.strip()
+
+ return html_title
diff --git a/backend/store.py b/backend/store.py
index 46d108c1..fd5ec895 100644
--- a/backend/store.py
+++ b/backend/store.py
@@ -38,6 +38,7 @@ class ChangeDetectionStore:
},
'application': {
'password': False,
+ 'extract_title_as_title': False,
'notification_urls': [] # Apprise URL list
}
}
diff --git a/backend/templates/settings.html b/backend/templates/settings.html
index e136ad93..457e5f4b 100644
--- a/backend/templates/settings.html
+++ b/backend/templates/settings.html
@@ -16,6 +16,10 @@
{{ render_field(form.password, size=10) }}
{% endif %}
+
+ {{ render_field(form.extract_title_as_title) }}
+ Note: This will automatically apply to all existing watches.
+
diff --git a/backend/tests/test_backend.py b/backend/tests/test_backend.py
index 502cd998..8f944605 100644
--- a/backend/tests/test_backend.py
+++ b/backend/tests/test_backend.py
@@ -8,8 +8,6 @@ from . util import set_original_response, set_modified_response, live_server_set
sleep_time_for_fetch_thread = 3
-
-
def test_check_basic_change_detection_functionality(client, live_server):
set_original_response()
live_server_setup(live_server)
@@ -82,15 +80,27 @@ def test_check_basic_change_detection_functionality(client, live_server):
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
+ assert b'head title' not in res.data # Should not be present because this is off by default
assert b'test-endpoint' in res.data
set_original_response()
+ # Enable auto pickup of in settings
+ res = client.post(
+ url_for("settings_page"),
+ data={"extract_title_as_title": "1", "minutes_between_check": 180},
+ follow_redirects=True
+ )
+
client.get(url_for("api_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
+
res = client.get(url_for("index"))
assert b'unviewed' in res.data
+ # It should have picked up the
+ assert b'head title' in res.data
+ #
# Cleanup everything
res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
diff --git a/backend/tests/test_css_selector.py b/backend/tests/test_css_selector.py
index 40d7e23a..6425600c 100644
--- a/backend/tests/test_css_selector.py
+++ b/backend/tests/test_css_selector.py
@@ -4,6 +4,8 @@ import time
from flask import url_for
from . util import live_server_setup
+from ..html_tools import *
+
def test_setup(live_server):
live_server_setup(live_server)
@@ -48,11 +50,9 @@ def test_css_filter_output():
from backend import fetch_site_status
from inscriptis import get_text
- css_filter = fetch_site_status.css_filter()
-
# Check text with sub-parts renders correctly
content = """
Some really bold text
"""
- html_blob = css_filter.apply(css_filter="#thingthing", html_content=content)
+ html_blob = css_filter(css_filter="#thingthing", html_content=content)
text = get_text(html_blob)
assert text == " Some really bold text"
@@ -61,7 +61,7 @@ def test_css_filter_output():