diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 7b25a5c8..e29553ee 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -641,6 +641,7 @@ def changedetection_app(config=None, datastore_o=None):
form.global_subtractive_selectors.data = datastore.data['settings']['application']['global_subtractive_selectors']
form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
+ form.render_anchor_tag_content.data = datastore.data['settings']['application']['render_anchor_tag_content']
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
form.notification_title.data = datastore.data['settings']['application']['notification_title']
@@ -671,6 +672,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data
datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
datastore.data['settings']['application']['real_browser_save_screenshot'] = form.real_browser_save_screenshot.data
+ datastore.data['settings']['application']['render_anchor_tag_content'] = form.render_anchor_tag_content.data
if not os.getenv("SALTED_PASS", False) and form.password.encrypted_password:
datastore.data['settings']['application']['password'] = form.password.encrypted_password
diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 786e7a2f..aef24bcd 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -4,7 +4,6 @@ import re
import time
import urllib3
-from inscriptis import get_text
from changedetectionio import content_fetcher, html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -88,7 +87,7 @@ class perform_site_check():
has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
-
+
if is_json and not has_filter_rule:
css_filter_rule = "json:$"
has_filter_rule = True
@@ -117,9 +116,14 @@ class perform_site_check():
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
if has_subtractive_selectors:
html_content = html_tools.element_removal(subtractive_selectors, html_content)
- # get_text() via inscriptis
- stripped_text_from_html = get_text(html_content)
-
+ # extract text
+ stripped_text_from_html = \
+ html_tools.html_to_text(
+ html_content,
+ render_anchor_tag_content=self.datastore.data["settings"][
+ "application"].get(
+ "render_anchor_tag_content", False)
+ )
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 4dc7921a..2f28a62d 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -231,7 +231,7 @@ class ValidateListRegex(object):
except re.error:
message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
raise ValidationError(message % (line))
-
+
class ValidateCSSJSONXPATHInput(object):
"""
Filter validation
@@ -293,7 +293,7 @@ class ValidateCSSJSONXPATHInput(object):
# Re #265 - maybe in the future fetch the page and offer a
# warning/notice that its possible the rule doesnt yet match anything?
-
+
class quickWatchForm(Form):
# https://wtforms.readthedocs.io/en/2.3.x/fields/#module-wtforms.fields.html5
# `require_tld` = False is needed even for the test harness "http://localhost:5005.." to run
@@ -352,6 +352,10 @@ class globalSettingsForm(commonSettingsForm):
global_subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
ignore_whitespace = BooleanField('Ignore whitespace')
+
+ render_anchor_tag_content = BooleanField('Render Anchor Tag Content',
+ default=False)
+
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?')
removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"})
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index fda2cf25..d355c209 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -4,6 +4,9 @@ from typing import List
from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse
+import re
+from inscriptis import get_text
+from inscriptis.model.config import ParserConfig
class JSONNotFound(ValueError):
@@ -25,12 +28,12 @@ def subtractive_css_selector(css_selector, html_content):
item.decompose()
return str(soup)
-
+
def element_removal(selectors: List[str], html_content):
"""Joins individual filters into one css filter."""
selector = ",".join(selectors)
return subtractive_css_selector(selector, html_content)
-
+
# Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content):
@@ -167,3 +170,35 @@ def strip_ignore_text(content, wordlist, mode="content"):
return ignored_line_numbers
return "\n".encode('utf8').join(output)
+
+
+def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
+ """Converts html string to a string with just the text. If ignoring
+ rendering anchor tag content is enable, anchor tag content are also
+ included in the text
+
+ :param html_content: string with html content
+ :param render_anchor_tag_content: boolean flag indicating whether to extract
+ hyperlinks (the anchor tag content) together with text. This refers to the
+ 'href' inside 'a' tags.
+ Anchor tag content is rendered in the following manner:
+ '[ text ](anchor tag content)'
+ :return: extracted text from the HTML
+ """
+ # if anchor tag content flag is set to True define a config for
+ # extracting this content
+ if render_anchor_tag_content:
+
+ parser_config = ParserConfig(
+ annotation_rules={"a": ["hyperlink"]}, display_links=True
+ )
+
+ # otherwise set config to None
+ else:
+ parser_config = None
+
+ # get text and annotations via inscriptis
+ text_content = get_text(html_content, config=parser_config)
+
+ return text_content
+
diff --git a/changedetectionio/store.py b/changedetectionio/store.py
index e37710de..4aa5cc3e 100644
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -52,6 +52,7 @@ class ChangeDetectionStore:
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'global_subtractive_selectors': [],
'ignore_whitespace': False,
+ 'render_anchor_tag_content': False,
'notification_urls': [], # Apprise URL list
# Custom notification content
'notification_title': default_notification_title,
diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html
index fac9da3a..996c58d4 100644
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -91,10 +91,16 @@
+
-