Re #133 Option for ignoring whitespacing (#345)

* Global setting option to ignore whitespace when detecting a change
pull/344/head^2
dgtlmoon 3 years ago committed by GitHub
parent 489671dcca
commit b5c1fce136
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -405,7 +405,7 @@ def changedetection_app(config=None, datastore_o=None):
# Get the most recent one # Get the most recent one
newest_history_key = datastore.get_val(uuid, 'newest_history_key') newest_history_key = datastore.get_val(uuid, 'newest_history_key')
# 0 means that theres only one, so that there should be no 'unviewed' history availabe # 0 means that theres only one, so that there should be no 'unviewed' history available
if newest_history_key == 0: if newest_history_key == 0:
newest_history_key = list(datastore.data['watching'][uuid]['history'].keys())[0] newest_history_key = list(datastore.data['watching'][uuid]['history'].keys())[0]
@ -418,7 +418,11 @@ def changedetection_app(config=None, datastore_o=None):
stripped_content = handler.strip_ignore_text(raw_content, stripped_content = handler.strip_ignore_text(raw_content,
datastore.data['watching'][uuid]['ignore_text']) datastore.data['watching'][uuid]['ignore_text'])
if datastore.data['settings']['application'].get('ignore_whitespace', False):
checksum = hashlib.md5(stripped_content.translate(None, b'\r\n\t ')).hexdigest()
else:
checksum = hashlib.md5(stripped_content).hexdigest() checksum = hashlib.md5(stripped_content).hexdigest()
return checksum return checksum
return datastore.data['watching'][uuid]['previous_md5'] return datastore.data['watching'][uuid]['previous_md5']
@ -553,6 +557,7 @@ def changedetection_app(config=None, datastore_o=None):
form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text']
form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace']
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend'] form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
form.notification_title.data = datastore.data['settings']['application']['notification_title'] form.notification_title.data = datastore.data['settings']['application']['notification_title']
@ -580,6 +585,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
datastore.data['settings']['application']['base_url'] = form.base_url.data datastore.data['settings']['application']['base_url'] = form.base_url.data
datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data
datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data
if form.trigger_check.data: if form.trigger_check.data:
if len(form.notification_urls.data): if len(form.notification_urls.data):

@ -58,8 +58,7 @@ class perform_site_check():
watch = self.datastore.data['watching'][uuid] watch = self.datastore.data['watching'][uuid]
update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'], update_obj = {
'history': {},
"last_checked": timestamp "last_checked": timestamp
} }
@ -137,9 +136,17 @@ class perform_site_check():
else: else:
stripped_text_from_html = stripped_text_from_html.encode('utf8') stripped_text_from_html = stripped_text_from_html.encode('utf8')
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
else:
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
# On the first run of a site, watch['previous_md5'] will be an empty string, set it the current one.
if not len(watch['previous_md5']):
watch['previous_md5'] = fetched_md5
update_obj["previous_md5"] = fetched_md5
blocked_by_not_found_trigger_text = False blocked_by_not_found_trigger_text = False
if len(watch['trigger_text']): if len(watch['trigger_text']):
@ -160,16 +167,12 @@ class perform_site_check():
break break
# could be None or False depending on JSON type
# On the first run of a site, watch['previous_md5'] will be an empty string
if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5: if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
changed_detected = True changed_detected = True
update_obj["previous_md5"] = fetched_md5
# Don't confuse people by updating as last-changed, when it actually just changed from None..
if self.datastore.get_val(uuid, 'previous_md5'):
update_obj["last_changed"] = timestamp update_obj["last_changed"] = timestamp
update_obj["previous_md5"] = fetched_md5
# Extract title as title # Extract title as title
if is_html: if is_html:

@ -259,3 +259,4 @@ class globalSettingsForm(commonSettingsForm):
extract_title_as_title = BooleanField('Extract <title> from document and use as watch title') extract_title_as_title = BooleanField('Extract <title> from document and use as watch title')
base_url = StringField('Base URL', validators=[validators.Optional()]) base_url = StringField('Base URL', validators=[validators.Optional()])
global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
ignore_whitespace = BooleanField('Ignore whitespace')

@ -46,6 +46,7 @@ class ChangeDetectionStore:
'extract_title_as_title': False, 'extract_title_as_title': False,
'fetch_backend': 'html_requests', 'fetch_backend': 'html_requests',
'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum
'ignore_whitespace': False,
'notification_urls': [], # Apprise URL list 'notification_urls': [], # Apprise URL list
# Custom notification content # Custom notification content
'notification_title': None, 'notification_title': None,

@ -69,15 +69,24 @@
<div class="tab-pane-inner" id="filters"> <div class="tab-pane-inner" id="filters">
<span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span>
<fieldset class="pure-group">
{{ render_field(form.ignore_whitespace) }}
<span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/>
<i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc.
</span>
</fieldset>
<fieldset class="pure-group"> <fieldset class="pure-group">
{{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line {{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line
/some.regex\d{2}/ for case-INsensitive regex /some.regex\d{2}/ for case-INsensitive regex
") }} ") }}
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br/>
Each line processed separately, any line matching will be ignored.<br/> <span class="pure-form-message-inline">Each line processed separately, any line matching will be ignored.<br/>
Regular Expression support, wrap the line in forward slash <b>/regex/</b>. Regular Expression support, wrap the line in forward slash <b>/regex/</b>.
</span> </span>
</fieldset>
</div> </div>
<div id="actions"> <div id="actions">

@ -18,7 +18,8 @@ def cleanup(datastore_path):
'url-watches.json', 'url-watches.json',
'notification.txt', 'notification.txt',
'count.txt', 'count.txt',
'endpoint-content.txt'] 'endpoint-content.txt'
]
for file in files: for file in files:
try: try:
os.unlink("{}/{}".format(datastore_path, file)) os.unlink("{}/{}".format(datastore_path, file))

@ -0,0 +1,96 @@
#!/usr/bin/python3
import time
from flask import url_for
from . util import live_server_setup
def test_setup(live_server):
live_server_setup(live_server)
# Should be the same as set_original_ignore_response() but with a little more whitespacing
def set_original_ignore_response_but_with_whitespace():
test_return_data = """<html>
<body>
Some initial text</br>
<p>
Which is across multiple lines</p>
<br>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def set_original_ignore_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
# If there was only a change in the whitespacing, then we shouldnt have a change detected
def test_check_ignore_whitespace(client, live_server):
sleep_time_for_fetch_thread = 3
# Give the endpoint time to spin up
time.sleep(1)
set_original_ignore_response()
# Goto the settings page, add our ignore text
res = client.post(
url_for("settings_page"),
data={
"minutes_between_check": 180,
"ignore_whitespace": "y",
'fetch_backend': "html_requests"
},
follow_redirects=True
)
assert b"Settings updated." in res.data
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
set_original_ignore_response_but_with_whitespace()
time.sleep(sleep_time_for_fetch_thread)
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
assert b'/test-endpoint' in res.data

@ -64,20 +64,22 @@ class update_worker(threading.Thread):
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
else: else:
if update_obj:
try: try:
self.datastore.update_watch(uuid=uuid, update_obj=update_obj) watch = self.datastore.data['watching'][uuid]
if changed_detected:
n_object = {} # For the FIRST time we check a site, or a change detected, save the snapshot.
if changed_detected or not watch['last_checked']:
# A change was detected # A change was detected
fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents)
# Update history with the stripped text for future reference, this will also mean we save the first
# Should always be keyed by string(timestamp) # Should always be keyed by string(timestamp)
self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}}) self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}})
watch = self.datastore.data['watching'][uuid] # Generally update anything interesting returned
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
# A change was detected
if changed_detected:
n_object = {}
print (">> Change detected in UUID {} - {}".format(uuid, watch['url'])) print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
# Notifications should only trigger on the second time (first time, we gather the initial snapshot) # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
@ -131,6 +133,7 @@ class update_worker(threading.Thread):
self.notification_q.put(n_object) self.notification_q.put(n_object)
except Exception as e: except Exception as e:
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
print("!!!! Exception in update_worker !!!\n", e) print("!!!! Exception in update_worker !!!\n", e)
self.current_uuid = None # Done self.current_uuid = None # Done

Loading…
Cancel
Save