UTF-8 handling fixes, Improvements to whitespace filtering (#2691)

pull/2696/head
dgtlmoon 3 months ago committed by GitHub
parent f38429ec93
commit 5a768d7db3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -17,7 +17,6 @@
</script> </script>
<script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<!--<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>-->
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<div class="edit-form monospaced-textarea"> <div class="edit-form monospaced-textarea">

@ -476,7 +476,7 @@ class processor_text_json_diff_form(commonSettingsForm):
title = StringField('Title', default='') title = StringField('Title', default='')
ignore_text = StringListField('Remove lines containing', [ValidateListRegex()]) ignore_text = StringListField('Ignore lines containing', [ValidateListRegex()])
headers = StringDictKeyValue('Request headers') headers = StringDictKeyValue('Request headers')
body = TextAreaField('Request body', [validators.Optional()]) body = TextAreaField('Request body', [validators.Optional()])
method = SelectField('Request method', choices=valid_method, default=default_method) method = SelectField('Request method', choices=valid_method, default=default_method)

@ -3,11 +3,11 @@ from lxml import etree
import json import json
import re import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>" TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here # 'price' , 'lowPrice', 'highPrice' are usually under here
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"] LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
@ -326,6 +326,7 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# - "line numbers" return a list of line numbers that match (int list) # - "line numbers" return a list of line numbers that match (int list)
# #
# wordlist - list of regex's (str) or words (str) # wordlist - list of regex's (str) or words (str)
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
def strip_ignore_text(content, wordlist, mode="content"): def strip_ignore_text(content, wordlist, mode="content"):
i = 0 i = 0
output = [] output = []
@ -341,32 +342,30 @@ def strip_ignore_text(content, wordlist, mode="content"):
else: else:
ignore_text.append(k.strip()) ignore_text.append(k.strip())
for line in content.splitlines(): for line in content.splitlines(keepends=True):
i += 1 i += 1
# Always ignore blank lines in this mode. (when this function gets called) # Always ignore blank lines in this mode. (when this function gets called)
got_match = False got_match = False
if len(line.strip()): for l in ignore_text:
for l in ignore_text: if l.lower() in line.lower():
if l.lower() in line.lower(): got_match = True
got_match = True
if not got_match: if not got_match:
for r in ignore_regex: for r in ignore_regex:
if r.search(line): if r.search(line):
got_match = True got_match = True
if not got_match:
# Not ignored
output.append(line.encode('utf8'))
else:
ignored_line_numbers.append(i)
if not got_match:
# Not ignored, and should preserve "keepends"
output.append(line)
else:
ignored_line_numbers.append(i)
# Used for finding out what to highlight # Used for finding out what to highlight
if mode == "line numbers": if mode == "line numbers":
return ignored_line_numbers return ignored_line_numbers
return "\n".encode('utf8').join(output) return ''.join(output)
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
from xml.sax.saxutils import escape as xml_escape from xml.sax.saxutils import escape as xml_escape

@ -6,6 +6,8 @@ import re
from pathlib import Path from pathlib import Path
from loguru import logger from loguru import logger
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
# Allowable protocols, protects against javascript: etc # Allowable protocols, protects against javascript: etc
# file:// is further checked by ALLOW_FILE_URI # file:// is further checked by ALLOW_FILE_URI
SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):' SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
@ -312,13 +314,13 @@ class model(watch_base):
dest = os.path.join(self.watch_data_dir, snapshot_fname) dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest): if not os.path.exists(dest):
with open(dest, 'wb') as f: with open(dest, 'wb') as f:
f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT))
else: else:
snapshot_fname = f"{snapshot_id}.txt" snapshot_fname = f"{snapshot_id}.txt"
dest = os.path.join(self.watch_data_dir, snapshot_fname) dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest): if not os.path.exists(dest):
with open(dest, 'wb') as f: with open(dest, 'wb') as f:
f.write(contents) f.write(contents.encode('utf-8'))
# Append to index # Append to index
# @todo check last char was \n # @todo check last char was \n
@ -350,14 +352,32 @@ class model(watch_base):
return seconds return seconds
# Iterate over all history texts and see if something new exists # Iterate over all history texts and see if something new exists
def lines_contain_something_unique_compared_to_history(self, lines: list): # Always applying .strip() to start/end but optionally replace any other whitespace
local_lines = set([l.decode('utf-8').strip().lower() for l in lines]) def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False):
local_lines = []
if lines:
if ignore_whitespace:
if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
else:
local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
else:
if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
local_lines = set([l.strip().lower() for l in lines])
else:
local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
# Compare each lines (set) against each history text file (set) looking for something new.. # Compare each lines (set) against each history text file (set) looking for something new..
existing_history = set({}) existing_history = set({})
for k, v in self.history.items(): for k, v in self.history.items():
content = self.get_history_snapshot(k) content = self.get_history_snapshot(k)
alist = set([line.strip().lower() for line in content.splitlines()])
if ignore_whitespace:
alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()])
else:
alist = set([line.strip().lower() for line in content.splitlines()])
existing_history = existing_history.union(alist) existing_history = existing_history.union(alist)
# Check that everything in local_lines(new stuff) already exists in existing_history - it should # Check that everything in local_lines(new stuff) already exists in existing_history - it should

@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor):
# Always record the new checksum # Always record the new checksum
update_obj["previous_md5"] = fetched_md5 update_obj["previous_md5"] = fetched_md5
return changed_detected, update_obj, snapshot_content.encode('utf-8').strip() return changed_detected, update_obj, snapshot_content.strip()

@ -46,6 +46,9 @@ def prepare_filter_prevew(datastore, watch_uuid):
text_after_filter = '' text_after_filter = ''
text_before_filter = '' text_before_filter = ''
trigger_line_numbers = []
ignore_line_numbers = []
tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid)) tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid))
if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir): if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir):
@ -72,7 +75,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
) )
# Use the last loaded HTML as the input # Use the last loaded HTML as the input
update_handler.datastore = datastore update_handler.datastore = datastore
update_handler.fetcher.content = decompressed_data update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string
update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type') update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')
# Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk # Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk
@ -84,9 +87,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
text_after_filter = future1.result() text_after_filter = future1.result()
text_before_filter = future2.result() text_before_filter = future2.result()
trigger_line_numbers = []
try: try:
trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter, trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
wordlist=tmp_watch['trigger_text'], wordlist=tmp_watch['trigger_text'],
mode='line numbers' mode='line numbers'
@ -94,6 +95,15 @@ def prepare_filter_prevew(datastore, watch_uuid):
except Exception as e: except Exception as e:
text_before_filter = f"Error: {str(e)}" text_before_filter = f"Error: {str(e)}"
try:
text_to_ignore = tmp_watch.get('ignore_text', []) + datastore.data['settings']['application'].get('global_ignore_text', [])
ignore_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
wordlist=text_to_ignore,
mode='line numbers'
)
except Exception as e:
text_before_filter = f"Error: {str(e)}"
logger.trace(f"Parsed in {time.time() - now:.3f}s") logger.trace(f"Parsed in {time.time() - now:.3f}s")
return jsonify( return jsonify(
@ -102,6 +112,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter, 'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter,
'duration': time.time() - now, 'duration': time.time() - now,
'trigger_line_numbers': trigger_line_numbers, 'trigger_line_numbers': trigger_line_numbers,
'ignore_line_numbers': ignore_line_numbers,
} }
) )

@ -7,7 +7,7 @@ import re
import urllib3 import urllib3
from changedetectionio.processors import difference_detection_processor from changedetectionio.processors import difference_detection_processor
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger from loguru import logger
@ -36,7 +36,6 @@ class PDFToHTMLToolNotFound(ValueError):
class perform_site_check(difference_detection_processor): class perform_site_check(difference_detection_processor):
def run_changedetection(self, watch, skip_when_checksum_same=True): def run_changedetection(self, watch, skip_when_checksum_same=True):
changed_detected = False changed_detected = False
html_content = "" html_content = ""
screenshot = False # as bytes screenshot = False # as bytes
@ -205,18 +204,9 @@ class perform_site_check(difference_detection_processor):
if watch.get('trim_text_whitespace'): if watch.get('trim_text_whitespace'):
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()) stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
if watch.get('remove_duplicate_lines'):
stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
if watch.get('sort_text_alphabetically'):
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
# Re #340 - return the content before the 'ignore text' was applied # Re #340 - return the content before the 'ignore text' was applied
# Also used to calculate/show what was removed # Also used to calculate/show what was removed
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') text_content_before_ignored_filter = stripped_text_from_html
# @todo whitespace coming from missing rtrim()? # @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about. # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
@ -236,12 +226,12 @@ class perform_site_check(difference_detection_processor):
line_feed_sep="\n", line_feed_sep="\n",
include_change_type_prefix=False) include_change_type_prefix=False)
watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter) watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8'))
if not rendered_diff and stripped_text_from_html: if not rendered_diff and stripped_text_from_html:
# We had some content, but no differences were found # We had some content, but no differences were found
# Store our new file as the MD5 so it will trigger in the future # Store our new file as the MD5 so it will trigger in the future
c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest() c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8') return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
else: else:
stripped_text_from_html = rendered_diff stripped_text_from_html = rendered_diff
@ -262,14 +252,6 @@ class perform_site_check(difference_detection_processor):
update_obj["last_check_status"] = self.fetcher.get_last_status_code() update_obj["last_check_status"] = self.fetcher.get_last_status_code()
# If there's text to skip
# @todo we could abstract out the get_text() to handle this cleaner
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
if len(text_to_ignore):
stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
else:
stripped_text_from_html = stripped_text_from_html.encode('utf8')
# 615 Extract text by regex # 615 Extract text by regex
extract_text = watch.get('extract_text', []) extract_text = watch.get('extract_text', [])
if len(extract_text) > 0: if len(extract_text) > 0:
@ -278,39 +260,53 @@ class perform_site_check(difference_detection_processor):
# incase they specified something in '/.../x' # incase they specified something in '/.../x'
if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE): if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re) regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
result = re.findall(regex.encode('utf-8'), stripped_text_from_html) result = re.findall(regex, stripped_text_from_html)
for l in result: for l in result:
if type(l) is tuple: if type(l) is tuple:
# @todo - some formatter option default (between groups) # @todo - some formatter option default (between groups)
regex_matched_output += list(l) + [b'\n'] regex_matched_output += list(l) + ['\n']
else: else:
# @todo - some formatter option default (between each ungrouped result) # @todo - some formatter option default (between each ungrouped result)
regex_matched_output += [l] + [b'\n'] regex_matched_output += [l] + ['\n']
else: else:
# Doesnt look like regex, just hunt for plaintext and return that which matches # Doesnt look like regex, just hunt for plaintext and return that which matches
# `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE) r = re.compile(re.escape(s_re), re.IGNORECASE)
res = r.findall(stripped_text_from_html) res = r.findall(stripped_text_from_html)
if res: if res:
for match in res: for match in res:
regex_matched_output += [match] + [b'\n'] regex_matched_output += [match] + ['\n']
########################################################## ##########################################################
stripped_text_from_html = b'' stripped_text_from_html = ''
text_content_before_ignored_filter = b''
if regex_matched_output: if regex_matched_output:
# @todo some formatter for presentation? # @todo some formatter for presentation?
stripped_text_from_html = b''.join(regex_matched_output) stripped_text_from_html = ''.join(regex_matched_output)
text_content_before_ignored_filter = stripped_text_from_html
if watch.get('remove_duplicate_lines'):
stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
if watch.get('sort_text_alphabetically'):
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
### CALCULATE MD5
# If there's text to ignore
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
text_for_checksuming = stripped_text_from_html
if text_to_ignore:
text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
# Re #133 - if we should strip whitespaces from triggering the change detected comparison # Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False): if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
else: else:
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()
############ Blocking rules, after checksum ################# ############ Blocking rules, after checksum #################
blocked = False blocked = False
@ -350,7 +346,13 @@ class perform_site_check(difference_detection_processor):
if changed_detected: if changed_detected:
if watch.get('check_unique_lines', False): if watch.get('check_unique_lines', False):
has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines()) ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace')
has_unique_lines = watch.lines_contain_something_unique_compared_to_history(
lines=stripped_text_from_html.splitlines(),
ignore_whitespace=ignore_whitespace
)
# One or more lines? unsure? # One or more lines? unsure?
if not has_unique_lines: if not has_unique_lines:
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False") logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")

@ -42,8 +42,12 @@ function request_textpreview_update() {
{ {
'color': '#ee0000', 'color': '#ee0000',
'lines': data['trigger_line_numbers'] 'lines': data['trigger_line_numbers']
},
{
'color': '#757575',
'lines': data['ignore_line_numbers']
} }
]); ])
}).fail(function (error) { }).fail(function (error) {
if (error.statusText === 'abort') { if (error.statusText === 'abort') {
console.log('Request was aborted due to a new request being fired.'); console.log('Request was aborted due to a new request being fired.');
@ -76,8 +80,8 @@ $(document).ready(function () {
$('body').toggleClass('preview-text-enabled') $('body').toggleClass('preview-text-enabled')
request_textpreview_update(); request_textpreview_update();
const method = $('body').hasClass('preview-text-enabled') ? 'on' : 'off'; const method = $('body').hasClass('preview-text-enabled') ? 'on' : 'off';
$('textarea:visible')[method]('keyup blur', request_textpreview_update.throttle(1000)); $('#filters-and-triggers textarea')[method]('blur', request_textpreview_update.throttle(1000));
$('input:visible')[method]('keyup blur change', request_textpreview_update.throttle(1000)); $('#filters-and-triggers input')[method]('change', request_textpreview_update.throttle(1000));
$("#filters-and-triggers-tab")[method]('click', request_textpreview_update.throttle(1000)); $("#filters-and-triggers-tab")[method]('click', request_textpreview_update.throttle(1000));
}); });
$('.minitabs-wrapper').miniTabs({ $('.minitabs-wrapper').miniTabs({

@ -4,6 +4,7 @@ from flask import (
flash flash
) )
from .html_tools import TRANSLATE_WHITESPACE_TABLE
from . model import App, Watch from . model import App, Watch
from copy import deepcopy, copy from copy import deepcopy, copy
from os import path, unlink from os import path, unlink
@ -750,17 +751,17 @@ class ChangeDetectionStore:
def update_5(self): def update_5(self):
# If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings # If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings
# In other words - the watch notification_title and notification_body are not needed if they are the same as the default one # In other words - the watch notification_title and notification_body are not needed if they are the same as the default one
current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
for uuid, watch in self.data['watching'].items(): for uuid, watch in self.data['watching'].items():
try: try:
watch_body = watch.get('notification_body', '') watch_body = watch.get('notification_body', '')
if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body: if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body:
# Looks the same as the default one, so unset it # Looks the same as the default one, so unset it
watch['notification_body'] = None watch['notification_body'] = None
watch_title = watch.get('notification_title', '') watch_title = watch.get('notification_title', '')
if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title: if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title:
# Looks the same as the default one, so unset it # Looks the same as the default one, so unset it
watch['notification_title'] = None watch['notification_title'] = None
except Exception as e: except Exception as e:

@ -26,7 +26,6 @@
</script> </script>
<script src="{{url_for('static_content', group='js', filename='plugins.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='plugins.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
{% if playwright_enabled %} {% if playwright_enabled %}
@ -330,9 +329,9 @@ nav
{{ render_checkbox_field(form.filter_text_added) }} {{ render_checkbox_field(form.filter_text_added) }}
{{ render_checkbox_field(form.filter_text_replaced) }} {{ render_checkbox_field(form.filter_text_replaced) }}
{{ render_checkbox_field(form.filter_text_removed) }} {{ render_checkbox_field(form.filter_text_removed) }}
<span class="pure-form-message-inline">Note: Depending on the length and similarity of the text on each line, the algorithm may consider an <strong>addition</strong> instead of <strong>replacement</strong> for example.</span> <span class="pure-form-message-inline">Note: Depending on the length and similarity of the text on each line, the algorithm may consider an <strong>addition</strong> instead of <strong>replacement</strong> for example.</span><br>
<span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br> <span class="pure-form-message-inline">&nbsp;So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
<span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span> <span class="pure-form-message-inline">&nbsp;When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
</fieldset> </fieldset>
<fieldset class="pure-control-group"> <fieldset class="pure-control-group">
{{ render_checkbox_field(form.check_unique_lines) }} {{ render_checkbox_field(form.check_unique_lines) }}
@ -371,7 +370,7 @@ nav
") }} ") }}
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li>Matching text will be <strong>removed</strong> from the text snapshot</li> <li>Matching text will be <strong>ignored</strong> in the text snapshot (you can still see it but it wont trigger a change)</li>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li> <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li> <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
<li>Changing this will affect the comparison checksum which may trigger an alert</li> <li>Changing this will affect the comparison checksum which may trigger an alert</li>

@ -172,7 +172,7 @@ nav
<span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br> <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br>
<span class="pure-form-message-inline"> <span class="pure-form-message-inline">
<ul> <ul>
<li>Matching text will be <strong>removed</strong> from the text snapshot</li> <li>Matching text will be <strong>ignored</strong> in the text snapshot (you can still see it but it wont trigger a change)</li>
<li>Note: This is applied globally in addition to the per-watch rules.</li> <li>Note: This is applied globally in addition to the per-watch rules.</li>
<li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li> <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
<li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li> <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>

@ -71,7 +71,7 @@ def test_setup(client, live_server, measure_memory_usage):
live_server_setup(live_server) live_server_setup(live_server)
def test_check_filter_multiline(client, live_server, measure_memory_usage): def test_check_filter_multiline(client, live_server, measure_memory_usage):
#live_server_setup(live_server) # live_server_setup(live_server)
set_multiline_response() set_multiline_response()
# Add our URL to the import page # Add our URL to the import page

@ -33,13 +33,17 @@ def test_strip_regex_text_func():
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
assert b"but 1 lines" in stripped_content assert "but 1 lines" in stripped_content
assert b"igNORe-cAse text" not in stripped_content assert "igNORe-cAse text" not in stripped_content
assert b"but 1234 lines" not in stripped_content assert "but 1234 lines" not in stripped_content
assert b"really" not in stripped_content assert "really" not in stripped_content
assert b"not this" not in stripped_content assert "not this" not in stripped_content
# Check line number reporting # Check line number reporting
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers") stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
assert stripped_content == [2, 5, 6, 7, 8, 10] assert stripped_content == [2, 5, 6, 7, 8, 10]
# Check that linefeeds are preserved when there are is no matching ignores
content = "some text\n\nand other text\n"
stripped_content = html_tools.strip_ignore_text(content, ignore_lines)
assert content == stripped_content

@ -22,10 +22,15 @@ def test_strip_text_func():
ignore_lines = ["sometimes"] ignore_lines = ["sometimes"]
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
assert "sometimes" not in stripped_content
assert "Some content" in stripped_content
assert b"sometimes" not in stripped_content # Check that line feeds dont get chewed up when something is found
assert b"Some content" in stripped_content test_content = "Some initial text\n\nWhich is across multiple lines\n\nZZZZz\n\n\nSo let's see what happens."
ignore = ['something irrelevent but just to check', 'XXXXX', 'YYYYY', 'ZZZZZ']
stripped_content = html_tools.strip_ignore_text(test_content, ignore)
assert stripped_content == "Some initial text\n\nWhich is across multiple lines\n\n\n\nSo let's see what happens."
def set_original_ignore_response(): def set_original_ignore_response():
test_return_data = """<html> test_return_data = """<html>
@ -141,8 +146,6 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa
# Just to be sure.. set a regular modified change.. # Just to be sure.. set a regular modified change..
set_modified_original_ignore_response() set_modified_original_ignore_response()
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
@ -153,17 +156,17 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa
res = client.get(url_for("preview_page", uuid="first")) res = client.get(url_for("preview_page", uuid="first"))
# Should no longer be in the preview # SHOULD BE be in the preview, it was added in set_modified_original_ignore_response()
assert b'new ignore stuff' not in res.data # and we have "new ignore stuff" in ignore_text
# it is only ignored, it is not removed (it will be highlighted too)
assert b'new ignore stuff' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data assert b'Deleted' in res.data
# When adding some ignore text, it should not trigger a change, even if something else on that line changes
def test_check_global_ignore_text_functionality(client, live_server, measure_memory_usage): def test_check_global_ignore_text_functionality(client, live_server, measure_memory_usage):
#live_server_setup(live_server)
# Give the endpoint time to spin up
time.sleep(1)
ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ" ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ"
set_original_ignore_response() set_original_ignore_response()
@ -172,6 +175,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
url_for("settings_page"), url_for("settings_page"),
data={ data={
"requests-time_between_check-minutes": 180, "requests-time_between_check-minutes": 180,
"application-ignore_whitespace": "y",
"application-global_ignore_text": ignore_text, "application-global_ignore_text": ignore_text,
'application-fetch_backend': "html_requests" 'application-fetch_backend': "html_requests"
}, },
@ -192,9 +196,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
# Give the thread time to pick it up # Give the thread time to pick it up
wait_for_all_checks(client) wait_for_all_checks(client)
#Adding some ignore text should not trigger a change
# Goto the edit page of the item, add our ignore text
# Add our URL to the import page
res = client.post( res = client.post(
url_for("edit_page", uuid="first"), url_for("edit_page", uuid="first"),
data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests"}, data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests"},
@ -210,20 +212,15 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
# Trigger a check # Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True) client.get(url_for("form_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
wait_for_all_checks(client) wait_for_all_checks(client)
# It should report nothing found (no new 'unviewed' class), adding random ignore text should not cause a change
# so that we are sure everything is viewed and in a known 'nothing changed' state
res = client.get(url_for("diff_history_page", uuid="first"))
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' not in res.data assert b'unviewed' not in res.data
assert b'/test-endpoint' in res.data assert b'/test-endpoint' in res.data
#####
# Make a change which includes the ignore text, it should be ignored and no 'change' triggered
# Make a change which includes the ignore text # It adds text with "ZZZZzzzz" and "ZZZZ" is in the ignore list
set_modified_ignore_response() set_modified_ignore_response()
# Trigger a check # Trigger a check
@ -233,6 +230,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
# It should report nothing found (no new 'unviewed' class) # It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b'unviewed' not in res.data assert b'unviewed' not in res.data
assert b'/test-endpoint' in res.data assert b'/test-endpoint' in res.data

@ -18,12 +18,13 @@ class TestDiffBuilder(unittest.TestCase):
watch['last_viewed'] = 110 watch['last_viewed'] = 110
watch.save_history_text(contents=b"hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4())) # Contents from the browser are always returned from the browser/requests/etc as str, str is basically UTF-16 in python
watch.save_history_text(contents=b"hello world", timestamp=105, snapshot_id=str(uuid_builder.uuid4())) watch.save_history_text(contents="hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4()))
watch.save_history_text(contents=b"hello world", timestamp=109, snapshot_id=str(uuid_builder.uuid4())) watch.save_history_text(contents="hello world", timestamp=105, snapshot_id=str(uuid_builder.uuid4()))
watch.save_history_text(contents=b"hello world", timestamp=112, snapshot_id=str(uuid_builder.uuid4())) watch.save_history_text(contents="hello world", timestamp=109, snapshot_id=str(uuid_builder.uuid4()))
watch.save_history_text(contents=b"hello world", timestamp=115, snapshot_id=str(uuid_builder.uuid4())) watch.save_history_text(contents="hello world", timestamp=112, snapshot_id=str(uuid_builder.uuid4()))
watch.save_history_text(contents=b"hello world", timestamp=117, snapshot_id=str(uuid_builder.uuid4())) watch.save_history_text(contents="hello world", timestamp=115, snapshot_id=str(uuid_builder.uuid4()))
watch.save_history_text(contents="hello world", timestamp=117, snapshot_id=str(uuid_builder.uuid4()))
p = watch.get_next_snapshot_key_to_last_viewed p = watch.get_next_snapshot_key_to_last_viewed
assert p == "112", "Correct last-viewed timestamp was detected" assert p == "112", "Correct last-viewed timestamp was detected"

@ -286,8 +286,8 @@ class update_worker(threading.Thread):
# Re #342 # Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc # We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)): # if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes") # raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e: except PermissionError as e:
logger.critical(f"File permission error updating file, watch: {uuid}") logger.critical(f"File permission error updating file, watch: {uuid}")
logger.critical(str(e)) logger.critical(str(e))

Loading…
Cancel
Save