Misc fixes - juggling utf-8 not needed

(its utf-16 by default python string)
refactor-filters
dgtlmoon 3 months ago
parent 783926962d
commit 3b6ae70c9c

@ -17,7 +17,6 @@
</script>
<script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<!--<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>-->
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<div class="edit-form monospaced-textarea">

@ -357,7 +357,7 @@ def strip_ignore_text(content, wordlist, mode="content"):
if not got_match:
# Not ignored
output.append(line.encode('utf8'))
output.append(line)
else:
ignored_line_numbers.append(i)
@ -366,7 +366,7 @@ def strip_ignore_text(content, wordlist, mode="content"):
if mode == "line numbers":
return ignored_line_numbers
return "\n".encode('utf8').join(output)
return "\n".join(output)
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
from xml.sax.saxutils import escape as xml_escape

@ -312,13 +312,13 @@ class model(watch_base):
dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest):
with open(dest, 'wb') as f:
f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT))
else:
snapshot_fname = f"{snapshot_id}.txt"
dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest):
with open(dest, 'wb') as f:
f.write(contents)
f.write(contents.encode('utf-8'))
# Append to index
# @todo check last char was \n

@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor):
# Always record the new checksum
update_obj["previous_md5"] = fetched_md5
return changed_detected, update_obj, snapshot_content.encode('utf-8').strip()
return changed_detected, update_obj, snapshot_content.strip()

@ -72,7 +72,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
)
# Use the last loaded HTML as the input
update_handler.datastore = datastore
update_handler.fetcher.content = decompressed_data
update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string
update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')
# Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk

@ -205,18 +205,9 @@ class perform_site_check(difference_detection_processor):
if watch.get('trim_text_whitespace'):
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
if watch.get('remove_duplicate_lines'):
stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
if watch.get('sort_text_alphabetically'):
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
# Re #340 - return the content before the 'ignore text' was applied
# Also used to calculate/show what was removed
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
text_content_before_ignored_filter = stripped_text_from_html
# @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
@ -235,12 +226,12 @@ class perform_site_check(difference_detection_processor):
line_feed_sep="\n",
include_change_type_prefix=False)
watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter)
watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8'))
if not rendered_diff and stripped_text_from_html:
# We had some content, but no differences were found
# Store our new file as the MD5 so it will trigger in the future
c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest()
c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
else:
stripped_text_from_html = rendered_diff
@ -264,10 +255,8 @@ class perform_site_check(difference_detection_processor):
# If there's text to skip
# @todo we could abstract out the get_text() to handle this cleaner
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
if len(text_to_ignore):
if text_to_ignore:
stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
else:
stripped_text_from_html = stripped_text_from_html.encode('utf8')
# 615 Extract text by regex
extract_text = watch.get('extract_text', [])
@ -301,15 +290,22 @@ class perform_site_check(difference_detection_processor):
if regex_matched_output:
# @todo some formatter for presentation?
stripped_text_from_html = b''.join(regex_matched_output)
text_content_before_ignored_filter = stripped_text_from_html
if watch.get('remove_duplicate_lines'):
stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
if watch.get('sort_text_alphabetically'):
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
else:
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest()
############ Blocking rules, after checksum #################
blocked = False

@ -26,7 +26,6 @@
</script>
<script src="{{url_for('static_content', group='js', filename='plugins.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
{% if playwright_enabled %}

@ -286,8 +286,8 @@ class update_worker(threading.Thread):
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
# if not isinstance(contents, (bytes, bytearray)):
# raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
logger.critical(f"File permission error updating file, watch: {uuid}")
logger.critical(str(e))

Loading…
Cancel
Save