Misc fixes - juggling utf-8 not needed

(its utf-16 by default python string)
refactor-filters
dgtlmoon 3 months ago
parent 783926962d
commit 3b6ae70c9c

@ -17,7 +17,6 @@
</script> </script>
<script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<!--<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>-->
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<div class="edit-form monospaced-textarea"> <div class="edit-form monospaced-textarea">

@ -357,7 +357,7 @@ def strip_ignore_text(content, wordlist, mode="content"):
if not got_match: if not got_match:
# Not ignored # Not ignored
output.append(line.encode('utf8')) output.append(line)
else: else:
ignored_line_numbers.append(i) ignored_line_numbers.append(i)
@ -366,7 +366,7 @@ def strip_ignore_text(content, wordlist, mode="content"):
if mode == "line numbers": if mode == "line numbers":
return ignored_line_numbers return ignored_line_numbers
return "\n".encode('utf8').join(output) return "\n".join(output)
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
from xml.sax.saxutils import escape as xml_escape from xml.sax.saxutils import escape as xml_escape

@ -312,13 +312,13 @@ class model(watch_base):
dest = os.path.join(self.watch_data_dir, snapshot_fname) dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest): if not os.path.exists(dest):
with open(dest, 'wb') as f: with open(dest, 'wb') as f:
f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT))
else: else:
snapshot_fname = f"{snapshot_id}.txt" snapshot_fname = f"{snapshot_id}.txt"
dest = os.path.join(self.watch_data_dir, snapshot_fname) dest = os.path.join(self.watch_data_dir, snapshot_fname)
if not os.path.exists(dest): if not os.path.exists(dest):
with open(dest, 'wb') as f: with open(dest, 'wb') as f:
f.write(contents) f.write(contents.encode('utf-8'))
# Append to index # Append to index
# @todo check last char was \n # @todo check last char was \n

@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor):
# Always record the new checksum # Always record the new checksum
update_obj["previous_md5"] = fetched_md5 update_obj["previous_md5"] = fetched_md5
return changed_detected, update_obj, snapshot_content.encode('utf-8').strip() return changed_detected, update_obj, snapshot_content.strip()

@ -72,7 +72,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
) )
# Use the last loaded HTML as the input # Use the last loaded HTML as the input
update_handler.datastore = datastore update_handler.datastore = datastore
update_handler.fetcher.content = decompressed_data update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string
update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type') update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')
# Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk # Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk

@ -205,18 +205,9 @@ class perform_site_check(difference_detection_processor):
if watch.get('trim_text_whitespace'): if watch.get('trim_text_whitespace'):
stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()) stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
if watch.get('remove_duplicate_lines'):
stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
if watch.get('sort_text_alphabetically'):
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
# Re #340 - return the content before the 'ignore text' was applied # Re #340 - return the content before the 'ignore text' was applied
# Also used to calculate/show what was removed # Also used to calculate/show what was removed
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') text_content_before_ignored_filter = stripped_text_from_html
# @todo whitespace coming from missing rtrim()? # @todo whitespace coming from missing rtrim()?
# stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about. # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
@ -235,12 +226,12 @@ class perform_site_check(difference_detection_processor):
line_feed_sep="\n", line_feed_sep="\n",
include_change_type_prefix=False) include_change_type_prefix=False)
watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter) watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8'))
if not rendered_diff and stripped_text_from_html: if not rendered_diff and stripped_text_from_html:
# We had some content, but no differences were found # We had some content, but no differences were found
# Store our new file as the MD5 so it will trigger in the future # Store our new file as the MD5 so it will trigger in the future
c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest() c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8') return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
else: else:
stripped_text_from_html = rendered_diff stripped_text_from_html = rendered_diff
@ -264,10 +255,8 @@ class perform_site_check(difference_detection_processor):
# If there's text to skip # If there's text to skip
# @todo we could abstract out the get_text() to handle this cleaner # @todo we could abstract out the get_text() to handle this cleaner
text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', []) text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
if len(text_to_ignore): if text_to_ignore:
stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore) stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
else:
stripped_text_from_html = stripped_text_from_html.encode('utf8')
# 615 Extract text by regex # 615 Extract text by regex
extract_text = watch.get('extract_text', []) extract_text = watch.get('extract_text', [])
@ -301,15 +290,22 @@ class perform_site_check(difference_detection_processor):
if regex_matched_output: if regex_matched_output:
# @todo some formatter for presentation? # @todo some formatter for presentation?
stripped_text_from_html = b''.join(regex_matched_output) stripped_text_from_html = b''.join(regex_matched_output)
text_content_before_ignored_filter = stripped_text_from_html
if watch.get('remove_duplicate_lines'):
stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
if watch.get('sort_text_alphabetically'):
# Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
# we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
# Re #133 - if we should strip whitespaces from triggering the change detected comparison # Re #133 - if we should strip whitespaces from triggering the change detected comparison
if self.datastore.data['settings']['application'].get('ignore_whitespace', False): if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() fetched_md5 = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
else: else:
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest()
############ Blocking rules, after checksum ################# ############ Blocking rules, after checksum #################
blocked = False blocked = False

@ -26,7 +26,6 @@
</script> </script>
<script src="{{url_for('static_content', group='js', filename='plugins.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='plugins.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
<script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script> <script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
{% if playwright_enabled %} {% if playwright_enabled %}

@ -286,8 +286,8 @@ class update_worker(threading.Thread):
# Re #342 # Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc # We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)): # if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes") # raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e: except PermissionError as e:
logger.critical(f"File permission error updating file, watch: {uuid}") logger.critical(f"File permission error updating file, watch: {uuid}")
logger.critical(str(e)) logger.critical(str(e))

Loading…
Cancel
Save