From 3b6ae70c9c57b45ed4d52a0a7ad3853ad2c6c50a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 9 Oct 2024 13:11:04 +0200 Subject: [PATCH] Misc fixes - juggling utf-8 not needed (its utf-16 by default python string) --- .../blueprint/tags/templates/edit-tag.html | 1 - changedetectionio/html_tools.py | 4 +-- changedetectionio/model/Watch.py | 4 +-- .../processors/restock_diff/processor.py | 2 +- .../processors/text_json_diff/__init__.py | 2 +- .../processors/text_json_diff/processor.py | 32 ++++++++----------- changedetectionio/templates/edit.html | 1 - changedetectionio/update_worker.py | 4 +-- 8 files changed, 22 insertions(+), 28 deletions(-) diff --git a/changedetectionio/blueprint/tags/templates/edit-tag.html b/changedetectionio/blueprint/tags/templates/edit-tag.html index a713cf6a..e527ea52 100644 --- a/changedetectionio/blueprint/tags/templates/edit-tag.html +++ b/changedetectionio/blueprint/tags/templates/edit-tag.html @@ -17,7 +17,6 @@ -
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 7c2e1eba..646d71a1 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -357,7 +357,7 @@ def strip_ignore_text(content, wordlist, mode="content"): if not got_match: # Not ignored - output.append(line.encode('utf8')) + output.append(line) else: ignored_line_numbers.append(i) @@ -366,7 +366,7 @@ def strip_ignore_text(content, wordlist, mode="content"): if mode == "line numbers": return ignored_line_numbers - return "\n".encode('utf8').join(output) + return "\n".join(output) def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: from xml.sax.saxutils import escape as xml_escape diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index b52d37fb..c6d71854 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -312,13 +312,13 @@ class model(watch_base): dest = os.path.join(self.watch_data_dir, snapshot_fname) if not os.path.exists(dest): with open(dest, 'wb') as f: - f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) + f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT)) else: snapshot_fname = f"{snapshot_id}.txt" dest = os.path.join(self.watch_data_dir, snapshot_fname) if not os.path.exists(dest): with open(dest, 'wb') as f: - f.write(contents) + f.write(contents.encode('utf-8')) # Append to index # @todo check last char was \n diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 0f490221..911e1838 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor): # Always record the new checksum update_obj["previous_md5"] = fetched_md5 - return changed_detected, update_obj, snapshot_content.encode('utf-8').strip() + return changed_detected, update_obj, snapshot_content.strip() diff --git a/changedetectionio/processors/text_json_diff/__init__.py b/changedetectionio/processors/text_json_diff/__init__.py index f87aa350..ed0746f2 100644 --- a/changedetectionio/processors/text_json_diff/__init__.py +++ b/changedetectionio/processors/text_json_diff/__init__.py @@ -72,7 +72,7 @@ def prepare_filter_prevew(datastore, watch_uuid): ) # Use the last loaded HTML as the input update_handler.datastore = datastore - update_handler.fetcher.content = decompressed_data + update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type') # Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index a35724b5..7dad3dc7 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -205,18 +205,9 @@ class perform_site_check(difference_detection_processor): if watch.get('trim_text_whitespace'): stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()) - if watch.get('remove_duplicate_lines'): - stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())) - - if watch.get('sort_text_alphabetically'): - # Note: Because a

something

will add an extra line feed to signify the paragraph gap - # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. - stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n") - stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower())) - # Re #340 - return the content before the 'ignore text' was applied # Also used to calculate/show what was removed - text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') + text_content_before_ignored_filter = stripped_text_from_html # @todo whitespace coming from missing rtrim()? # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about. @@ -235,12 +226,12 @@ class perform_site_check(difference_detection_processor): line_feed_sep="\n", include_change_type_prefix=False) - watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter) + watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8')) if not rendered_diff and stripped_text_from_html: # We had some content, but no differences were found # Store our new file as the MD5 so it will trigger in the future - c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest() + c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest() return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8') else: stripped_text_from_html = rendered_diff @@ -264,10 +255,8 @@ class perform_site_check(difference_detection_processor): # If there's text to skip # @todo we could abstract out the get_text() to handle this cleaner text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', []) - if len(text_to_ignore): + if text_to_ignore: stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore) - else: - stripped_text_from_html = stripped_text_from_html.encode('utf8') # 615 Extract text by regex extract_text = watch.get('extract_text', []) @@ -301,15 +290,22 @@ class perform_site_check(difference_detection_processor): if regex_matched_output: # @todo some formatter for presentation? stripped_text_from_html = b''.join(regex_matched_output) - text_content_before_ignored_filter = stripped_text_from_html + if watch.get('remove_duplicate_lines'): + stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())) + if watch.get('sort_text_alphabetically'): + # Note: Because a

something

will add an extra line feed to signify the paragraph gap + # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. + stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n") + stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower())) + # Re #133 - if we should strip whitespaces from triggering the change detected comparison if self.datastore.data['settings']['application'].get('ignore_whitespace', False): - fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() + fetched_md5 = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest() else: - fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() + fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest() ############ Blocking rules, after checksum ################# blocked = False diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 9d9f48ff..1f7d363f 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -26,7 +26,6 @@ - {% if playwright_enabled %} diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 97e1ec27..ebb3ada7 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -286,8 +286,8 @@ class update_worker(threading.Thread): # Re #342 # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. # We then convert/.decode('utf-8') for the notification etc - if not isinstance(contents, (bytes, bytearray)): - raise Exception("Error - returned data from the fetch handler SHOULD be bytes") +# if not isinstance(contents, (bytes, bytearray)): +# raise Exception("Error - returned data from the fetch handler SHOULD be bytes") except PermissionError as e: logger.critical(f"File permission error updating file, watch: {uuid}") logger.critical(str(e))