Misc fixes - juggling utf-8 not needed

(its utf-16 by default python string)
3 months ago · 3b6ae70c9c
parent 783926962d
commit 3b6ae70c9c
8 changed files with 22 additions and 28 deletions
--- a/changedetectionio/blueprint/tags/templates/edit-tag.html
+++ b/changedetectionio/blueprint/tags/templates/edit-tag.html
@ -17,7 +17,6 @@
 </script>

 <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
-<!--<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>-->
 <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>

 <div class="edit-form monospaced-textarea">
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -357,7 +357,7 @@ def strip_ignore_text(content, wordlist, mode="content"):

            if not got_match:
                # Not ignored
-                output.append(line.encode('utf8'))
+                output.append(line)
            else:
                ignored_line_numbers.append(i)

@ -366,7 +366,7 @@ def strip_ignore_text(content, wordlist, mode="content"):
    if mode == "line numbers":
        return ignored_line_numbers

-    return "\n".encode('utf8').join(output)
+    return "\n".join(output)

 def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    from xml.sax.saxutils import escape as xml_escape
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -312,13 +312,13 @@ class model(watch_base):
            dest = os.path.join(self.watch_data_dir, snapshot_fname)
            if not os.path.exists(dest):
                with open(dest, 'wb') as f:
-                    f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
+                    f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT))
        else:
            snapshot_fname = f"{snapshot_id}.txt"
            dest = os.path.join(self.watch_data_dir, snapshot_fname)
            if not os.path.exists(dest):
                with open(dest, 'wb') as f:
-                    f.write(contents)
+                    f.write(contents.encode('utf-8'))

        # Append to index
        # @todo check last char was \n
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor):
        # Always record the new checksum
        update_obj["previous_md5"] = fetched_md5

-        return changed_detected, update_obj, snapshot_content.encode('utf-8').strip()
+        return changed_detected, update_obj, snapshot_content.strip()
--- a/changedetectionio/processors/text_json_diff/init.py
+++ b/changedetectionio/processors/text_json_diff/init.py
@ -72,7 +72,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
                                                                 )
            # Use the last loaded HTML as the input
            update_handler.datastore = datastore
-            update_handler.fetcher.content = decompressed_data
+            update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string
            update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')

            # Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@ -205,18 +205,9 @@ class perform_site_check(difference_detection_processor):
        if watch.get('trim_text_whitespace'):
            stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())

-        if watch.get('remove_duplicate_lines'):
-            stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
-
-        if watch.get('sort_text_alphabetically'):
-            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
-            # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
-            stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
-            stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
-
        # Re #340 - return the content before the 'ignore text' was applied
        # Also used to calculate/show what was removed
-        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
+        text_content_before_ignored_filter = stripped_text_from_html

        # @todo whitespace coming from missing rtrim()?
        # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
@ -235,12 +226,12 @@ class perform_site_check(difference_detection_processor):
                                             line_feed_sep="\n",
                                             include_change_type_prefix=False)

-            watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter)
+            watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8'))

            if not rendered_diff and stripped_text_from_html:
                # We had some content, but no differences were found
                # Store our new file as the MD5 so it will trigger in the future
-                c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest()
+                c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
                return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
            else:
                stripped_text_from_html = rendered_diff
@ -264,10 +255,8 @@ class perform_site_check(difference_detection_processor):
        # If there's text to skip
        # @todo we could abstract out the get_text() to handle this cleaner
        text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
-        if len(text_to_ignore):
+        if text_to_ignore:
            stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
-        else:
-            stripped_text_from_html = stripped_text_from_html.encode('utf8')

        # 615 Extract text by regex
        extract_text = watch.get('extract_text', [])
@ -301,15 +290,22 @@ class perform_site_check(difference_detection_processor):
            if regex_matched_output:
                # @todo some formatter for presentation?
                stripped_text_from_html = b''.join(regex_matched_output)
-                text_content_before_ignored_filter = stripped_text_from_html

+        if watch.get('remove_duplicate_lines'):
+            stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))


+        if watch.get('sort_text_alphabetically'):
+            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
+            # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
+            stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
+            stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
+
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
-            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
+            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
        else:
-            fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
+            fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest()

        ############ Blocking rules, after checksum #################
        blocked = False
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -26,7 +26,6 @@
 </script>
 <script src="{{url_for('static_content', group='js', filename='plugins.js')}}" defer></script>
 <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
-<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
 <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
 <script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
 {% if playwright_enabled %}
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -286,8 +286,8 @@ class update_worker(threading.Thread):
                        # Re #342
                        # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
                        # We then convert/.decode('utf-8') for the notification etc
-                        if not isinstance(contents, (bytes, bytearray)):
-                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
+#                        if not isinstance(contents, (bytes, bytearray)):
+#                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
                    except PermissionError as e:
                        logger.critical(f"File permission error updating file, watch: {uuid}")
                        logger.critical(str(e))