From 5a768d7db38a46183947c0a8cd4ad9a382b1f42e Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Thu, 10 Oct 2024 14:59:39 +0200
Subject: [PATCH] UTF-8 handling fixes, Improvements to whitespace filtering
 (#2691)

---
 .../blueprint/tags/templates/edit-tag.html    |  1 -
 changedetectionio/forms.py                    |  2 +-
 changedetectionio/html_tools.py               | 35 +++++----
 changedetectionio/model/Watch.py              | 30 ++++++--
 .../processors/restock_diff/processor.py      |  2 +-
 .../processors/text_json_diff/__init__.py     | 17 ++++-
 .../processors/text_json_diff/processor.py    | 72 ++++++++++---------
 changedetectionio/static/js/watch-settings.js | 10 ++-
 changedetectionio/store.py                    |  9 +--
 changedetectionio/templates/edit.html         |  9 ++-
 changedetectionio/templates/settings.html     |  2 +-
 changedetectionio/tests/test_extract_regex.py |  2 +-
 .../tests/test_ignore_regex_text.py           | 14 ++--
 changedetectionio/tests/test_ignore_text.py   | 42 ++++++-----
 .../tests/unit/test_watch_model.py            | 13 ++--
 changedetectionio/update_worker.py            |  4 +-
 16 files changed, 151 insertions(+), 113 deletions(-)
diff --git a/changedetectionio/blueprint/tags/templates/edit-tag.html b/changedetectionio/blueprint/tags/templates/edit-tag.html
index a713cf6a..e527ea52 100644
--- a/changedetectionio/blueprint/tags/templates/edit-tag.html
+++ b/changedetectionio/blueprint/tags/templates/edit-tag.html
@@ -17,7 +17,6 @@
 </script>
 
 <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
-<!--<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>-->
 <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
 
 <div class="edit-form monospaced-textarea">
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 28fdfeb9..19056b5d 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -476,7 +476,7 @@ class processor_text_json_diff_form(commonSettingsForm):
 
     title = StringField('Title', default='')
 
-    ignore_text = StringListField('Remove lines containing', [ValidateListRegex()])
+    ignore_text = StringListField('Ignore lines containing', [ValidateListRegex()])
     headers = StringDictKeyValue('Request headers')
     body = TextAreaField('Request body', [validators.Optional()])
     method = SelectField('Request method', choices=valid_method, default=default_method)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 7c2e1eba..6e4ebca0 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -3,11 +3,11 @@ from lxml import etree
 import json
 import re
 
-
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
-
+TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
 PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
+
 # 'price' , 'lowPrice', 'highPrice' are usually under here
 # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
 LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
@@ -326,6 +326,7 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 #          - "line numbers" return a list of line numbers that match (int list)
 #
 # wordlist - list of regex's (str) or words (str)
+# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
 def strip_ignore_text(content, wordlist, mode="content"):
     i = 0
     output = []
@@ -341,32 +342,30 @@ def strip_ignore_text(content, wordlist, mode="content"):
         else:
             ignore_text.append(k.strip())
 
-    for line in content.splitlines():
+    for line in content.splitlines(keepends=True):
         i += 1
         # Always ignore blank lines in this mode. (when this function gets called)
         got_match = False
-        if len(line.strip()):
-            for l in ignore_text:
-                if l.lower() in line.lower():
-                    got_match = True
+        for l in ignore_text:
+            if l.lower() in line.lower():
+                got_match = True
 
-            if not got_match:
-                for r in ignore_regex:
-                    if r.search(line):
-                        got_match = True
-
-            if not got_match:
-                # Not ignored
-                output.append(line.encode('utf8'))
-            else:
-                ignored_line_numbers.append(i)
+        if not got_match:
+            for r in ignore_regex:
+                if r.search(line):
+                    got_match = True
 
+        if not got_match:
+            # Not ignored, and should preserve "keepends"
+            output.append(line)
+        else:
+            ignored_line_numbers.append(i)
 
     # Used for finding out what to highlight
     if mode == "line numbers":
         return ignored_line_numbers
 
-    return "\n".encode('utf8').join(output)
+    return ''.join(output)
 
 def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
     from xml.sax.saxutils import escape as xml_escape
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index b52d37fb..a2e38ce1 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -6,6 +6,8 @@ import re
 from pathlib import Path
 from loguru import logger
 
+from ..html_tools import TRANSLATE_WHITESPACE_TABLE
+
 # Allowable protocols, protects against javascript: etc
 # file:// is further checked by ALLOW_FILE_URI
 SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
@@ -312,13 +314,13 @@ class model(watch_base):
             dest = os.path.join(self.watch_data_dir, snapshot_fname)
             if not os.path.exists(dest):
                 with open(dest, 'wb') as f:
-                    f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
+                    f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT))
         else:
             snapshot_fname = f"{snapshot_id}.txt"
             dest = os.path.join(self.watch_data_dir, snapshot_fname)
             if not os.path.exists(dest):
                 with open(dest, 'wb') as f:
-                    f.write(contents)
+                    f.write(contents.encode('utf-8'))
 
         # Append to index
         # @todo check last char was \n
@@ -350,14 +352,32 @@ class model(watch_base):
         return seconds
 
     # Iterate over all history texts and see if something new exists
-    def lines_contain_something_unique_compared_to_history(self, lines: list):
-        local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
+    # Always applying .strip() to start/end but optionally replace any other whitespace
+    def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False):
+        local_lines = []
+        if lines:
+            if ignore_whitespace:
+                if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
+                    local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
+                else:
+                    local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
+            else:
+                if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
+                    local_lines = set([l.strip().lower() for l in lines])
+                else:
+                    local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
+
 
         # Compare each lines (set) against each history text file (set) looking for something new..
         existing_history = set({})
         for k, v in self.history.items():
             content = self.get_history_snapshot(k)
-            alist = set([line.strip().lower() for line in content.splitlines()])
+
+            if ignore_whitespace:
+                alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()])
+            else:
+                alist = set([line.strip().lower() for line in content.splitlines()])
+
             existing_history = existing_history.union(alist)
 
         # Check that everything in local_lines(new stuff) already exists in existing_history - it should
diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py
index 0f490221..911e1838 100644
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor):
         # Always record the new checksum
         update_obj["previous_md5"] = fetched_md5
 
-        return changed_detected, update_obj, snapshot_content.encode('utf-8').strip()
+        return changed_detected, update_obj, snapshot_content.strip()
diff --git a/changedetectionio/processors/text_json_diff/__init__.py b/changedetectionio/processors/text_json_diff/__init__.py
index f87aa350..6a5efad9 100644
--- a/changedetectionio/processors/text_json_diff/__init__.py
+++ b/changedetectionio/processors/text_json_diff/__init__.py
@@ -46,6 +46,9 @@ def prepare_filter_prevew(datastore, watch_uuid):
 
     text_after_filter = ''
     text_before_filter = ''
+    trigger_line_numbers = []
+    ignore_line_numbers = []
+
     tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid))
 
     if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir):
@@ -72,7 +75,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
                                                                  )
             # Use the last loaded HTML as the input
             update_handler.datastore = datastore
-            update_handler.fetcher.content = decompressed_data
+            update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string
             update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')
 
             # Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk
@@ -84,9 +87,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
                 text_after_filter = future1.result()
                 text_before_filter = future2.result()
 
-    trigger_line_numbers = []
     try:
-
         trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
                                                             wordlist=tmp_watch['trigger_text'],
                                                             mode='line numbers'
@@ -94,6 +95,15 @@ def prepare_filter_prevew(datastore, watch_uuid):
     except Exception as e:
         text_before_filter = f"Error: {str(e)}"
 
+    try:
+        text_to_ignore = tmp_watch.get('ignore_text', []) + datastore.data['settings']['application'].get('global_ignore_text', [])
+        ignore_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
+                                                           wordlist=text_to_ignore,
+                                                           mode='line numbers'
+                                                           )
+    except Exception as e:
+        text_before_filter = f"Error: {str(e)}"
+
     logger.trace(f"Parsed in {time.time() - now:.3f}s")
 
     return jsonify(
@@ -102,6 +112,7 @@ def prepare_filter_prevew(datastore, watch_uuid):
             'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter,
             'duration': time.time() - now,
             'trigger_line_numbers': trigger_line_numbers,
+            'ignore_line_numbers': ignore_line_numbers,
         }
     )
 
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index 43feb05f..c3752956 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -7,7 +7,7 @@ import re
 import urllib3
 
 from changedetectionio.processors import difference_detection_processor
-from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
+from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
 from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from loguru import logger
@@ -36,7 +36,6 @@ class PDFToHTMLToolNotFound(ValueError):
 class perform_site_check(difference_detection_processor):
 
     def run_changedetection(self, watch, skip_when_checksum_same=True):
-
         changed_detected = False
         html_content = ""
         screenshot = False  # as bytes
@@ -205,18 +204,9 @@ class perform_site_check(difference_detection_processor):
         if watch.get('trim_text_whitespace'):
             stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
 
-        if watch.get('remove_duplicate_lines'):
-            stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
-
-        if watch.get('sort_text_alphabetically'):
-            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
-            # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
-            stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
-            stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
-
         # Re #340 - return the content before the 'ignore text' was applied
         # Also used to calculate/show what was removed
-        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
+        text_content_before_ignored_filter = stripped_text_from_html
 
         # @todo whitespace coming from missing rtrim()?
         # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
@@ -236,12 +226,12 @@ class perform_site_check(difference_detection_processor):
                                              line_feed_sep="\n",
                                              include_change_type_prefix=False)
 
-            watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter)
+            watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8'))
 
             if not rendered_diff and stripped_text_from_html:
                 # We had some content, but no differences were found
                 # Store our new file as the MD5 so it will trigger in the future
-                c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest()
+                c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
                 return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
             else:
                 stripped_text_from_html = rendered_diff
@@ -262,14 +252,6 @@ class perform_site_check(difference_detection_processor):
 
         update_obj["last_check_status"] = self.fetcher.get_last_status_code()
 
-        # If there's text to skip
-        # @todo we could abstract out the get_text() to handle this cleaner
-        text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
-        if len(text_to_ignore):
-            stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
-        else:
-            stripped_text_from_html = stripped_text_from_html.encode('utf8')
-
         # 615 Extract text by regex
         extract_text = watch.get('extract_text', [])
         if len(extract_text) > 0:
@@ -278,39 +260,53 @@ class perform_site_check(difference_detection_processor):
                 # incase they specified something in '/.../x'
                 if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
                     regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
-                    result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+                    result = re.findall(regex, stripped_text_from_html)
 
                     for l in result:
                         if type(l) is tuple:
                             # @todo - some formatter option default (between groups)
-                            regex_matched_output += list(l) + [b'\n']
+                            regex_matched_output += list(l) + ['\n']
                         else:
                             # @todo - some formatter option default (between each ungrouped result)
-                            regex_matched_output += [l] + [b'\n']
+                            regex_matched_output += [l] + ['\n']
                 else:
                     # Doesnt look like regex, just hunt for plaintext and return that which matches
                     # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
-                    r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
+                    r = re.compile(re.escape(s_re), re.IGNORECASE)
                     res = r.findall(stripped_text_from_html)
                     if res:
                         for match in res:
-                            regex_matched_output += [match] + [b'\n']
+                            regex_matched_output += [match] + ['\n']
 
             ##########################################################
-            stripped_text_from_html = b''
-            text_content_before_ignored_filter = b''
+            stripped_text_from_html = ''
+
             if regex_matched_output:
                 # @todo some formatter for presentation?
-                stripped_text_from_html = b''.join(regex_matched_output)
-                text_content_before_ignored_filter = stripped_text_from_html
+                stripped_text_from_html = ''.join(regex_matched_output)
 
+        if watch.get('remove_duplicate_lines'):
+            stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()))
 
 
+        if watch.get('sort_text_alphabetically'):
+            # Note: Because a <p>something</p> will add an extra line feed to signify the paragraph gap
+            # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here.
+            stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n")
+            stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower()))
+
+### CALCULATE MD5
+        # If there's text to ignore
+        text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', [])
+        text_for_checksuming = stripped_text_from_html
+        if text_to_ignore:
+            text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore)
+
         # Re #133 - if we should strip whitespaces from triggering the change detected comparison
-        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
-            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
+        if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
+            fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
         else:
-            fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
+            fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()
 
         ############ Blocking rules, after checksum #################
         blocked = False
@@ -350,7 +346,13 @@ class perform_site_check(difference_detection_processor):
 
         if changed_detected:
             if watch.get('check_unique_lines', False):
-                has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines())
+                ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace')
+
+                has_unique_lines = watch.lines_contain_something_unique_compared_to_history(
+                    lines=stripped_text_from_html.splitlines(),
+                    ignore_whitespace=ignore_whitespace
+                )
+
                 # One or more lines? unsure?
                 if not has_unique_lines:
                     logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")
diff --git a/changedetectionio/static/js/watch-settings.js b/changedetectionio/static/js/watch-settings.js
index f3360dbe..cb9f9c60 100644
--- a/changedetectionio/static/js/watch-settings.js
+++ b/changedetectionio/static/js/watch-settings.js
@@ -42,8 +42,12 @@ function request_textpreview_update() {
                 {
                     'color': '#ee0000',
                     'lines': data['trigger_line_numbers']
+                },
+                {
+                    'color': '#757575',
+                    'lines': data['ignore_line_numbers']
                 }
-            ]);
+            ])
     }).fail(function (error) {
         if (error.statusText === 'abort') {
             console.log('Request was aborted due to a new request being fired.');
@@ -76,8 +80,8 @@ $(document).ready(function () {
         $('body').toggleClass('preview-text-enabled')
         request_textpreview_update();
         const method = $('body').hasClass('preview-text-enabled') ? 'on' : 'off';
-        $('textarea:visible')[method]('keyup blur', request_textpreview_update.throttle(1000));
-        $('input:visible')[method]('keyup blur change', request_textpreview_update.throttle(1000));
+        $('#filters-and-triggers textarea')[method]('blur', request_textpreview_update.throttle(1000));
+        $('#filters-and-triggers input')[method]('change', request_textpreview_update.throttle(1000));
         $("#filters-and-triggers-tab")[method]('click', request_textpreview_update.throttle(1000));
     });
     $('.minitabs-wrapper').miniTabs({
diff --git a/changedetectionio/store.py b/changedetectionio/store.py
index cc1b335f..697da5bc 100644
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -4,6 +4,7 @@ from flask import (
     flash
 )
 
+from .html_tools import TRANSLATE_WHITESPACE_TABLE
 from . model import App, Watch
 from copy import deepcopy, copy
 from os import path, unlink
@@ -750,17 +751,17 @@ class ChangeDetectionStore:
     def update_5(self):
         # If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings
         # In other words - the watch notification_title and notification_body are not needed if they are the same as the default one
-        current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
-        current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
+        current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
+        current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
         for uuid, watch in self.data['watching'].items():
             try:
                 watch_body = watch.get('notification_body', '')
-                if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body:
+                if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body:
                     # Looks the same as the default one, so unset it
                     watch['notification_body'] = None
 
                 watch_title = watch.get('notification_title', '')
-                if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title:
+                if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title:
                     # Looks the same as the default one, so unset it
                     watch['notification_title'] = None
             except Exception as e:
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 9d9f48ff..5847962f 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -26,7 +26,6 @@
 </script>
 <script src="{{url_for('static_content', group='js', filename='plugins.js')}}" defer></script>
 <script src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
-<script src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
 <script src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
 <script src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
 {% if playwright_enabled %}
@@ -330,9 +329,9 @@ nav
                         {{ render_checkbox_field(form.filter_text_added) }}
                         {{ render_checkbox_field(form.filter_text_replaced) }}
                         {{ render_checkbox_field(form.filter_text_removed) }}
-                    <span class="pure-form-message-inline">Note: Depending on the length and similarity of the text on each line, the algorithm may consider an <strong>addition</strong> instead of <strong>replacement</strong> for example.</span>
-                    <span class="pure-form-message-inline">So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
-                    <span class="pure-form-message-inline">When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
+                    <span class="pure-form-message-inline">Note: Depending on the length and similarity of the text on each line, the algorithm may consider an <strong>addition</strong> instead of <strong>replacement</strong> for example.</span><br>
+                    <span class="pure-form-message-inline">&nbsp;So it's always better to select <strong>Added</strong>+<strong>Replaced</strong> when you're interested in new content.</span><br>
+                    <span class="pure-form-message-inline">&nbsp;When content is merely moved in a list, it will also trigger an <strong>addition</strong>, consider enabling <code><strong>Only trigger when unique lines appear</strong></code></span>
                 </fieldset>
                 <fieldset class="pure-control-group">
                     {{ render_checkbox_field(form.check_unique_lines) }}
@@ -371,7 +370,7 @@ nav
 ") }}
                     <span class="pure-form-message-inline">
                         <ul>
-                            <li>Matching text will be <strong>removed</strong> from the text snapshot</li>
+                            <li>Matching text will be <strong>ignored</strong> in the text snapshot (you can still see it but it wont trigger a change)</li>
                             <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
                             <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
                             <li>Changing this will affect the comparison checksum which may trigger an alert</li>
diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html
index ad41e7b6..e39c4081 100644
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@@ -172,7 +172,7 @@ nav
                     <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br>
                     <span class="pure-form-message-inline">
                         <ul>
-                            <li>Matching text will be <strong>removed</strong> from the text snapshot</li>
+                            <li>Matching text will be <strong>ignored</strong> in the text snapshot (you can still see it but it wont trigger a change)</li>
                             <li>Note: This is applied globally in addition to the per-watch rules.</li>
                             <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
                             <li>Regular Expression support, wrap the entire line in forward slash <code>/regex/</code></li>
diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py
index 522cff0c..058b3411 100644
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@@ -71,7 +71,7 @@ def test_setup(client, live_server, measure_memory_usage):
     live_server_setup(live_server)
 
 def test_check_filter_multiline(client, live_server, measure_memory_usage):
-    #live_server_setup(live_server)
+   # live_server_setup(live_server)
     set_multiline_response()
 
     # Add our URL to the import page
diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py
index 06c60ea4..34883182 100644
--- a/changedetectionio/tests/test_ignore_regex_text.py
+++ b/changedetectionio/tests/test_ignore_regex_text.py
@@ -33,13 +33,17 @@ def test_strip_regex_text_func():
 
     stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
 
-    assert b"but 1 lines" in stripped_content
-    assert b"igNORe-cAse text" not in stripped_content
-    assert b"but 1234 lines" not in stripped_content
-    assert b"really" not in stripped_content
-    assert b"not this" not in stripped_content
+    assert "but 1 lines" in stripped_content
+    assert "igNORe-cAse text" not in stripped_content
+    assert "but 1234 lines" not in stripped_content
+    assert "really" not in stripped_content
+    assert "not this" not in stripped_content
 
     # Check line number reporting
     stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
     assert stripped_content == [2, 5, 6, 7, 8, 10]
 
+    # Check that linefeeds are preserved when there are is no matching ignores
+    content = "some text\n\nand other text\n"
+    stripped_content = html_tools.strip_ignore_text(content, ignore_lines)
+    assert content == stripped_content
diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py
index 37d21d1b..4a5c86a1 100644
--- a/changedetectionio/tests/test_ignore_text.py
+++ b/changedetectionio/tests/test_ignore_text.py
@@ -22,10 +22,15 @@ def test_strip_text_func():
     ignore_lines = ["sometimes"]
 
     stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
+    assert "sometimes" not in stripped_content
+    assert "Some content" in stripped_content
 
-    assert b"sometimes" not in stripped_content
-    assert b"Some content" in stripped_content
+    # Check that line feeds dont get chewed up when something is found
+    test_content = "Some initial text\n\nWhich is across multiple lines\n\nZZZZz\n\n\nSo let's see what happens."
+    ignore = ['something irrelevent but just to check', 'XXXXX', 'YYYYY', 'ZZZZZ']
 
+    stripped_content = html_tools.strip_ignore_text(test_content, ignore)
+    assert stripped_content == "Some initial text\n\nWhich is across multiple lines\n\n\n\nSo let's see what happens."
 
 def set_original_ignore_response():
     test_return_data = """<html>
@@ -141,8 +146,6 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa
 
 
 
-
-
     # Just to be sure.. set a regular modified change..
     set_modified_original_ignore_response()
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
@@ -153,17 +156,17 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa
 
     res = client.get(url_for("preview_page", uuid="first"))
 
-    # Should no longer be in the preview
-    assert b'new ignore stuff' not in res.data
+    # SHOULD BE be in the preview, it was added in set_modified_original_ignore_response()
+    # and we have "new ignore stuff" in ignore_text
+    # it is only ignored, it is not removed (it will be highlighted too)
+    assert b'new ignore stuff' in res.data
 
     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
     assert b'Deleted' in res.data
 
+# When adding some ignore text, it should not trigger a change, even if something else on that line changes
 def test_check_global_ignore_text_functionality(client, live_server, measure_memory_usage):
-
-    # Give the endpoint time to spin up
-    time.sleep(1)
-
+    #live_server_setup(live_server)
     ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ"
     set_original_ignore_response()
 
@@ -172,6 +175,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
         url_for("settings_page"),
         data={
             "requests-time_between_check-minutes": 180,
+            "application-ignore_whitespace": "y",
             "application-global_ignore_text": ignore_text,
             'application-fetch_backend': "html_requests"
         },
@@ -192,9 +196,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
     # Give the thread time to pick it up
     wait_for_all_checks(client)
 
-
-    # Goto the edit page of the item, add our ignore text
-    # Add our URL to the import page
+    #Adding some ignore text should not trigger a change
     res = client.post(
         url_for("edit_page", uuid="first"),
         data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests"},
@@ -210,20 +212,15 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
 
     # Trigger a check
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
-
-    # Give the thread time to pick it up
     wait_for_all_checks(client)
-
-    # so that we are sure everything is viewed and in a known 'nothing changed' state
-    res = client.get(url_for("diff_history_page", uuid="first"))
-
-    # It should report nothing found (no new 'unviewed' class)
+    # It should report nothing found (no new 'unviewed' class), adding random ignore text should not cause a change
     res = client.get(url_for("index"))
     assert b'unviewed' not in res.data
     assert b'/test-endpoint' in res.data
+#####
 
-
-    #  Make a change which includes the ignore text
+    # Make a change which includes the ignore text, it should be ignored and no 'change' triggered
+    # It adds text with "ZZZZzzzz" and "ZZZZ" is in the ignore list
     set_modified_ignore_response()
 
     # Trigger a check
@@ -233,6 +230,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem
 
     # It should report nothing found (no new 'unviewed' class)
     res = client.get(url_for("index"))
+
     assert b'unviewed' not in res.data
     assert b'/test-endpoint' in res.data
 
diff --git a/changedetectionio/tests/unit/test_watch_model.py b/changedetectionio/tests/unit/test_watch_model.py
index a9e3df3c..a550cd0a 100644
--- a/changedetectionio/tests/unit/test_watch_model.py
+++ b/changedetectionio/tests/unit/test_watch_model.py
@@ -18,12 +18,13 @@ class TestDiffBuilder(unittest.TestCase):
 
         watch['last_viewed'] = 110
 
-        watch.save_history_text(contents=b"hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4()))
-        watch.save_history_text(contents=b"hello world", timestamp=105, snapshot_id=str(uuid_builder.uuid4()))
-        watch.save_history_text(contents=b"hello world", timestamp=109, snapshot_id=str(uuid_builder.uuid4()))
-        watch.save_history_text(contents=b"hello world", timestamp=112, snapshot_id=str(uuid_builder.uuid4()))
-        watch.save_history_text(contents=b"hello world", timestamp=115, snapshot_id=str(uuid_builder.uuid4()))
-        watch.save_history_text(contents=b"hello world", timestamp=117, snapshot_id=str(uuid_builder.uuid4()))
+        # Contents from the browser are always returned from the browser/requests/etc as str, str is basically UTF-16 in python
+        watch.save_history_text(contents="hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4()))
+        watch.save_history_text(contents="hello world", timestamp=105, snapshot_id=str(uuid_builder.uuid4()))
+        watch.save_history_text(contents="hello world", timestamp=109, snapshot_id=str(uuid_builder.uuid4()))
+        watch.save_history_text(contents="hello world", timestamp=112, snapshot_id=str(uuid_builder.uuid4()))
+        watch.save_history_text(contents="hello world", timestamp=115, snapshot_id=str(uuid_builder.uuid4()))
+        watch.save_history_text(contents="hello world", timestamp=117, snapshot_id=str(uuid_builder.uuid4()))
 
         p = watch.get_next_snapshot_key_to_last_viewed
         assert p == "112", "Correct last-viewed timestamp was detected"
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py
index 97e1ec27..ebb3ada7 100644
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -286,8 +286,8 @@ class update_worker(threading.Thread):
                         # Re #342
                         # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
                         # We then convert/.decode('utf-8') for the notification etc
-                        if not isinstance(contents, (bytes, bytearray)):
-                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
+#                        if not isinstance(contents, (bytes, bytearray)):
+#                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
                     except PermissionError as e:
                         logger.critical(f"File permission error updating file, watch: {uuid}")
                         logger.critical(str(e))