From fabbb3733a754edfdba2abbef2e6d9f8c77420ca Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Wed, 9 Oct 2024 18:49:18 +0200
Subject: [PATCH] Stop html_tools.strip_ignore_text from chewing newlines

---
 changedetectionio/html_tools.py               | 32 +++++++++----------
 .../tests/test_ignore_regex_text.py           |  4 +++
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 646d71a1..a3832817 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -326,6 +326,7 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 #          - "line numbers" return a list of line numbers that match (int list)
 #
 # wordlist - list of regex's (str) or words (str)
+# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
 def strip_ignore_text(content, wordlist, mode="content"):
     i = 0
     output = []
@@ -341,32 +342,31 @@ def strip_ignore_text(content, wordlist, mode="content"):
         else:
             ignore_text.append(k.strip())
 
-    for line in content.splitlines():
+    for line in content.splitlines(keepends=True):
         i += 1
         # Always ignore blank lines in this mode. (when this function gets called)
         got_match = False
-        if len(line.strip()):
-            for l in ignore_text:
-                if l.lower() in line.lower():
-                    got_match = True
-
-            if not got_match:
-                for r in ignore_regex:
-                    if r.search(line):
-                        got_match = True
+        for l in ignore_text:
+            if l.lower() in line.lower():
+                got_match = True
 
-            if not got_match:
-                # Not ignored
-                output.append(line)
-            else:
-                ignored_line_numbers.append(i)
+        if not got_match:
+            for r in ignore_regex:
+                if r.search(line):
+                    got_match = True
 
+        if not got_match:
+            # Not ignored
+            # Plus "\n" because
+            output.append(line)
+        else:
+            ignored_line_numbers.append(i)
 
     # Used for finding out what to highlight
     if mode == "line numbers":
         return ignored_line_numbers
 
-    return "\n".join(output)
+    return ''.join(output)
 
 def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
     from xml.sax.saxutils import escape as xml_escape
diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py
index 2268d34f..34883182 100644
--- a/changedetectionio/tests/test_ignore_regex_text.py
+++ b/changedetectionio/tests/test_ignore_regex_text.py
@@ -43,3 +43,7 @@ def test_strip_regex_text_func():
     stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
     assert stripped_content == [2, 5, 6, 7, 8, 10]
 
+    # Check that linefeeds are preserved when there are is no matching ignores
+    content = "some text\n\nand other text\n"
+    stripped_content = html_tools.strip_ignore_text(content, ignore_lines)
+    assert content == stripped_content