Stop html_tools.strip_ignore_text from chewing newlines

3 months ago · fabbb3733a
parent deadf881b0
commit fabbb3733a
2 changed files with 20 additions and 16 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -326,6 +326,7 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
 #          - "line numbers" return a list of line numbers that match (int list)
 #
 # wordlist - list of regex's (str) or words (str)
+# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
 def strip_ignore_text(content, wordlist, mode="content"):
    i = 0
    output = []
@ -341,11 +342,10 @@ def strip_ignore_text(content, wordlist, mode="content"):
        else:
            ignore_text.append(k.strip())

-    for line in content.splitlines():
+    for line in content.splitlines(keepends=True):
        i += 1
        # Always ignore blank lines in this mode. (when this function gets called)
        got_match = False
-        if len(line.strip()):
        for l in ignore_text:
            if l.lower() in line.lower():
                got_match = True
@ -357,16 +357,16 @@ def strip_ignore_text(content, wordlist, mode="content"):

        if not got_match:
            # Not ignored
+            # Plus "\n" because
            output.append(line)
        else:
            ignored_line_numbers.append(i)

-
    # Used for finding out what to highlight
    if mode == "line numbers":
        return ignored_line_numbers

-    return "\n".join(output)
+    return ''.join(output)

 def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    from xml.sax.saxutils import escape as xml_escape
--- a/changedetectionio/tests/test_ignore_regex_text.py
+++ b/changedetectionio/tests/test_ignore_regex_text.py
@ -43,3 +43,7 @@ def test_strip_regex_text_func():
    stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
    assert stripped_content == [2, 5, 6, 7, 8, 10]

+    # Check that linefeeds are preserved when there are is no matching ignores
+    content = "some text\n\nand other text\n"
+    stripped_content = html_tools.strip_ignore_text(content, ignore_lines)
+    assert content == stripped_content