Stop html_tools.strip_ignore_text from chewing newlines

refactor-filters
dgtlmoon 3 months ago
parent deadf881b0
commit fabbb3733a

@ -326,6 +326,7 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
# - "line numbers" return a list of line numbers that match (int list) # - "line numbers" return a list of line numbers that match (int list)
# #
# wordlist - list of regex's (str) or words (str) # wordlist - list of regex's (str) or words (str)
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
def strip_ignore_text(content, wordlist, mode="content"): def strip_ignore_text(content, wordlist, mode="content"):
i = 0 i = 0
output = [] output = []
@ -341,32 +342,31 @@ def strip_ignore_text(content, wordlist, mode="content"):
else: else:
ignore_text.append(k.strip()) ignore_text.append(k.strip())
for line in content.splitlines(): for line in content.splitlines(keepends=True):
i += 1 i += 1
# Always ignore blank lines in this mode. (when this function gets called) # Always ignore blank lines in this mode. (when this function gets called)
got_match = False got_match = False
if len(line.strip()): for l in ignore_text:
for l in ignore_text: if l.lower() in line.lower():
if l.lower() in line.lower(): got_match = True
got_match = True
if not got_match:
for r in ignore_regex:
if r.search(line):
got_match = True
if not got_match: if not got_match:
# Not ignored for r in ignore_regex:
output.append(line) if r.search(line):
else: got_match = True
ignored_line_numbers.append(i)
if not got_match:
# Not ignored
# Plus "\n" because
output.append(line)
else:
ignored_line_numbers.append(i)
# Used for finding out what to highlight # Used for finding out what to highlight
if mode == "line numbers": if mode == "line numbers":
return ignored_line_numbers return ignored_line_numbers
return "\n".join(output) return ''.join(output)
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
from xml.sax.saxutils import escape as xml_escape from xml.sax.saxutils import escape as xml_escape

@ -43,3 +43,7 @@ def test_strip_regex_text_func():
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers") stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
assert stripped_content == [2, 5, 6, 7, 8, 10] assert stripped_content == [2, 5, 6, 7, 8, 10]
# Check that linefeeds are preserved when there are is no matching ignores
content = "some text\n\nand other text\n"
stripped_content = html_tools.strip_ignore_text(content, ignore_lines)
assert content == stripped_content

Loading…
Cancel
Save