diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 0cdaeea4..f0719e81 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -191,42 +191,50 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
#
# wordlist - list of regex's (str) or words (str)
def strip_ignore_text(content, wordlist, mode="content"):
- ignore = []
+ i = 0
+ output = []
+ ignore_text = []
ignore_regex = []
- # @todo check this runs case insensitive
- for k in wordlist:
+ ignored_line_numbers = []
+ for k in wordlist:
# Is it a regex?
- if k[0] == '/':
- ignore_regex.append(k.strip(" /"))
+ x = re.search('^\/(.*)\/(.*)', k.strip())
+ if x:
+ # Starts with / but doesn't look like a regex
+ p = x.group(1)
+ try:
+ # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
+ ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
+ except Exception as e:
+ # Badly formed regex, treat as text
+ ignore_text.append(k.strip())
else:
- ignore.append(k)
+ # Had a / but doesn't work as regex
+ ignore_text.append(k.strip())
- i = 0
- output = []
- ignored_line_numbers = []
for line in content.splitlines():
i += 1
# Always ignore blank lines in this mode. (when this function gets called)
+ got_match = False
if len(line.strip()):
- regex_matches = False
+ for l in ignore_text:
+ if l.lower() in line.lower():
+ got_match = True
- # if any of these match, skip
- for regex in ignore_regex:
- try:
- if re.search(regex, line, re.IGNORECASE):
- regex_matches = True
- except Exception as e:
- continue
+ if not got_match:
+ for r in ignore_regex:
+ if r.search(line):
+ got_match = True
- if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore):
+ if not got_match:
+ # Not ignored
output.append(line.encode('utf8'))
else:
ignored_line_numbers.append(i)
-
# Used for finding out what to highlight
if mode == "line numbers":
return ignored_line_numbers
diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py
index e21ff050..49901f38 100644
--- a/changedetectionio/tests/test_ignore_regex_text.py
+++ b/changedetectionio/tests/test_ignore_regex_text.py
@@ -15,11 +15,24 @@ def test_strip_regex_text_func():
but sometimes we want to remove the lines.
but 1 lines
+ skip 5 lines
+ really? yes man
+#/not this tries weirdly formed regex or just strings starting with /
+/not this
but including 1234 lines
igNORe-cAse text we dont want to keep
but not always."""
- ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"]
+
+ ignore_lines = [
+ "sometimes",
+ "/\s\d{2,3}\s/",
+ "/ignore-case text/",
+ "really?",
+ "/skip \d lines/i",
+ "/not"
+ ]
+
fetcher = fetch_site_status.perform_site_check(datastore=False)
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
@@ -27,4 +40,10 @@ def test_strip_regex_text_func():
assert b"but 1 lines" in stripped_content
assert b"igNORe-cAse text" not in stripped_content
assert b"but 1234 lines" not in stripped_content
+ assert b"really" not in stripped_content
+ assert b"not this" not in stripped_content
+
+ # Check line number reporting
+ stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
+ assert stripped_content == [2, 5, 6, 7, 8, 10]