From 40d01acde9f7002509b4b15469d4d862efe3e5b5 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 5 Sep 2023 12:58:41 +0200 Subject: [PATCH] Fix - Regular Expression text in `ignore` and `trigger` were not processing correctly, also refactored for lower CPU usage (#1747) --- changedetectionio/html_tools.py | 46 +++++++++++-------- .../tests/test_ignore_regex_text.py | 21 ++++++++- 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 0cdaeea4..f0719e81 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -191,42 +191,50 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None # # wordlist - list of regex's (str) or words (str) def strip_ignore_text(content, wordlist, mode="content"): - ignore = [] + i = 0 + output = [] + ignore_text = [] ignore_regex = [] - # @todo check this runs case insensitive - for k in wordlist: + ignored_line_numbers = [] + for k in wordlist: # Is it a regex? - if k[0] == '/': - ignore_regex.append(k.strip(" /")) + x = re.search('^\/(.*)\/(.*)', k.strip()) + if x: + # Starts with / but doesn't look like a regex + p = x.group(1) + try: + # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis + ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE)) + except Exception as e: + # Badly formed regex, treat as text + ignore_text.append(k.strip()) else: - ignore.append(k) + # Had a / but doesn't work as regex + ignore_text.append(k.strip()) - i = 0 - output = [] - ignored_line_numbers = [] for line in content.splitlines(): i += 1 # Always ignore blank lines in this mode. (when this function gets called) + got_match = False if len(line.strip()): - regex_matches = False + for l in ignore_text: + if l.lower() in line.lower(): + got_match = True - # if any of these match, skip - for regex in ignore_regex: - try: - if re.search(regex, line, re.IGNORECASE): - regex_matches = True - except Exception as e: - continue + if not got_match: + for r in ignore_regex: + if r.search(line): + got_match = True - if not regex_matches and not any(skip_text.lower() in line.lower() for skip_text in ignore): + if not got_match: + # Not ignored output.append(line.encode('utf8')) else: ignored_line_numbers.append(i) - # Used for finding out what to highlight if mode == "line numbers": return ignored_line_numbers diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py index e21ff050..49901f38 100644 --- a/changedetectionio/tests/test_ignore_regex_text.py +++ b/changedetectionio/tests/test_ignore_regex_text.py @@ -15,11 +15,24 @@ def test_strip_regex_text_func(): but sometimes we want to remove the lines. but 1 lines + skip 5 lines + really? yes man +#/not this tries weirdly formed regex or just strings starting with / +/not this but including 1234 lines igNORe-cAse text we dont want to keep but not always.""" - ignore_lines = ["sometimes", "/\s\d{2,3}\s/", "/ignore-case text/"] + + ignore_lines = [ + "sometimes", + "/\s\d{2,3}\s/", + "/ignore-case text/", + "really?", + "/skip \d lines/i", + "/not" + ] + fetcher = fetch_site_status.perform_site_check(datastore=False) stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) @@ -27,4 +40,10 @@ def test_strip_regex_text_func(): assert b"but 1 lines" in stripped_content assert b"igNORe-cAse text" not in stripped_content assert b"but 1234 lines" not in stripped_content + assert b"really" not in stripped_content + assert b"not this" not in stripped_content + + # Check line number reporting + stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers") + assert stripped_content == [2, 5, 6, 7, 8, 10]