diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index f0719e81..a7b1f366 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -10,6 +10,7 @@ import re # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis TEXT_FILTER_LIST_LINE_SUFFIX = "
" +PERL_STYLE_REGEX = r'^/(.*?)/([a-z]?)?$' # 'price' , 'lowPrice', 'highPrice' are usually under here # all of those may or may not appear on different websites LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" @@ -17,7 +18,21 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" class JSONNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) - + + +# Doesn't look like python supports forward slash auto enclosure in re.findall +# So convert it to inline flag "(?i)foobar" type configuration +def perl_style_slash_enclosed_regex_to_options(regex): + res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE) + + if res: + flags = res.group(2) if res.group(2) else 'i' + regex = f"(?{flags}){res.group(1)}" + else: + regex = f"(?i){regex}" + + return regex + # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches def include_filters(include_filters, html_content, append_pretty_line_formatting=False): soup = BeautifulSoup(html_content, "html.parser") @@ -195,23 +210,14 @@ def strip_ignore_text(content, wordlist, mode="content"): output = [] ignore_text = [] ignore_regex = [] - ignored_line_numbers = [] for k in wordlist: # Is it a regex? - x = re.search('^\/(.*)\/(.*)', k.strip()) - if x: - # Starts with / but doesn't look like a regex - p = x.group(1) - try: - # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis - ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE)) - except Exception as e: - # Badly formed regex, treat as text - ignore_text.append(k.strip()) + res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE) + if res: + ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k))) else: - # Had a / but doesn't work as regex ignore_text.append(k.strip()) for line in content.splitlines(): diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 1bf31fa4..2dda0e8d 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -37,19 +37,6 @@ class perform_site_check(difference_detection_processor): super().__init__(*args, **kwargs) self.datastore = datastore - # Doesn't look like python supports forward slash auto enclosure in re.findall - # So convert it to inline flag "(?i)foobar" type configuration - def forward_slash_enclosed_regex_to_options(self, regex): - res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE) - - if res: - regex = res.group(1) - regex = f"(?{res.group(2)}){regex}" - else: - regex = f"(?i){regex}" - - return regex - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): changed_detected = False screenshot = False # as bytes @@ -340,7 +327,7 @@ class perform_site_check(difference_detection_processor): regex_matched_output = [] for s_re in extract_text: # incase they specified something in '/.../x' - regex = self.forward_slash_enclosed_regex_to_options(s_re) + regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re) result = re.findall(regex.encode('utf-8'), stripped_text_from_html) for l in result: diff --git a/changedetectionio/tests/test_trigger_regex.py b/changedetectionio/tests/test_trigger_regex.py index 1462eb2a..7f070e89 100644 --- a/changedetectionio/tests/test_trigger_regex.py +++ b/changedetectionio/tests/test_trigger_regex.py @@ -2,7 +2,7 @@ import time from flask import url_for -from . util import live_server_setup +from .util import live_server_setup, wait_for_all_checks def set_original_ignore_response(): @@ -26,13 +26,8 @@ def test_trigger_regex_functionality(client, live_server): live_server_setup(live_server) - sleep_time_for_fetch_thread = 3 - set_original_ignore_response() - # Give the endpoint time to spin up - time.sleep(1) - # Add our URL to the import page test_url = url_for('test_endpoint', _external=True) res = client.post( @@ -43,7 +38,7 @@ def test_trigger_regex_functionality(client, live_server): assert b"1 Imported" in res.data # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (just a new one shouldnt have anything) res = client.get(url_for("index")) @@ -57,7 +52,7 @@ def test_trigger_regex_functionality(client, live_server): "fetch_backend": "html_requests"}, follow_redirects=True ) - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # so that we set the state to 'unviewed' after all the edits client.get(url_for("diff_history_page", uuid="first")) @@ -65,7 +60,7 @@ def test_trigger_regex_functionality(client, live_server): f.write("some new noise") client.get(url_for("form_watch_checknow"), follow_redirects=True) - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (nothing should match the regex) res = client.get(url_for("index")) @@ -75,7 +70,7 @@ def test_trigger_regex_functionality(client, live_server): f.write("regex test123
\nsomething 123") client.get(url_for("form_watch_checknow"), follow_redirects=True) - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) res = client.get(url_for("index")) assert b'unviewed' in res.data