diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index f0719e81..a7b1f366 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -10,6 +10,7 @@ import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "
"
+PERL_STYLE_REGEX = r'^/(.*?)/([a-z]?)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here
# all of those may or may not appear on different websites
LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
@@ -17,7 +18,21 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
class JSONNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
-
+
+
+# Doesn't look like python supports forward slash auto enclosure in re.findall
+# So convert it to inline flag "(?i)foobar" type configuration
+def perl_style_slash_enclosed_regex_to_options(regex):
+ res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
+
+ if res:
+ flags = res.group(2) if res.group(2) else 'i'
+ regex = f"(?{flags}){res.group(1)}"
+ else:
+ regex = f"(?i){regex}"
+
+ return regex
+
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
soup = BeautifulSoup(html_content, "html.parser")
@@ -195,23 +210,14 @@ def strip_ignore_text(content, wordlist, mode="content"):
output = []
ignore_text = []
ignore_regex = []
-
ignored_line_numbers = []
for k in wordlist:
# Is it a regex?
- x = re.search('^\/(.*)\/(.*)', k.strip())
- if x:
- # Starts with / but doesn't look like a regex
- p = x.group(1)
- try:
- # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
- ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
- except Exception as e:
- # Badly formed regex, treat as text
- ignore_text.append(k.strip())
+ res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
+ if res:
+ ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
else:
- # Had a / but doesn't work as regex
ignore_text.append(k.strip())
for line in content.splitlines():
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 1bf31fa4..2dda0e8d 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -37,19 +37,6 @@ class perform_site_check(difference_detection_processor):
super().__init__(*args, **kwargs)
self.datastore = datastore
- # Doesn't look like python supports forward slash auto enclosure in re.findall
- # So convert it to inline flag "(?i)foobar" type configuration
- def forward_slash_enclosed_regex_to_options(self, regex):
- res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
-
- if res:
- regex = res.group(1)
- regex = f"(?{res.group(2)}){regex}"
- else:
- regex = f"(?i){regex}"
-
- return regex
-
def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
changed_detected = False
screenshot = False # as bytes
@@ -340,7 +327,7 @@ class perform_site_check(difference_detection_processor):
regex_matched_output = []
for s_re in extract_text:
# incase they specified something in '/.../x'
- regex = self.forward_slash_enclosed_regex_to_options(s_re)
+ regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
for l in result:
diff --git a/changedetectionio/tests/test_trigger_regex.py b/changedetectionio/tests/test_trigger_regex.py
index 1462eb2a..7f070e89 100644
--- a/changedetectionio/tests/test_trigger_regex.py
+++ b/changedetectionio/tests/test_trigger_regex.py
@@ -2,7 +2,7 @@
import time
from flask import url_for
-from . util import live_server_setup
+from .util import live_server_setup, wait_for_all_checks
def set_original_ignore_response():
@@ -26,13 +26,8 @@ def test_trigger_regex_functionality(client, live_server):
live_server_setup(live_server)
- sleep_time_for_fetch_thread = 3
-
set_original_ignore_response()
- # Give the endpoint time to spin up
- time.sleep(1)
-
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
@@ -43,7 +38,7 @@ def test_trigger_regex_functionality(client, live_server):
assert b"1 Imported" in res.data
# Give the thread time to pick it up
- time.sleep(sleep_time_for_fetch_thread)
+ wait_for_all_checks(client)
# It should report nothing found (just a new one shouldnt have anything)
res = client.get(url_for("index"))
@@ -57,7 +52,7 @@ def test_trigger_regex_functionality(client, live_server):
"fetch_backend": "html_requests"},
follow_redirects=True
)
- time.sleep(sleep_time_for_fetch_thread)
+ wait_for_all_checks(client)
# so that we set the state to 'unviewed' after all the edits
client.get(url_for("diff_history_page", uuid="first"))
@@ -65,7 +60,7 @@ def test_trigger_regex_functionality(client, live_server):
f.write("some new noise")
client.get(url_for("form_watch_checknow"), follow_redirects=True)
- time.sleep(sleep_time_for_fetch_thread)
+ wait_for_all_checks(client)
# It should report nothing found (nothing should match the regex)
res = client.get(url_for("index"))
@@ -75,7 +70,7 @@ def test_trigger_regex_functionality(client, live_server):
f.write("regex test123
\nsomething 123")
client.get(url_for("form_watch_checknow"), follow_redirects=True)
- time.sleep(sleep_time_for_fetch_thread)
+ wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'unviewed' in res.data