WIP

1 year ago · fc38e30989
parent 8c8f378395
commit fc38e30989
3 changed files with 25 additions and 37 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -10,6 +10,7 @@ import re
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"

+PERL_STYLE_REGEX = r'^/(.*?)/([a-z]?)?$'
 # 'price' , 'lowPrice', 'highPrice' are usually under here
 # all of those may or may not appear on different websites
 LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
@ -17,7 +18,21 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
 class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
-        
+
+
+# Doesn't look like python supports forward slash auto enclosure in re.findall
+# So convert it to inline flag "(?i)foobar" type configuration
+def perl_style_slash_enclosed_regex_to_options(regex):
+    res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
+
+    if res:
+        flags = res.group(2) if res.group(2) else 'i'
+        regex = f"(?{flags}){res.group(1)}"
+    else:
+        regex = f"(?i){regex}"
+
+    return regex
+
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
    soup = BeautifulSoup(html_content, "html.parser")
@ -195,23 +210,14 @@ def strip_ignore_text(content, wordlist, mode="content"):
    output = []
    ignore_text = []
    ignore_regex = []
-
    ignored_line_numbers = []

    for k in wordlist:
        # Is it a regex?
-        x = re.search('^\/(.*)\/(.*)', k.strip())
-        if x:
-            # Starts with / but doesn't look like a regex
-            p = x.group(1)
-            try:
-                # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
-                ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
-            except Exception as e:
-                # Badly formed regex, treat as text
-                ignore_text.append(k.strip())
+        res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
+        if res:
+            ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
        else:
-            # Had a / but doesn't work as regex
            ignore_text.append(k.strip())

    for line in content.splitlines():
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@ -37,19 +37,6 @@ class perform_site_check(difference_detection_processor):
        super().__init__(*args, **kwargs)
        self.datastore = datastore

-    # Doesn't look like python supports forward slash auto enclosure in re.findall
-    # So convert it to inline flag "(?i)foobar" type configuration
-    def forward_slash_enclosed_regex_to_options(self, regex):
-        res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
-
-        if res:
-            regex = res.group(1)
-            regex = f"(?{res.group(2)}){regex}"
-        else:
-            regex = f"(?i){regex}"
-
-        return regex
-
    def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
        changed_detected = False
        screenshot = False  # as bytes
@ -340,7 +327,7 @@ class perform_site_check(difference_detection_processor):
            regex_matched_output = []
            for s_re in extract_text:
                # incase they specified something in '/.../x'
-                regex = self.forward_slash_enclosed_regex_to_options(s_re)
+                regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)

                for l in result:
--- a/changedetectionio/tests/test_trigger_regex.py
+++ b/changedetectionio/tests/test_trigger_regex.py
@ -2,7 +2,7 @@

 import time
 from flask import url_for
-from . util import live_server_setup
+from .util import live_server_setup, wait_for_all_checks


 def set_original_ignore_response():
@ -26,13 +26,8 @@ def test_trigger_regex_functionality(client, live_server):

    live_server_setup(live_server)

-    sleep_time_for_fetch_thread = 3
-
    set_original_ignore_response()

-    # Give the endpoint time to spin up
-    time.sleep(1)
-
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
@ -43,7 +38,7 @@ def test_trigger_regex_functionality(client, live_server):
    assert b"1 Imported" in res.data

    # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)

    # It should report nothing found (just a new one shouldnt have anything)
    res = client.get(url_for("index"))
@ -57,7 +52,7 @@ def test_trigger_regex_functionality(client, live_server):
              "fetch_backend": "html_requests"},
        follow_redirects=True
    )
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
    # so that we set the state to 'unviewed' after all the edits
    client.get(url_for("diff_history_page", uuid="first"))

@ -65,7 +60,7 @@ def test_trigger_regex_functionality(client, live_server):
        f.write("some new noise")

    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)

    # It should report nothing found (nothing should match the regex)
    res = client.get(url_for("index"))
@ -75,7 +70,7 @@ def test_trigger_regex_functionality(client, live_server):
        f.write("regex test123<br>\nsomething 123")

    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data