From 2b948c15c10a089b8a843f345c76c1ee25df7de9 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Tue, 3 Oct 2023 17:44:27 +0200
Subject: [PATCH] Backend - Regular expression / string filtering refactor for
 Python 3.11 and deprecation warnings since Python 3.6 (#1786)

---
 changedetectionio/html_tools.py               | 34 +++++----
 .../processors/text_json_diff.py              | 69 +++++++++----------
 changedetectionio/templates/edit.html         |  5 +-
 changedetectionio/tests/test_extract_regex.py | 44 +++++++-----
 changedetectionio/tests/test_trigger_regex.py | 15 ++--
 5 files changed, 86 insertions(+), 81 deletions(-)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index f0719e81..671c96c6 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -10,6 +10,7 @@ import re
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
 
+PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
 # 'price' , 'lowPrice', 'highPrice' are usually under here
 # all of those may or may not appear on different websites
 LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
@@ -17,7 +18,23 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers"
 class JSONNotFound(ValueError):
     def __init__(self, msg):
         ValueError.__init__(self, msg)
-        
+
+
+# Doesn't look like python supports forward slash auto enclosure in re.findall
+# So convert it to inline flag "(?i)foobar" type configuration
+def perl_style_slash_enclosed_regex_to_options(regex):
+
+    res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)
+
+    if res:
+        flags = res.group(2) if res.group(2) else 'i'
+        regex = f"(?{flags}){res.group(1)}"
+    else:
+        # Fall back to just ignorecase as an option
+        regex = f"(?i){regex}"
+
+    return regex
+
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
     soup = BeautifulSoup(html_content, "html.parser")
@@ -195,23 +212,14 @@ def strip_ignore_text(content, wordlist, mode="content"):
     output = []
     ignore_text = []
     ignore_regex = []
-
     ignored_line_numbers = []
 
     for k in wordlist:
         # Is it a regex?
-        x = re.search('^\/(.*)\/(.*)', k.strip())
-        if x:
-            # Starts with / but doesn't look like a regex
-            p = x.group(1)
-            try:
-                # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis
-                ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE))
-            except Exception as e:
-                # Badly formed regex, treat as text
-                ignore_text.append(k.strip())
+        res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
+        if res:
+            ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
         else:
-            # Had a / but doesn't work as regex
             ignore_text.append(k.strip())
 
     for line in content.splitlines():
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 5e69a591..19ef78da 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -11,17 +11,19 @@ from changedetectionio import content_fetcher, html_tools
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from copy import deepcopy
 from . import difference_detection_processor
+from ..html_tools import PERL_STYLE_REGEX
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
-
-name =  'Webpage Text/HTML, JSON and PDF changes'
+name = 'Webpage Text/HTML, JSON and PDF changes'
 description = 'Detects all text changes where possible'
 
+
 class FilterNotFoundInResponse(ValueError):
     def __init__(self, msg):
         ValueError.__init__(self, msg)
 
+
 class PDFToHTMLToolNotFound(ValueError):
     def __init__(self, msg):
         ValueError.__init__(self, msg)
@@ -37,19 +39,6 @@ class perform_site_check(difference_detection_processor):
         super().__init__(*args, **kwargs)
         self.datastore = datastore
 
-    # Doesn't look like python supports forward slash auto enclosure in re.findall
-    # So convert it to inline flag "foobar(?i)" type configuration
-    def forward_slash_enclosed_regex_to_options(self, regex):
-        res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
-
-        if res:
-            regex = res.group(1)
-            regex += '(?{})'.format(res.group(2))
-        else:
-            regex += '(?{})'.format('i')
-
-        return regex
-
     def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None):
         changed_detected = False
         screenshot = False  # as bytes
@@ -135,7 +124,8 @@ class perform_site_check(difference_detection_processor):
         # requests for PDF's, images etc should be passwd the is_binary flag
         is_binary = watch.is_pdf
 
-        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
+        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'),
+                    is_binary=is_binary)
         fetcher.quit()
 
         self.screenshot = fetcher.screenshot
@@ -151,7 +141,6 @@ class perform_site_check(difference_detection_processor):
             if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
                 raise content_fetcher.checksumFromPreviousCheckWasTheSame()
 
-
         # Fetching complete, now filters
         # @todo move to class / maybe inside of fetcher abstract base?
 
@@ -231,8 +220,6 @@ class perform_site_check(difference_detection_processor):
                     stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
                     is_html = False
 
-
-
         if is_html or is_source:
 
             # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
@@ -283,7 +270,6 @@ class perform_site_check(difference_detection_processor):
         # Re #340 - return the content before the 'ignore text' was applied
         text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
 
-
         # @todo whitespace coming from missing rtrim()?
         # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
         # Rewrite's the processing text based on only what diff result they want to see
@@ -293,13 +279,13 @@ class perform_site_check(difference_detection_processor):
             # needs to not include (added) etc or it may get used twice
             # Replace the processed text with the preferred result
             rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(),
-                                                       newest_version_file_contents=stripped_text_from_html,
-                                                       include_equal=False,  # not the same lines
-                                                       include_added=watch.get('filter_text_added', True),
-                                                       include_removed=watch.get('filter_text_removed', True),
-                                                       include_replaced=watch.get('filter_text_replaced', True),
-                                                       line_feed_sep="\n",
-                                                       include_change_type_prefix=False)
+                                             newest_version_file_contents=stripped_text_from_html,
+                                             include_equal=False,  # not the same lines
+                                             include_added=watch.get('filter_text_added', True),
+                                             include_removed=watch.get('filter_text_removed', True),
+                                             include_replaced=watch.get('filter_text_replaced', True),
+                                             line_feed_sep="\n",
+                                             include_change_type_prefix=False)
 
             watch.save_last_fetched_before_filters(text_content_before_ignored_filter)
 
@@ -340,16 +326,25 @@ class perform_site_check(difference_detection_processor):
             regex_matched_output = []
             for s_re in extract_text:
                 # incase they specified something in '/.../x'
-                regex = self.forward_slash_enclosed_regex_to_options(s_re)
-                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
-
-                for l in result:
-                    if type(l) is tuple:
-                        # @todo - some formatter option default (between groups)
-                        regex_matched_output += list(l) + [b'\n']
-                    else:
-                        # @todo - some formatter option default (between each ungrouped result)
-                        regex_matched_output += [l] + [b'\n']
+                if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
+                    regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
+                    result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+
+                    for l in result:
+                        if type(l) is tuple:
+                            # @todo - some formatter option default (between groups)
+                            regex_matched_output += list(l) + [b'\n']
+                        else:
+                            # @todo - some formatter option default (between each ungrouped result)
+                            regex_matched_output += [l] + [b'\n']
+                else:
+                    # Doesnt look like regex, just hunt for plaintext and return that which matches
+                    # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
+                    r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
+                    res = r.findall(stripped_text_from_html)
+                    if res:
+                        for match in res:
+                            regex_matched_output += [match] + [b'\n']
 
             # Now we will only show what the regex matched
             stripped_text_from_html = b''
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index a6302b30..73d5cac8 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -378,15 +378,16 @@ Unavailable") }}
                         {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
                         <span class="pure-form-message-inline">
                     <ul>
-                        <li>Extracts text in the final output (line by line) after other filters using regular expressions;
+                        <li>Extracts text in the final output (line by line) after other filters using regular expressions or string match;
                             <ul>
                                 <li>Regular expression &dash; example <code>/reports.+?2022/i</code></li>
+                                <li>Don't forget to consider the white-space at the start of a line <code>/.+?reports.+?2022/i</code></li>
                                 <li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br></li>
                                 <li>Keyword example &dash; example <code>Out of stock</code></li>
                                 <li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
                             </ul>
                         </li>
-                        <li>One line per regular-expression/ string match</li>
+                        <li>One line per regular-expression/string match</li>
                     </ul>
                         </span>
                     </div>
diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py
index fec939f1..a96b443f 100644
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@@ -2,7 +2,7 @@
 
 import time
 from flask import url_for
-from .util import live_server_setup
+from .util import live_server_setup, wait_for_all_checks
 
 from ..html_tools import *
 
@@ -55,6 +55,8 @@ def set_multiline_response():
      </p>
      
      <div>aaand something lines</div>
+     <br>
+     <div>and this should be</div>
      </body>
      </html>
     """
@@ -66,11 +68,10 @@ def set_multiline_response():
 
 
 def test_setup(client, live_server):
-
     live_server_setup(live_server)
 
 def test_check_filter_multiline(client, live_server):
-
+    #live_server_setup(live_server)
     set_multiline_response()
 
     # Add our URL to the import page
@@ -82,14 +83,15 @@ def test_check_filter_multiline(client, live_server):
     )
     assert b"1 Imported" in res.data
 
-    time.sleep(3)
+    wait_for_all_checks(client)
 
     # Goto the edit page, add our ignore text
     # Add our URL to the import page
     res = client.post(
         url_for("edit_page", uuid="first"),
         data={"include_filters": '',
-              'extract_text': '/something.+?6 billion.+?lines/si',
+              # Test a regex and a plaintext
+              'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be',
               "url": test_url,
               "tags": "",
               "headers": "",
@@ -99,13 +101,19 @@ def test_check_filter_multiline(client, live_server):
     )
 
     assert b"Updated watch." in res.data
-    time.sleep(3)
+    wait_for_all_checks(client)
+
+    res = client.get(url_for("index"))
+
+    # Issue 1828
+    assert b'not at the start of the expression' not in res.data
 
     res = client.get(
         url_for("preview_page", uuid="first"),
         follow_redirects=True
     )
-
+    # Plaintext that doesnt look like a regex should match also
+    assert b'and this should be' in res.data
 
     assert b'<div class="">Something' in res.data
     assert b'<div class="">across 6 billion multiple' in res.data
@@ -115,14 +123,11 @@ def test_check_filter_multiline(client, live_server):
     assert b'aaand something lines' not in res.data
 
 def test_check_filter_and_regex_extract(client, live_server):
-    sleep_time_for_fetch_thread = 3
+    
     include_filters = ".changetext"
 
     set_original_response()
 
-    # Give the endpoint time to spin up
-    time.sleep(1)
-
     # Add our URL to the import page
     test_url = url_for('test_endpoint', _external=True)
     res = client.post(
@@ -132,19 +137,15 @@ def test_check_filter_and_regex_extract(client, live_server):
     )
     assert b"1 Imported" in res.data
 
-    time.sleep(1)
-    # Trigger a check
-    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-
     # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
 
     # Goto the edit page, add our ignore text
     # Add our URL to the import page
     res = client.post(
         url_for("edit_page", uuid="first"),
         data={"include_filters": include_filters,
-              'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
+              'extract_text': '/\d+ online/\r\n/\d+ guests/\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i\r\n/issue1828.+?2022/i',
               "url": test_url,
               "tags": "",
               "headers": "",
@@ -155,8 +156,13 @@ def test_check_filter_and_regex_extract(client, live_server):
 
     assert b"Updated watch." in res.data
 
+
     # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
+
+    res = client.get(url_for("index"))
+    #issue 1828
+    assert b'not at the start of the expression' not in res.data
 
     #  Make a change
     set_modified_response()
@@ -164,7 +170,7 @@ def test_check_filter_and_regex_extract(client, live_server):
     # Trigger a check
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
     # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
 
     # It should have 'unviewed' still
     # Because it should be looking at only that 'sametext' id
diff --git a/changedetectionio/tests/test_trigger_regex.py b/changedetectionio/tests/test_trigger_regex.py
index 1462eb2a..7f070e89 100644
--- a/changedetectionio/tests/test_trigger_regex.py
+++ b/changedetectionio/tests/test_trigger_regex.py
@@ -2,7 +2,7 @@
 
 import time
 from flask import url_for
-from . util import live_server_setup
+from .util import live_server_setup, wait_for_all_checks
 
 
 def set_original_ignore_response():
@@ -26,13 +26,8 @@ def test_trigger_regex_functionality(client, live_server):
 
     live_server_setup(live_server)
 
-    sleep_time_for_fetch_thread = 3
-
     set_original_ignore_response()
 
-    # Give the endpoint time to spin up
-    time.sleep(1)
-
     # Add our URL to the import page
     test_url = url_for('test_endpoint', _external=True)
     res = client.post(
@@ -43,7 +38,7 @@ def test_trigger_regex_functionality(client, live_server):
     assert b"1 Imported" in res.data
 
     # Give the thread time to pick it up
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
 
     # It should report nothing found (just a new one shouldnt have anything)
     res = client.get(url_for("index"))
@@ -57,7 +52,7 @@ def test_trigger_regex_functionality(client, live_server):
               "fetch_backend": "html_requests"},
         follow_redirects=True
     )
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
     # so that we set the state to 'unviewed' after all the edits
     client.get(url_for("diff_history_page", uuid="first"))
 
@@ -65,7 +60,7 @@ def test_trigger_regex_functionality(client, live_server):
         f.write("some new noise")
 
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
 
     # It should report nothing found (nothing should match the regex)
     res = client.get(url_for("index"))
@@ -75,7 +70,7 @@ def test_trigger_regex_functionality(client, live_server):
         f.write("regex test123<br>\nsomething 123")
 
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
-    time.sleep(sleep_time_for_fetch_thread)
+    wait_for_all_checks(client)
     res = client.get(url_for("index"))
     assert b'unviewed' in res.data