From faa42d75e067a1f3e8de8b2d812f67928b88be3f Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Tue, 26 Jul 2022 17:33:40 +0200
Subject: [PATCH] Refactor of extract text filter - Regex, support Regex
 (groups) and all python regex flags via /something/aiLmsux (#773)

---
 changedetectionio/fetch_site_status.py        | 37 ++++++-
 changedetectionio/templates/edit.html         | 11 ++-
 changedetectionio/tests/test_extract_regex.py | 98 ++++++++++++++++---
 changedetectionio/update_worker.py            |  1 -
 4 files changed, 124 insertions(+), 23 deletions(-)
diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 907e0c91..48342d93 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -11,6 +11,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 
 # Some common stuff here that can be moved to a base class
+# (set_proxy_from_list)
 class perform_site_check():
 
     def __init__(self, *args, datastore, **kwargs):
@@ -45,6 +46,20 @@ class perform_site_check():
 
         return proxy_args
 
+    # Doesn't look like python supports forward slash auto enclosure in re.findall
+    # So convert it to inline flag "foobar(?i)" type configuration
+    def forward_slash_enclosed_regex_to_options(self, regex):
+        res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
+
+        if res:
+            regex = res.group(1)
+            regex += '(?{})'.format(res.group(2))
+        else:
+            regex += '(?{})'.format('i')
+
+        return regex
+
+
     def run(self, uuid):
         timestamp = int(time.time())  # used for storage etc too
 
@@ -215,15 +230,27 @@ class perform_site_check():
         if len(extract_text) > 0:
             regex_matched_output = []
             for s_re in extract_text:
-                result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
-                                    flags=re.MULTILINE | re.DOTALL | re.LOCALE)
-                if result:
-                    regex_matched_output = regex_matched_output + result
+                # incase they specified something in '/.../x'
+                regex = self.forward_slash_enclosed_regex_to_options(s_re)
+                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+
+                for l in result:
+                    if type(l) is tuple:
+                        #@todo - some formatter option default (between groups)
+                        regex_matched_output += list(l) + [b'\n']
+                    else:
+                        # @todo - some formatter option default (between each ungrouped result)
+                        regex_matched_output += [l] + [b'\n']
 
+            # Now we will only show what the regex matched
+            stripped_text_from_html = b''
+            text_content_before_ignored_filter = b''
             if regex_matched_output:
-                stripped_text_from_html = b'\n'.join(regex_matched_output)
+                # @todo some formatter for presentation?
+                stripped_text_from_html = b''.join(regex_matched_output)
                 text_content_before_ignored_filter = stripped_text_from_html
 
+
         # Re #133 - if we should strip whitespaces from triggering the change detected comparison
         if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
             fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index c706b0b2..7b5d0c4a 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -239,8 +239,15 @@ Unavailable") }}
                         {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
                         <span class="pure-form-message-inline">
                     <ul>
-                        <li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
-                        <li>One line per regular-expression.</li>
+                        <li>Extracts text in the final output (line by line) after other filters using regular expressions;
+                            <ul>
+                                <li>Regular expression &dash; example <code>/reports.+?2022/i</code></li>
+                                <li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br/></li>
+                                <li>Keyword example &dash; example <code>Out of stock</code></li>
+                                <li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
+                            </ul>
+                        </li>
+                        <li>One line per regular-expression/ string match</li>
                     </ul>
                         </span>
                     </div>
diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py
index d51200d3..aad29d51 100644
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@@ -15,7 +15,7 @@ def set_original_response():
      </br>
      So let's see what happens.  </br>
      <div id="sametext">Some text thats the same</div>
-     <div id="changetext">Some text that will change</div>
+     <div class="changetext">Some text that will change</div>     
      </body>
      </html>
     """
@@ -33,7 +33,8 @@ def set_modified_response():
      </br>
      So let's see what happens.  </br>
      <div id="sametext">Some text thats the same</div>
-     <div id="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/>  2000 online )</div>
+     <div class="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/>  2000 online )</div>
+     <div class="changetext">SomeCase insensitive 3456</div>
      </body>
      </html>
     """
@@ -44,11 +45,78 @@ def set_modified_response():
     return None
 
 
-def test_check_filter_and_regex_extract(client, live_server):
-    sleep_time_for_fetch_thread = 3
+def set_multiline_response():
+    test_return_data = """<html>
+       <body>
+     
+     <p>Something <br/>
+        across 6 billion multiple<br/>
+        lines
+     </p>
+     
+     <div>aaand something lines</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+    return None
+
+
+def test_setup(client, live_server):
 
     live_server_setup(live_server)
-    css_filter = "#changetext"
+
+def test_check_filter_multiline(client, live_server):
+
+    set_multiline_response()
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    time.sleep(3)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": '',
+              'extract_text': '/something.+?6 billion.+?lines/si',
+              "url": test_url,
+              "tag": "",
+              "headers": "",
+              'fetch_backend': "html_requests"
+              },
+        follow_redirects=True
+    )
+
+    assert b"Updated watch." in res.data
+    time.sleep(3)
+
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+
+    assert b'<div class="">Something' in res.data
+    assert b'<div class="">across 6 billion multiple' in res.data
+    assert b'<div class="">lines' in res.data
+
+    # but the last one, which also says 'lines' shouldnt be here (non-greedy match checking)
+    assert b'aaand something lines' not in res.data
+
+def test_check_filter_and_regex_extract(client, live_server):
+    sleep_time_for_fetch_thread = 3
+    css_filter = ".changetext"
 
     set_original_response()
 
@@ -64,6 +132,7 @@ def test_check_filter_and_regex_extract(client, live_server):
     )
     assert b"1 Imported" in res.data
 
+    time.sleep(1)
     # Trigger a check
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
 
@@ -75,7 +144,7 @@ def test_check_filter_and_regex_extract(client, live_server):
     res = client.post(
         url_for("edit_page", uuid="first"),
         data={"css_filter": css_filter,
-              'extract_text': '\d+ online\n\d+ guests',
+              'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
               "url": test_url,
               "tag": "",
               "headers": "",
@@ -86,15 +155,6 @@ def test_check_filter_and_regex_extract(client, live_server):
 
     assert b"Updated watch." in res.data
 
-    # Check it saved
-    res = client.get(
-        url_for("edit_page", uuid="first"),
-    )
-    assert b'\d+ online' in res.data
-
-    # Trigger a check
-#    client.get(url_for("form_watch_checknow"), follow_redirects=True)
-
     # Give the thread time to pick it up
     time.sleep(sleep_time_for_fetch_thread)
 
@@ -126,5 +186,13 @@ def test_check_filter_and_regex_extract(client, live_server):
     # Both regexs should be here
     assert b'<div class="">80 guests' in res.data
 
+    # Regex with flag handling should be here
+    assert b'<div class="">SomeCase insensitive 3456' in res.data
+
+    # Singular group from /somecase insensitive (345\d)/i
+    assert b'<div class="">3456' in res.data
+
+    # Regex with multiline flag handling should be here
+
     # Should not be here
     assert b'Some text that did change' not in res.data
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py
index a9acf502..41b1ff91 100644
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -113,7 +113,6 @@ class update_worker(threading.Thread):
                         err_text = "Page request from server didnt respond correctly"
                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
                                                                            'last_check_status': e.status_code})
-
                     except Exception as e:
                         self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
                         self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})