From 2ccd0fc77b9c686f15ff3b856e139cc085720b12 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Tue, 3 Oct 2023 17:12:31 +0200
Subject: [PATCH] Should also support non-regex strings

---
 .../processors/text_json_diff.py              | 56 +++++++++++--------
 changedetectionio/tests/test_extract_regex.py | 16 ++++--
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 2dda0e8d..19ef78da 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -11,17 +11,19 @@ from changedetectionio import content_fetcher, html_tools
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from copy import deepcopy
 from . import difference_detection_processor
+from ..html_tools import PERL_STYLE_REGEX
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
-
-name =  'Webpage Text/HTML, JSON and PDF changes'
+name = 'Webpage Text/HTML, JSON and PDF changes'
 description = 'Detects all text changes where possible'
 
+
 class FilterNotFoundInResponse(ValueError):
     def __init__(self, msg):
         ValueError.__init__(self, msg)
 
+
 class PDFToHTMLToolNotFound(ValueError):
     def __init__(self, msg):
         ValueError.__init__(self, msg)
@@ -122,7 +124,8 @@ class perform_site_check(difference_detection_processor):
         # requests for PDF's, images etc should be passwd the is_binary flag
         is_binary = watch.is_pdf
 
-        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
+        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'),
+                    is_binary=is_binary)
         fetcher.quit()
 
         self.screenshot = fetcher.screenshot
@@ -138,7 +141,6 @@ class perform_site_check(difference_detection_processor):
             if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
                 raise content_fetcher.checksumFromPreviousCheckWasTheSame()
 
-
         # Fetching complete, now filters
         # @todo move to class / maybe inside of fetcher abstract base?
 
@@ -218,8 +220,6 @@ class perform_site_check(difference_detection_processor):
                     stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
                     is_html = False
 
-
-
         if is_html or is_source:
 
             # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
@@ -270,7 +270,6 @@ class perform_site_check(difference_detection_processor):
         # Re #340 - return the content before the 'ignore text' was applied
         text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
 
-
         # @todo whitespace coming from missing rtrim()?
         # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
         # Rewrite's the processing text based on only what diff result they want to see
@@ -280,13 +279,13 @@ class perform_site_check(difference_detection_processor):
             # needs to not include (added) etc or it may get used twice
             # Replace the processed text with the preferred result
             rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(),
-                                                       newest_version_file_contents=stripped_text_from_html,
-                                                       include_equal=False,  # not the same lines
-                                                       include_added=watch.get('filter_text_added', True),
-                                                       include_removed=watch.get('filter_text_removed', True),
-                                                       include_replaced=watch.get('filter_text_replaced', True),
-                                                       line_feed_sep="\n",
-                                                       include_change_type_prefix=False)
+                                             newest_version_file_contents=stripped_text_from_html,
+                                             include_equal=False,  # not the same lines
+                                             include_added=watch.get('filter_text_added', True),
+                                             include_removed=watch.get('filter_text_removed', True),
+                                             include_replaced=watch.get('filter_text_replaced', True),
+                                             line_feed_sep="\n",
+                                             include_change_type_prefix=False)
 
             watch.save_last_fetched_before_filters(text_content_before_ignored_filter)
 
@@ -327,16 +326,25 @@ class perform_site_check(difference_detection_processor):
             regex_matched_output = []
             for s_re in extract_text:
                 # incase they specified something in '/.../x'
-                regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
-                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
-
-                for l in result:
-                    if type(l) is tuple:
-                        # @todo - some formatter option default (between groups)
-                        regex_matched_output += list(l) + [b'\n']
-                    else:
-                        # @todo - some formatter option default (between each ungrouped result)
-                        regex_matched_output += [l] + [b'\n']
+                if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
+                    regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
+                    result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+
+                    for l in result:
+                        if type(l) is tuple:
+                            # @todo - some formatter option default (between groups)
+                            regex_matched_output += list(l) + [b'\n']
+                        else:
+                            # @todo - some formatter option default (between each ungrouped result)
+                            regex_matched_output += [l] + [b'\n']
+                else:
+                    # Doesnt look like regex, just hunt for plaintext and return that which matches
+                    # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
+                    r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
+                    res = r.findall(stripped_text_from_html)
+                    if res:
+                        for match in res:
+                            regex_matched_output += [match] + [b'\n']
 
             # Now we will only show what the regex matched
             stripped_text_from_html = b''
diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py
index 09a25677..e5f31e6a 100644
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@@ -55,6 +55,8 @@ def set_multiline_response():
      </p>
      
      <div>aaand something lines</div>
+     <br>
+     <div>and this should be</div>
      </body>
      </html>
     """
@@ -66,11 +68,10 @@ def set_multiline_response():
 
 
 def test_setup(client, live_server):
-
     live_server_setup(live_server)
 
 def test_check_filter_multiline(client, live_server):
-
+    #live_server_setup(live_server)
     set_multiline_response()
 
     # Add our URL to the import page
@@ -89,7 +90,8 @@ def test_check_filter_multiline(client, live_server):
     res = client.post(
         url_for("edit_page", uuid="first"),
         data={"include_filters": '',
-              'extract_text': '/something.+?6 billion.+?lines/si',
+              # Test a regex and a plaintext
+              'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be',
               "url": test_url,
               "tags": "",
               "headers": "",
@@ -102,14 +104,16 @@ def test_check_filter_multiline(client, live_server):
     wait_for_all_checks(client)
 
     res = client.get(url_for("index"))
-    #issue 1828
+
+    # Issue 1828
     assert b'not at the start of the expression' not in res.data
-    
+
     res = client.get(
         url_for("preview_page", uuid="first"),
         follow_redirects=True
     )
-
+    # Plaintext that doesnt look like a regex should match also
+    assert b'and this should be' in res.data
 
     assert b'<div class="">Something' in res.data
     assert b'<div class="">across 6 billion multiple' in res.data