Should also support non-regex strings

1 year ago · 2ccd0fc77b
parent 18d48bc2a0
commit 2ccd0fc77b
2 changed files with 42 additions and 30 deletions
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@ -11,17 +11,19 @@ from changedetectionio import content_fetcher, html_tools
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from copy import deepcopy
 from . import difference_detection_processor
 from ..html_tools import PERL_STYLE_REGEX
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 name = 'Webpage Text/HTML, JSON and PDF changes'
 description = 'Detects all text changes where possible'
 class FilterNotFoundInResponse(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
 class PDFToHTMLToolNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)
@ -122,7 +124,8 @@ class perform_site_check(difference_detection_processor):
        # requests for PDF's, images etc should be passwd the is_binary flag
        is_binary = watch.is_pdf
-        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary)
+        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'),
                    is_binary=is_binary)
        fetcher.quit()
        self.screenshot = fetcher.screenshot
@ -138,7 +141,6 @@ class perform_site_check(difference_detection_processor):
            if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'):
                raise content_fetcher.checksumFromPreviousCheckWasTheSame()
        # Fetching complete, now filters
        # @todo move to class / maybe inside of fetcher abstract base?
@ -218,8 +220,6 @@ class perform_site_check(difference_detection_processor):
                    stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
                    is_html = False
        if is_html or is_source:
            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
@ -270,7 +270,6 @@ class perform_site_check(difference_detection_processor):
        # Re #340 - return the content before the 'ignore text' was applied
        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
        # @todo whitespace coming from missing rtrim()?
        # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about.
        # Rewrite's the processing text based on only what diff result they want to see
@ -327,6 +326,7 @@ class perform_site_check(difference_detection_processor):
            regex_matched_output = []
            for s_re in extract_text:
                # incase they specified something in '/.../x'
                if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE):
                    regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re)
                    result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
@ -337,6 +337,14 @@ class perform_site_check(difference_detection_processor):
                        else:
                            # @todo - some formatter option default (between each ungrouped result)
                            regex_matched_output += [l] + [b'\n']
                else:
                    # Doesnt look like regex, just hunt for plaintext and return that which matches
                    # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes
                    r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE)
                    res = r.findall(stripped_text_from_html)
                    if res:
                        for match in res:
                            regex_matched_output += [match] + [b'\n']
            # Now we will only show what the regex matched
            stripped_text_from_html = b''
--- a/changedetectionio/tests/test_extract_regex.py
+++ b/changedetectionio/tests/test_extract_regex.py
@ -55,6 +55,8 @@ def set_multiline_response():
     </p>
     <div>aaand something lines</div>
     <br>
     <div>and this should be</div>
     </body>
     </html>
    """
@ -66,11 +68,10 @@ def set_multiline_response():
 def test_setup(client, live_server):
    live_server_setup(live_server)
 def test_check_filter_multiline(client, live_server):
-
+    #live_server_setup(live_server)
    set_multiline_response()
    # Add our URL to the import page
@ -89,7 +90,8 @@ def test_check_filter_multiline(client, live_server):
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"include_filters": '',
-              'extract_text': '/something.+?6 billion.+?lines/si',
+              # Test a regex and a plaintext
              'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be',
              "url": test_url,
              "tags": "",
              "headers": "",
@ -102,14 +104,16 @@ def test_check_filter_multiline(client, live_server):
    wait_for_all_checks(client)
    res = client.get(url_for("index"))
-    #issue 1828
+
    # Issue 1828
    assert b'not at the start of the expression' not in res.data
    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
-
+    # Plaintext that doesnt look like a regex should match also
    assert b'and this should be' in res.data
    assert b'<div class="">Something' in res.data
    assert b'<div class="">across 6 billion multiple' in res.data