From 2ccd0fc77b9c686f15ff3b856e139cc085720b12 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 3 Oct 2023 17:12:31 +0200 Subject: [PATCH] Should also support non-regex strings --- .../processors/text_json_diff.py | 56 +++++++++++-------- changedetectionio/tests/test_extract_regex.py | 16 ++++-- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 2dda0e8d..19ef78da 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -11,17 +11,19 @@ from changedetectionio import content_fetcher, html_tools from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from copy import deepcopy from . import difference_detection_processor +from ..html_tools import PERL_STYLE_REGEX urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -name = 'Webpage Text/HTML, JSON and PDF changes' +name = 'Webpage Text/HTML, JSON and PDF changes' description = 'Detects all text changes where possible' + class FilterNotFoundInResponse(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) + class PDFToHTMLToolNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) @@ -122,7 +124,8 @@ class perform_site_check(difference_detection_processor): # requests for PDF's, images etc should be passwd the is_binary flag is_binary = watch.is_pdf - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary) + fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), + is_binary=is_binary) fetcher.quit() self.screenshot = fetcher.screenshot @@ -138,7 +141,6 @@ class perform_site_check(difference_detection_processor): if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): raise content_fetcher.checksumFromPreviousCheckWasTheSame() - # Fetching complete, now filters # @todo move to class / maybe inside of fetcher abstract base? @@ -218,8 +220,6 @@ class perform_site_check(difference_detection_processor): stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) is_html = False - - if is_html or is_source: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text @@ -270,7 +270,6 @@ class perform_site_check(difference_detection_processor): # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') - # @todo whitespace coming from missing rtrim()? # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about. # Rewrite's the processing text based on only what diff result they want to see @@ -280,13 +279,13 @@ class perform_site_check(difference_detection_processor): # needs to not include (added) etc or it may get used twice # Replace the processed text with the preferred result rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(), - newest_version_file_contents=stripped_text_from_html, - include_equal=False, # not the same lines - include_added=watch.get('filter_text_added', True), - include_removed=watch.get('filter_text_removed', True), - include_replaced=watch.get('filter_text_replaced', True), - line_feed_sep="\n", - include_change_type_prefix=False) + newest_version_file_contents=stripped_text_from_html, + include_equal=False, # not the same lines + include_added=watch.get('filter_text_added', True), + include_removed=watch.get('filter_text_removed', True), + include_replaced=watch.get('filter_text_replaced', True), + line_feed_sep="\n", + include_change_type_prefix=False) watch.save_last_fetched_before_filters(text_content_before_ignored_filter) @@ -327,16 +326,25 @@ class perform_site_check(difference_detection_processor): regex_matched_output = [] for s_re in extract_text: # incase they specified something in '/.../x' - regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re) - result = re.findall(regex.encode('utf-8'), stripped_text_from_html) - - for l in result: - if type(l) is tuple: - # @todo - some formatter option default (between groups) - regex_matched_output += list(l) + [b'\n'] - else: - # @todo - some formatter option default (between each ungrouped result) - regex_matched_output += [l] + [b'\n'] + if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE): + regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re) + result = re.findall(regex.encode('utf-8'), stripped_text_from_html) + + for l in result: + if type(l) is tuple: + # @todo - some formatter option default (between groups) + regex_matched_output += list(l) + [b'\n'] + else: + # @todo - some formatter option default (between each ungrouped result) + regex_matched_output += [l] + [b'\n'] + else: + # Doesnt look like regex, just hunt for plaintext and return that which matches + # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes + r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE) + res = r.findall(stripped_text_from_html) + if res: + for match in res: + regex_matched_output += [match] + [b'\n'] # Now we will only show what the regex matched stripped_text_from_html = b'' diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py index 09a25677..e5f31e6a 100644 --- a/changedetectionio/tests/test_extract_regex.py +++ b/changedetectionio/tests/test_extract_regex.py @@ -55,6 +55,8 @@ def set_multiline_response():

aaand something lines
+
+
and this should be
""" @@ -66,11 +68,10 @@ def set_multiline_response(): def test_setup(client, live_server): - live_server_setup(live_server) def test_check_filter_multiline(client, live_server): - + #live_server_setup(live_server) set_multiline_response() # Add our URL to the import page @@ -89,7 +90,8 @@ def test_check_filter_multiline(client, live_server): res = client.post( url_for("edit_page", uuid="first"), data={"include_filters": '', - 'extract_text': '/something.+?6 billion.+?lines/si', + # Test a regex and a plaintext + 'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be', "url": test_url, "tags": "", "headers": "", @@ -102,14 +104,16 @@ def test_check_filter_multiline(client, live_server): wait_for_all_checks(client) res = client.get(url_for("index")) - #issue 1828 + + # Issue 1828 assert b'not at the start of the expression' not in res.data - + res = client.get( url_for("preview_page", uuid="first"), follow_redirects=True ) - + # Plaintext that doesnt look like a regex should match also + assert b'and this should be' in res.data assert b'
Something' in res.data assert b'
across 6 billion multiple' in res.data