From 2b948c15c10a089b8a843f345c76c1ee25df7de9 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 3 Oct 2023 17:44:27 +0200 Subject: [PATCH] Backend - Regular expression / string filtering refactor for Python 3.11 and deprecation warnings since Python 3.6 (#1786) --- changedetectionio/html_tools.py | 34 +++++---- .../processors/text_json_diff.py | 69 +++++++++---------- changedetectionio/templates/edit.html | 5 +- changedetectionio/tests/test_extract_regex.py | 44 +++++++----- changedetectionio/tests/test_trigger_regex.py | 15 ++-- 5 files changed, 86 insertions(+), 81 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index f0719e81..671c96c6 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -10,6 +10,7 @@ import re # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis TEXT_FILTER_LIST_LINE_SUFFIX = "
" +PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' # 'price' , 'lowPrice', 'highPrice' are usually under here # all of those may or may not appear on different websites LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" @@ -17,7 +18,23 @@ LD_JSON_PRODUCT_OFFER_SELECTOR = "json:$..offers" class JSONNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) - + + +# Doesn't look like python supports forward slash auto enclosure in re.findall +# So convert it to inline flag "(?i)foobar" type configuration +def perl_style_slash_enclosed_regex_to_options(regex): + + res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE) + + if res: + flags = res.group(2) if res.group(2) else 'i' + regex = f"(?{flags}){res.group(1)}" + else: + # Fall back to just ignorecase as an option + regex = f"(?i){regex}" + + return regex + # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches def include_filters(include_filters, html_content, append_pretty_line_formatting=False): soup = BeautifulSoup(html_content, "html.parser") @@ -195,23 +212,14 @@ def strip_ignore_text(content, wordlist, mode="content"): output = [] ignore_text = [] ignore_regex = [] - ignored_line_numbers = [] for k in wordlist: # Is it a regex? - x = re.search('^\/(.*)\/(.*)', k.strip()) - if x: - # Starts with / but doesn't look like a regex - p = x.group(1) - try: - # @Todo python regex options can go before the regex str, but not really many of the options apply on a per-line basis - ignore_regex.append(re.compile(rf"{p}", re.IGNORECASE)) - except Exception as e: - # Badly formed regex, treat as text - ignore_text.append(k.strip()) + res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE) + if res: + ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k))) else: - # Had a / but doesn't work as regex ignore_text.append(k.strip()) for line in content.splitlines(): diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 5e69a591..19ef78da 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -11,17 +11,19 @@ from changedetectionio import content_fetcher, html_tools from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from copy import deepcopy from . import difference_detection_processor +from ..html_tools import PERL_STYLE_REGEX urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -name = 'Webpage Text/HTML, JSON and PDF changes' +name = 'Webpage Text/HTML, JSON and PDF changes' description = 'Detects all text changes where possible' + class FilterNotFoundInResponse(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) + class PDFToHTMLToolNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) @@ -37,19 +39,6 @@ class perform_site_check(difference_detection_processor): super().__init__(*args, **kwargs) self.datastore = datastore - # Doesn't look like python supports forward slash auto enclosure in re.findall - # So convert it to inline flag "foobar(?i)" type configuration - def forward_slash_enclosed_regex_to_options(self, regex): - res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE) - - if res: - regex = res.group(1) - regex += '(?{})'.format(res.group(2)) - else: - regex += '(?{})'.format('i') - - return regex - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): changed_detected = False screenshot = False # as bytes @@ -135,7 +124,8 @@ class perform_site_check(difference_detection_processor): # requests for PDF's, images etc should be passwd the is_binary flag is_binary = watch.is_pdf - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), is_binary=is_binary) + fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), + is_binary=is_binary) fetcher.quit() self.screenshot = fetcher.screenshot @@ -151,7 +141,6 @@ class perform_site_check(difference_detection_processor): if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): raise content_fetcher.checksumFromPreviousCheckWasTheSame() - # Fetching complete, now filters # @todo move to class / maybe inside of fetcher abstract base? @@ -231,8 +220,6 @@ class perform_site_check(difference_detection_processor): stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) is_html = False - - if is_html or is_source: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text @@ -283,7 +270,6 @@ class perform_site_check(difference_detection_processor): # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') - # @todo whitespace coming from missing rtrim()? # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about. # Rewrite's the processing text based on only what diff result they want to see @@ -293,13 +279,13 @@ class perform_site_check(difference_detection_processor): # needs to not include (added) etc or it may get used twice # Replace the processed text with the preferred result rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(), - newest_version_file_contents=stripped_text_from_html, - include_equal=False, # not the same lines - include_added=watch.get('filter_text_added', True), - include_removed=watch.get('filter_text_removed', True), - include_replaced=watch.get('filter_text_replaced', True), - line_feed_sep="\n", - include_change_type_prefix=False) + newest_version_file_contents=stripped_text_from_html, + include_equal=False, # not the same lines + include_added=watch.get('filter_text_added', True), + include_removed=watch.get('filter_text_removed', True), + include_replaced=watch.get('filter_text_replaced', True), + line_feed_sep="\n", + include_change_type_prefix=False) watch.save_last_fetched_before_filters(text_content_before_ignored_filter) @@ -340,16 +326,25 @@ class perform_site_check(difference_detection_processor): regex_matched_output = [] for s_re in extract_text: # incase they specified something in '/.../x' - regex = self.forward_slash_enclosed_regex_to_options(s_re) - result = re.findall(regex.encode('utf-8'), stripped_text_from_html) - - for l in result: - if type(l) is tuple: - # @todo - some formatter option default (between groups) - regex_matched_output += list(l) + [b'\n'] - else: - # @todo - some formatter option default (between each ungrouped result) - regex_matched_output += [l] + [b'\n'] + if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE): + regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re) + result = re.findall(regex.encode('utf-8'), stripped_text_from_html) + + for l in result: + if type(l) is tuple: + # @todo - some formatter option default (between groups) + regex_matched_output += list(l) + [b'\n'] + else: + # @todo - some formatter option default (between each ungrouped result) + regex_matched_output += [l] + [b'\n'] + else: + # Doesnt look like regex, just hunt for plaintext and return that which matches + # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes + r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE) + res = r.findall(stripped_text_from_html) + if res: + for match in res: + regex_matched_output += [match] + [b'\n'] # Now we will only show what the regex matched stripped_text_from_html = b'' diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index a6302b30..73d5cac8 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -378,15 +378,16 @@ Unavailable") }} {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }} diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py index fec939f1..a96b443f 100644 --- a/changedetectionio/tests/test_extract_regex.py +++ b/changedetectionio/tests/test_extract_regex.py @@ -2,7 +2,7 @@ import time from flask import url_for -from .util import live_server_setup +from .util import live_server_setup, wait_for_all_checks from ..html_tools import * @@ -55,6 +55,8 @@ def set_multiline_response():

aaand something lines
+
+
and this should be
""" @@ -66,11 +68,10 @@ def set_multiline_response(): def test_setup(client, live_server): - live_server_setup(live_server) def test_check_filter_multiline(client, live_server): - + #live_server_setup(live_server) set_multiline_response() # Add our URL to the import page @@ -82,14 +83,15 @@ def test_check_filter_multiline(client, live_server): ) assert b"1 Imported" in res.data - time.sleep(3) + wait_for_all_checks(client) # Goto the edit page, add our ignore text # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), data={"include_filters": '', - 'extract_text': '/something.+?6 billion.+?lines/si', + # Test a regex and a plaintext + 'extract_text': '/something.+?6 billion.+?lines/si\r\nand this should be', "url": test_url, "tags": "", "headers": "", @@ -99,13 +101,19 @@ def test_check_filter_multiline(client, live_server): ) assert b"Updated watch." in res.data - time.sleep(3) + wait_for_all_checks(client) + + res = client.get(url_for("index")) + + # Issue 1828 + assert b'not at the start of the expression' not in res.data res = client.get( url_for("preview_page", uuid="first"), follow_redirects=True ) - + # Plaintext that doesnt look like a regex should match also + assert b'and this should be' in res.data assert b'
Something' in res.data assert b'
across 6 billion multiple' in res.data @@ -115,14 +123,11 @@ def test_check_filter_multiline(client, live_server): assert b'aaand something lines' not in res.data def test_check_filter_and_regex_extract(client, live_server): - sleep_time_for_fetch_thread = 3 + include_filters = ".changetext" set_original_response() - # Give the endpoint time to spin up - time.sleep(1) - # Add our URL to the import page test_url = url_for('test_endpoint', _external=True) res = client.post( @@ -132,19 +137,15 @@ def test_check_filter_and_regex_extract(client, live_server): ) assert b"1 Imported" in res.data - time.sleep(1) - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # Goto the edit page, add our ignore text # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), data={"include_filters": include_filters, - 'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i', + 'extract_text': '/\d+ online/\r\n/\d+ guests/\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i\r\n/issue1828.+?2022/i', "url": test_url, "tags": "", "headers": "", @@ -155,8 +156,13 @@ def test_check_filter_and_regex_extract(client, live_server): assert b"Updated watch." in res.data + # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) + + res = client.get(url_for("index")) + #issue 1828 + assert b'not at the start of the expression' not in res.data # Make a change set_modified_response() @@ -164,7 +170,7 @@ def test_check_filter_and_regex_extract(client, live_server): # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should have 'unviewed' still # Because it should be looking at only that 'sametext' id diff --git a/changedetectionio/tests/test_trigger_regex.py b/changedetectionio/tests/test_trigger_regex.py index 1462eb2a..7f070e89 100644 --- a/changedetectionio/tests/test_trigger_regex.py +++ b/changedetectionio/tests/test_trigger_regex.py @@ -2,7 +2,7 @@ import time from flask import url_for -from . util import live_server_setup +from .util import live_server_setup, wait_for_all_checks def set_original_ignore_response(): @@ -26,13 +26,8 @@ def test_trigger_regex_functionality(client, live_server): live_server_setup(live_server) - sleep_time_for_fetch_thread = 3 - set_original_ignore_response() - # Give the endpoint time to spin up - time.sleep(1) - # Add our URL to the import page test_url = url_for('test_endpoint', _external=True) res = client.post( @@ -43,7 +38,7 @@ def test_trigger_regex_functionality(client, live_server): assert b"1 Imported" in res.data # Give the thread time to pick it up - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (just a new one shouldnt have anything) res = client.get(url_for("index")) @@ -57,7 +52,7 @@ def test_trigger_regex_functionality(client, live_server): "fetch_backend": "html_requests"}, follow_redirects=True ) - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # so that we set the state to 'unviewed' after all the edits client.get(url_for("diff_history_page", uuid="first")) @@ -65,7 +60,7 @@ def test_trigger_regex_functionality(client, live_server): f.write("some new noise") client.get(url_for("form_watch_checknow"), follow_redirects=True) - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) # It should report nothing found (nothing should match the regex) res = client.get(url_for("index")) @@ -75,7 +70,7 @@ def test_trigger_regex_functionality(client, live_server): f.write("regex test123
\nsomething 123") client.get(url_for("form_watch_checknow"), follow_redirects=True) - time.sleep(sleep_time_for_fetch_thread) + wait_for_all_checks(client) res = client.get(url_for("index")) assert b'unviewed' in res.data