From faa42d75e067a1f3e8de8b2d812f67928b88be3f Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 26 Jul 2022 17:33:40 +0200 Subject: [PATCH] Refactor of extract text filter - Regex, support Regex (groups) and all python regex flags via /something/aiLmsux (#773) --- changedetectionio/fetch_site_status.py | 37 ++++++- changedetectionio/templates/edit.html | 11 ++- changedetectionio/tests/test_extract_regex.py | 98 ++++++++++++++++--- changedetectionio/update_worker.py | 1 - 4 files changed, 124 insertions(+), 23 deletions(-) diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 907e0c91..48342d93 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -11,6 +11,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Some common stuff here that can be moved to a base class +# (set_proxy_from_list) class perform_site_check(): def __init__(self, *args, datastore, **kwargs): @@ -45,6 +46,20 @@ class perform_site_check(): return proxy_args + # Doesn't look like python supports forward slash auto enclosure in re.findall + # So convert it to inline flag "foobar(?i)" type configuration + def forward_slash_enclosed_regex_to_options(self, regex): + res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE) + + if res: + regex = res.group(1) + regex += '(?{})'.format(res.group(2)) + else: + regex += '(?{})'.format('i') + + return regex + + def run(self, uuid): timestamp = int(time.time()) # used for storage etc too @@ -215,15 +230,27 @@ class perform_site_check(): if len(extract_text) > 0: regex_matched_output = [] for s_re in extract_text: - result = re.findall(s_re.encode('utf8'), stripped_text_from_html, - flags=re.MULTILINE | re.DOTALL | re.LOCALE) - if result: - regex_matched_output = regex_matched_output + result + # incase they specified something in '/.../x' + regex = self.forward_slash_enclosed_regex_to_options(s_re) + result = re.findall(regex.encode('utf-8'), stripped_text_from_html) + + for l in result: + if type(l) is tuple: + #@todo - some formatter option default (between groups) + regex_matched_output += list(l) + [b'\n'] + else: + # @todo - some formatter option default (between each ungrouped result) + regex_matched_output += [l] + [b'\n'] + # Now we will only show what the regex matched + stripped_text_from_html = b'' + text_content_before_ignored_filter = b'' if regex_matched_output: - stripped_text_from_html = b'\n'.join(regex_matched_output) + # @todo some formatter for presentation? + stripped_text_from_html = b''.join(regex_matched_output) text_content_before_ignored_filter = stripped_text_from_html + # Re #133 - if we should strip whitespaces from triggering the change detected comparison if self.datastore.data['settings']['application'].get('ignore_whitespace', False): fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index c706b0b2..7b5d0c4a 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -239,8 +239,15 @@ Unavailable") }} {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }} diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py index d51200d3..aad29d51 100644 --- a/changedetectionio/tests/test_extract_regex.py +++ b/changedetectionio/tests/test_extract_regex.py @@ -15,7 +15,7 @@ def set_original_response():
So let's see what happens.
Some text thats the same
-
Some text that will change
+
Some text that will change
""" @@ -33,7 +33,8 @@ def set_modified_response():
So let's see what happens.
Some text thats the same
-
Some text that did change ( 1000 online
80 guests
2000 online )
+
Some text that did change ( 1000 online
80 guests
2000 online )
+
SomeCase insensitive 3456
""" @@ -44,11 +45,78 @@ def set_modified_response(): return None -def test_check_filter_and_regex_extract(client, live_server): - sleep_time_for_fetch_thread = 3 +def set_multiline_response(): + test_return_data = """ + + +

Something
+ across 6 billion multiple
+ lines +

+ +
aaand something lines
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + return None + + +def test_setup(client, live_server): live_server_setup(live_server) - css_filter = "#changetext" + +def test_check_filter_multiline(client, live_server): + + set_multiline_response() + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + time.sleep(3) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": '', + 'extract_text': '/something.+?6 billion.+?lines/si', + "url": test_url, + "tag": "", + "headers": "", + 'fetch_backend': "html_requests" + }, + follow_redirects=True + ) + + assert b"Updated watch." in res.data + time.sleep(3) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + + assert b'
Something' in res.data + assert b'
across 6 billion multiple' in res.data + assert b'
lines' in res.data + + # but the last one, which also says 'lines' shouldnt be here (non-greedy match checking) + assert b'aaand something lines' not in res.data + +def test_check_filter_and_regex_extract(client, live_server): + sleep_time_for_fetch_thread = 3 + css_filter = ".changetext" set_original_response() @@ -64,6 +132,7 @@ def test_check_filter_and_regex_extract(client, live_server): ) assert b"1 Imported" in res.data + time.sleep(1) # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) @@ -75,7 +144,7 @@ def test_check_filter_and_regex_extract(client, live_server): res = client.post( url_for("edit_page", uuid="first"), data={"css_filter": css_filter, - 'extract_text': '\d+ online\n\d+ guests', + 'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i', "url": test_url, "tag": "", "headers": "", @@ -86,15 +155,6 @@ def test_check_filter_and_regex_extract(client, live_server): assert b"Updated watch." in res.data - # Check it saved - res = client.get( - url_for("edit_page", uuid="first"), - ) - assert b'\d+ online' in res.data - - # Trigger a check -# client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(sleep_time_for_fetch_thread) @@ -126,5 +186,13 @@ def test_check_filter_and_regex_extract(client, live_server): # Both regexs should be here assert b'
80 guests' in res.data + # Regex with flag handling should be here + assert b'
SomeCase insensitive 3456' in res.data + + # Singular group from /somecase insensitive (345\d)/i + assert b'
3456' in res.data + + # Regex with multiline flag handling should be here + # Should not be here assert b'Some text that did change' not in res.data diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index a9acf502..41b1ff91 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -113,7 +113,6 @@ class update_worker(threading.Thread): err_text = "Page request from server didnt respond correctly" self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, 'last_check_status': e.status_code}) - except Exception as e: self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e)) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})