diff --git a/.gitignore b/.gitignore index b0df9756..0655eb90 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,6 @@ __pycache__ build dist venv +test-datastore *.egg-info* .vscode/settings.json diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index c0fbf2de..18441385 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -204,6 +204,20 @@ class perform_site_check(): else: stripped_text_from_html = stripped_text_from_html.encode('utf8') + # 615 Extract text by regex + extract_text = watch.get('extract_text', []) + if len(extract_text) > 0: + regex_matched_output = [] + for s_re in extract_text: + result = re.findall(s_re.encode('utf8'), stripped_text_from_html, + flags=re.MULTILINE | re.DOTALL | re.LOCALE) + if result: + regex_matched_output.append(result[0]) + + if regex_matched_output: + stripped_text_from_html = b'\n'.join(regex_matched_output) + text_content_before_ignored_filter = stripped_text_from_html + # Re #133 - if we should strip whitespaces from triggering the change detected comparison if self.datastore.data['settings']['application'].get('ignore_whitespace', False): fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() @@ -221,6 +235,7 @@ class perform_site_check(): # Yeah, lets block first until something matches blocked_by_not_found_trigger_text = True # Filter and trigger works the same, so reuse it + # It should return the line numbers that match result = html_tools.strip_ignore_text(content=str(stripped_text_from_html), wordlist=watch['trigger_text'], mode="line numbers") diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index e6f41978..38d9f8b1 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -223,7 +223,7 @@ class validateURL(object): except validators.ValidationFailure: message = field.gettext('\'%s\' is not a valid URL.' % (field.data.strip())) raise ValidationError(message) - + class ValidateListRegex(object): """ Validates that anything that looks like a regex passes as a regex @@ -330,6 +330,9 @@ class watchForm(commonSettingsForm): css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='') subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) + + extract_text = StringListField('Extract text', [ValidateListRegex()]) + title = StringField('Title', default='') ignore_text = StringListField('Ignore text', [ValidateListRegex()]) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index cde2f456..56f7ca84 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -35,7 +35,8 @@ class model(dict): 'notification_title': default_notification_title, 'notification_body': default_notification_body, 'notification_format': default_notification_format, - 'css_filter': "", + 'css_filter': '', + 'extract_text': [], # Extract text by regex after filters 'subtractive_selectors': [], 'trigger_text': [], # List of text or regex to wait for until a change is detected 'fetch_backend': None, diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index bf5e7aa3..32fab8ae 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -199,6 +199,17 @@ nav +
+
+ {{ render_field(form.extract_text, rows=5, placeholder="/some.regex\d{2}/ case-insensitive regex") }} + +
    +
  • Extracts text in the final output after other filters using regular expressions, for example \d+ online
  • +
  • One line per regular-expression.
  • +
+
+
+
diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py new file mode 100644 index 00000000..e59a22b6 --- /dev/null +++ b/changedetectionio/tests/test_extract_regex.py @@ -0,0 +1,127 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from .util import live_server_setup + +from ..html_tools import * + + +def set_original_response(): + test_return_data = """ + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+
Some text thats the same
+
Some text that will change
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + + +def set_modified_response(): + test_return_data = """ + + Some initial text
+

which has this one new line

+
+ So let's see what happens.
+
Some text thats the same
+
Some text that did change ( 1000 online
80 guests)
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + return None + + +def test_check_filter_and_regex_extract(client, live_server): + sleep_time_for_fetch_thread = 3 + + live_server_setup(live_server) + css_filter = "#changetext" + + set_original_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": css_filter, + 'extract_text': '\d+ online\n\d+ guests', + "url": test_url, + "tag": "", + "headers": "", + 'fetch_backend': "html_requests" + }, + follow_redirects=True + ) + + assert b"Updated watch." in res.data + + # Check it saved + res = client.get( + url_for("edit_page", uuid="first"), + ) + assert b'\d+ online' in res.data + + # Trigger a check +# client.get(url_for("form_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Make a change + set_modified_response() + + # Trigger a check + client.get(url_for("form_watch_checknow"), follow_redirects=True) + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should have 'unviewed' still + # Because it should be looking at only that 'sametext' id + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + # Check HTML conversion detected and workd + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + # Class will be blank for now because the frontend didnt apply the diff + assert b'
1000 online' in res.data + + # Both regexs should be here + assert b'
80 guests' in res.data + + # Should not be here + assert b'Some text that did change' not in res.data \ No newline at end of file