From feb8e6c76cdab6b90d95cca98a1d218d0d413128 Mon Sep 17 00:00:00 2001 From: Calvin Bui <3604363+calvinbui@users.noreply.github.com> Date: Fri, 31 Dec 2021 20:26:38 +1100 Subject: [PATCH 01/19] Add socksVersion mapping (#331) --- changedetectionio/content_fetcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 69713b2c..57a71a36 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -70,7 +70,7 @@ class html_webdriver(Fetcher): # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" selenium_proxy_settings_mappings = ['ftpProxy', 'httpProxy', 'noProxy', 'proxyAutoconfigUrl', 'sslProxy', 'autodetect', - 'socksProxy', 'socksUsername', 'socksPassword'] + 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] proxy=None def __init__(self): From a504773941c90af1711f2881ef66d92839470680 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 31 Dec 2021 10:35:29 +0100 Subject: [PATCH 02/19] Bumping selenium version re https://github.com/dgtlmoon/changedetection.io/pull/331#issuecomment-1003323594 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 510dd383..2d48b62e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,4 @@ cryptography ~= 3.4 # Used for CSS filtering, replace with soupsieve and lxml for xpath bs4 -selenium ~= 3.141 +selenium ~= 3.150 From 8a825f005524e3dd9642a6b0b47dc78b32d55b3d Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 31 Dec 2021 10:44:45 +0100 Subject: [PATCH 03/19] Use selenium 4.1.0 --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2d48b62e..23583d11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,5 @@ cryptography ~= 3.4 # Used for CSS filtering, replace with soupsieve and lxml for xpath bs4 -selenium ~= 3.150 +# 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0 +selenium ~= 4.1.0 From 735fc2ac8e8de33d380ef0debcea29aa05d9f6f6 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 31 Dec 2021 10:48:11 +0100 Subject: [PATCH 04/19] Adding new proxyType to selenium mappings --- changedetectionio/content_fetcher.py | 5 ++++- docker-compose.yml | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 57a71a36..33a38427 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -68,9 +68,12 @@ class html_webdriver(Fetcher): # Configs for Proxy setup # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" - selenium_proxy_settings_mappings = ['ftpProxy', 'httpProxy', 'noProxy', + selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', 'proxyAutoconfigUrl', 'sslProxy', 'autodetect', 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] + + + proxy=None def __init__(self): diff --git a/docker-compose.yml b/docker-compose.yml index 1060ed58..1fc71bb9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,9 +17,9 @@ services: # Alternative WebDriver/selenium URL, do not use "'s or 's! # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub # - # WebDriver proxy settings webdriver_ftpProxy, webdriver_httpProxy, webdriver_noProxy, + # WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_httpProxy, webdriver_noProxy, # webdriver_proxyAutoconfigUrl, webdriver_sslProxy, webdriver_autodetect, - # webdriver_socksProxy, webdriver_socksUsername, webdriver_socksPassword + # webdriver_socksProxy, webdriver_socksUsername, webdriver_socksVersion, webdriver_socksPassword # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # From 0439acacbeef7015217a0946b2fa490021c62cad Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sat, 1 Jan 2022 14:53:08 +0100 Subject: [PATCH 05/19] Adding global ignore text (#339) --- changedetectionio/__init__.py | 4 +- changedetectionio/fetch_site_status.py | 6 +- changedetectionio/forms.py | 1 + changedetectionio/store.py | 1 + changedetectionio/templates/settings.html | 15 ++++ changedetectionio/tests/test_ignore_text.py | 85 +++++++++++++++++++++ 6 files changed, 108 insertions(+), 4 deletions(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 98f1a954..24718357 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -552,6 +552,7 @@ def changedetection_app(config=None, datastore_o=None): if request.method == 'GET': form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] + form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend'] form.notification_title.data = datastore.data['settings']['application']['notification_title'] @@ -578,7 +579,8 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['settings']['application']['notification_format'] = form.notification_format.data datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['application']['base_url'] = form.base_url.data - + datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data + if form.trigger_check.data: if len(form.notification_urls.data): n_object = {'watch_url': "Test from changedetection.io!", diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 69ff7de0..dec73987 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -129,11 +129,11 @@ class perform_site_check(): update_obj["last_check_status"] = fetcher.get_last_status_code() update_obj["last_error"] = False - # If there's text to skip # @todo we could abstract out the get_text() to handle this cleaner - if len(watch['ignore_text']): - stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, watch['ignore_text']) + text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', []) + if len(text_to_ignore): + stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, text_to_ignore) else: stripped_text_from_html = stripped_text_from_html.encode('utf8') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index dc06c67a..586a27bb 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -258,3 +258,4 @@ class globalSettingsForm(commonSettingsForm): [validators.NumberRange(min=1)]) extract_title_as_title = BooleanField('Extract from document and use as watch title') base_url = StringField('Base URL', validators=[validators.Optional()]) + global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) \ No newline at end of file diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 60f3d826..fb7cede5 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -45,6 +45,7 @@ class ChangeDetectionStore: 'base_url' : None, 'extract_title_as_title': False, 'fetch_backend': 'html_requests', + 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'notification_urls': [], # Apprise URL list # Custom notification content 'notification_title': None, diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 3a048cc4..5c031305 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -13,6 +13,7 @@ <li class="tab" id="default-tab"><a href="#general">General</a></li> <li class="tab"><a href="#notifications">Notifications</a></li> <li class="tab"><a href="#fetching">Fetching</a></li> + <li class="tab"><a href="#filters">Global Filters</a></li> </ul> </div> <div class="box-wrap inner"> @@ -65,6 +66,20 @@ </span> </div> </div> + + + <div class="tab-pane-inner" id="filters"> + <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span> + <fieldset class="pure-group"> + {{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line +/some.regex\d{2}/ for case-INsensitive regex + ") }} + <span class="pure-form-message-inline"> + Each line processed separately, any line matching will be ignored.<br/> + Regular Expression support, wrap the line in forward slash <b>/regex/</b>. + </span> + </div> + <div id="actions"> <div class="pure-control-group"> <button type="submit" class="pure-button pure-button-primary">Save</button> diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index 119f26eb..79aa761d 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -151,3 +151,88 @@ def test_check_ignore_text_functionality(client, live_server): res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data + +def test_check_global_ignore_text_functionality(client, live_server): + sleep_time_for_fetch_thread = 3 + + ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ" + set_original_ignore_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Goto the settings page, add our ignore text + res = client.post( + url_for("settings_page"), + data={ + "minutes_between_check": 180, + "global_ignore_text": ignore_text, + 'fetch_backend': "html_requests" + }, + follow_redirects=True + ) + assert b"Settings updated." in res.data + + # Goto the edit page of the item, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + # Check it saved + res = client.get( + url_for("settings_page"), + ) + assert bytes(ignore_text.encode('utf-8')) in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'/test-endpoint' in res.data + + # Make a change + set_modified_ignore_response() + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'/test-endpoint' in res.data + + # Just to be sure.. set a regular modified change.. + set_modified_original_ignore_response() + client.get(url_for("api_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data \ No newline at end of file From d4dc3466dcd4d42356aa28a56b3c2e86030a2f8a Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Sat, 1 Jan 2022 18:11:54 +0100 Subject: [PATCH 06/19] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 22cf50de..4fdaf0cc 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Open source web page monitoring, notification and change detection. - University/organisation news from their website - Detect and monitor changes in JSON API responses - API monitoring and alerting +- Changes in legal and other documents - Trigger API calls via notifications when text appears on a website - Glue together APIs using the JSON filter and JSON notifications - Create RSS feeds based on changes in web content From 489671dcca1f3c1d7b08f036e3507b90466d82b7 Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Sun, 2 Jan 2022 14:11:04 +0100 Subject: [PATCH 07/19] Re #342 notification encoding (#343) * Re #342 - check for accidental python byte encoding of non-utf8/string, check return type of fetcher and fix encoding of notification content --- changedetectionio/content_fetcher.py | 5 +++-- changedetectionio/store.py | 4 ++++ changedetectionio/tests/test_notification.py | 3 +++ changedetectionio/update_worker.py | 16 ++++++++++++++-- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 33a38427..ce83ebe0 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -14,7 +14,7 @@ class EmptyReply(Exception): class Fetcher(): error = None status_code = None - content = None # Should be bytes? + content = None # Should always be bytes. fetcher_description ="No description" @@ -129,7 +129,6 @@ class html_webdriver(Fetcher): # driver.quit() seems to cause better exceptions driver.quit() - return True # "html_requests" is listed as the default fetcher in store.py! @@ -146,6 +145,8 @@ class html_requests(Fetcher): timeout=timeout, verify=False) + # https://stackoverflow.com/questions/44203397/python-requests-get-returns-improperly-decoded-text-instead-of-utf-8 + # Return bytes here html = r.text diff --git a/changedetectionio/store.py b/changedetectionio/store.py index fb7cede5..3cc049c0 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -367,6 +367,10 @@ class ChangeDetectionStore: import uuid output_path = "{}/{}".format(self.datastore_path, watch_uuid) + # Incase the operator deleted it, check and create. + if not os.path.isdir(output_path): + mkdir(output_path) + fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4()) with open(fname, 'wb') as f: f.write(contents) diff --git a/changedetectionio/tests/test_notification.py b/changedetectionio/tests/test_notification.py index 3cfeecf9..21083066 100644 --- a/changedetectionio/tests/test_notification.py +++ b/changedetectionio/tests/test_notification.py @@ -159,6 +159,9 @@ def test_check_notification(client, live_server): with open("test-datastore/notification.txt", "r") as f: notification_submission = f.read() + print ("Notification submission was:", notification_submission) + # Re #342 - check for accidental python byte encoding of non-utf8/string + assert "b'" not in notification_submission assert re.search('Watch UUID: [0-9a-f]{8}(-[0-9a-f]{4}){3}-[0-9a-f]{12}', notification_submission, re.IGNORECASE) assert "Watch title: my title" in notification_submission diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 4ab1d806..a8992d96 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -2,7 +2,12 @@ import threading import queue import time -# Requests for checking on the site use a pool of thread Workers managed by a Queue. +# A single update worker +# +# Requests for checking on a single site(watch) from a queue of watches +# (another process inserts watches into the queue that are time-ready for checking) + + class update_worker(threading.Thread): current_uuid = None @@ -39,6 +44,13 @@ class update_worker(threading.Thread): now = time.time() changed_detected, update_obj, contents = update_handler.run(uuid) + # Re #342 + # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. + # We then convert/.decode('utf-8') for the notification etc + if not isinstance(contents, (bytes, bytearray)): + raise Exception("Error - returned data from the fetch handler SHOULD be bytes") + + # Always record that we atleast tried self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3)}) @@ -111,7 +123,7 @@ class update_worker(threading.Thread): n_object.update({ 'watch_url': watch['url'], 'uuid': uuid, - 'current_snapshot': str(contents), + 'current_snapshot': contents.decode('utf-8'), 'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep), 'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep) }) From b5c1fce13699b7668bbc58a376b2aaa9382b11c5 Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Sun, 2 Jan 2022 22:28:34 +0100 Subject: [PATCH 08/19] Re #133 Option for ignoring whitespacing (#345) * Global setting option to ignore whitespace when detecting a change --- changedetectionio/__init__.py | 12 +- changedetectionio/fetch_site_status.py | 23 +-- changedetectionio/forms.py | 3 +- changedetectionio/store.py | 1 + changedetectionio/templates/settings.html | 19 ++- changedetectionio/tests/conftest.py | 3 +- changedetectionio/tests/test_ignore_text.py | 2 +- .../tests/test_ignorewhitespace.py | 96 ++++++++++++ changedetectionio/update_worker.py | 137 +++++++++--------- 9 files changed, 208 insertions(+), 88 deletions(-) create mode 100644 changedetectionio/tests/test_ignorewhitespace.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 24718357..be1fc6be 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -405,7 +405,7 @@ def changedetection_app(config=None, datastore_o=None): # Get the most recent one newest_history_key = datastore.get_val(uuid, 'newest_history_key') - # 0 means that theres only one, so that there should be no 'unviewed' history availabe + # 0 means that theres only one, so that there should be no 'unviewed' history available if newest_history_key == 0: newest_history_key = list(datastore.data['watching'][uuid]['history'].keys())[0] @@ -418,7 +418,11 @@ def changedetection_app(config=None, datastore_o=None): stripped_content = handler.strip_ignore_text(raw_content, datastore.data['watching'][uuid]['ignore_text']) - checksum = hashlib.md5(stripped_content).hexdigest() + if datastore.data['settings']['application'].get('ignore_whitespace', False): + checksum = hashlib.md5(stripped_content.translate(None, b'\r\n\t ')).hexdigest() + else: + checksum = hashlib.md5(stripped_content).hexdigest() + return checksum return datastore.data['watching'][uuid]['previous_md5'] @@ -553,6 +557,7 @@ def changedetection_app(config=None, datastore_o=None): form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check']) form.notification_urls.data = datastore.data['settings']['application']['notification_urls'] form.global_ignore_text.data = datastore.data['settings']['application']['global_ignore_text'] + form.ignore_whitespace.data = datastore.data['settings']['application']['ignore_whitespace'] form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title'] form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend'] form.notification_title.data = datastore.data['settings']['application']['notification_title'] @@ -580,7 +585,8 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data datastore.data['settings']['application']['base_url'] = form.base_url.data datastore.data['settings']['application']['global_ignore_text'] = form.global_ignore_text.data - + datastore.data['settings']['application']['ignore_whitespace'] = form.ignore_whitespace.data + if form.trigger_check.data: if len(form.notification_urls.data): n_object = {'watch_url': "Test from changedetection.io!", diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index dec73987..0a957114 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -58,8 +58,7 @@ class perform_site_check(): watch = self.datastore.data['watching'][uuid] - update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'], - 'history': {}, + update_obj = { "last_checked": timestamp } @@ -137,8 +136,16 @@ class perform_site_check(): else: stripped_text_from_html = stripped_text_from_html.encode('utf8') + # Re #133 - if we should strip whitespaces from triggering the change detected comparison + if self.datastore.data['settings']['application'].get('ignore_whitespace', False): + fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() + else: + fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() - fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() + # On the first run of a site, watch['previous_md5'] will be an empty string, set it the current one. + if not len(watch['previous_md5']): + watch['previous_md5'] = fetched_md5 + update_obj["previous_md5"] = fetched_md5 blocked_by_not_found_trigger_text = False @@ -160,16 +167,12 @@ class perform_site_check(): break - # could be None or False depending on JSON type - # On the first run of a site, watch['previous_md5'] will be an empty string + if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5: changed_detected = True - - # Don't confuse people by updating as last-changed, when it actually just changed from None.. - if self.datastore.get_val(uuid, 'previous_md5'): - update_obj["last_changed"] = timestamp - update_obj["previous_md5"] = fetched_md5 + update_obj["last_changed"] = timestamp + # Extract title as title if is_html: diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 586a27bb..020d9fa8 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -258,4 +258,5 @@ class globalSettingsForm(commonSettingsForm): [validators.NumberRange(min=1)]) extract_title_as_title = BooleanField('Extract <title> from document and use as watch title') base_url = StringField('Base URL', validators=[validators.Optional()]) - global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) \ No newline at end of file + global_ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) + ignore_whitespace = BooleanField('Ignore whitespace') \ No newline at end of file diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 3cc049c0..d27f0476 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -46,6 +46,7 @@ class ChangeDetectionStore: 'extract_title_as_title': False, 'fetch_backend': 'html_requests', 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum + 'ignore_whitespace': False, 'notification_urls': [], # Apprise URL list # Custom notification content 'notification_title': None, diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 5c031305..69c80686 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -69,15 +69,24 @@ <div class="tab-pane-inner" id="filters"> - <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span> - <fieldset class="pure-group"> + + <fieldset class="pure-group"> + {{ render_field(form.ignore_whitespace) }} + <span class="pure-form-message-inline">Ignore whitespace, tabs and new-lines/line-feeds when considering if a change was detected.<br/> + <i>Note:</i> Changing this will change the status of your existing watches, possibily trigger alerts etc. + </span> + </fieldset> + + + <fieldset class="pure-group"> {{ render_field(form.global_ignore_text, rows=5, placeholder="Some text to ignore in a line /some.regex\d{2}/ for case-INsensitive regex ") }} - <span class="pure-form-message-inline"> - Each line processed separately, any line matching will be ignored.<br/> + <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br/> + <span class="pure-form-message-inline">Each line processed separately, any line matching will be ignored.<br/> Regular Expression support, wrap the line in forward slash <b>/regex/</b>. - </span> + </span> + </fieldset> </div> <div id="actions"> diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py index f34ed5bb..aced3075 100644 --- a/changedetectionio/tests/conftest.py +++ b/changedetectionio/tests/conftest.py @@ -18,7 +18,8 @@ def cleanup(datastore_path): 'url-watches.json', 'notification.txt', 'count.txt', - 'endpoint-content.txt'] + 'endpoint-content.txt' + ] for file in files: try: os.unlink("{}/{}".format(datastore_path, file)) diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index 79aa761d..726a6f9b 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -235,4 +235,4 @@ def test_check_global_ignore_text_functionality(client, live_server): assert b'unviewed' in res.data res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) - assert b'Deleted' in res.data \ No newline at end of file + assert b'Deleted' in res.data diff --git a/changedetectionio/tests/test_ignorewhitespace.py b/changedetectionio/tests/test_ignorewhitespace.py new file mode 100644 index 00000000..062efd70 --- /dev/null +++ b/changedetectionio/tests/test_ignorewhitespace.py @@ -0,0 +1,96 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup + +def test_setup(live_server): + live_server_setup(live_server) + + +# Should be the same as set_original_ignore_response() but with a little more whitespacing +def set_original_ignore_response_but_with_whitespace(): + test_return_data = """<html> + <body> + Some initial text</br> + <p> + + + Which is across multiple lines</p> + <br> + </br> + + So let's see what happens. </br> + + + </body> + </html> + + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def set_original_ignore_response(): + test_return_data = """<html> + <body> + Some initial text</br> + <p>Which is across multiple lines</p> + </br> + So let's see what happens. </br> + </body> + </html> + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + + +# If there was only a change in the whitespacing, then we shouldnt have a change detected +def test_check_ignore_whitespace(client, live_server): + sleep_time_for_fetch_thread = 3 + + # Give the endpoint time to spin up + time.sleep(1) + + set_original_ignore_response() + + # Goto the settings page, add our ignore text + res = client.post( + url_for("settings_page"), + data={ + "minutes_between_check": 180, + "ignore_whitespace": "y", + 'fetch_backend': "html_requests" + }, + follow_redirects=True + ) + assert b"Settings updated." in res.data + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + set_original_ignore_response_but_with_whitespace() + time.sleep(sleep_time_for_fetch_thread) + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'/test-endpoint' in res.data diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index a8992d96..a4181426 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -64,74 +64,77 @@ class update_worker(threading.Thread): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) else: - if update_obj: - try: - self.datastore.update_watch(uuid=uuid, update_obj=update_obj) - if changed_detected: - n_object = {} - # A change was detected - fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) - - # Update history with the stripped text for future reference, this will also mean we save the first - # Should always be keyed by string(timestamp) - self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}}) - - watch = self.datastore.data['watching'][uuid] - - print (">> Change detected in UUID {} - {}".format(uuid, watch['url'])) - - # Notifications should only trigger on the second time (first time, we gather the initial snapshot) - if len(watch['history']) > 1: - - dates = list(watch['history'].keys()) - # Convert to int, sort and back to str again - # @todo replace datastore getter that does this automatically - dates = [int(i) for i in dates] - dates.sort(reverse=True) - dates = [str(i) for i in dates] - - prev_fname = watch['history'][dates[1]] - - - # Did it have any notification alerts to hit? - if len(watch['notification_urls']): - print(">>> Notifications queued for UUID from watch {}".format(uuid)) - n_object['notification_urls'] = watch['notification_urls'] - n_object['notification_title'] = watch['notification_title'] - n_object['notification_body'] = watch['notification_body'] - n_object['notification_format'] = watch['notification_format'] - - # No? maybe theres a global setting, queue them all - elif len(self.datastore.data['settings']['application']['notification_urls']): - print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid)) - n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls'] - n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title'] - n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body'] - n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format'] + try: + watch = self.datastore.data['watching'][uuid] + + # For the FIRST time we check a site, or a change detected, save the snapshot. + if changed_detected or not watch['last_checked']: + # A change was detected + fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) + # Should always be keyed by string(timestamp) + self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}}) + + # Generally update anything interesting returned + self.datastore.update_watch(uuid=uuid, update_obj=update_obj) + + # A change was detected + if changed_detected: + n_object = {} + print (">> Change detected in UUID {} - {}".format(uuid, watch['url'])) + + # Notifications should only trigger on the second time (first time, we gather the initial snapshot) + if len(watch['history']) > 1: + + dates = list(watch['history'].keys()) + # Convert to int, sort and back to str again + # @todo replace datastore getter that does this automatically + dates = [int(i) for i in dates] + dates.sort(reverse=True) + dates = [str(i) for i in dates] + + prev_fname = watch['history'][dates[1]] + + + # Did it have any notification alerts to hit? + if len(watch['notification_urls']): + print(">>> Notifications queued for UUID from watch {}".format(uuid)) + n_object['notification_urls'] = watch['notification_urls'] + n_object['notification_title'] = watch['notification_title'] + n_object['notification_body'] = watch['notification_body'] + n_object['notification_format'] = watch['notification_format'] + + # No? maybe theres a global setting, queue them all + elif len(self.datastore.data['settings']['application']['notification_urls']): + print(">>> Watch notification URLs were empty, using GLOBAL notifications for UUID: {}".format(uuid)) + n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls'] + n_object['notification_title'] = self.datastore.data['settings']['application']['notification_title'] + n_object['notification_body'] = self.datastore.data['settings']['application']['notification_body'] + n_object['notification_format'] = self.datastore.data['settings']['application']['notification_format'] + else: + print(">>> NO notifications queued, watch and global notification URLs were empty.") + + # Only prepare to notify if the rules above matched + if 'notification_urls' in n_object: + # HTML needs linebreak, but MarkDown and Text can use a linefeed + if n_object['notification_format'] == 'HTML': + line_feed_sep = "</br>" else: - print(">>> NO notifications queued, watch and global notification URLs were empty.") - - # Only prepare to notify if the rules above matched - if 'notification_urls' in n_object: - # HTML needs linebreak, but MarkDown and Text can use a linefeed - if n_object['notification_format'] == 'HTML': - line_feed_sep = "</br>" - else: - line_feed_sep = "\n" - - from changedetectionio import diff - n_object.update({ - 'watch_url': watch['url'], - 'uuid': uuid, - 'current_snapshot': contents.decode('utf-8'), - 'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep), - 'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep) - }) - - self.notification_q.put(n_object) - - except Exception as e: - print("!!!! Exception in update_worker !!!\n", e) + line_feed_sep = "\n" + + from changedetectionio import diff + n_object.update({ + 'watch_url': watch['url'], + 'uuid': uuid, + 'current_snapshot': contents.decode('utf-8'), + 'diff_full': diff.render_diff(prev_fname, fname, line_feed_sep=line_feed_sep), + 'diff': diff.render_diff(prev_fname, fname, True, line_feed_sep=line_feed_sep) + }) + + self.notification_q.put(n_object) + + except Exception as e: + # Catch everything possible here, so that if a worker crashes, we don't lose it until restart! + print("!!!! Exception in update_worker !!!\n", e) self.current_uuid = None # Done self.q.task_done() From fbd9ecab62c1a400d2a9540ddf15e657fc0f1187 Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Sun, 2 Jan 2022 22:35:04 +0100 Subject: [PATCH 09/19] Re #340 - snapshot should not be modified by ignore text (#344) --- changedetectionio/fetch_site_status.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 0a957114..82108306 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -122,6 +122,9 @@ class perform_site_check(): # get_text() via inscriptis stripped_text_from_html = get_text(html_content) + # Re #340 - return the content before the 'ignore text' was applied + text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') + # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. @@ -181,4 +184,4 @@ class perform_site_check(): update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) - return changed_detected, update_obj, stripped_text_from_html + return changed_detected, update_obj, text_content_before_ignored_filter From 023951a10e7f783ecee9b3daf0260ad940b7586c Mon Sep 17 00:00:00 2001 From: Unpublished <Unpublished@users.noreply.github.com> Date: Sun, 2 Jan 2022 22:35:33 +0100 Subject: [PATCH 10/19] Be sure that documents returned with a application/json header are not parsed with inscriptis (#337) * Auto-detect JSON by Content-Type header * Add test to not parse JSON responses with inscriptis --- changedetectionio/content_fetcher.py | 3 ++ changedetectionio/fetch_site_status.py | 13 ++++-- .../tests/test_jsonpath_selector.py | 46 +++++++++++++++++++ changedetectionio/tests/util.py | 10 ++++ 4 files changed, 69 insertions(+), 3 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index ce83ebe0..d82775b9 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -15,6 +15,7 @@ class Fetcher(): error = None status_code = None content = None # Should always be bytes. + headers = None fetcher_description ="No description" @@ -113,6 +114,7 @@ class html_webdriver(Fetcher): # @todo - dom wait loaded? time.sleep(5) self.content = driver.page_source + self.headers = {} driver.quit() @@ -156,4 +158,5 @@ class html_requests(Fetcher): self.status_code = r.status_code self.content = html + self.headers = r.headers diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 82108306..98c0be1d 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -103,9 +103,16 @@ class perform_site_check(): # https://stackoverflow.com/questions/41817578/basic-method-chaining ? # return content().textfilter().jsonextract().checksumcompare() ? - is_html = True + is_json = fetcher.headers.get('Content-Type', '') == 'application/json' + is_html = not is_json css_filter_rule = watch['css_filter'] - if css_filter_rule and len(css_filter_rule.strip()): + + has_filter_rule = css_filter_rule and len(css_filter_rule.strip()) + if is_json and not has_filter_rule: + css_filter_rule = "json:$" + has_filter_rule = True + + if has_filter_rule: if 'json:' in css_filter_rule: stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) is_html = False @@ -116,7 +123,7 @@ class perform_site_check(): if is_html: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content - if css_filter_rule and len(css_filter_rule.strip()): + if has_filter_rule: html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) # get_text() via inscriptis diff --git a/changedetectionio/tests/test_jsonpath_selector.py b/changedetectionio/tests/test_jsonpath_selector.py index 39529642..5a4b7959 100644 --- a/changedetectionio/tests/test_jsonpath_selector.py +++ b/changedetectionio/tests/test_jsonpath_selector.py @@ -111,6 +111,21 @@ def set_original_response(): f.write(test_return_data) return None + +def set_response_with_html(): + test_return_data = """ + { + "test": [ + { + "html": "<b>" + } + ] + } + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + def set_modified_response(): test_return_data = """ { @@ -138,6 +153,37 @@ def set_modified_response(): return None +def test_check_json_without_filter(client, live_server): + # Request a JSON document from a application/json source containing HTML + # and be sure it doesn't get chewed up by instriptis + set_response_with_html() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint_json', _external=True) + client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(3) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + assert b'"<b>' in res.data + assert res.data.count(b'{\n') >= 2 + + def test_check_json_filter(client, live_server): json_filter = 'json:boss.name' diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 80eb9820..2e30be25 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -44,6 +44,16 @@ def live_server_setup(live_server): with open("test-datastore/endpoint-content.txt", "r") as f: return f.read() + @live_server.app.route('/test-endpoint-json') + def test_endpoint_json(): + + from flask import make_response + + with open("test-datastore/endpoint-content.txt", "r") as f: + resp = make_response(f.read()) + resp.headers['Content-Type'] = 'application/json' + return resp + # Just return the headers in the request @live_server.app.route('/test-headers') def test_headers(): From b73f5a5c88646d9ea546d966e66b036d3deb4b3e Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Mon, 3 Jan 2022 18:46:50 +0100 Subject: [PATCH 11/19] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4fdaf0cc..042f7b7a 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ Open source web page monitoring, notification and change detection. [![Deploy](https://www.herokucdn.com/deploy/button.svg)](https://dashboard.heroku.com/new?template=https%3A%2F%2Fgithub.com%2Fdgtlmoon%2Fchangedetection.io%2Ftree%2Fmaster) +Read the [Heroku notes and limitations wiki page first](https://github.com/dgtlmoon/changedetection.io/wiki/Heroku-notes) + #### Example use cases - Products and services have a change in pricing From b33105d5769f457d6805a544f3fdfcd8e64370f1 Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Mon, 3 Jan 2022 20:16:21 +0100 Subject: [PATCH 12/19] Re #348 - Add test for backup, use proper datastore path --- changedetectionio/__init__.py | 17 +++++++++-------- changedetectionio/tests/test_backup.py | 25 +++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 8 deletions(-) create mode 100644 changedetectionio/tests/test_backup.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index be1fc6be..6ef97442 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -758,7 +758,8 @@ def changedetection_app(config=None, datastore_o=None): from pathlib import Path # Remove any existing backup file, for now we just keep one file - for previous_backup_filename in Path(app.config['datastore_path']).rglob('changedetection-backup-*.zip'): + + for previous_backup_filename in Path(datastore_o.datastore_path).rglob('changedetection-backup-*.zip'): os.unlink(previous_backup_filename) # create a ZipFile object @@ -766,7 +767,7 @@ def changedetection_app(config=None, datastore_o=None): # We only care about UUIDS from the current index file uuids = list(datastore.data['watching'].keys()) - backup_filepath = os.path.join(app.config['datastore_path'], backupname) + backup_filepath = os.path.join(datastore_o.datastore_path, backupname) with zipfile.ZipFile(backup_filepath, "w", compression=zipfile.ZIP_DEFLATED, @@ -776,22 +777,22 @@ def changedetection_app(config=None, datastore_o=None): datastore.sync_to_json() # Add the index - zipObj.write(os.path.join(app.config['datastore_path'], "url-watches.json"), arcname="url-watches.json") + zipObj.write(os.path.join(datastore_o.datastore_path, "url-watches.json"), arcname="url-watches.json") # Add the flask app secret - zipObj.write(os.path.join(app.config['datastore_path'], "secret.txt"), arcname="secret.txt") + zipObj.write(os.path.join(datastore_o.datastore_path, "secret.txt"), arcname="secret.txt") # Add any snapshot data we find, use the full path to access the file, but make the file 'relative' in the Zip. - for txt_file_path in Path(app.config['datastore_path']).rglob('*.txt'): + for txt_file_path in Path(datastore_o.datastore_path).rglob('*.txt'): parent_p = txt_file_path.parent if parent_p.name in uuids: zipObj.write(txt_file_path, - arcname=str(txt_file_path).replace(app.config['datastore_path'], ''), + arcname=str(txt_file_path).replace(datastore_o.datastore_path, ''), compress_type=zipfile.ZIP_DEFLATED, compresslevel=8) # Create a list file with just the URLs, so it's easier to port somewhere else in the future - list_file = os.path.join(app.config['datastore_path'], "url-list.txt") + list_file = os.path.join(datastore_o.datastore_path, "url-list.txt") with open(list_file, "w") as f: for uuid in datastore.data['watching']: url = datastore.data['watching'][uuid]['url'] @@ -803,7 +804,7 @@ def changedetection_app(config=None, datastore_o=None): compress_type=zipfile.ZIP_DEFLATED, compresslevel=8) - return send_from_directory(app.config['datastore_path'], backupname, as_attachment=True) + return send_from_directory(datastore_o.datastore_path, backupname, as_attachment=True) @app.route("/static/<string:group>/<string:filename>", methods=['GET']) def static_content(group, filename): diff --git a/changedetectionio/tests/test_backup.py b/changedetectionio/tests/test_backup.py new file mode 100644 index 00000000..787d7fc0 --- /dev/null +++ b/changedetectionio/tests/test_backup.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from urllib.request import urlopen +from . util import set_original_response, set_modified_response, live_server_setup + + +def test_backup(client, live_server): + + live_server_setup(live_server) + + # Give the endpoint time to spin up + time.sleep(1) + + res = client.get( + url_for("get_backup"), + follow_redirects=True + ) + + # Should get the right zip content type + assert res.content_type == "application/zip" + # Should be PK/ZIP stream + assert res.data.count(b'PK') >= 2 + From 55e679e9732d12b3e4e89f966e3112758a754479 Mon Sep 17 00:00:00 2001 From: Valtteri Huuskonen <38670296+vaasuu@users.noreply.github.com> Date: Tue, 4 Jan 2022 11:55:20 +0200 Subject: [PATCH 13/19] fix typo in README.md (#350) Fix spelling of Raspberry Pi. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 042f7b7a..77a45f6d 100644 --- a/README.md +++ b/README.md @@ -144,9 +144,9 @@ When you enable a `json:` filter, you can even automatically extract and parse e See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration -### RaspberriPi support? +### Raspberry Pi support? -RaspberriPi and linux/arm/v6 linux/arm/v7 arm64 devices are supported! +Raspberry Pi and linux/arm/v6 linux/arm/v7 arm64 devices are supported! ### Windows native support? From f166ab1e303e6a5a47a1c4553ab604e4a835a7a8 Mon Sep 17 00:00:00 2001 From: revilo951 <revilo951@users.noreply.github.com> Date: Wed, 5 Jan 2022 22:20:56 +1100 Subject: [PATCH 14/19] Adding note in comments for working arm64 chrome with rPi-4 (#336) --- docker-compose.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 1fc71bb9..ef4c5699 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -43,7 +43,8 @@ services: restart: unless-stopped # Used for fetching pages via WebDriver+Chrome where you need Javascript support. - # Does not work on rPi, https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver + # Now working on arm64 (needs testing on rPi - tested on Oracle ARM instance) + # replace image with seleniarm/standalone-chromium:4.0.0-20211213 # browser-chrome: # hostname: browser-chrome From f87f7077a6da5ffd4072bc3ad91cd844b19a6fec Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Wed, 5 Jan 2022 14:13:30 +0100 Subject: [PATCH 15/19] Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error (#354) * Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error, adds test --- changedetectionio/content_fetcher.py | 12 +++++- changedetectionio/fetch_site_status.py | 4 +- changedetectionio/html_tools.py | 1 - changedetectionio/tests/test_errorhandling.py | 38 +++++++++++++++++++ changedetectionio/tests/util.py | 7 ++++ changedetectionio/update_worker.py | 20 ++++++---- 6 files changed, 69 insertions(+), 13 deletions(-) create mode 100644 changedetectionio/tests/test_errorhandling.py diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index d82775b9..72491753 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -9,6 +9,12 @@ import urllib3.exceptions class EmptyReply(Exception): + def __init__(self, status_code, url): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + return + pass class Fetcher(): @@ -110,6 +116,8 @@ class html_webdriver(Fetcher): # @todo - how to check this? is it possible? self.status_code = 200 + # @todo somehow we should try to get this working for WebDriver + # raise EmptyReply(url=url, status_code=r.status_code) # @todo - dom wait loaded? time.sleep(5) @@ -151,10 +159,10 @@ class html_requests(Fetcher): # Return bytes here html = r.text - # @todo test this + # @todo maybe you really want to test zero-byte return pages? if not r or not html or not len(html): - raise EmptyReply(url) + raise EmptyReply(url=url, status_code=r.status_code) self.status_code = r.status_code self.content = html diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 98c0be1d..28c27420 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -58,9 +58,7 @@ class perform_site_check(): watch = self.datastore.data['watching'][uuid] - update_obj = { - "last_checked": timestamp - } + update_obj = {} extra_headers = self.datastore.get_val(uuid, 'headers') diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 5d34c4cd..5c795c23 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -16,7 +16,6 @@ def css_filter(css_filter, html_content): return html_block + "\n" - # Extract/find element def extract_element(find='title', html_content=''): diff --git a/changedetectionio/tests/test_errorhandling.py b/changedetectionio/tests/test_errorhandling.py new file mode 100644 index 00000000..423316d4 --- /dev/null +++ b/changedetectionio/tests/test_errorhandling.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup + +from ..html_tools import * + +def test_setup(live_server): + live_server_setup(live_server) + + +def test_error_handler(client, live_server): + + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint_403_error', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(3) + + + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'Status Code 403' in res.data + assert bytes("just now".encode('utf-8')) in res.data \ No newline at end of file diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 2e30be25..54532680 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -54,6 +54,13 @@ def live_server_setup(live_server): resp.headers['Content-Type'] = 'application/json' return resp + @live_server.app.route('/test-403') + def test_endpoint_403_error(): + + from flask import make_response + resp = make_response('', 403) + return resp + # Just return the headers in the request @live_server.app.route('/test-headers') def test_headers(): diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index a4181426..8f535829 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -39,9 +39,10 @@ class update_worker(threading.Thread): changed_detected = False contents = "" update_obj= {} + now = time.time() try: - now = time.time() + changed_detected, update_obj, contents = update_handler.run(uuid) # Re #342 @@ -51,14 +52,13 @@ class update_worker(threading.Thread): raise Exception("Error - returned data from the fetch handler SHOULD be bytes") - # Always record that we atleast tried - self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3)}) - except PermissionError as e: self.app.logger.error("File permission error updating", uuid, str(e)) except content_fetcher.EmptyReply as e: - self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)}) - + # Some kind of custom to-str handler in the exception handler that does this? + err_text = "EmptyReply: Status Code {}".format(e.status_code) + self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, + 'last_check_status': e.status_code}) except Exception as e: self.app.logger.error("Exception reached processing watch UUID:%s - %s", uuid, str(e)) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) @@ -66,13 +66,14 @@ class update_worker(threading.Thread): else: try: watch = self.datastore.data['watching'][uuid] + fname = "" # Saved history text filename # For the FIRST time we check a site, or a change detected, save the snapshot. if changed_detected or not watch['last_checked']: # A change was detected fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) # Should always be keyed by string(timestamp) - self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}}) + self.datastore.update_watch(uuid, {"history": {str(round(time.time())): fname}}) # Generally update anything interesting returned self.datastore.update_watch(uuid=uuid, update_obj=update_obj) @@ -136,6 +137,11 @@ class update_worker(threading.Thread): # Catch everything possible here, so that if a worker crashes, we don't lose it until restart! print("!!!! Exception in update_worker !!!\n", e) + finally: + # Always record that we atleast tried + self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3), + 'last_checked': round(time.time())}) + self.current_uuid = None # Done self.q.task_done() From 59d31bf76f395f95433908829301fd317df6bc4c Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Wed, 5 Jan 2022 17:58:07 +0100 Subject: [PATCH 16/19] XPath support (#355) * XPath support and minor improvements to form validation --- README.md | 2 + changedetectionio/fetch_site_status.py | 10 +- changedetectionio/forms.py | 24 +++- changedetectionio/html_tools.py | 15 +++ changedetectionio/templates/edit.html | 4 +- .../tests/test_xpath_selector.py | 118 ++++++++++++++++++ requirements.txt | 5 +- 7 files changed, 170 insertions(+), 8 deletions(-) create mode 100644 changedetectionio/tests/test_xpath_selector.py diff --git a/README.md b/README.md index 77a45f6d..97dcc408 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,8 @@ docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/dat ```bash docker-compose pull && docker-compose up -d ``` +### Filters +XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. ### Notifications diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 28c27420..7f678657 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -114,15 +114,17 @@ class perform_site_check(): if 'json:' in css_filter_rule: stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) is_html = False - else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) if is_html: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content if has_filter_rule: - html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) + # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." + if css_filter_rule[0] == '/': + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) + else: + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) # get_text() via inscriptis stripped_text_from_html = get_text(html_content) diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 020d9fa8..bd40435a 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -181,7 +181,7 @@ class ValidateListRegex(object): message = field.gettext('RegEx \'%s\' is not a valid regular expression.') raise ValidationError(message % (line)) -class ValidateCSSJSONInput(object): +class ValidateCSSJSONXPATHInput(object): """ Filter validation @todo CSS validator ;) @@ -191,6 +191,24 @@ class ValidateCSSJSONInput(object): self.message = message def __call__(self, form, field): + + # Nothing to see here + if not len(field.data.strip()): + return + + # Does it look like XPath? + if field.data.strip()[0] == '/': + from lxml import html, etree + tree = html.fromstring("<html></html>") + + try: + tree.xpath(field.data.strip()) + except etree.XPathEvalError as e: + message = field.gettext('\'%s\' is not a valid XPath expression. (%s)') + raise ValidationError(message % (field.data, str(e))) + except: + raise ValidationError("A system-error occurred when validating your XPath expression") + if 'json:' in field.data: from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError from jsonpath_ng.ext import parse @@ -202,6 +220,8 @@ class ValidateCSSJSONInput(object): except (JsonPathParserError, JsonPathLexerError) as e: message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)') raise ValidationError(message % (input, str(e))) + except: + raise ValidationError("A system-error occurred when validating your JSONPath expression") # Re #265 - maybe in the future fetch the page and offer a # warning/notice that its possible the rule doesnt yet match anything? @@ -229,7 +249,7 @@ class watchForm(commonSettingsForm): minutes_between_check = html5.IntegerField('Maximum time in minutes until recheck', [validators.Optional(), validators.NumberRange(min=1)]) - css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()]) + css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()]) title = StringField('Title') ignore_text = StringListField('Ignore Text', [ValidateListRegex()]) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 5c795c23..7a6b91c6 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -16,6 +16,21 @@ def css_filter(css_filter, html_content): return html_block + "\n" + +# Return str Utf-8 of matched rules +def xpath_filter(xpath_filter, html_content): + from lxml import html + from lxml import etree + + tree = html.fromstring(html_content) + html_block = "" + + for item in tree.xpath(xpath_filter.strip()): + html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>" + + return html_block + + # Extract/find element def extract_element(find='title', html_content=''): diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index f30c0705..466b7318 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -95,8 +95,10 @@ User-Agent: wonderbra 1.0") }} <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li> <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a></li> + <li>XPATH - Limit text to this XPath rule, simply start with a forward-slash, example <b>//*[contains(@class, 'sametext')]</b>, <a + href="http://xpather.com/" target="new">test your XPath here</a></li> </ul> - Please be sure that you thoroughly understand how to write CSS or JSONPath selector rules before filing an issue on GitHub! <a + Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/> </span> </div> diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py new file mode 100644 index 00000000..c5646c81 --- /dev/null +++ b/changedetectionio/tests/test_xpath_selector.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup + +from ..html_tools import * + +def test_setup(live_server): + live_server_setup(live_server) + +def set_original_response(): + test_return_data = """<html> + <body> + Some initial text</br> + <p>Which is across multiple lines</p> + </br> + So let's see what happens. </br> + <div class="sametext">Some text thats the same</div> + <div class="changetext">Some text that will change</div> + </body> + </html> + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + return None + +def set_modified_response(): + test_return_data = """<html> + <body> + Some initial text</br> + <p>Which is across multiple lines</p> + </br> + So let's see what happens. THIS CHANGES AND SHOULDNT TRIGGER A CHANGE</br> + <div class="sametext">Some text thats the same</div> + <div class="changetext">Some new text</div> + </body> + </html> + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + return None + + +def test_check_markup_xpath_filter_restriction(client, live_server): + sleep_time_for_fetch_thread = 3 + + xpath_filter = "//*[contains(@class, 'sametext')]" + + set_original_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # view it/reset state back to viewed + client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True) + + # Make a change + set_modified_response() + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + +def test_xpath_validation(client, live_server): + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"is not a valid XPath expression" in res.data \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 23583d11..688ad92b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,8 +26,11 @@ paho-mqtt # ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly cryptography ~= 3.4 -# Used for CSS filtering, replace with soupsieve and lxml for xpath +# Used for CSS filtering bs4 +# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe. +lxml + # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0 selenium ~= 4.1.0 From febb2bbf0dc9767d32cac70a7248f2865ed500ae Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Wed, 5 Jan 2022 19:12:13 +0100 Subject: [PATCH 17/19] Heroku tweaks (backup download) (#356) * use absolute path, just incase the data-dir is set relative --- changedetectionio/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 6ef97442..fcd98bad 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -804,7 +804,8 @@ def changedetection_app(config=None, datastore_o=None): compress_type=zipfile.ZIP_DEFLATED, compresslevel=8) - return send_from_directory(datastore_o.datastore_path, backupname, as_attachment=True) + # Send_from_directory needs to be the full absolute path + return send_from_directory(os.path.abspath(datastore_o.datastore_path), backupname, as_attachment=True) @app.route("/static/<string:group>/<string:filename>", methods=['GET']) def static_content(group, filename): From 7916faf58b6f79579932e9be78f42622f10021cd Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Wed, 5 Jan 2022 19:12:50 +0100 Subject: [PATCH 18/19] 0.39.6 --- changedetectionio/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 6ef97442..bbf73b7a 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -30,7 +30,7 @@ import datetime import pytz from copy import deepcopy -__version__ = '0.39.5' +__version__ = '0.39.6' datastore = None From bc7422763541c422b0585d1cddd39a78ad59344e Mon Sep 17 00:00:00 2001 From: dgtlmoon <dgtlmoon@gmail.com> Date: Wed, 5 Jan 2022 20:42:45 +0100 Subject: [PATCH 19/19] Clarify notice/messages around changing ignore text --- changedetectionio/templates/edit.html | 7 +++++-- changedetectionio/templates/settings.html | 9 +++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 466b7318..8fb2e1b6 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -109,8 +109,11 @@ User-Agent: wonderbra 1.0") }} /some.regex\d{2}/ for case-INsensitive regex ") }} <span class="pure-form-message-inline"> - Each line processed separately, any line matching will be ignored.<br/> - Regular Expression support, wrap the line in forward slash <b>/regex/</b>. + <ul> + <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li> + <li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li> + <li>Changing this will affect the comparison checksum which may trigger an alert</li> + </ul> </span> </fieldset> diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index 69c80686..385638b2 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -83,8 +83,13 @@ /some.regex\d{2}/ for case-INsensitive regex ") }} <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br/> - <span class="pure-form-message-inline">Each line processed separately, any line matching will be ignored.<br/> - Regular Expression support, wrap the line in forward slash <b>/regex/</b>. + <span class="pure-form-message-inline"> + <ul> + <li>Note: This is applied globally in addition to the per-watch rules.</li> + <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li> + <li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li> + <li>Changing this will affect the comparison checksum which may trigger an alert</li> + </ul> </span> </fieldset> </div>