From 6734fb91a2cf3563bdb36a5dd61e74a2262be3f4 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 17 May 2022 22:22:00 +0200 Subject: [PATCH] Option to control if pages with no renderable content are a change (example: JS webapps that dont render any text sometimes) (#608) --- changedetectionio/content_fetcher.py | 9 +- changedetectionio/fetch_site_status.py | 5 + changedetectionio/forms.py | 1 + changedetectionio/model/App.py | 1 + changedetectionio/templates/settings.html | 5 + .../tests/test_nonrenderable_pages.py | 102 ++++++++++++++++++ changedetectionio/update_worker.py | 4 + 7 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 changedetectionio/tests/test_nonrenderable_pages.py diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index dab97982..0deb8966 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -3,17 +3,22 @@ import chardet import os import requests import time -import urllib3.exceptions import sys - class EmptyReply(Exception): def __init__(self, status_code, url): # Set this so we can use it in other parts of the app self.status_code = status_code self.url = url return + pass +class ReplyWithContentButNoText(Exception): + def __init__(self, status_code, url): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + return pass diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 0770a362..c8b95321 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -184,6 +184,11 @@ class perform_site_check(): # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') + # Treat pages with no renderable text content as a change? No by default + empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) + if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: + raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=200) + # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 14d52cab..d99060f5 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -371,6 +371,7 @@ class globalSettingsApplicationForm(commonSettingsForm): ignore_whitespace = BooleanField('Ignore whitespace') real_browser_save_screenshot = BooleanField('Save last screenshot when using Chrome?') removepassword_button = SubmitField('Remove password', render_kw={"class": "pure-button pure-button-primary"}) + empty_pages_are_a_change = BooleanField('Treat empty pages as a change?', default=False) render_anchor_tag_content = BooleanField('Render anchor tag content', default=False) fetch_backend = RadioField('Fetch Method', default="html_requests", choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) password = SaltyPasswordField() diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index c0c7b135..cb1af56b 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -30,6 +30,7 @@ class model(dict): 'password': False, 'base_url' : None, 'extract_title_as_title': False, + 'empty_pages_are_a_change': False, 'fetch_backend': os.getenv("DEFAULT_FETCH_BACKEND", "html_requests"), 'global_ignore_text': [], # List of text to ignore when calculating the comparison checksum 'global_subtractive_selectors': [], diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index bd3a7632..4cca1da2 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -61,6 +61,11 @@ {{ render_checkbox_field(form.application.form.real_browser_save_screenshot) }} When using a Chrome browser, a screenshot from the last check will be available on the Diff page + +
+ {{ render_checkbox_field(form.application.form.empty_pages_are_a_change) }} + When a page contains HTML, but no renderable text appears (empty page), is this considered a change? +
{% if form.requests.proxy %}
{{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }} diff --git a/changedetectionio/tests/test_nonrenderable_pages.py b/changedetectionio/tests/test_nonrenderable_pages.py new file mode 100644 index 00000000..a2490e5c --- /dev/null +++ b/changedetectionio/tests/test_nonrenderable_pages.py @@ -0,0 +1,102 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from urllib.request import urlopen +from .util import set_original_response, set_modified_response, live_server_setup + +sleep_time_for_fetch_thread = 3 + + +def set_nonrenderable_response(): + test_return_data = """ + modified head title + + + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + return None + +def test_check_basic_change_detection_functionality(client, live_server): + set_original_response() + live_server_setup(live_server) + + # Add our URL to the import page + res = client.post( + url_for("import_page"), + data={"urls": url_for('test_endpoint', _external=True)}, + follow_redirects=True + ) + + assert b"1 Imported" in res.data + + time.sleep(sleep_time_for_fetch_thread) + + # Do this a few times.. ensures we dont accidently set the status + for n in range(3): + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + + ##################### + client.post( + url_for("settings_page"), + data={"application-empty_pages_are_a_change": "", + "requests-time_between_check-minutes": 180, + 'application-fetch_backend': "html_requests"}, + follow_redirects=True + ) + + # this should not trigger a change, because no good text could be converted from the HTML + set_nonrenderable_response() + + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + + # ok now do the opposite + + client.post( + url_for("settings_page"), + data={"application-empty_pages_are_a_change": "y", + "requests-time_between_check-minutes": 180, + 'application-fetch_backend': "html_requests"}, + follow_redirects=True + ) + set_modified_response() + + + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (no new 'unviewed' class) + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + + + + # + # Cleanup everything + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 600cd232..c23ae82a 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -52,6 +52,10 @@ class update_worker(threading.Thread): raise Exception("Error - returned data from the fetch handler SHOULD be bytes") except PermissionError as e: self.app.logger.error("File permission error updating", uuid, str(e)) + except content_fetcher.ReplyWithContentButNoText as e: + # Totally fine, it's by choice - just continue on, nothing more to care about + # Page had elements/content but no renderable text + pass except content_fetcher.EmptyReply as e: # Some kind of custom to-str handler in the exception handler that does this? err_text = "EmptyReply: Status Code {}".format(e.status_code)