From f87f7077a6da5ffd4072bc3ad91cd844b19a6fec Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 5 Jan 2022 14:13:30 +0100 Subject: [PATCH] Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error (#354) * Better handling of EmptyReply exception, always bump 'last_checked' in the case of an error, adds test --- changedetectionio/content_fetcher.py | 12 +++++- changedetectionio/fetch_site_status.py | 4 +- changedetectionio/html_tools.py | 1 - changedetectionio/tests/test_errorhandling.py | 38 +++++++++++++++++++ changedetectionio/tests/util.py | 7 ++++ changedetectionio/update_worker.py | 20 ++++++---- 6 files changed, 69 insertions(+), 13 deletions(-) create mode 100644 changedetectionio/tests/test_errorhandling.py diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index d82775b9..72491753 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -9,6 +9,12 @@ import urllib3.exceptions class EmptyReply(Exception): + def __init__(self, status_code, url): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + return + pass class Fetcher(): @@ -110,6 +116,8 @@ class html_webdriver(Fetcher): # @todo - how to check this? is it possible? self.status_code = 200 + # @todo somehow we should try to get this working for WebDriver + # raise EmptyReply(url=url, status_code=r.status_code) # @todo - dom wait loaded? time.sleep(5) @@ -151,10 +159,10 @@ class html_requests(Fetcher): # Return bytes here html = r.text - # @todo test this + # @todo maybe you really want to test zero-byte return pages? if not r or not html or not len(html): - raise EmptyReply(url) + raise EmptyReply(url=url, status_code=r.status_code) self.status_code = r.status_code self.content = html diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 98c0be1d..28c27420 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -58,9 +58,7 @@ class perform_site_check(): watch = self.datastore.data['watching'][uuid] - update_obj = { - "last_checked": timestamp - } + update_obj = {} extra_headers = self.datastore.get_val(uuid, 'headers') diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 5d34c4cd..5c795c23 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -16,7 +16,6 @@ def css_filter(css_filter, html_content): return html_block + "\n" - # Extract/find element def extract_element(find='title', html_content=''): diff --git a/changedetectionio/tests/test_errorhandling.py b/changedetectionio/tests/test_errorhandling.py new file mode 100644 index 00000000..423316d4 --- /dev/null +++ b/changedetectionio/tests/test_errorhandling.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup + +from ..html_tools import * + +def test_setup(live_server): + live_server_setup(live_server) + + +def test_error_handler(client, live_server): + + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint_403_error', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(3) + + + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + assert b'Status Code 403' in res.data + assert bytes("just now".encode('utf-8')) in res.data \ No newline at end of file diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 2e30be25..54532680 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -54,6 +54,13 @@ def live_server_setup(live_server): resp.headers['Content-Type'] = 'application/json' return resp + @live_server.app.route('/test-403') + def test_endpoint_403_error(): + + from flask import make_response + resp = make_response('', 403) + return resp + # Just return the headers in the request @live_server.app.route('/test-headers') def test_headers(): diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index a4181426..8f535829 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -39,9 +39,10 @@ class update_worker(threading.Thread): changed_detected = False contents = "" update_obj= {} + now = time.time() try: - now = time.time() + changed_detected, update_obj, contents = update_handler.run(uuid) # Re #342 @@ -51,14 +52,13 @@ class update_worker(threading.Thread): raise Exception("Error - returned data from the fetch handler SHOULD be bytes") - # Always record that we atleast tried - self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3)}) - except PermissionError as e: self.app.logger.error("File permission error updating", uuid, str(e)) except content_fetcher.EmptyReply as e: - self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)}) - + # Some kind of custom to-str handler in the exception handler that does this? + err_text = "EmptyReply: Status Code {}".format(e.status_code) + self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, + 'last_check_status': e.status_code}) except Exception as e: self.app.logger.error("Exception reached processing watch UUID:%s - %s", uuid, str(e)) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) @@ -66,13 +66,14 @@ class update_worker(threading.Thread): else: try: watch = self.datastore.data['watching'][uuid] + fname = "" # Saved history text filename # For the FIRST time we check a site, or a change detected, save the snapshot. if changed_detected or not watch['last_checked']: # A change was detected fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents) # Should always be keyed by string(timestamp) - self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}}) + self.datastore.update_watch(uuid, {"history": {str(round(time.time())): fname}}) # Generally update anything interesting returned self.datastore.update_watch(uuid=uuid, update_obj=update_obj) @@ -136,6 +137,11 @@ class update_worker(threading.Thread): # Catch everything possible here, so that if a worker crashes, we don't lose it until restart! print("!!!! Exception in update_worker !!!\n", e) + finally: + # Always record that we atleast tried + self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3), + 'last_checked': round(time.time())}) + self.current_uuid = None # Done self.q.task_done()