From 646a54945a2df30b44eb2e50908502a9a40bc95c Mon Sep 17 00:00:00 2001 From: Leigh Morresi <275001+dgtlmoon@users.noreply.github.com> Date: Wed, 27 Jan 2021 15:56:59 +0100 Subject: [PATCH] Handle errors better, use the plaintext output --- backend/backend.py | 32 +++++++++++--------- backend/fetch_site_status.py | 42 ++++++++++++++++++++++----- backend/requirements.txt | 2 ++ backend/static/css/styles.css | 4 +++ backend/store.py | 2 ++ backend/templates/watch-overview.html | 8 +++-- 6 files changed, 66 insertions(+), 24 deletions(-) diff --git a/backend/backend.py b/backend/backend.py index 26e52ce6..dc05ed70 100644 --- a/backend/backend.py +++ b/backend/backend.py @@ -1,5 +1,8 @@ #!/usr/bin/python3 + +# @todo logging + import json import eventlet import eventlet.wsgi @@ -22,7 +25,7 @@ ticker_thread = None datastore = store.ChangeDetectionStore() messages = [] -running_update_threads={} +running_update_threads = {} app = Flask(__name__, static_url_path='/static') app.config['STATIC_RESOURCES'] = "/app/static" @@ -32,11 +35,11 @@ app.config['STATIC_RESOURCES'] = "/app/static" # Disables caching of the templates app.config['TEMPLATES_AUTO_RELOAD'] = True + # We use the whole watch object from the store/JSON so we can see if there's some related status in terms of a thread # running or something similar. @app.template_filter('format_last_checked_time') def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"): - global running_update_threads if watch_obj['uuid'] in running_update_threads: if running_update_threads[watch_obj['uuid']].is_alive(): @@ -47,17 +50,20 @@ def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"): return datetime.datetime.utcfromtimestamp(int(watch_obj['last_checked'])).strftime(format) + @app.template_filter('format_timestamp') def _jinja2_filter_datetimestamp(timestamp, format="%Y-%m-%d %H:%M:%S"): if timestamp == 0: return 'Never' return datetime.datetime.utcfromtimestamp(timestamp).strftime(format) + @app.route("/", methods=['GET']) def main_page(): global messages # Show messages but once. + # maybe if the change happened more than a few days ago.. add a class output = render_template("watch-overview.html", watches=datastore.data['watching'], messages=messages) messages = [] return output @@ -75,9 +81,9 @@ def static_content(group, filename): def api_watch_add(): global messages - #@todo add_watch should throw a custom Exception for validation etc - datastore.add_watch(url=request.form.get('url'), tag=request.form.get('tag')) - messages.append({'class':'ok', 'message': 'Saved'}) + # @todo add_watch should throw a custom Exception for validation etc + datastore.add_watch(url=request.form.get('url').strip(), tag=request.form.get('tag').strip()) + messages.append({'class': 'ok', 'message': 'Saved'}) launch_checks() return redirect(url_for('main_page')) @@ -86,14 +92,14 @@ def api_watch_add(): def api_watch_checknow(): global messages - uuid=request.args.get('uuid') + uuid = request.args.get('uuid') # dict would be better, this is a simple safety catch. for watch in datastore.data['watching']: if watch['uuid'] == uuid: # @todo cancel if already running? running_update_threads[uuid] = fetch_site_status.perform_site_check(uuid=uuid, - datastore=datastore) + datastore=datastore) running_update_threads[uuid].start() return redirect(url_for('main_page')) @@ -105,14 +111,15 @@ def launch_checks(): global running_update_threads for watch in datastore.data['watching']: - if watch['last_checked'] <= time.time() - 20: - running_update_threads[watch['uuid']] = fetch_site_status.perform_site_check(uuid = watch['uuid'], datastore=datastore) + if watch['last_checked'] <= time.time() - 86400: + running_update_threads[watch['uuid']] = fetch_site_status.perform_site_check(uuid=watch['uuid'], + datastore=datastore) running_update_threads[watch['uuid']].start() -def ticker_thread_check_time_launch_checks(): +# Thread runner to check every minute +def ticker_thread_check_time_launch_checks(): while True: - print ("lanching") launch_checks() time.sleep(60) @@ -121,10 +128,9 @@ def main(argv): ssl_mode = False port = 5000 - #@todo handle ctrl break + # @todo handle ctrl break ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() - try: opts, args = getopt.getopt(argv, "sp:") except getopt.GetoptError: diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index 9d0a4223..7dee51b8 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -1,21 +1,21 @@ from threading import Thread import time import requests - import hashlib + # Hmm Polymorphism datastore, thread, etc class perform_site_check(Thread): def __init__(self, *args, uuid=False, datastore, **kwargs): super().__init__(*args, **kwargs) - self.timestamp = int(time.time()) # used for storage etc too + self.timestamp = int(time.time()) # used for storage etc too self.uuid = uuid self.datastore = datastore self.url = datastore.get_val(uuid, 'url') self.current_md5 = datastore.get_val(uuid, 'previous_md5') def save_firefox_screenshot(self, uuid, output): - #@todo call selenium or whatever + # @todo call selenium or whatever return def save_response_output(self, output): @@ -31,19 +31,45 @@ class perform_site_check(Thread): f.write(output) f.close() - def run(self): + + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,cs;q=0.7' + } + + + print("Checking", self.url) + import html2text + try: - r = requests.get(self.url) - except requests.exceptions.ConnectionError as e: + r = requests.get(self.url, headers=headers, timeout=15) + + stripped_text_from_html = html2text.html2text(r.content.decode('utf-8')) + # Usually from networkIO/requests level + except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e: self.datastore.update_watch(self.uuid, 'last_error', str(e)) + print(str(e)) + + + # Usually from html2text level + except UnicodeDecodeError as e: + self.datastore.update_watch(self.uuid, 'last_error', str(e)) + print(str(e)) - print (str(e)) else: + + # We rely on the actual text in the html output.. many sites have random script vars etc + + + + self.datastore.update_watch(self.uuid, 'last_error', False) self.datastore.update_watch(self.uuid, 'last_check_status', r.status_code) - fetched_md5=hashlib.md5(r.text.encode('utf-8')).hexdigest() + fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest() if self.current_md5 != fetched_md5: self.datastore.update_watch(self.uuid, 'previous_md5', fetched_md5) diff --git a/backend/requirements.txt b/backend/requirements.txt index 9d50a5f5..d48c15ce 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -14,5 +14,7 @@ validators bleach==3.2.1 html5lib==0.9999999 # via bleach +html2text + # @notes # - Dont install socketio, it interferes with flask_socketio diff --git a/backend/static/css/styles.css b/backend/static/css/styles.css index 381bbd02..9fed6971 100644 --- a/backend/static/css/styles.css +++ b/backend/static/css/styles.css @@ -24,3 +24,7 @@ flex-direction: column; align-items: center; justify-content: center; } + +.watch-table .error { +color: #aa0000; +} \ No newline at end of file diff --git a/backend/store.py b/backend/store.py index 794d0148..9eefabb8 100644 --- a/backend/store.py +++ b/backend/store.py @@ -2,6 +2,8 @@ import json import uuid import validators + +# @TODO Have a var which is the base value, this is referred to even in the templating.. merge and append,not just append # Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods? # Open a github issue if you know something :) # https://stackoverflow.com/questions/6190468/how-to-trigger-function-on-value-change diff --git a/backend/templates/watch-overview.html b/backend/templates/watch-overview.html index bed00980..da66c874 100644 --- a/backend/templates/watch-overview.html +++ b/backend/templates/watch-overview.html @@ -16,7 +16,7 @@ - +
@@ -30,10 +30,12 @@ {% for watch in watches %} - + - +
#
{{ loop.index }} {{ watch.url }}{{watch|format_last_checked_time}}{{watch|format_last_checked_time}} + {% if watch.last_error is defined and watch.last_error != False %} !! {% endif %} + {{watch.last_changed|format_timestamp}} Recheck