From 1968d400fe5a5fc7aa740310d43db35ad94fc12b Mon Sep 17 00:00:00 2001 From: Leigh Morresi <275001+dgtlmoon@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:04:39 +0100 Subject: [PATCH] Store the html2text version too --- backend/fetch_site_status.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index b8b7bea7..2d966bc0 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -2,7 +2,7 @@ from threading import Thread import time import requests import hashlib - +import os # Hmm Polymorphism datastore, thread, etc class perform_site_check(Thread): @@ -13,21 +13,32 @@ class perform_site_check(Thread): self.datastore = datastore self.url = datastore.get_val(uuid, 'url') self.current_md5 = datastore.get_val(uuid, 'previous_md5') + self.output_path = "/datastore/{}".format(self.uuid) def save_firefox_screenshot(self, uuid, output): # @todo call selenium or whatever return - def save_response_output(self, output): - # @todo maybe record a history.json, [timestamp, md5, filename] - import os - path = "/datastore/{}".format(self.uuid) + def ensure_output_path(self): + try: - os.stat(path) + os.stat(self.output_path) except: - os.mkdir(path) + os.mkdir(self.output_path) + + def save_response_html_output(self, output): + # @todo maybe record a history.json, [timestamp, md5, filename] + + + with open("{}/{}.txt".format(self.output_path, self.timestamp), 'w') as f: + f.write(output) + f.close() + + + def save_response_stripped_output(self, output): + # @todo maybe record a history.json, [timestamp, md5, filename] - with open("{}/{}.txt".format(path, self.timestamp), 'w') as f: + with open("{}/{}.stripped.txt".format(self.output_path, self.timestamp), 'w') as f: f.write(output) f.close() @@ -47,6 +58,7 @@ class perform_site_check(Thread): try: r = requests.get(self.url, headers=headers, timeout=15, verify=False) stripped_text_from_html = html2text.html2text(r.content.decode('utf-8')) + self.save_response_stripped_output(stripped_text_from_html) # Usually from networkIO/requests level except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e: @@ -63,10 +75,6 @@ class perform_site_check(Thread): else: # We rely on the actual text in the html output.. many sites have random script vars etc - - - - self.datastore.update_watch(self.uuid, 'last_error', False) self.datastore.update_watch(self.uuid, 'last_check_status', r.status_code) @@ -79,7 +87,7 @@ class perform_site_check(Thread): self.datastore.update_watch(self.uuid, 'last_changed', self.timestamp) self.datastore.update_watch(self.uuid, 'previous_md5', fetched_md5) - self.save_response_output(r.text) + self.save_response_html_output(r.text) self.datastore.update_watch(self.uuid, 'last_checked', int(time.time()))