From 47e5a7cf0990bc958061ce57de8ae88f3fbd39aa Mon Sep 17 00:00:00 2001 From: Leigh Morresi <275001+dgtlmoon@users.noreply.github.com> Date: Fri, 5 Feb 2021 18:43:35 +0100 Subject: [PATCH] Avoid accidently using Python's objects that are copied - but land as a 'soft reference', need to use a better dict struct in the future #6 --- backend/backend.py | 16 +++++-- backend/fetch_site_status.py | 87 ++++++++++++++++-------------------- backend/store.py | 73 +++++++++++++++--------------- 3 files changed, 87 insertions(+), 89 deletions(-) diff --git a/backend/backend.py b/backend/backend.py index b3563808..d76c1618 100644 --- a/backend/backend.py +++ b/backend/backend.py @@ -102,6 +102,7 @@ def main_page(): watch['uuid'] = uuid sorted_watches.append(watch) + sorted_watches.sort(key=lambda x: x['last_changed'], reverse=True) existing_tags = datastore.get_all_tags() @@ -248,6 +249,7 @@ def diff_history_page(uuid): dates.sort(reverse=True) dates = [str(i) for i in dates] + newest_file = watch['history'][dates[0]] with open(newest_file, 'r') as f: newest_version_file_contents = f.read() @@ -392,6 +394,7 @@ class Worker(threading.Thread): current_uuid = None + def __init__(self, q, *args, **kwargs): self.q = q super().__init__(*args, **kwargs) @@ -399,14 +402,21 @@ class Worker(threading.Thread): def run(self): import fetch_site_status + from copy import deepcopy + + update_handler = fetch_site_status.perform_site_check(datastore=datastore) + try: while True: uuid = self.q.get() # Blocking self.current_uuid = uuid if uuid in list(datastore.data['watching'].keys()): - update_handler = fetch_site_status.perform_site_check(uuid=uuid, datastore=datastore) - datastore.update_watch(uuid=uuid, update_obj=update_handler.update_data) + + result = update_handler.run(uuid) + + datastore.update_watch(uuid=uuid, update_obj=result) + self.current_uuid = None # Done self.q.task_done() @@ -440,7 +450,7 @@ def save_datastore(): while True: if datastore.needs_write: datastore.sync_to_json() - time.sleep(5) + time.sleep(1) except KeyboardInterrupt: return diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index 95b75a8a..a5770ffb 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -5,30 +5,15 @@ import os import re from inscriptis import get_text -# Some common stuff here that can be moved to a base class -class perform_site_check(): +from copy import deepcopy - # New state that is set after a check - # Return value dict - update_obj = {} +# Some common stuff here that can be moved to a base class +class perform_site_check(): - def __init__(self, *args, uuid=False, datastore, **kwargs): + def __init__(self, *args, datastore, **kwargs): super().__init__(*args, **kwargs) - self.timestamp = int(time.time()) # used for storage etc too - self.uuid = uuid self.datastore = datastore - self.url = datastore.get_val(uuid, 'url') - self.current_md5 = datastore.get_val(uuid, 'previous_md5') - self.output_path = "/datastore/{}".format(self.uuid) - - self.ensure_output_path() - self.run() - - # Current state of what needs to be updated - @property - def update_data(self): - return self.update_obj def save_firefox_screenshot(self, uuid, output): # @todo call selenium or whatever @@ -41,27 +26,30 @@ class perform_site_check(): except: os.mkdir(self.output_path) - def save_response_html_output(self, output): - - # @todo Saving the original HTML can be very large, better to set as an option, these files could be important to some. - with open("{}/{}.html".format(self.output_path, self.timestamp), 'w') as f: - f.write(output) - f.close() + def save_response_stripped_output(self, output, fname): - def save_response_stripped_output(self, output): - fname = "{}/{}.stripped.txt".format(self.output_path, self.timestamp) with open(fname, 'w') as f: f.write(output) f.close() return fname - def run(self): + def run(self, uuid): + + timestamp = int(time.time()) # used for storage etc too + + update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'], + 'history': {}, + "last_checked": timestamp + } + + self.output_path = "/datastore/{}".format(uuid) + self.ensure_output_path() - extra_headers = self.datastore.get_val(self.uuid, 'headers') + extra_headers = self.datastore.get_val(uuid, 'headers') # Tweak the base config with the per-watch ones - request_headers = self.datastore.data['settings']['headers'].copy() + request_headers = self.datastore.data['settings']['headers'] request_headers.update(extra_headers) # https://github.com/psf/requests/issues/4525 @@ -77,7 +65,7 @@ class perform_site_check(): timeout = 15 try: - r = requests.get(self.url, + r = requests.get(self.datastore.get_val(uuid, 'url'), headers=request_headers, timeout=timeout, verify=False) @@ -88,17 +76,17 @@ class perform_site_check(): # Usually from networkIO/requests level except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: - self.update_obj["last_error"] = str(e) + update_obj["last_error"] = str(e) print(str(e)) except requests.exceptions.MissingSchema: - print("Skipping {} due to missing schema/bad url".format(self.uuid)) + print("Skipping {} due to missing schema/bad url".format(uuid)) # Usually from html2text level except UnicodeDecodeError as e: - self.update_obj["last_error"] = str(e) + update_obj["last_error"] = str(e) print(str(e)) # figure out how to deal with this cleaner.. # 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte @@ -107,26 +95,29 @@ class perform_site_check(): # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. - self.update_obj["last_check_status"] = r.status_code - self.update_obj["last_error"] = False + update_obj["last_check_status"] = r.status_code + update_obj["last_error"] = False - fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest() + if not len(r.text): + update_obj["last_error"] = "Empty reply" + fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest() - if self.current_md5 != fetched_md5: # could be None or False depending on JSON type + # could be None or False depending on JSON type + if self.datastore.data['watching'][uuid]['previous_md5'] != fetched_md5: # Don't confuse people by updating as last-changed, when it actually just changed from None.. - if self.datastore.get_val(self.uuid, 'previous_md5'): - self.update_obj["last_changed"] = self.timestamp + if self.datastore.get_val(uuid, 'previous_md5'): + update_obj["last_changed"] = timestamp - self.update_obj["previous_md5"] = fetched_md5 - - self.save_response_html_output(r.text) - output_filepath = self.save_response_stripped_output(stripped_text_from_html) + update_obj["previous_md5"] = fetched_md5 + fname = "{}/{}.stripped.txt".format(self.output_path, fetched_md5) + with open(fname, 'w') as f: + f.write(stripped_text_from_html) + f.close() # Update history with the stripped text for future reference, this will also mean we save the first - timestamp = str(self.timestamp) - self.update_obj.update({"history": {timestamp: output_filepath}}) - - self.update_obj["last_checked"] = self.timestamp + # Should always be keyed by string(timestamp) + update_obj.update({"history": {str(timestamp): fname}}) + return update_obj diff --git a/backend/store.py b/backend/store.py index 4e1f4a3a..4e8b044f 100644 --- a/backend/store.py +++ b/backend/store.py @@ -5,6 +5,7 @@ import os.path from os import path from threading import Lock, Thread +from copy import deepcopy # Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods? # Open a github issue if you know something :) @@ -41,7 +42,7 @@ class ChangeDetectionStore: 'last_checked': 0, 'last_changed': 0, 'title': None, - 'previous_md5': None, + 'previous_md5': "", 'uuid': str(uuid_builder.uuid4()), 'headers': {}, # Extra headers to send 'history': {} # Dict of timestamp and output stripped filename @@ -71,9 +72,9 @@ class ChangeDetectionStore: # Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future. # @todo pretty sure theres a python we todo this with an abstracted(?) object! - i = 0 + for uuid, watch in self.data['watching'].items(): - _blank = self.generic_definition.copy() + _blank = deepcopy(self.generic_definition) _blank.update(watch) self.__data['watching'].update({uuid: _blank}) print("Watching:", uuid, _blank['url']) @@ -88,19 +89,18 @@ class ChangeDetectionStore: def update_watch(self, uuid, update_obj): - self.lock.acquire() + with self.lock: - # In python 3.9 we have the |= dict operator, but that still will lose data on nested structures... - for dict_key, d in self.generic_definition.items(): - if isinstance(d, dict) and dict_key in update_obj: - self.__data['watching'][uuid][dict_key].update(update_obj[dict_key]) - del(update_obj[dict_key]) + # In python 3.9 we have the |= dict operator, but that still will lose data on nested structures... + for dict_key, d in self.generic_definition.items(): + if isinstance(d, dict): + if update_obj is not None and dict_key in update_obj: + self.__data['watching'][uuid][dict_key].update(update_obj[dict_key]) + del(update_obj[dict_key]) - # Update with the remaining values - self.__data['watching'][uuid].update(update_obj) + self.__data['watching'][uuid].update(update_obj) self.needs_write = True - self.lock.release() @property def data(self): @@ -120,12 +120,10 @@ class ChangeDetectionStore: return tags def delete(self, uuid): - - self.lock.acquire() - del (self.__data['watching'][uuid]) - self.needs_write = True - self.lock.release() - + with self.lock: + del (self.__data['watching'][uuid]) + self.needs_write = True + def url_exists(self, url): # Probably their should be dict... @@ -140,31 +138,30 @@ class ChangeDetectionStore: return self.data['watching'][uuid].get(val) def add_watch(self, url, tag): - self.lock.acquire() - #print("Adding", url, tag) - # # @todo deal with exception - # validators.url(url) - - # @todo use a common generic version of this - new_uuid = str(uuid_builder.uuid4()) - _blank = self.generic_definition.copy() - _blank.update({ - 'url': url, - 'tag': tag, - 'uuid': new_uuid - }) - - self.data['watching'][new_uuid] = _blank + with self.lock: + + # @todo use a common generic version of this + new_uuid = str(uuid_builder.uuid4()) + _blank = deepcopy(self.generic_definition) + _blank.update({ + 'url': url, + 'tag': tag, + 'uuid': new_uuid + }) + + self.data['watching'][new_uuid] = _blank + self.needs_write = True - self.lock.release() + return new_uuid def sync_to_json(self): - print("Saving index") - self.lock.acquire() + + with open('/datastore/url-watches.json', 'w') as json_file: - json.dump(self.data, json_file, indent=4) + json.dump(self.__data, json_file, indent=4) + print("Re-saved index") + self.needs_write = False - self.lock.release() # body of the constructor