From 343e359b391a76b0a5ee94567a393d1fb2b64a89 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 23 Jun 2024 09:19:32 +0200 Subject: [PATCH] Now saving last two HTML snapshots for future reference, refactor, dont write screenshots and xpath to disk when no change detected (saves disk IO) (#2431) --- changedetectionio/api/api_v1.py | 20 ++- .../blueprint/browser_steps/__init__.py | 6 +- changedetectionio/model/Watch.py | 91 ++++++++++-- changedetectionio/processors/__init__.py | 3 +- changedetectionio/processors/restock_diff.py | 10 +- .../processors/text_json_diff.py | 25 ++-- changedetectionio/store.py | 41 ------ changedetectionio/tests/test_api.py | 9 ++ .../tests/test_history_consistency.py | 28 ++-- changedetectionio/update_worker.py | 130 ++++++++++-------- 10 files changed, 212 insertions(+), 151 deletions(-) diff --git a/changedetectionio/api/api_v1.py b/changedetectionio/api/api_v1.py index 85e2b30e..19d83612 100644 --- a/changedetectionio/api/api_v1.py +++ b/changedetectionio/api/api_v1.py @@ -170,23 +170,33 @@ class WatchSingleHistory(Resource): curl http://localhost:5000/api/v1/watch/cc0cfffa-f449-477b-83ea-0caafd1dc091/history/1677092977 -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" @apiName Get single snapshot content @apiGroup Watch History + @apiParam {String} [html] Optional Set to =1 to return the last HTML (only stores last 2 snapshots, use `latest` as timestamp) @apiSuccess (200) {String} OK @apiSuccess (404) {String} ERR Not found """ watch = self.datastore.data['watching'].get(uuid) if not watch: - abort(404, message='No watch exists with the UUID of {}'.format(uuid)) + abort(404, message=f"No watch exists with the UUID of {uuid}") if not len(watch.history): - abort(404, message='Watch found but no history exists for the UUID {}'.format(uuid)) + abort(404, message=f"Watch found but no history exists for the UUID {uuid}") if timestamp == 'latest': timestamp = list(watch.history.keys())[-1] - content = watch.get_history_snapshot(timestamp) + if request.args.get('html'): + content = watch.get_fetched_html(timestamp) + if content: + response = make_response(content, 200) + response.mimetype = "text/html" + else: + response = make_response("No content found", 404) + response.mimetype = "text/plain" + else: + content = watch.get_history_snapshot(timestamp) + response = make_response(content, 200) + response.mimetype = "text/plain" - response = make_response(content, 200) - response.mimetype = "text/plain" return response diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py index 30797099..f92bf9f8 100644 --- a/changedetectionio/blueprint/browser_steps/__init__.py +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -187,8 +187,10 @@ def construct_blueprint(datastore: ChangeDetectionStore): u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url if is_last_step and u: (screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data() - datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot) - datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data) + watch = datastore.data['watching'].get(uuid) + if watch: + watch.save_screenshot(screenshot=screenshot) + watch.save_xpath_data(data=xpath_data) # if not this_session.page: # cleanup_playwright_session() diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 83bfb2e3..44157268 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -328,14 +328,9 @@ class model(dict): def save_history_text(self, contents, timestamp, snapshot_id): import brotli - self.ensure_data_dir_exists() + logger.trace(f"{self.get('uuid')} - Updating history.txt with timestamp {timestamp}") - # Small hack so that we sleep just enough to allow 1 second between history snapshots - # this is because history.txt indexes/keys snapshots by epoch seconds and we dont want dupe keys - if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key): - logger.warning(f"Timestamp {timestamp} already exists, waiting 1 seconds so we have a unique key in history.txt") - timestamp = str(int(timestamp) + 1) - time.sleep(1) + self.ensure_data_dir_exists() threshold = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024)) skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False')) @@ -528,8 +523,42 @@ class model(dict): # None is set return False + def save_error_text(self, contents): + self.ensure_data_dir_exists() + target_path = os.path.join(self.watch_data_dir, "last-error.txt") + with open(target_path, 'w') as f: + f.write(contents) + + def save_xpath_data(self, data, as_error=False): + import json + + if as_error: + target_path = os.path.join(self.watch_data_dir, "elements-error.json") + else: + target_path = os.path.join(self.watch_data_dir, "elements.json") + + self.ensure_data_dir_exists() + + with open(target_path, 'w') as f: + f.write(json.dumps(data)) + f.close() + + # Save as PNG, PNG is larger but better for doing visual diff in the future + def save_screenshot(self, screenshot: bytes, as_error=False): + + if as_error: + target_path = os.path.join(self.watch_data_dir, "last-error-screenshot.png") + else: + target_path = os.path.join(self.watch_data_dir, "last-screenshot.png") + + self.ensure_data_dir_exists() + + with open(target_path, 'wb') as f: + f.write(screenshot) + f.close() + - def get_last_fetched_before_filters(self): + def get_last_fetched_text_before_filters(self): import brotli filepath = os.path.join(self.watch_data_dir, 'last-fetched.br') @@ -544,12 +573,56 @@ class model(dict): with open(filepath, 'rb') as f: return(brotli.decompress(f.read()).decode('utf-8')) - def save_last_fetched_before_filters(self, contents): + def save_last_text_fetched_before_filters(self, contents): import brotli filepath = os.path.join(self.watch_data_dir, 'last-fetched.br') with open(filepath, 'wb') as f: f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) + def save_last_fetched_html(self, timestamp, contents): + import brotli + + self.ensure_data_dir_exists() + snapshot_fname = f"{timestamp}.html.br" + filepath = os.path.join(self.watch_data_dir, snapshot_fname) + + with open(filepath, 'wb') as f: + contents = contents.encode('utf-8') if isinstance(contents, str) else contents + try: + f.write(brotli.compress(contents)) + except Exception as e: + logger.warning(f"{self.get('uuid')} - Unable to compress snapshot, saving as raw data to {filepath}") + logger.warning(e) + f.write(contents) + + self._prune_last_fetched_html_snapshots() + + def get_fetched_html(self, timestamp): + import brotli + + snapshot_fname = f"{timestamp}.html.br" + filepath = os.path.join(self.watch_data_dir, snapshot_fname) + if os.path.isfile(filepath): + with open(filepath, 'rb') as f: + return (brotli.decompress(f.read()).decode('utf-8')) + + return False + + + def _prune_last_fetched_html_snapshots(self): + + dates = list(self.history.keys()) + dates.reverse() + + for index, timestamp in enumerate(dates): + snapshot_fname = f"{timestamp}.html.br" + filepath = os.path.join(self.watch_data_dir, snapshot_fname) + + # Keep only the first 2 + if index > 1 and os.path.isfile(filepath): + os.remove(filepath) + + @property def get_browsersteps_available_screenshots(self): "For knowing which screenshots are available to show the user in BrowserSteps UI" diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 9e4ce6b1..d24c9a9d 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,5 +1,6 @@ from abc import abstractmethod from changedetectionio.strtobool import strtobool +from changedetectionio.model import Watch from copy import deepcopy from loguru import logger import hashlib @@ -138,7 +139,7 @@ class difference_detection_processor(): # After init, call run_changedetection() which will do the actual change-detection @abstractmethod - def run_changedetection(self, uuid, skip_when_checksum_same=True): + def run_changedetection(self, watch: Watch, skip_when_checksum_same=True): update_obj = {'last_notification_error': False, 'last_error': False} some_data = 'xxxxx' update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index e692e7cb..a948eb0d 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -1,6 +1,5 @@ from . import difference_detection_processor -from copy import deepcopy from loguru import logger import hashlib import urllib3 @@ -20,10 +19,7 @@ class perform_site_check(difference_detection_processor): screenshot = None xpath_data = None - def run_changedetection(self, uuid, skip_when_checksum_same=True): - - # DeepCopy so we can be sure we don't accidently change anything by reference - watch = deepcopy(self.datastore.data['watching'].get(uuid)) + def run_changedetection(self, watch, skip_when_checksum_same=True): if not watch: raise Exception("Watch no longer exists.") @@ -44,13 +40,13 @@ class perform_site_check(difference_detection_processor): fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False - logger.debug(f"Watch UUID {uuid} restock check returned '{self.fetcher.instock_data}' from JS scraper.") + logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.") else: raise UnableToExtractRestockData(status_code=self.fetcher.status_code) # The main thing that all this at the moment comes down to :) changed_detected = False - logger.debug(f"Watch UUID {uuid} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") + logger.debug(f"Watch UUID {watch.get('uuid')} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") if watch.get('previous_md5') and watch.get('previous_md5') != fetched_md5: # Yes if we only care about it going to instock, AND we are in stock diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 1d60be63..e793de89 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -10,8 +10,6 @@ from . import difference_detection_processor from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text from changedetectionio import html_tools, content_fetchers from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT -import changedetectionio.content_fetchers -from copy import deepcopy from loguru import logger urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -21,7 +19,8 @@ description = 'Detects all text changes where possible' json_filter_prefixes = ['json:', 'jq:', 'jqraw:'] class FilterNotFoundInResponse(ValueError): - def __init__(self, msg): + def __init__(self, msg, screenshot=None): + self.screenshot = screenshot ValueError.__init__(self, msg) @@ -34,14 +33,12 @@ class PDFToHTMLToolNotFound(ValueError): # (set_proxy_from_list) class perform_site_check(difference_detection_processor): - def run_changedetection(self, uuid, skip_when_checksum_same=True): + def run_changedetection(self, watch, skip_when_checksum_same=True): changed_detected = False html_content = "" screenshot = False # as bytes stripped_text_from_html = "" - # DeepCopy so we can be sure we don't accidently change anything by reference - watch = deepcopy(self.datastore.data['watching'].get(uuid)) if not watch: raise Exception("Watch no longer exists.") @@ -116,12 +113,12 @@ class perform_site_check(difference_detection_processor): # Better would be if Watch.model could access the global data also # and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__ # https://realpython.com/inherit-python-dict/ instead of doing it procedurely - include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='include_filters') + include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='include_filters') # 1845 - remove duplicated filters in both group and watch include filter include_filters_rule = list(dict.fromkeys(watch.get('include_filters', []) + include_filters_from_tags)) - subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='subtractive_selectors'), + subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='subtractive_selectors'), *watch.get("subtractive_selectors", []), *self.datastore.data["settings"]["application"].get("global_subtractive_selectors", []) ] @@ -188,7 +185,7 @@ class perform_site_check(difference_detection_processor): append_pretty_line_formatting=not watch.is_source_type_url) if not html_content.strip(): - raise FilterNotFoundInResponse(include_filters_rule) + raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot) if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) @@ -222,7 +219,7 @@ class perform_site_check(difference_detection_processor): from .. import diff # needs to not include (added) etc or it may get used twice # Replace the processed text with the preferred result - rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(), + rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_text_before_filters(), newest_version_file_contents=stripped_text_from_html, include_equal=False, # not the same lines include_added=watch.get('filter_text_added', True), @@ -231,7 +228,7 @@ class perform_site_check(difference_detection_processor): line_feed_sep="\n", include_change_type_prefix=False) - watch.save_last_fetched_before_filters(text_content_before_ignored_filter) + watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter) if not rendered_diff and stripped_text_from_html: # We had some content, but no differences were found @@ -344,17 +341,17 @@ class perform_site_check(difference_detection_processor): if not watch['title'] or not len(watch['title']): update_obj['title'] = html_tools.extract_element(find='title', html_content=self.fetcher.content) - logger.debug(f"Watch UUID {uuid} content check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") + logger.debug(f"Watch UUID {watch.get('uuid')} content check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") if changed_detected: if watch.get('check_unique_lines', False): has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines()) # One or more lines? unsure? if not has_unique_lines: - logger.debug(f"check_unique_lines: UUID {uuid} didnt have anything new setting change_detected=False") + logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False") changed_detected = False else: - logger.debug(f"check_unique_lines: UUID {uuid} had unique content") + logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} had unique content") # Always record the new checksum update_obj["previous_md5"] = fetched_md5 diff --git a/changedetectionio/store.py b/changedetectionio/store.py index d11e7243..d4a6cb0f 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -163,7 +163,6 @@ class ChangeDetectionStore: del (update_obj[dict_key]) self.__data['watching'][uuid].update(update_obj) - self.needs_write = True @property @@ -376,46 +375,6 @@ class ChangeDetectionStore: return False - # Save as PNG, PNG is larger but better for doing visual diff in the future - def save_screenshot(self, watch_uuid, screenshot: bytes, as_error=False): - if not self.data['watching'].get(watch_uuid): - return - - if as_error: - target_path = os.path.join(self.datastore_path, watch_uuid, "last-error-screenshot.png") - else: - target_path = os.path.join(self.datastore_path, watch_uuid, "last-screenshot.png") - - self.data['watching'][watch_uuid].ensure_data_dir_exists() - - with open(target_path, 'wb') as f: - f.write(screenshot) - f.close() - - - def save_error_text(self, watch_uuid, contents): - if not self.data['watching'].get(watch_uuid): - return - - self.data['watching'][watch_uuid].ensure_data_dir_exists() - target_path = os.path.join(self.datastore_path, watch_uuid, "last-error.txt") - with open(target_path, 'w') as f: - f.write(contents) - - def save_xpath_data(self, watch_uuid, data, as_error=False): - - if not self.data['watching'].get(watch_uuid): - return - if as_error: - target_path = os.path.join(self.datastore_path, watch_uuid, "elements-error.json") - else: - target_path = os.path.join(self.datastore_path, watch_uuid, "elements.json") - self.data['watching'][watch_uuid].ensure_data_dir_exists() - with open(target_path, 'w') as f: - f.write(json.dumps(data)) - f.close() - - def sync_to_json(self): logger.info("Saving JSON..") try: diff --git a/changedetectionio/tests/test_api.py b/changedetectionio/tests/test_api.py index 5be55ec2..2f66489a 100644 --- a/changedetectionio/tests/test_api.py +++ b/changedetectionio/tests/test_api.py @@ -149,6 +149,15 @@ def test_api_simple(client, live_server): headers={'x-api-key': api_key}, ) assert b'which has this one new line' in res.data + assert b'
0 and c >= threshold: - if not self.datastore.data['watching'][uuid].get('notification_muted'): + if not watch.get('notification_muted'): self.send_filter_failure_notification(uuid) c = 0 @@ -400,15 +407,15 @@ class update_worker(threading.Thread): } ) - if self.datastore.data['watching'][uuid].get('filter_failure_notification_send', False): - c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5) + if watch.get('filter_failure_notification_send', False): + c = watch.get('consecutive_filter_failures', 5) c += 1 # Send notification if we reached the threshold? threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0) logger.error(f"Step for {uuid} not found, consecutive_filter_failures: {c}") if threshold > 0 and c >= threshold: - if not self.datastore.data['watching'][uuid].get('notification_muted'): + if not watch.get('notification_muted'): self.send_step_failure_notification(watch_uuid=uuid, step_n=e.step_n) c = 0 @@ -430,7 +437,7 @@ class update_worker(threading.Thread): except content_fetchers.exceptions.JSActionExceptions as e: err_text = "Error running JS Actions - Page request - "+e.message if e.screenshot: - self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) + watch.save_screenshot(screenshot=e.screenshot, as_error=True) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, 'last_check_status': e.status_code}) process_changedetection_results = False @@ -440,7 +447,7 @@ class update_worker(threading.Thread): err_text = "{} - {}".format(err_text, e.message) if e.screenshot: - self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) + watch.save_screenshot(screenshot=e.screenshot, as_error=True) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, 'last_check_status': e.status_code, @@ -464,8 +471,6 @@ class update_worker(threading.Thread): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)}) # Other serious error process_changedetection_results = False -# import traceback -# print(traceback.format_exc()) else: # Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc) @@ -473,7 +478,7 @@ class update_worker(threading.Thread): continue # Mark that we never had any failures - if not self.datastore.data['watching'][uuid].get('ignore_status_codes'): + if not watch.get('ignore_status_codes'): update_obj['consecutive_filter_failures'] = 0 # Everything ran OK, clean off any previous error @@ -481,25 +486,48 @@ class update_worker(threading.Thread): self.cleanup_error_artifacts(uuid) + if not self.datastore.data['watching'].get(uuid): + continue # # Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc if process_changedetection_results: + # Always save the screenshot if it's available + + if update_handler.screenshot: + watch.save_screenshot(screenshot=update_handler.screenshot) + + if update_handler.xpath_data: + watch.save_xpath_data(data=update_handler.xpath_data) + try: - watch = self.datastore.data['watching'].get(uuid) self.datastore.update_watch(uuid=uuid, update_obj=update_obj) # Also save the snapshot on the first time checked - if changed_detected or not watch['last_checked']: + if changed_detected or not watch.get('last_checked'): + timestamp = round(time.time()) + + # Small hack so that we sleep just enough to allow 1 second between history snapshots + # this is because history.txt indexes/keys snapshots by epoch seconds and we dont want dupe keys + + if watch.newest_history_key and int(timestamp) == int(watch.newest_history_key): + logger.warning( + f"Timestamp {timestamp} already exists, waiting 1 seconds so we have a unique key in history.txt") + timestamp = str(int(timestamp) + 1) + time.sleep(1) + watch.save_history_text(contents=contents, - timestamp=str(round(time.time())), + timestamp=timestamp, snapshot_id=update_obj.get('previous_md5', 'none')) + if update_handler.fetcher.content: + watch.save_last_fetched_html(contents=update_handler.fetcher.content, timestamp=timestamp) + # A change was detected if changed_detected: # Notifications should only trigger on the second time (first time, we gather the initial snapshot) if watch.history_n >= 2: logger.info(f"Change detected in UUID {uuid} - {watch['url']}") - if not self.datastore.data['watching'][uuid].get('notification_muted'): + if not watch.get('notification_muted'): self.send_content_changed_notification(watch_uuid=uuid) else: logger.info(f"Change triggered in UUID {uuid} due to first history saving (no notifications sent) - {watch['url']}") @@ -510,29 +538,23 @@ class update_worker(threading.Thread): logger.critical(str(e)) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) - if self.datastore.data['watching'].get(uuid): - # Always record that we atleast tried - count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1 - # Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds - try: - server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255] - self.datastore.update_watch(uuid=uuid, - update_obj={'remote_server_reply': server_header} - ) - except Exception as e: - pass + # Always record that we atleast tried + count = watch.get('check_count', 0) + 1 - self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3), - 'last_checked': round(time.time()), - 'check_count': count - }) + # Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds + try: + server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255] + self.datastore.update_watch(uuid=uuid, + update_obj={'remote_server_reply': server_header} + ) + except Exception as e: + pass - # Always save the screenshot if it's available - if update_handler.screenshot: - self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot) - if update_handler.xpath_data: - self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data) + self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3), + 'last_checked': round(time.time()), + 'check_count': count + }) self.current_uuid = None # Done