From 51cb83a20a5429d5fb7f550bbf3e9d760f4eea59 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 8 Oct 2024 10:52:01 +0200 Subject: [PATCH] Refactor preview code to its own module --- changedetectionio/flask_app.py | 77 +------------ changedetectionio/model/Watch.py | 13 ++- .../processors/text_json_diff/__init__.py | 107 ++++++++++++++++++ 3 files changed, 118 insertions(+), 79 deletions(-) diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 395bc3c5..26080c4f 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import datetime -import importlib import flask_login import locale @@ -12,9 +11,7 @@ import threading import time import timeago -from .content_fetchers.exceptions import ReplyWithContentButNoText from .processors import find_processors, get_parent_module, get_custom_watch_obj_for_processor -from .processors.text_json_diff.processor import FilterNotFoundInResponse from .safe_jinja import render as jinja_render from changedetectionio.strtobool import strtobool from copy import deepcopy @@ -1381,79 +1378,9 @@ def changedetection_app(config=None, datastore_o=None): @app.route("/edit//preview-rendered", methods=['POST']) @login_optionally_required def watch_get_preview_rendered(uuid): - from flask import jsonify '''For when viewing the "preview" of the rendered text from inside of Edit''' - now = time.time() - import brotli - from . import forms - - text_after_filter = '' - tmp_watch = deepcopy(datastore.data['watching'].get(uuid)) - - if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir): - # Splice in the temporary stuff from the form - form = forms.processor_text_json_diff_form(formdata=request.form if request.method == 'POST' else None, - data=request.form - ) - # Only update vars that came in via the AJAX post - p = {k: v for k, v in form.data.items() if k in request.form.keys()} - tmp_watch.update(p) - - latest_filename = next(reversed(tmp_watch.history)) - html_fname = os.path.join(tmp_watch.watch_data_dir, f"{latest_filename}.html.br") - with open(html_fname, 'rb') as f: - decompressed_data = brotli.decompress(f.read()).decode('utf-8') if html_fname.endswith('.br') else f.read().decode('utf-8') - - # Just like a normal change detection except provide a fake "watch" object and dont call .call_browser() - processor_module = importlib.import_module("changedetectionio.processors.text_json_diff.processor") - update_handler = processor_module.perform_site_check(datastore=datastore, - watch_uuid=uuid # probably not needed anymore anyway? - ) - # Use the last loaded HTML as the input - update_handler.fetcher.content = decompressed_data - update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type') - try: - changed_detected, update_obj, text_after_filter = update_handler.run_changedetection( - watch=tmp_watch, - skip_when_checksum_same=False, - ) - except FilterNotFoundInResponse as e: - text_after_filter = f"Filter not found in HTML: {str(e)}" - except ReplyWithContentButNoText as e: - text_after_filter = f"Filter found but no text (empty result)" - except Exception as e: - text_after_filter = f"Error: {str(e)}" - - if not text_after_filter.strip(): - text_after_filter = 'Empty content' - - # because run_changedetection always returns bytes due to saving the snapshots etc - text_after_filter = text_after_filter.decode('utf-8') if isinstance(text_after_filter, bytes) else text_after_filter - - do_anchor = datastore.data["settings"]["application"].get("render_anchor_tag_content", False) - - trigger_line_numbers = [] - try: - text_before_filter = html_tools.html_to_text(html_content=decompressed_data, - render_anchor_tag_content=do_anchor) - - trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter, - wordlist=tmp_watch['trigger_text'], - mode='line numbers' - ) - except Exception as e: - text_before_filter = f"Error: {str(e)}" - - logger.trace(f"Parsed in {time.time() - now:.3f}s") - - return jsonify( - { - 'after_filter': text_after_filter, - 'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter, - 'duration': time.time() - now, - 'trigger_line_numbers': trigger_line_numbers, - } - ) + from .processors.text_json_diff import prepare_filter_prevew + return prepare_filter_prevew(watch_uuid=uuid, datastore=datastore) @app.route("/form/add/quickwatch", methods=['POST']) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index d3167bf9..b52d37fb 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -36,8 +36,9 @@ class model(watch_base): jitter_seconds = 0 def __init__(self, *arg, **kw): - self.__datastore_path = kw['datastore_path'] - del kw['datastore_path'] + self.__datastore_path = kw.get('datastore_path') + if kw.get('datastore_path'): + del kw['datastore_path'] super(model, self).__init__(*arg, **kw) if kw.get('default'): self.update(kw['default']) @@ -171,6 +172,10 @@ class model(watch_base): """ tmp_history = {} + # In the case we are only using the watch for processing without history + if not self.watch_data_dir: + return [] + # Read the history file as a dict fname = os.path.join(self.watch_data_dir, "history.txt") if os.path.isfile(fname): @@ -396,8 +401,8 @@ class model(watch_base): @property def watch_data_dir(self): # The base dir of the watch data - return os.path.join(self.__datastore_path, self['uuid']) - + return os.path.join(self.__datastore_path, self['uuid']) if self.__datastore_path else None + def get_error_text(self): """Return the text saved from a previous request that resulted in a non-200 error""" fname = os.path.join(self.watch_data_dir, "last-error.txt") diff --git a/changedetectionio/processors/text_json_diff/__init__.py b/changedetectionio/processors/text_json_diff/__init__.py index e69de29b..f87aa350 100644 --- a/changedetectionio/processors/text_json_diff/__init__.py +++ b/changedetectionio/processors/text_json_diff/__init__.py @@ -0,0 +1,107 @@ + +from loguru import logger + + + +def _task(watch, update_handler): + from changedetectionio.content_fetchers.exceptions import ReplyWithContentButNoText + from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse + + text_after_filter = '' + + try: + # The slow process (we run 2 of these in parallel) + changed_detected, update_obj, text_after_filter = update_handler.run_changedetection( + watch=watch, + skip_when_checksum_same=False, + ) + except FilterNotFoundInResponse as e: + text_after_filter = f"Filter not found in HTML: {str(e)}" + except ReplyWithContentButNoText as e: + text_after_filter = f"Filter found but no text (empty result)" + except Exception as e: + text_after_filter = f"Error: {str(e)}" + + if not text_after_filter.strip(): + text_after_filter = 'Empty content' + + # because run_changedetection always returns bytes due to saving the snapshots etc + text_after_filter = text_after_filter.decode('utf-8') if isinstance(text_after_filter, bytes) else text_after_filter + + return text_after_filter + + +def prepare_filter_prevew(datastore, watch_uuid): + '''Used by @app.route("/edit//preview-rendered", methods=['POST'])''' + from changedetectionio import forms, html_tools + from changedetectionio.model.Watch import model as watch_model + from concurrent.futures import ProcessPoolExecutor + from copy import deepcopy + from flask import request, jsonify + import brotli + import importlib + import os + import time + now = time.time() + + text_after_filter = '' + text_before_filter = '' + tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid)) + + if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir): + # Splice in the temporary stuff from the form + form = forms.processor_text_json_diff_form(formdata=request.form if request.method == 'POST' else None, + data=request.form + ) + + # Only update vars that came in via the AJAX post + p = {k: v for k, v in form.data.items() if k in request.form.keys()} + tmp_watch.update(p) + blank_watch_no_filters = watch_model() + blank_watch_no_filters['url'] = tmp_watch.get('url') + + latest_filename = next(reversed(tmp_watch.history)) + html_fname = os.path.join(tmp_watch.watch_data_dir, f"{latest_filename}.html.br") + with open(html_fname, 'rb') as f: + decompressed_data = brotli.decompress(f.read()).decode('utf-8') if html_fname.endswith('.br') else f.read().decode('utf-8') + + # Just like a normal change detection except provide a fake "watch" object and dont call .call_browser() + processor_module = importlib.import_module("changedetectionio.processors.text_json_diff.processor") + update_handler = processor_module.perform_site_check(datastore=datastore, + watch_uuid=tmp_watch.get('uuid') # probably not needed anymore anyway? + ) + # Use the last loaded HTML as the input + update_handler.datastore = datastore + update_handler.fetcher.content = decompressed_data + update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type') + + # Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk + # Do this as a parallel process because it could take some time + with ProcessPoolExecutor(max_workers=2) as executor: + future1 = executor.submit(_task, tmp_watch, update_handler) + future2 = executor.submit(_task, blank_watch_no_filters, update_handler) + + text_after_filter = future1.result() + text_before_filter = future2.result() + + trigger_line_numbers = [] + try: + + trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter, + wordlist=tmp_watch['trigger_text'], + mode='line numbers' + ) + except Exception as e: + text_before_filter = f"Error: {str(e)}" + + logger.trace(f"Parsed in {time.time() - now:.3f}s") + + return jsonify( + { + 'after_filter': text_after_filter, + 'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter, + 'duration': time.time() - now, + 'trigger_line_numbers': trigger_line_numbers, + } + ) +