Refactor preview code to its own module

preview-refactor
dgtlmoon 3 months ago
parent 54a4970a4c
commit 51cb83a20a

@ -1,7 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import datetime import datetime
import importlib
import flask_login import flask_login
import locale import locale
@ -12,9 +11,7 @@ import threading
import time import time
import timeago import timeago
from .content_fetchers.exceptions import ReplyWithContentButNoText
from .processors import find_processors, get_parent_module, get_custom_watch_obj_for_processor from .processors import find_processors, get_parent_module, get_custom_watch_obj_for_processor
from .processors.text_json_diff.processor import FilterNotFoundInResponse
from .safe_jinja import render as jinja_render from .safe_jinja import render as jinja_render
from changedetectionio.strtobool import strtobool from changedetectionio.strtobool import strtobool
from copy import deepcopy from copy import deepcopy
@ -1381,79 +1378,9 @@ def changedetection_app(config=None, datastore_o=None):
@app.route("/edit/<string:uuid>/preview-rendered", methods=['POST']) @app.route("/edit/<string:uuid>/preview-rendered", methods=['POST'])
@login_optionally_required @login_optionally_required
def watch_get_preview_rendered(uuid): def watch_get_preview_rendered(uuid):
from flask import jsonify
'''For when viewing the "preview" of the rendered text from inside of Edit''' '''For when viewing the "preview" of the rendered text from inside of Edit'''
now = time.time() from .processors.text_json_diff import prepare_filter_prevew
import brotli return prepare_filter_prevew(watch_uuid=uuid, datastore=datastore)
from . import forms
text_after_filter = ''
tmp_watch = deepcopy(datastore.data['watching'].get(uuid))
if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir):
# Splice in the temporary stuff from the form
form = forms.processor_text_json_diff_form(formdata=request.form if request.method == 'POST' else None,
data=request.form
)
# Only update vars that came in via the AJAX post
p = {k: v for k, v in form.data.items() if k in request.form.keys()}
tmp_watch.update(p)
latest_filename = next(reversed(tmp_watch.history))
html_fname = os.path.join(tmp_watch.watch_data_dir, f"{latest_filename}.html.br")
with open(html_fname, 'rb') as f:
decompressed_data = brotli.decompress(f.read()).decode('utf-8') if html_fname.endswith('.br') else f.read().decode('utf-8')
# Just like a normal change detection except provide a fake "watch" object and dont call .call_browser()
processor_module = importlib.import_module("changedetectionio.processors.text_json_diff.processor")
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=uuid # probably not needed anymore anyway?
)
# Use the last loaded HTML as the input
update_handler.fetcher.content = decompressed_data
update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')
try:
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(
watch=tmp_watch,
skip_when_checksum_same=False,
)
except FilterNotFoundInResponse as e:
text_after_filter = f"Filter not found in HTML: {str(e)}"
except ReplyWithContentButNoText as e:
text_after_filter = f"Filter found but no text (empty result)"
except Exception as e:
text_after_filter = f"Error: {str(e)}"
if not text_after_filter.strip():
text_after_filter = 'Empty content'
# because run_changedetection always returns bytes due to saving the snapshots etc
text_after_filter = text_after_filter.decode('utf-8') if isinstance(text_after_filter, bytes) else text_after_filter
do_anchor = datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
trigger_line_numbers = []
try:
text_before_filter = html_tools.html_to_text(html_content=decompressed_data,
render_anchor_tag_content=do_anchor)
trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
wordlist=tmp_watch['trigger_text'],
mode='line numbers'
)
except Exception as e:
text_before_filter = f"Error: {str(e)}"
logger.trace(f"Parsed in {time.time() - now:.3f}s")
return jsonify(
{
'after_filter': text_after_filter,
'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter,
'duration': time.time() - now,
'trigger_line_numbers': trigger_line_numbers,
}
)
@app.route("/form/add/quickwatch", methods=['POST']) @app.route("/form/add/quickwatch", methods=['POST'])

@ -36,8 +36,9 @@ class model(watch_base):
jitter_seconds = 0 jitter_seconds = 0
def __init__(self, *arg, **kw): def __init__(self, *arg, **kw):
self.__datastore_path = kw['datastore_path'] self.__datastore_path = kw.get('datastore_path')
del kw['datastore_path'] if kw.get('datastore_path'):
del kw['datastore_path']
super(model, self).__init__(*arg, **kw) super(model, self).__init__(*arg, **kw)
if kw.get('default'): if kw.get('default'):
self.update(kw['default']) self.update(kw['default'])
@ -171,6 +172,10 @@ class model(watch_base):
""" """
tmp_history = {} tmp_history = {}
# In the case we are only using the watch for processing without history
if not self.watch_data_dir:
return []
# Read the history file as a dict # Read the history file as a dict
fname = os.path.join(self.watch_data_dir, "history.txt") fname = os.path.join(self.watch_data_dir, "history.txt")
if os.path.isfile(fname): if os.path.isfile(fname):
@ -396,7 +401,7 @@ class model(watch_base):
@property @property
def watch_data_dir(self): def watch_data_dir(self):
# The base dir of the watch data # The base dir of the watch data
return os.path.join(self.__datastore_path, self['uuid']) return os.path.join(self.__datastore_path, self['uuid']) if self.__datastore_path else None
def get_error_text(self): def get_error_text(self):
"""Return the text saved from a previous request that resulted in a non-200 error""" """Return the text saved from a previous request that resulted in a non-200 error"""

@ -0,0 +1,107 @@
from loguru import logger
def _task(watch, update_handler):
from changedetectionio.content_fetchers.exceptions import ReplyWithContentButNoText
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
text_after_filter = ''
try:
# The slow process (we run 2 of these in parallel)
changed_detected, update_obj, text_after_filter = update_handler.run_changedetection(
watch=watch,
skip_when_checksum_same=False,
)
except FilterNotFoundInResponse as e:
text_after_filter = f"Filter not found in HTML: {str(e)}"
except ReplyWithContentButNoText as e:
text_after_filter = f"Filter found but no text (empty result)"
except Exception as e:
text_after_filter = f"Error: {str(e)}"
if not text_after_filter.strip():
text_after_filter = 'Empty content'
# because run_changedetection always returns bytes due to saving the snapshots etc
text_after_filter = text_after_filter.decode('utf-8') if isinstance(text_after_filter, bytes) else text_after_filter
return text_after_filter
def prepare_filter_prevew(datastore, watch_uuid):
'''Used by @app.route("/edit/<string:uuid>/preview-rendered", methods=['POST'])'''
from changedetectionio import forms, html_tools
from changedetectionio.model.Watch import model as watch_model
from concurrent.futures import ProcessPoolExecutor
from copy import deepcopy
from flask import request, jsonify
import brotli
import importlib
import os
import time
now = time.time()
text_after_filter = ''
text_before_filter = ''
tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid))
if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir):
# Splice in the temporary stuff from the form
form = forms.processor_text_json_diff_form(formdata=request.form if request.method == 'POST' else None,
data=request.form
)
# Only update vars that came in via the AJAX post
p = {k: v for k, v in form.data.items() if k in request.form.keys()}
tmp_watch.update(p)
blank_watch_no_filters = watch_model()
blank_watch_no_filters['url'] = tmp_watch.get('url')
latest_filename = next(reversed(tmp_watch.history))
html_fname = os.path.join(tmp_watch.watch_data_dir, f"{latest_filename}.html.br")
with open(html_fname, 'rb') as f:
decompressed_data = brotli.decompress(f.read()).decode('utf-8') if html_fname.endswith('.br') else f.read().decode('utf-8')
# Just like a normal change detection except provide a fake "watch" object and dont call .call_browser()
processor_module = importlib.import_module("changedetectionio.processors.text_json_diff.processor")
update_handler = processor_module.perform_site_check(datastore=datastore,
watch_uuid=tmp_watch.get('uuid') # probably not needed anymore anyway?
)
# Use the last loaded HTML as the input
update_handler.datastore = datastore
update_handler.fetcher.content = decompressed_data
update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type')
# Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk
# Do this as a parallel process because it could take some time
with ProcessPoolExecutor(max_workers=2) as executor:
future1 = executor.submit(_task, tmp_watch, update_handler)
future2 = executor.submit(_task, blank_watch_no_filters, update_handler)
text_after_filter = future1.result()
text_before_filter = future2.result()
trigger_line_numbers = []
try:
trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter,
wordlist=tmp_watch['trigger_text'],
mode='line numbers'
)
except Exception as e:
text_before_filter = f"Error: {str(e)}"
logger.trace(f"Parsed in {time.time() - now:.3f}s")
return jsonify(
{
'after_filter': text_after_filter,
'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter,
'duration': time.time() - now,
'trigger_line_numbers': trigger_line_numbers,
}
)
Loading…
Cancel
Save