New feature - Simple extract data by regex from all historical watch text into CSV (#1191)

pull/1192/head
dgtlmoon 2 years ago committed by GitHub
parent b8d5a12ad0
commit 2345b6b558
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -810,7 +810,7 @@ def changedetection_app(config=None, datastore_o=None):
return redirect(url_for('index'))
@app.route("/diff/<string:uuid>", methods=['GET'])
@app.route("/diff/<string:uuid>", methods=['GET', 'POST'])
@login_required
def diff_history_page(uuid):
@ -818,6 +818,7 @@ def changedetection_app(config=None, datastore_o=None):
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
extra_stylesheets = [url_for('static_content', group='styles', filename='diff.css')]
try:
watch = datastore.data['watching'][uuid]
@ -825,6 +826,23 @@ def changedetection_app(config=None, datastore_o=None):
flash("No history found for the specified link, bad link?", "error")
return redirect(url_for('index'))
# For submission of requesting an extract
if request.method == 'POST':
extract_regex = request.form.get('extract_regex').strip()
output = watch.extract_regex_from_all_history(extract_regex)
if output:
watch_dir = os.path.join(datastore_o.datastore_path, uuid)
response = make_response(send_from_directory(directory=watch_dir, path=output, as_attachment=True))
response.headers['Content-type'] = 'text/csv'
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
response.headers['Pragma'] = 'no-cache'
response.headers['Expires'] = 0
return response
flash('Nothing matches that RegEx', 'error')
redirect(url_for('diff_history_page', uuid=uuid)+'#extract')
history = watch.history
dates = list(history.keys())
@ -866,24 +884,28 @@ def changedetection_app(config=None, datastore_o=None):
is_html_webdriver = True if watch.get('fetch_backend') == 'html_webdriver' or (
watch.get('fetch_backend', None) is None and system_uses_webdriver) else False
from changedetectionio import forms
extract_form = forms.extractDataForm(request.form)
output = render_template("diff.html",
watch_a=watch,
newest=newest_version_file_contents,
previous=previous_version_file_contents,
extra_stylesheets=extra_stylesheets,
dark_mode=getDarkModeSetting(),
versions=dates[:-1], # All except current/last
uuid=uuid,
newest_version_timestamp=dates[-1],
current_previous_version=str(previous_version),
current_diff_url=watch['url'],
current_previous_version=str(previous_version),
dark_mode=getDarkModeSetting(),
extra_stylesheets=extra_stylesheets,
extra_title=" - Diff - {}".format(watch['title'] if watch['title'] else watch['url']),
left_sticky=True,
screenshot=screenshot_url,
extract_form=extract_form,
is_html_webdriver=is_html_webdriver,
last_error=watch['last_error'],
last_error_screenshot=watch.get_error_snapshot(),
last_error_text=watch.get_error_text(),
last_error_screenshot=watch.get_error_snapshot()
left_sticky=True,
newest=newest_version_file_contents,
newest_version_timestamp=dates[-1],
previous=previous_version_file_contents,
screenshot=screenshot_url,
uuid=uuid,
versions=dates[:-1], # All except current/last
watch_a=watch
)
return output

@ -448,3 +448,9 @@ class globalSettingsForm(Form):
requests = FormField(globalSettingsRequestForm)
application = FormField(globalSettingsApplicationForm)
save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
class extractDataForm(Form):
extract_regex = StringField('RegEx to extract')
extract_submit_button = SubmitField('Extract as CSV', render_kw={"class": "pure-button pure-button-primary"})

@ -318,3 +318,47 @@ class model(dict):
if os.path.isfile(fname):
return fname
return False
def extract_regex_from_all_history(self, regex):
import csv
import re
import datetime
csv_output_filename = False
csv_writer = False
f = None
# self.history will be keyed with the full path
for k, fname in self.history.items():
if os.path.isfile(fname):
with open(fname, "r") as f:
contents = f.read()
res = re.findall(regex, contents, re.MULTILINE)
if res:
if not csv_writer:
# A file on the disk can be transferred much faster via flask than a string reply
csv_output_filename = 'report.csv'
f = open(os.path.join(self.watch_data_dir, csv_output_filename), 'w')
# @todo some headers in the future
#fieldnames = ['Epoch seconds', 'Date']
csv_writer = csv.writer(f,
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
#fieldnames=fieldnames
)
csv_writer.writerow(['Epoch seconds', 'Date'])
# csv_writer.writeheader()
date_str = datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S')
for r in res:
row = [k, date_str]
if isinstance(r, str):
row.append(r)
else:
row+=r
csv_writer.writerow(row)
if f:
f.close()
return csv_output_filename

@ -13,6 +13,8 @@ $(document).ready(function () {
} else if (hash_name === '#error-screenshot') {
$("img#error-screenshot-img").attr('src', error_screenshot_url);
$("#settings").hide();
} else if (hash_name === '#extract') {
$("#settings").hide();
}

@ -132,8 +132,9 @@ html[data-darkmode="true"] {
padding: 2em;
margin-left: 1em;
margin-right: 1em;
border-radius: 5px;
font-size: 11px; }
border-radius: 5px; }
#diff-ui #text {
font-size: 11px; }
#diff-ui table {
table-layout: fixed;
width: 100%; }

@ -7,7 +7,11 @@
margin-left: 1em;
margin-right: 1em;
border-radius: 5px;
font-size: 11px;
// The first tab 'text' diff
#text {
font-size: 11px;
}
table {
table-layout: fixed;

@ -1,5 +1,5 @@
{% extends 'base.html' %}
{% from '_helpers.jinja' import render_field, render_checkbox_field, render_button %}
{% block content %}
<script>
const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
@ -58,6 +58,7 @@
{% if last_error_screenshot %}<li class="tab" id="error-screenshot-tab"><a href="#error-screenshot">Error Screenshot</a></li> {% endif %}
<li class="tab" id=""><a href="#text">Text</a></li>
<li class="tab" id="screenshot-tab"><a href="#screenshot">Screenshot</a></li>
<li class="tab" id="extract-tab"><a href="#extract">Extract Data</a></li>
</ul>
</div>
@ -108,6 +109,37 @@
<strong>Screenshot requires Playwright/WebDriver enabled</strong>
{% endif %}
</div>
<div class="tab-pane-inner" id="extract">
<form id="extract-data-form" class="pure-form pure-form-stacked edit-form"
action="{{ url_for('diff_history_page', uuid=uuid) }}#extract"
method="POST">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<p>This tool will extract text data from all of the watch history.</p>
<div class="pure-control-group">
{{ render_field(extract_form.extract_regex) }}
<span class="pure-form-message-inline">
A <strong>RegEx</strong> is a pattern that identifies exactly which part inside of the text that you want to extract.<br/>
<p>
For example, to extract only the numbers from text &dash;</br>
<strong>Raw text</strong>: <code>Temperature <span style="color: red">5.5</span>°C in Sydney</code></br>
<strong>RegEx to extract:</strong> <code>Temperature ([0-9\.]+)</code><br/>
</p>
<p>
<a href="https://RegExr.com/">Be sure to test your RegEx here.</a>
</p>
<p>
Each RegEx group bracket <code>()</code> will be in its own column, the first column value is always the date.
</p>
</span>
</div>
<div class="pure-control-group">
{{ render_button(extract_form.extract_submit_button) }}
</div>
</form>
</div>
</div>
<script>

@ -244,6 +244,7 @@ xpath://body/div/span[contains(@class, 'example-class')]",
<span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the &lt;element&gt; contains &lt;![CDATA[]]&gt;</strong></span><br/>
{% endif %}
<span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/>
<ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed).

@ -0,0 +1,70 @@
#!/usr/bin/python3
import time
from flask import url_for
from urllib.request import urlopen
from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
sleep_time_for_fetch_thread = 3
def test_check_extract_text_from_diff(client, live_server):
import time
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("Now it's {} seconds since epoch, time flies!".format(str(time.time())))
live_server_setup(live_server)
# Add our URL to the import page
res = client.post(
url_for("import_page"),
data={"urls": url_for('test_endpoint', _external=True)},
follow_redirects=True
)
assert b"1 Imported" in res.data
time.sleep(1)
# Load in 5 different numbers/changes
last_date=""
for n in range(5):
# Give the thread time to pick it up
print("Bumping snapshot and checking.. ", n)
last_date = str(time.time())
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("Now it's {} seconds since epoch, time flies!".format(last_date))
client.get(url_for("form_watch_checknow"), follow_redirects=True)
wait_for_all_checks(client)
res = client.post(
url_for("diff_history_page", uuid="first"),
data={"extract_regex": "Now it's ([0-9\.]+)",
"extract_submit_button": "Extract as CSV"},
follow_redirects=False
)
assert b'Nothing matches that RegEx' not in res.data
assert res.content_type == 'text/csv'
# Read the csv reply as stringio
from io import StringIO
import csv
f = StringIO(res.data.decode('utf-8'))
reader = csv.reader(f, delimiter=',')
output=[]
for row in reader:
output.append(row)
assert output[0][0] == 'Epoch seconds'
# Header line + 1 origin/first + 5 changes
assert(len(output) == 7)
# We expect to find the last bumped date in the changes in the last field of the spreadsheet
assert(output[6][2] == last_date)
# And nothing else, only that group () of the decimal and .
assert "time flies" not in output[6][2]
Loading…
Cancel
Save