Distill.io JSON export file importer (#592)
parent
18f0b63b7d
commit
f28c260576
@ -0,0 +1,133 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
import time
|
||||||
|
import validators
|
||||||
|
|
||||||
|
|
||||||
|
class Importer():
|
||||||
|
remaining_data = []
|
||||||
|
new_uuids = []
|
||||||
|
good = 0
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.new_uuids = []
|
||||||
|
self.good = 0
|
||||||
|
self.remaining_data = []
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self,
|
||||||
|
data,
|
||||||
|
flash,
|
||||||
|
datastore):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class import_url_list(Importer):
|
||||||
|
"""
|
||||||
|
Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
|
||||||
|
"""
|
||||||
|
def run(self,
|
||||||
|
data,
|
||||||
|
flash,
|
||||||
|
datastore,
|
||||||
|
):
|
||||||
|
|
||||||
|
urls = data.split("\n")
|
||||||
|
good = 0
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
if (len(urls) > 5000):
|
||||||
|
flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
url = url.strip()
|
||||||
|
if not len(url):
|
||||||
|
continue
|
||||||
|
|
||||||
|
tags = ""
|
||||||
|
|
||||||
|
# 'tags' should be a csv list after the URL
|
||||||
|
if ' ' in url:
|
||||||
|
url, tags = url.split(" ", 1)
|
||||||
|
|
||||||
|
# Flask wtform validators wont work with basic auth, use validators package
|
||||||
|
# Up to 5000 per batch so we dont flood the server
|
||||||
|
if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
|
||||||
|
new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
|
||||||
|
if new_uuid:
|
||||||
|
# Straight into the queue.
|
||||||
|
self.new_uuids.append(new_uuid)
|
||||||
|
good += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Worked past the 'continue' above, append it to the bad list
|
||||||
|
if self.remaining_data is None:
|
||||||
|
self.remaining_data = []
|
||||||
|
self.remaining_data.append(url)
|
||||||
|
|
||||||
|
flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
|
||||||
|
|
||||||
|
|
||||||
|
class import_distill_io_json(Importer):
|
||||||
|
def run(self,
|
||||||
|
data,
|
||||||
|
flash,
|
||||||
|
datastore,
|
||||||
|
):
|
||||||
|
|
||||||
|
import json
|
||||||
|
good = 0
|
||||||
|
now = time.time()
|
||||||
|
self.new_uuids=[]
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(data.strip())
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
flash("Unable to read JSON file, was it broken?", 'error')
|
||||||
|
return
|
||||||
|
|
||||||
|
if not data.get('data'):
|
||||||
|
flash("JSON structure looks invalid, was it broken?", 'error')
|
||||||
|
return
|
||||||
|
|
||||||
|
for d in data.get('data'):
|
||||||
|
d_config = json.loads(d['config'])
|
||||||
|
extras = {'title': d['name']}
|
||||||
|
|
||||||
|
if len(d['uri']) and good < 5000:
|
||||||
|
try:
|
||||||
|
# @todo we only support CSS ones at the moment
|
||||||
|
if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
|
||||||
|
extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
|
||||||
|
if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
|
||||||
|
extras['css_filter'] = 'xpath:' + extras['css_filter']
|
||||||
|
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
extras['tag'] = ", ".join(d['tags'])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
new_uuid = datastore.add_watch(url=d['uri'].strip(),
|
||||||
|
extras=extras,
|
||||||
|
write_to_disk_now=False)
|
||||||
|
|
||||||
|
if new_uuid:
|
||||||
|
# Straight into the queue.
|
||||||
|
self.new_uuids.append(new_uuid)
|
||||||
|
good += 1
|
||||||
|
|
||||||
|
flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
|
@ -1,30 +1,86 @@
|
|||||||
{% extends 'base.html' %}
|
{% extends 'base.html' %}
|
||||||
|
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<div class="edit-form">
|
<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
|
||||||
<div class="inner">
|
<div class="edit-form monospaced-textarea">
|
||||||
|
|
||||||
|
<div class="tabs collapsable">
|
||||||
|
<ul>
|
||||||
|
<li class="tab" id="default-tab"><a href="#url-list">URL List</a></li>
|
||||||
|
<li class="tab"><a href="#distill-io">Distill.io</a></li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="box-wrap inner">
|
||||||
<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
|
<form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
|
||||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
|
||||||
<fieldset class="pure-group">
|
<div class="tab-pane-inner" id="url-list">
|
||||||
<legend>
|
<fieldset class="pure-group">
|
||||||
Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):
|
<legend>
|
||||||
<br>
|
Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma
|
||||||
<code>https://example.com tag1, tag2, last tag</code>
|
(,):
|
||||||
<br>
|
<br>
|
||||||
URLs which do not pass validation will stay in the textarea.
|
<code>https://example.com tag1, tag2, last tag</code>
|
||||||
</legend>
|
<br>
|
||||||
|
URLs which do not pass validation will stay in the textarea.
|
||||||
|
</legend>
|
||||||
<textarea name="urls" class="pure-input-1-2" placeholder="https://"
|
|
||||||
style="width: 100%;
|
|
||||||
|
<textarea name="urls" class="pure-input-1-2" placeholder="https://"
|
||||||
|
style="width: 100%;
|
||||||
font-family:monospace;
|
font-family:monospace;
|
||||||
white-space: pre;
|
white-space: pre;
|
||||||
overflow-wrap: normal;
|
overflow-wrap: normal;
|
||||||
overflow-x: scroll;" rows="25">{{ remaining }}</textarea>
|
overflow-x: scroll;" rows="25">{{ import_url_list_remaining }}</textarea>
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="tab-pane-inner" id="distill-io">
|
||||||
|
|
||||||
|
|
||||||
|
<fieldset class="pure-group">
|
||||||
|
<legend>
|
||||||
|
Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.</br>
|
||||||
|
This is <i>experimental</i>, supported fields are <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, the rest (including <code>schedule</code>) are ignored.
|
||||||
|
<br/>
|
||||||
|
<p>
|
||||||
|
How to export? <a href="https://distill.io/docs/web-monitor/how-export-and-import-monitors/">https://distill.io/docs/web-monitor/how-export-and-import-monitors/</a><br/>
|
||||||
|
Be sure to set your default fetcher to Chrome if required.</br>
|
||||||
|
</p>
|
||||||
|
</legend>
|
||||||
|
|
||||||
|
|
||||||
|
<textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
|
||||||
|
font-family:monospace;
|
||||||
|
white-space: pre;
|
||||||
|
overflow-wrap: normal;
|
||||||
|
overflow-x: scroll;" placeholder="Example Distill.io JSON export file
|
||||||
|
|
||||||
|
{
|
||||||
|
"client": {
|
||||||
|
"local": 1
|
||||||
|
},
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"name": "Unraid | News",
|
||||||
|
"uri": "https://unraid.net/blog",
|
||||||
|
"config": "{\"selections\":[{\"frames\":[{\"index\":0,\"excludes\":[],\"includes\":[{\"type\":\"xpath\",\"expr\":\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\"}]}],\"dynamic\":true,\"delay\":2}],\"ignoreEmptyText\":true,\"includeStyle\":false,\"dataAttr\":\"text\"}",
|
||||||
|
"tags": [],
|
||||||
|
"content_type": 2,
|
||||||
|
"state": 40,
|
||||||
|
"schedule": "{\"type\":\"INTERVAL\",\"params\":{\"interval\":4447}}",
|
||||||
|
"ts": "2022-03-27T15:51:15.667Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
" rows="25">{{ original_distill_json }}</textarea>
|
||||||
|
</fieldset>
|
||||||
|
</div>
|
||||||
<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
|
<button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
|
||||||
</form>
|
</form>
|
||||||
</div>
|
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
Loading…
Reference in new issue