Distill.io JSON export file importer (#592)
parent
18f0b63b7d
commit
f28c260576
@ -0,0 +1,133 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
import time
|
||||||
|
import validators
|
||||||
|
|
||||||
|
|
||||||
|
class Importer():
|
||||||
|
remaining_data = []
|
||||||
|
new_uuids = []
|
||||||
|
good = 0
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.new_uuids = []
|
||||||
|
self.good = 0
|
||||||
|
self.remaining_data = []
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self,
|
||||||
|
data,
|
||||||
|
flash,
|
||||||
|
datastore):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class import_url_list(Importer):
|
||||||
|
"""
|
||||||
|
Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
|
||||||
|
"""
|
||||||
|
def run(self,
|
||||||
|
data,
|
||||||
|
flash,
|
||||||
|
datastore,
|
||||||
|
):
|
||||||
|
|
||||||
|
urls = data.split("\n")
|
||||||
|
good = 0
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
if (len(urls) > 5000):
|
||||||
|
flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
url = url.strip()
|
||||||
|
if not len(url):
|
||||||
|
continue
|
||||||
|
|
||||||
|
tags = ""
|
||||||
|
|
||||||
|
# 'tags' should be a csv list after the URL
|
||||||
|
if ' ' in url:
|
||||||
|
url, tags = url.split(" ", 1)
|
||||||
|
|
||||||
|
# Flask wtform validators wont work with basic auth, use validators package
|
||||||
|
# Up to 5000 per batch so we dont flood the server
|
||||||
|
if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
|
||||||
|
new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
|
||||||
|
if new_uuid:
|
||||||
|
# Straight into the queue.
|
||||||
|
self.new_uuids.append(new_uuid)
|
||||||
|
good += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Worked past the 'continue' above, append it to the bad list
|
||||||
|
if self.remaining_data is None:
|
||||||
|
self.remaining_data = []
|
||||||
|
self.remaining_data.append(url)
|
||||||
|
|
||||||
|
flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
|
||||||
|
|
||||||
|
|
||||||
|
class import_distill_io_json(Importer):
|
||||||
|
def run(self,
|
||||||
|
data,
|
||||||
|
flash,
|
||||||
|
datastore,
|
||||||
|
):
|
||||||
|
|
||||||
|
import json
|
||||||
|
good = 0
|
||||||
|
now = time.time()
|
||||||
|
self.new_uuids=[]
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(data.strip())
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
flash("Unable to read JSON file, was it broken?", 'error')
|
||||||
|
return
|
||||||
|
|
||||||
|
if not data.get('data'):
|
||||||
|
flash("JSON structure looks invalid, was it broken?", 'error')
|
||||||
|
return
|
||||||
|
|
||||||
|
for d in data.get('data'):
|
||||||
|
d_config = json.loads(d['config'])
|
||||||
|
extras = {'title': d['name']}
|
||||||
|
|
||||||
|
if len(d['uri']) and good < 5000:
|
||||||
|
try:
|
||||||
|
# @todo we only support CSS ones at the moment
|
||||||
|
if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
|
||||||
|
extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
|
||||||
|
if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
|
||||||
|
extras['css_filter'] = 'xpath:' + extras['css_filter']
|
||||||
|
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
extras['tag'] = ", ".join(d['tags'])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
new_uuid = datastore.add_watch(url=d['uri'].strip(),
|
||||||
|
extras=extras,
|
||||||
|
write_to_disk_now=False)
|
||||||
|
|
||||||
|
if new_uuid:
|
||||||
|
# Straight into the queue.
|
||||||
|
self.new_uuids.append(new_uuid)
|
||||||
|
good += 1
|
||||||
|
|
||||||
|
flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
|
Loading…
Reference in new issue