From f28c2605765505d4ad2c94d6ca7117292b92fe30 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 10 May 2022 17:15:41 +0200 Subject: [PATCH] Distill.io JSON export file importer (#592) --- changedetectionio/__init__.py | 57 ++++---- changedetectionio/fetch_site_status.py | 17 +-- changedetectionio/importer.py | 133 ++++++++++++++++++ changedetectionio/templates/edit.html | 2 +- changedetectionio/templates/import.html | 92 +++++++++--- changedetectionio/tests/test_import.py | 100 ++++++++++++- .../tests/test_xpath_selector.py | 44 +++++- 7 files changed, 381 insertions(+), 64 deletions(-) create mode 100644 changedetectionio/importer.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 707de4f8..5cc58e02 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -683,44 +683,37 @@ def changedetection_app(config=None, datastore_o=None): @app.route("/import", methods=['GET', "POST"]) @login_required def import_page(): - import validators remaining_urls = [] + if request.method == 'POST': + from .importer import import_url_list, import_distill_io_json + + # URL List import + if request.values.get('urls') and len(request.values.get('urls').strip()): + # Import and push into the queue for immediate update check + importer = import_url_list() + importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore) + for uuid in importer.new_uuids: + update_q.put(uuid) + + if len(importer.remaining_data) == 0: + return redirect(url_for('index')) + else: + remaining_urls = importer.remaining_data + + # Distill.io import + if request.values.get('distill-io') and len(request.values.get('distill-io').strip()): + # Import and push into the queue for immediate update check + d_importer = import_distill_io_json() + d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore) + for uuid in d_importer.new_uuids: + update_q.put(uuid) - good = 0 - if request.method == 'POST': - now=time.time() - urls = request.values.get('urls').split("\n") - - if (len(urls) > 5000): - flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.") - - for url in urls: - url = url.strip() - url, *tags = url.split(" ") - # Flask wtform validators wont work with basic auth, use validators package - # Up to 5000 per batch so we dont flood the server - if len(url) and validators.url(url.replace('source:', '')) and good < 5000: - new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False) - if new_uuid: - # Straight into the queue. - update_q.put(new_uuid) - good += 1 - continue - - if len(url.strip()): - remaining_urls.append(url) - - flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls))) - datastore.needs_write = True - - if len(remaining_urls) == 0: - # Looking good, redirect to index. - return redirect(url_for('index')) # Could be some remaining, or we could be on GET output = render_template("import.html", - remaining="\n".join(remaining_urls) + import_url_list_remaining="\n".join(remaining_urls), + original_distill_json='' ) return output diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 36ead8ec..93e21663 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -17,10 +17,10 @@ class perform_site_check(): self.datastore = datastore # If there was a proxy list enabled, figure out what proxy_args/which proxy to use - # if watch.proxy use that - # fetcher.proxy_override = watch.proxy or main config proxy - # Allows override the proxy on a per-request basis - # ALWAYS use the first one is nothing selected + # if watch.proxy use that + # fetcher.proxy_override = watch.proxy or main config proxy + # Allows override the proxy on a per-request basis + # ALWAYS use the first one is nothing selected def set_proxy_from_list(self, watch): proxy_args = None @@ -149,11 +149,13 @@ class perform_site_check(): # Then we assume HTML if has_filter_rule: # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." - if css_filter_rule[0] == '/': - html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) + if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'): + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''), + html_content=fetcher.content) else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) + if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) @@ -173,7 +175,6 @@ class perform_site_check(): # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') - # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') @@ -224,4 +225,4 @@ class perform_site_check(): if not watch['title'] or not len(watch['title']): update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) - return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot \ No newline at end of file + return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot diff --git a/changedetectionio/importer.py b/changedetectionio/importer.py new file mode 100644 index 00000000..30d349e6 --- /dev/null +++ b/changedetectionio/importer.py @@ -0,0 +1,133 @@ +from abc import ABC, abstractmethod +import time +import validators + + +class Importer(): + remaining_data = [] + new_uuids = [] + good = 0 + + def __init__(self): + self.new_uuids = [] + self.good = 0 + self.remaining_data = [] + + @abstractmethod + def run(self, + data, + flash, + datastore): + pass + + +class import_url_list(Importer): + """ + Imports a list, can be in https://example.com tag1, tag2, last tag format + """ + def run(self, + data, + flash, + datastore, + ): + + urls = data.split("\n") + good = 0 + now = time.time() + + if (len(urls) > 5000): + flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.") + + for url in urls: + url = url.strip() + if not len(url): + continue + + tags = "" + + # 'tags' should be a csv list after the URL + if ' ' in url: + url, tags = url.split(" ", 1) + + # Flask wtform validators wont work with basic auth, use validators package + # Up to 5000 per batch so we dont flood the server + if len(url) and validators.url(url.replace('source:', '')) and good < 5000: + new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False) + if new_uuid: + # Straight into the queue. + self.new_uuids.append(new_uuid) + good += 1 + continue + + # Worked past the 'continue' above, append it to the bad list + if self.remaining_data is None: + self.remaining_data = [] + self.remaining_data.append(url) + + flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data))) + + +class import_distill_io_json(Importer): + def run(self, + data, + flash, + datastore, + ): + + import json + good = 0 + now = time.time() + self.new_uuids=[] + + + try: + data = json.loads(data.strip()) + except json.decoder.JSONDecodeError: + flash("Unable to read JSON file, was it broken?", 'error') + return + + if not data.get('data'): + flash("JSON structure looks invalid, was it broken?", 'error') + return + + for d in data.get('data'): + d_config = json.loads(d['config']) + extras = {'title': d['name']} + + if len(d['uri']) and good < 5000: + try: + # @todo we only support CSS ones at the moment + if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css': + extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr'] + except KeyError: + pass + except IndexError: + pass + + try: + extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr'] + if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath': + extras['css_filter'] = 'xpath:' + extras['css_filter'] + + except KeyError: + pass + except IndexError: + pass + + try: + extras['tag'] = ", ".join(d['tags']) + except KeyError: + pass + except IndexError: + pass + + new_uuid = datastore.add_watch(url=d['uri'].strip(), + extras=extras, + write_to_disk_now=False) + + if new_uuid: + # Straight into the queue. + self.new_uuids.append(new_uuid) + good += 1 + + flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data))) diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 5a09cd35..98dedfb4 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -131,7 +131,7 @@ User-Agent: wonderbra 1.0") }}
  • CSS - Limit text to this CSS rule, only text matching this CSS rule is included.
  • JSON - Limit text to this JSON rule, using JSONPath, prefix with "json:", use json:$ to force re-formatting if required, test your JSONPath here
  • -
  • XPath - Limit text to this XPath rule, simply start with a forward-slash, example //*[contains(@class, 'sametext')], XPath - Limit text to this XPath rule, simply start with a forward-slash, example //*[contains(@class, 'sametext')] or xpath://*[contains(@class, 'sametext')], test your XPath here
  • Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! -
    + +
    + + + +
    -
    - - Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,): -
    - https://example.com tag1, tag2, last tag -
    - URLs which do not pass validation will stay in the textarea. -
    - - - -
    + overflow-x: scroll;" rows="25">{{ import_url_list_remaining }} + + + +
    + +
    + + +
    + + Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.
    + This is experimental, supported fields are name, uri, tags, config:selections, the rest (including schedule) are ignored. +
    +

    + How to export? https://distill.io/docs/web-monitor/how-export-and-import-monitors/
    + Be sure to set your default fetcher to Chrome if required.
    +

    +
    + + + +
    +
    -
    + +
    {% endblock %} diff --git a/changedetectionio/tests/test_import.py b/changedetectionio/tests/test_import.py index 07676023..c4edad5c 100644 --- a/changedetectionio/tests/test_import.py +++ b/changedetectionio/tests/test_import.py @@ -5,18 +5,17 @@ import time from flask import url_for from .util import live_server_setup - - -def test_import(client, live_server): - +def test_setup(client, live_server): live_server_setup(live_server) +def test_import(client, live_server): # Give the endpoint time to spin up time.sleep(1) res = client.post( url_for("import_page"), data={ + "distill-io": "", "urls": """https://example.com https://example.com tag1 https://example.com tag1, other tag""" @@ -26,3 +25,96 @@ https://example.com tag1, other tag""" assert b"3 Imported" in res.data assert b"tag1" in res.data assert b"other tag" in res.data + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + + # Clear flask alerts + res = client.get( url_for("index")) + res = client.get( url_for("index")) + +def xtest_import_skip_url(client, live_server): + + + # Give the endpoint time to spin up + time.sleep(1) + + res = client.post( + url_for("import_page"), + data={ + "distill-io": "", + "urls": """https://example.com +:ht000000broken +""" + }, + follow_redirects=True, + ) + assert b"1 Imported" in res.data + assert b"ht000000broken" in res.data + assert b"1 Skipped" in res.data + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + # Clear flask alerts + res = client.get( url_for("index")) + +def test_import_distillio(client, live_server): + + distill_data=''' +{ + "client": { + "local": 1 + }, + "data": [ + { + "name": "Unraid | News", + "uri": "https://unraid.net/blog", + "config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}", + "tags": ["nice stuff", "nerd-news"], + "content_type": 2, + "state": 40, + "schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}", + "ts": "2022-03-27T15:51:15.667Z" + } + ] +} + +''' + + # Give the endpoint time to spin up + time.sleep(1) + client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + res = client.post( + url_for("import_page"), + data={ + "distill-io": distill_data, + "urls" : '' + }, + follow_redirects=True, + ) + + + assert b"Unable to read JSON file, was it broken?" not in res.data + assert b"1 Imported from Distill.io" in res.data + + res = client.get( url_for("edit_page", uuid="first")) + + assert b"https://unraid.net/blog" in res.data + assert b"Unraid | News" in res.data + + + # flask/wtforms should recode this, check we see it + # wtforms encodes it like id=' ,but html.escape makes it like id=' + # - so just check it manually :( + #import json + #import html + #d = json.loads(distill_data) + # embedded_d=json.loads(d['data'][0]['config']) + # x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8') + assert b"xpath:(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]" in res.data + + # did the tags work? + res = client.get( url_for("index")) + + assert b"nice stuff" in res.data + assert b"nerd-news" in res.data + + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + # Clear flask alerts + res = client.get(url_for("index")) diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py index d1374834..7a0ba0dc 100644 --- a/changedetectionio/tests/test_xpath_selector.py +++ b/changedetectionio/tests/test_xpath_selector.py @@ -116,4 +116,46 @@ def test_xpath_validation(client, live_server): data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) - assert b"is not a valid XPath expression" in res.data \ No newline at end of file + assert b"is not a valid XPath expression" in res.data + + +# actually only really used by the distll.io importer, but could be handy too +def test_check_with_prefix_css_filter(client, live_server): + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + # Give the endpoint time to spin up + time.sleep(1) + + set_original_response() + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + time.sleep(3) + + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + + assert b"Updated watch." in res.data + time.sleep(3) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + with open('/tmp/fuck.html', 'wb') as f: + f.write(res.data) + assert b"Some text thats the same" in res.data #in selector + assert b"Some text that will change" not in res.data #not in selector + + client.get(url_for("api_delete", uuid="all"), follow_redirects=True)