Distill.io JSON export file importer (#592)

3 years ago · f28c260576
parent 18f0b63b7d
commit f28c260576
7 changed files with 381 additions and 64 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -683,44 +683,37 @@ def changedetection_app(config=None, datastore_o=None):
    @app.route("/import", methods=['GET', "POST"])
    @login_required
    def import_page():
        import validators
        remaining_urls = []
        if request.method == 'POST':
            from .importer import import_url_list, import_distill_io_json
            # URL List import
            if request.values.get('urls') and len(request.values.get('urls').strip()):
                # Import and push into the queue for immediate update check
                importer = import_url_list()
                importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
                for uuid in importer.new_uuids:
                    update_q.put(uuid)
                if len(importer.remaining_data) == 0:
                    return redirect(url_for('index'))
                else:
                    remaining_urls = importer.remaining_data
            # Distill.io import
            if request.values.get('distill-io') and len(request.values.get('distill-io').strip()):
                # Import and push into the queue for immediate update check
                d_importer = import_distill_io_json()
                d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
                for uuid in d_importer.new_uuids:
                    update_q.put(uuid)
        good = 0
        if request.method == 'POST':
            now=time.time()
            urls = request.values.get('urls').split("\n")
            if (len(urls) > 5000):
                flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
            for url in urls:
                url = url.strip()
                url, *tags = url.split(" ")
                # Flask wtform validators wont work with basic auth, use validators package
                # Up to 5000 per batch so we dont flood the server
                if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
                    new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False)
                    if new_uuid:
                        # Straight into the queue.
                        update_q.put(new_uuid)
                        good += 1
                        continue
                if len(url.strip()):
                    remaining_urls.append(url)
            flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls)))
            datastore.needs_write = True
            if len(remaining_urls) == 0:
                # Looking good, redirect to index.
                return redirect(url_for('index'))
        # Could be some remaining, or we could be on GET
        output = render_template("import.html",
-                                 remaining="\n".join(remaining_urls)
+                                 import_url_list_remaining="\n".join(remaining_urls),
                                 original_distill_json=''
                                 )
        return output
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -17,10 +17,10 @@ class perform_site_check():
        self.datastore = datastore
    # If there was a proxy list enabled, figure out what proxy_args/which proxy to use
-        # if watch.proxy use that
+    # if watch.proxy use that
-        # fetcher.proxy_override = watch.proxy or main config proxy
+    # fetcher.proxy_override = watch.proxy or main config proxy
-        # Allows override the proxy on a per-request basis
+    # Allows override the proxy on a per-request basis
-        # ALWAYS use the first one is nothing selected
+    # ALWAYS use the first one is nothing selected
    def set_proxy_from_list(self, watch):
        proxy_args = None
@ -149,11 +149,13 @@ class perform_site_check():
                # Then we assume HTML
                if has_filter_rule:
                    # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
-                    if css_filter_rule[0] == '/':
+                    if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
-                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
                                                               html_content=fetcher.content)
                    else:
                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                        html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
                if has_subtractive_selectors:
                    html_content = html_tools.element_removal(subtractive_selectors, html_content)
@ -173,7 +175,6 @@ class perform_site_check():
            # Re #340 - return the content before the 'ignore text' was applied
            text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
        # Re #340 - return the content before the 'ignore text' was applied
        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
--- a/changedetectionio/importer.py
+++ b/changedetectionio/importer.py
@ -0,0 +1,133 @@
 from abc import ABC, abstractmethod
 import time
 import validators
 class Importer():
    remaining_data = []
    new_uuids = []
    good = 0
    def __init__(self):
        self.new_uuids = []
        self.good = 0
        self.remaining_data = []
    @abstractmethod
    def run(self,
            data,
            flash,
            datastore):
        pass
 class import_url_list(Importer):
    """
    Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
    """
    def run(self,
            data,
            flash,
            datastore,
            ):
        urls = data.split("\n")
        good = 0
        now = time.time()
        if (len(urls) > 5000):
            flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
        for url in urls:
            url = url.strip()
            if not len(url):
                continue
            tags = ""
            # 'tags' should be a csv list after the URL
            if ' ' in url:
                url, tags = url.split(" ", 1)
            # Flask wtform validators wont work with basic auth, use validators package
            # Up to 5000 per batch so we dont flood the server
            if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
                new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
                if new_uuid:
                    # Straight into the queue.
                    self.new_uuids.append(new_uuid)
                    good += 1
                    continue
            # Worked past the 'continue' above, append it to the bad list
            if self.remaining_data is None:
                self.remaining_data = []
            self.remaining_data.append(url)
        flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
 class import_distill_io_json(Importer):
    def run(self,
            data,
            flash,
            datastore,
            ):
        import json
        good = 0
        now = time.time()
        self.new_uuids=[]
        try:
            data = json.loads(data.strip())
        except json.decoder.JSONDecodeError:
            flash("Unable to read JSON file, was it broken?", 'error')
            return
        if not data.get('data'):
            flash("JSON structure looks invalid, was it broken?", 'error')
            return
        for d in data.get('data'):
            d_config = json.loads(d['config'])
            extras = {'title': d['name']}
            if len(d['uri']) and good < 5000:
                try:
                    # @todo we only support CSS ones at the moment
                    if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
                        extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
                except KeyError:
                    pass
                except IndexError:
                    pass
                try:
                    extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
                    if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
                        extras['css_filter'] = 'xpath:' + extras['css_filter']
                except KeyError:
                    pass
                except IndexError:
                    pass
                try:
                    extras['tag'] = ", ".join(d['tags'])
                except KeyError:
                    pass
                except IndexError:
                    pass
                new_uuid = datastore.add_watch(url=d['uri'].strip(),
                                               extras=extras,
                                               write_to_disk_now=False)
                if new_uuid:
                    # Straight into the queue.
                    self.new_uuids.append(new_uuid)
                    good += 1
        flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -131,7 +131,7 @@ User-Agent: wonderbra 1.0") }}
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required,  <a
                                href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
-                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code>, <a
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
                                href="http://xpather.com/" target="new">test your XPath here</a></li>
                    </ul>
                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
--- a/changedetectionio/templates/import.html
+++ b/changedetectionio/templates/import.html
@ -1,30 +1,86 @@
 {% extends 'base.html' %}
 {% block content %}
-<div class="edit-form">
+<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
-     <div class="inner">
+<div class="edit-form monospaced-textarea">
    <div class="tabs collapsable">
        <ul>
            <li class="tab" id="default-tab"><a href="#url-list">URL List</a></li>
            <li class="tab"><a href="#distill-io">Distill.io</a></li>
        </ul>
    </div>
    <div class="box-wrap inner">
        <form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
            <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
-            <fieldset class="pure-group">
+            <div class="tab-pane-inner" id="url-list">
-              <legend>
+                <fieldset class="pure-group">
-                Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):
+                    <legend>
-                <br>
+                        Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma
-                <code>https://example.com tag1, tag2, last tag</code>
+                        (,):
-                <br>
+                        <br>
-                URLs which do not pass validation will stay in the textarea.
+                        <code>https://example.com tag1, tag2, last tag</code>
-              </legend>
+                        <br>
-              
+                        URLs which do not pass validation will stay in the textarea.
-
+                    </legend>
-                <textarea name="urls" class="pure-input-1-2" placeholder="https://"
+
-                          style="width: 100%;
+
                    <textarea name="urls" class="pure-input-1-2" placeholder="https://"
                              style="width: 100%;
                                font-family:monospace;
                                white-space: pre;
                                overflow-wrap: normal;
-                                overflow-x: scroll;" rows="25">{{ remaining }}</textarea>
+                                overflow-x: scroll;" rows="25">{{ import_url_list_remaining }}</textarea>
-            </fieldset>
+                </fieldset>
            </div>
            <div class="tab-pane-inner" id="distill-io">
                <fieldset class="pure-group">
                    <legend>
                        Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.</br>
                        This is <i>experimental</i>, supported fields are <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, the rest (including <code>schedule</code>) are ignored.
                        <br/>
                        <p>
                        How to export? <a href="https://distill.io/docs/web-monitor/how-export-and-import-monitors/">https://distill.io/docs/web-monitor/how-export-and-import-monitors/</a><br/>
                        Be sure to set your default fetcher to Chrome if required.</br>
                        </p>
                    </legend>
                    <textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
                                font-family:monospace;
                                white-space: pre;
                                overflow-wrap: normal;
                                overflow-x: scroll;" placeholder="Example Distill.io JSON export file
 {
    &quot;client&quot;: {
        &quot;local&quot;: 1
    },
    &quot;data&quot;: [
        {
            &quot;name&quot;: &quot;Unraid | News&quot;,
            &quot;uri&quot;: &quot;https://unraid.net/blog&quot;,
            &quot;config&quot;: &quot;{\&quot;selections\&quot;:[{\&quot;frames\&quot;:[{\&quot;index\&quot;:0,\&quot;excludes\&quot;:[],\&quot;includes\&quot;:[{\&quot;type\&quot;:\&quot;xpath\&quot;,\&quot;expr\&quot;:\&quot;(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\&quot;}]}],\&quot;dynamic\&quot;:true,\&quot;delay\&quot;:2}],\&quot;ignoreEmptyText\&quot;:true,\&quot;includeStyle\&quot;:false,\&quot;dataAttr\&quot;:\&quot;text\&quot;}&quot;,
            &quot;tags&quot;: [],
            &quot;content_type&quot;: 2,
            &quot;state&quot;: 40,
            &quot;schedule&quot;: &quot;{\&quot;type\&quot;:\&quot;INTERVAL\&quot;,\&quot;params\&quot;:{\&quot;interval\&quot;:4447}}&quot;,
            &quot;ts&quot;: &quot;2022-03-27T15:51:15.667Z&quot;
        }
    ]
 }
 " rows="25">{{ original_distill_json }}</textarea>
                </fieldset>
            </div>
            <button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
        </form>
-     </div>
+
    </div>
 </div>
 {% endblock %}
--- a/changedetectionio/tests/test_import.py
+++ b/changedetectionio/tests/test_import.py
@ -5,18 +5,17 @@ import time
 from flask import url_for
 from .util import live_server_setup
-
+def test_setup(client, live_server):
 def test_import(client, live_server):
    live_server_setup(live_server)
 def test_import(client, live_server):
    # Give the endpoint time to spin up
    time.sleep(1)
    res = client.post(
        url_for("import_page"),
        data={
            "distill-io": "",
            "urls": """https://example.com
 https://example.com tag1
 https://example.com tag1, other tag"""
@ -26,3 +25,96 @@ https://example.com tag1, other tag"""
    assert b"3 Imported" in res.data
    assert b"tag1" in res.data
    assert b"other tag" in res.data
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    # Clear flask alerts
    res = client.get( url_for("index"))
    res = client.get( url_for("index"))
 def xtest_import_skip_url(client, live_server):
    # Give the endpoint time to spin up
    time.sleep(1)
    res = client.post(
        url_for("import_page"),
        data={
            "distill-io": "",
            "urls": """https://example.com
 :ht000000broken
 """
        },
        follow_redirects=True,
    )
    assert b"1 Imported" in res.data
    assert b"ht000000broken" in res.data
    assert b"1 Skipped" in res.data
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    # Clear flask alerts
    res = client.get( url_for("index"))
 def test_import_distillio(client, live_server):
    distill_data='''
 {
    "client": {
        "local": 1
    },
    "data": [
        {
            "name": "Unraid | News",
            "uri": "https://unraid.net/blog",
            "config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}",
            "tags": ["nice stuff", "nerd-news"],
            "content_type": 2,
            "state": 40,
            "schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}",
            "ts": "2022-03-27T15:51:15.667Z"
        }
    ]
 }		   
 '''
    # Give the endpoint time to spin up
    time.sleep(1)
    client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    res = client.post(
        url_for("import_page"),
        data={
            "distill-io": distill_data,
            "urls" : ''
        },
        follow_redirects=True,
    )
    assert b"Unable to read JSON file, was it broken?" not in res.data
    assert b"1 Imported from Distill.io" in res.data
    res = client.get( url_for("edit_page", uuid="first"))
    assert b"https://unraid.net/blog" in res.data
    assert b"Unraid | News" in res.data
    # flask/wtforms should recode this, check we see it
    # wtforms encodes it like id=&#39 ,but html.escape makes it like id=&#x27
    # - so just check it manually :(
    #import json
    #import html
    #d = json.loads(distill_data)
    # embedded_d=json.loads(d['data'][0]['config'])
    # x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8')
    assert b"xpath:(//div[@id=&#39;App&#39;]/div[contains(@class,&#39;flex&#39;)]/main[contains(@class,&#39;relative&#39;)]/section[contains(@class,&#39;relative&#39;)]/div[@class=&#39;container&#39;]/div[contains(@class,&#39;flex&#39;)]/div[contains(@class,&#39;w-full&#39;)])[1]" in res.data
    # did the tags work?
    res = client.get( url_for("index"))
    assert b"nice stuff" in res.data
    assert b"nerd-news" in res.data
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    # Clear flask alerts
    res = client.get(url_for("index"))
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@ -117,3 +117,45 @@ def test_xpath_validation(client, live_server):
        follow_redirects=True
    )
    assert b"is not a valid XPath expression" in res.data
 # actually only really used by the distll.io importer, but could be handy too
 def test_check_with_prefix_css_filter(client, live_server):
    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
    # Give the endpoint time to spin up
    time.sleep(1)
    set_original_response()
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    time.sleep(3)
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter":  "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    time.sleep(3)
    res = client.get(
        url_for("preview_page", uuid="first"),
        follow_redirects=True
    )
    with open('/tmp/fuck.html', 'wb') as f:
        f.write(res.data)
    assert b"Some text thats the same" in res.data #in selector
    assert b"Some text that will change" not in res.data #not in selector
    client.get(url_for("api_delete", uuid="all"), follow_redirects=True)