From f28c2605765505d4ad2c94d6ca7117292b92fe30 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Tue, 10 May 2022 17:15:41 +0200
Subject: [PATCH] Distill.io JSON export file importer (#592)

---
 changedetectionio/__init__.py                 |  57 ++++----
 changedetectionio/fetch_site_status.py        |  17 +--
 changedetectionio/importer.py                 | 133 ++++++++++++++++++
 changedetectionio/templates/edit.html         |   2 +-
 changedetectionio/templates/import.html       |  92 +++++++++---
 changedetectionio/tests/test_import.py        | 100 ++++++++++++-
 .../tests/test_xpath_selector.py              |  44 +++++-
 7 files changed, 381 insertions(+), 64 deletions(-)
 create mode 100644 changedetectionio/importer.py

diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 707de4f8..5cc58e02 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -683,44 +683,37 @@ def changedetection_app(config=None, datastore_o=None):
     @app.route("/import", methods=['GET', "POST"])
     @login_required
     def import_page():
-        import validators
         remaining_urls = []
+        if request.method == 'POST':
+            from .importer import import_url_list, import_distill_io_json
+
+            # URL List import
+            if request.values.get('urls') and len(request.values.get('urls').strip()):
+                # Import and push into the queue for immediate update check
+                importer = import_url_list()
+                importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
+                for uuid in importer.new_uuids:
+                    update_q.put(uuid)
+
+                if len(importer.remaining_data) == 0:
+                    return redirect(url_for('index'))
+                else:
+                    remaining_urls = importer.remaining_data
+
+            # Distill.io import
+            if request.values.get('distill-io') and len(request.values.get('distill-io').strip()):
+                # Import and push into the queue for immediate update check
+                d_importer = import_distill_io_json()
+                d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
+                for uuid in d_importer.new_uuids:
+                    update_q.put(uuid)
 
-        good = 0
 
-        if request.method == 'POST':
-            now=time.time()
-            urls = request.values.get('urls').split("\n")
-
-            if (len(urls) > 5000):
-                flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
-
-            for url in urls:
-                url = url.strip()
-                url, *tags = url.split(" ")
-                # Flask wtform validators wont work with basic auth, use validators package
-                # Up to 5000 per batch so we dont flood the server
-                if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
-                    new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False)
-                    if new_uuid:
-                        # Straight into the queue.
-                        update_q.put(new_uuid)
-                        good += 1
-                        continue
-
-                if len(url.strip()):
-                    remaining_urls.append(url)
-
-            flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls)))
-            datastore.needs_write = True
-
-            if len(remaining_urls) == 0:
-                # Looking good, redirect to index.
-                return redirect(url_for('index'))
 
         # Could be some remaining, or we could be on GET
         output = render_template("import.html",
-                                 remaining="\n".join(remaining_urls)
+                                 import_url_list_remaining="\n".join(remaining_urls),
+                                 original_distill_json=''
                                  )
         return output
 
diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 36ead8ec..93e21663 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -17,10 +17,10 @@ class perform_site_check():
         self.datastore = datastore
 
     # If there was a proxy list enabled, figure out what proxy_args/which proxy to use
-        # if watch.proxy use that
-        # fetcher.proxy_override = watch.proxy or main config proxy
-        # Allows override the proxy on a per-request basis
-        # ALWAYS use the first one is nothing selected
+    # if watch.proxy use that
+    # fetcher.proxy_override = watch.proxy or main config proxy
+    # Allows override the proxy on a per-request basis
+    # ALWAYS use the first one is nothing selected
 
     def set_proxy_from_list(self, watch):
         proxy_args = None
@@ -149,11 +149,13 @@ class perform_site_check():
                 # Then we assume HTML
                 if has_filter_rule:
                     # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
-                    if css_filter_rule[0] == '/':
-                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+                    if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
+                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
+                                                               html_content=fetcher.content)
                     else:
                         # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                         html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+
                 if has_subtractive_selectors:
                     html_content = html_tools.element_removal(subtractive_selectors, html_content)
 
@@ -173,7 +175,6 @@ class perform_site_check():
             # Re #340 - return the content before the 'ignore text' was applied
             text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
 
-
         # Re #340 - return the content before the 'ignore text' was applied
         text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
 
@@ -224,4 +225,4 @@ class perform_site_check():
                 if not watch['title'] or not len(watch['title']):
                     update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
 
-        return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
\ No newline at end of file
+        return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
diff --git a/changedetectionio/importer.py b/changedetectionio/importer.py
new file mode 100644
index 00000000..30d349e6
--- /dev/null
+++ b/changedetectionio/importer.py
@@ -0,0 +1,133 @@
+from abc import ABC, abstractmethod
+import time
+import validators
+
+
+class Importer():
+    remaining_data = []
+    new_uuids = []
+    good = 0
+
+    def __init__(self):
+        self.new_uuids = []
+        self.good = 0
+        self.remaining_data = []
+
+    @abstractmethod
+    def run(self,
+            data,
+            flash,
+            datastore):
+        pass
+
+
+class import_url_list(Importer):
+    """
+    Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
+    """
+    def run(self,
+            data,
+            flash,
+            datastore,
+            ):
+
+        urls = data.split("\n")
+        good = 0
+        now = time.time()
+
+        if (len(urls) > 5000):
+            flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
+
+        for url in urls:
+            url = url.strip()
+            if not len(url):
+                continue
+
+            tags = ""
+
+            # 'tags' should be a csv list after the URL
+            if ' ' in url:
+                url, tags = url.split(" ", 1)
+
+            # Flask wtform validators wont work with basic auth, use validators package
+            # Up to 5000 per batch so we dont flood the server
+            if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
+                new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
+                if new_uuid:
+                    # Straight into the queue.
+                    self.new_uuids.append(new_uuid)
+                    good += 1
+                    continue
+
+            # Worked past the 'continue' above, append it to the bad list
+            if self.remaining_data is None:
+                self.remaining_data = []
+            self.remaining_data.append(url)
+
+        flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
+
+
+class import_distill_io_json(Importer):
+    def run(self,
+            data,
+            flash,
+            datastore,
+            ):
+
+        import json
+        good = 0
+        now = time.time()
+        self.new_uuids=[]
+
+
+        try:
+            data = json.loads(data.strip())
+        except json.decoder.JSONDecodeError:
+            flash("Unable to read JSON file, was it broken?", 'error')
+            return
+
+        if not data.get('data'):
+            flash("JSON structure looks invalid, was it broken?", 'error')
+            return
+
+        for d in data.get('data'):
+            d_config = json.loads(d['config'])
+            extras = {'title': d['name']}
+
+            if len(d['uri']) and good < 5000:
+                try:
+                    # @todo we only support CSS ones at the moment
+                    if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
+                        extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
+                except KeyError:
+                    pass
+                except IndexError:
+                    pass
+
+                try:
+                    extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
+                    if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
+                        extras['css_filter'] = 'xpath:' + extras['css_filter']
+
+                except KeyError:
+                    pass
+                except IndexError:
+                    pass
+
+                try:
+                    extras['tag'] = ", ".join(d['tags'])
+                except KeyError:
+                    pass
+                except IndexError:
+                    pass
+
+                new_uuid = datastore.add_watch(url=d['uri'].strip(),
+                                               extras=extras,
+                                               write_to_disk_now=False)
+
+                if new_uuid:
+                    # Straight into the queue.
+                    self.new_uuids.append(new_uuid)
+                    good += 1
+
+        flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
index 5a09cd35..98dedfb4 100644
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@@ -131,7 +131,7 @@ User-Agent: wonderbra 1.0") }}
                         <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                         <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required,  <a
                                 href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
-                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code>, <a
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
                                 href="http://xpather.com/" target="new">test your XPath here</a></li>
                     </ul>
                     Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
diff --git a/changedetectionio/templates/import.html b/changedetectionio/templates/import.html
index e9376fc4..bd6a4854 100644
--- a/changedetectionio/templates/import.html
+++ b/changedetectionio/templates/import.html
@@ -1,30 +1,86 @@
 {% extends 'base.html' %}
-
 {% block content %}
-<div class="edit-form">
-     <div class="inner">
+<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
+<div class="edit-form monospaced-textarea">
+
+    <div class="tabs collapsable">
+        <ul>
+            <li class="tab" id="default-tab"><a href="#url-list">URL List</a></li>
+            <li class="tab"><a href="#distill-io">Distill.io</a></li>
+        </ul>
+    </div>
+
+    <div class="box-wrap inner">
         <form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
             <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
-            <fieldset class="pure-group">
-              <legend>
-                Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):
-                <br>
-                <code>https://example.com tag1, tag2, last tag</code>
-                <br>
-                URLs which do not pass validation will stay in the textarea.
-              </legend>
-              
-
-                <textarea name="urls" class="pure-input-1-2" placeholder="https://"
-                          style="width: 100%;
+            <div class="tab-pane-inner" id="url-list">
+                <fieldset class="pure-group">
+                    <legend>
+                        Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma
+                        (,):
+                        <br>
+                        <code>https://example.com tag1, tag2, last tag</code>
+                        <br>
+                        URLs which do not pass validation will stay in the textarea.
+                    </legend>
+
+
+                    <textarea name="urls" class="pure-input-1-2" placeholder="https://"
+                              style="width: 100%;
                                 font-family:monospace;
                                 white-space: pre;
                                 overflow-wrap: normal;
-                                overflow-x: scroll;" rows="25">{{ remaining }}</textarea>
-            </fieldset>
+                                overflow-x: scroll;" rows="25">{{ import_url_list_remaining }}</textarea>
+                </fieldset>
+
+
+            </div>
+
+            <div class="tab-pane-inner" id="distill-io">
+
+
+                <fieldset class="pure-group">
+                    <legend>
+                        Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.</br>
+                        This is <i>experimental</i>, supported fields are <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, the rest (including <code>schedule</code>) are ignored.
+                        <br/>
+                        <p>
+                        How to export? <a href="https://distill.io/docs/web-monitor/how-export-and-import-monitors/">https://distill.io/docs/web-monitor/how-export-and-import-monitors/</a><br/>
+                        Be sure to set your default fetcher to Chrome if required.</br>
+                        </p>
+                    </legend>
+
+
+                    <textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
+                                font-family:monospace;
+                                white-space: pre;
+                                overflow-wrap: normal;
+                                overflow-x: scroll;" placeholder="Example Distill.io JSON export file
+
+{
+    &quot;client&quot;: {
+        &quot;local&quot;: 1
+    },
+    &quot;data&quot;: [
+        {
+            &quot;name&quot;: &quot;Unraid | News&quot;,
+            &quot;uri&quot;: &quot;https://unraid.net/blog&quot;,
+            &quot;config&quot;: &quot;{\&quot;selections\&quot;:[{\&quot;frames\&quot;:[{\&quot;index\&quot;:0,\&quot;excludes\&quot;:[],\&quot;includes\&quot;:[{\&quot;type\&quot;:\&quot;xpath\&quot;,\&quot;expr\&quot;:\&quot;(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\&quot;}]}],\&quot;dynamic\&quot;:true,\&quot;delay\&quot;:2}],\&quot;ignoreEmptyText\&quot;:true,\&quot;includeStyle\&quot;:false,\&quot;dataAttr\&quot;:\&quot;text\&quot;}&quot;,
+            &quot;tags&quot;: [],
+            &quot;content_type&quot;: 2,
+            &quot;state&quot;: 40,
+            &quot;schedule&quot;: &quot;{\&quot;type\&quot;:\&quot;INTERVAL\&quot;,\&quot;params\&quot;:{\&quot;interval\&quot;:4447}}&quot;,
+            &quot;ts&quot;: &quot;2022-03-27T15:51:15.667Z&quot;
+        }
+    ]
+}
+" rows="25">{{ original_distill_json }}</textarea>
+                </fieldset>
+            </div>
             <button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
         </form>
-     </div>
+
+    </div>
 </div>
 
 {% endblock %}
diff --git a/changedetectionio/tests/test_import.py b/changedetectionio/tests/test_import.py
index 07676023..c4edad5c 100644
--- a/changedetectionio/tests/test_import.py
+++ b/changedetectionio/tests/test_import.py
@@ -5,18 +5,17 @@ import time
 from flask import url_for
 
 from .util import live_server_setup
-
-
-def test_import(client, live_server):
-
+def test_setup(client, live_server):
     live_server_setup(live_server)
 
+def test_import(client, live_server):
     # Give the endpoint time to spin up
     time.sleep(1)
 
     res = client.post(
         url_for("import_page"),
         data={
+            "distill-io": "",
             "urls": """https://example.com
 https://example.com tag1
 https://example.com tag1, other tag"""
@@ -26,3 +25,96 @@ https://example.com tag1, other tag"""
     assert b"3 Imported" in res.data
     assert b"tag1" in res.data
     assert b"other tag" in res.data
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+
+    # Clear flask alerts
+    res = client.get( url_for("index"))
+    res = client.get( url_for("index"))
+
+def xtest_import_skip_url(client, live_server):
+
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    res = client.post(
+        url_for("import_page"),
+        data={
+            "distill-io": "",
+            "urls": """https://example.com
+:ht000000broken
+"""
+        },
+        follow_redirects=True,
+    )
+    assert b"1 Imported" in res.data
+    assert b"ht000000broken" in res.data
+    assert b"1 Skipped" in res.data
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    # Clear flask alerts
+    res = client.get( url_for("index"))
+
+def test_import_distillio(client, live_server):
+
+    distill_data='''
+{
+    "client": {
+        "local": 1
+    },
+    "data": [
+        {
+            "name": "Unraid | News",
+            "uri": "https://unraid.net/blog",
+            "config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}",
+            "tags": ["nice stuff", "nerd-news"],
+            "content_type": 2,
+            "state": 40,
+            "schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}",
+            "ts": "2022-03-27T15:51:15.667Z"
+        }
+    ]
+}		   
+
+'''
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+    client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    res = client.post(
+        url_for("import_page"),
+        data={
+            "distill-io": distill_data,
+            "urls" : ''
+        },
+        follow_redirects=True,
+    )
+
+
+    assert b"Unable to read JSON file, was it broken?" not in res.data
+    assert b"1 Imported from Distill.io" in res.data
+
+    res = client.get( url_for("edit_page", uuid="first"))
+
+    assert b"https://unraid.net/blog" in res.data
+    assert b"Unraid | News" in res.data
+
+
+    # flask/wtforms should recode this, check we see it
+    # wtforms encodes it like id=&#39 ,but html.escape makes it like id=&#x27
+    # - so just check it manually :(
+    #import json
+    #import html
+    #d = json.loads(distill_data)
+    # embedded_d=json.loads(d['data'][0]['config'])
+    # x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8')
+    assert b"xpath:(//div[@id=&#39;App&#39;]/div[contains(@class,&#39;flex&#39;)]/main[contains(@class,&#39;relative&#39;)]/section[contains(@class,&#39;relative&#39;)]/div[@class=&#39;container&#39;]/div[contains(@class,&#39;flex&#39;)]/div[contains(@class,&#39;w-full&#39;)])[1]" in res.data
+
+    # did the tags work?
+    res = client.get( url_for("index"))
+
+    assert b"nice stuff" in res.data
+    assert b"nerd-news" in res.data
+
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    # Clear flask alerts
+    res = client.get(url_for("index"))
diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py
index d1374834..7a0ba0dc 100644
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -116,4 +116,46 @@ def test_xpath_validation(client, live_server):
         data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
         follow_redirects=True
     )
-    assert b"is not a valid XPath expression" in res.data
\ No newline at end of file
+    assert b"is not a valid XPath expression" in res.data
+
+
+# actually only really used by the distll.io importer, but could be handy too
+def test_check_with_prefix_css_filter(client, live_server):
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    set_original_response()
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    time.sleep(3)
+
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter":  "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+
+    assert b"Updated watch." in res.data
+    time.sleep(3)
+
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    with open('/tmp/fuck.html', 'wb') as f:
+        f.write(res.data)
+    assert b"Some text thats the same" in res.data #in selector
+    assert b"Some text that will change" not in res.data #not in selector
+
+    client.get(url_for("api_delete", uuid="all"), follow_redirects=True)