Merge branch 'master' into 550-visual-selector

3 years ago · 695fcc4566
parent d7c5a53315 0e385b1c22
commit 695fcc4566
19 changed files with 582 additions and 117 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -40,7 +40,7 @@ from flask_wtf import CSRFProtect

 from changedetectionio import html_tools

-__version__ = '0.39.12'
+__version__ = '0.39.13'

 datastore = None

@ -518,10 +518,31 @@ def changedetection_app(config=None, datastore_o=None):
        if all(value == 0 or value == None for value in datastore.data['watching'][uuid]['time_between_check'].values()):
            default['time_between_check'] = deepcopy(datastore.data['settings']['requests']['time_between_check'])

+        # Defaults for proxy choice
+        if datastore.proxy_list is not None:  # When enabled
+            system_proxy = datastore.data['settings']['requests']['proxy']
+            if default['proxy'] is None:
+                default['proxy'] = system_proxy
+            else:
+                # Does the chosen one exist?
+                if not any(default['proxy'] in tup for tup in datastore.proxy_list):
+                    default['proxy'] = datastore.proxy_list[0][0]
+
+            # Used by the form handler to keep or remove the proxy settings
+            default['proxy_list'] = datastore.proxy_list
+
+        # proxy_override set to the json/text list of the items
        form = forms.watchForm(formdata=request.form if request.method == 'POST' else None,
-                                        data=default
-                                        )
+                               data=default,
+                               )

+        if datastore.proxy_list is None:
+            # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead
+            del form.proxy
+        else:
+            form.proxy.choices = datastore.proxy_list
+            if default['proxy'] is None:
+                form.proxy.default='http://hello'

        if request.method == 'POST' and form.validate():
            extra_update_obj = {}
@ -601,10 +622,28 @@ def changedetection_app(config=None, datastore_o=None):
    def settings_page():
        from changedetectionio import content_fetcher, forms

+        default = deepcopy(datastore.data['settings'])
+        if datastore.proxy_list is not None:
+            # When enabled
+            system_proxy = datastore.data['settings']['requests']['proxy']
+            # In the case it doesnt exist anymore
+            if not any([system_proxy in tup for tup in datastore.proxy_list]):
+                system_proxy = None
+
+            default['requests']['proxy'] = system_proxy if system_proxy is not None else datastore.proxy_list[0][0]
+            # Used by the form handler to keep or remove the proxy settings
+            default['proxy_list'] = datastore.proxy_list
+
+
        # Don't use form.data on POST so that it doesnt overrid the checkbox status from the POST status
        form = forms.globalSettingsForm(formdata=request.form if request.method == 'POST' else None,
-                                        data=datastore.data['settings']
+                                        data=default
                                        )
+        if datastore.proxy_list is None:
+            # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead
+            del form.requests.form.proxy
+        else:
+            form.requests.form.proxy.choices = datastore.proxy_list

        if request.method == 'POST':
            # Password unset is a GET, but we can lock the session to a salted env password to always need the password
@ -644,44 +683,37 @@ def changedetection_app(config=None, datastore_o=None):
    @app.route("/import", methods=['GET', "POST"])
    @login_required
    def import_page():
-        import validators
        remaining_urls = []
+        if request.method == 'POST':
+            from .importer import import_url_list, import_distill_io_json
+
+            # URL List import
+            if request.values.get('urls') and len(request.values.get('urls').strip()):
+                # Import and push into the queue for immediate update check
+                importer = import_url_list()
+                importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore)
+                for uuid in importer.new_uuids:
+                    update_q.put(uuid)
+
+                if len(importer.remaining_data) == 0:
+                    return redirect(url_for('index'))
+                else:
+                    remaining_urls = importer.remaining_data
+
+            # Distill.io import
+            if request.values.get('distill-io') and len(request.values.get('distill-io').strip()):
+                # Import and push into the queue for immediate update check
+                d_importer = import_distill_io_json()
+                d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore)
+                for uuid in d_importer.new_uuids:
+                    update_q.put(uuid)

-        good = 0

-        if request.method == 'POST':
-            now=time.time()
-            urls = request.values.get('urls').split("\n")
-
-            if (len(urls) > 5000):
-                flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
-
-            for url in urls:
-                url = url.strip()
-                url, *tags = url.split(" ")
-                # Flask wtform validators wont work with basic auth, use validators package
-                # Up to 5000 per batch so we dont flood the server
-                if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
-                    new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False)
-                    if new_uuid:
-                        # Straight into the queue.
-                        update_q.put(new_uuid)
-                        good += 1
-                        continue
-
-                if len(url.strip()):
-                    remaining_urls.append(url)
-
-            flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls)))
-            datastore.needs_write = True
-
-            if len(remaining_urls) == 0:
-                # Looking good, redirect to index.
-                return redirect(url_for('index'))

        # Could be some remaining, or we could be on GET
        output = render_template("import.html",
-                                 remaining="\n".join(remaining_urls)
+                                 import_url_list_remaining="\n".join(remaining_urls),
+                                 original_distill_json=''
                                 )
        return output

--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -128,6 +128,9 @@ class Fetcher():

    # Will be needed in the future by the VisualSelector, always get this where possible.
    screenshot = False
+    fetcher_description = "No description"
+    system_http_proxy = os.getenv('HTTP_PROXY')
+    system_https_proxy = os.getenv('HTTPS_PROXY')

    @abstractmethod
    def get_error(self):
@ -184,21 +187,17 @@ class base_html_playwright(Fetcher):
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
        fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))

-    #    try:
-    #        from playwright.sync_api import sync_playwright
-    #    except ModuleNotFoundError:
-    #        fetcher_enabled = False
-
    browser_type = ''
    command_executor = ''

    # Configs for Proxy setup
    # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
-    playwright_proxy_settings_mappings = ['server', 'bypass', 'username', 'password']
+    playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']

    proxy = None

-    def __init__(self):
+    def __init__(self, proxy_override=None):
+
        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
        self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
        self.command_executor = os.getenv(
@ -216,6 +215,10 @@ class base_html_playwright(Fetcher):
        if proxy_args:
            self.proxy = proxy_args

+        # allow per-watch proxy selection override
+        if proxy_override:
+            self.proxy = {'server': proxy_override}
+
    def run(self,
            url,
            timeout,
@ -226,6 +229,8 @@ class base_html_playwright(Fetcher):
            current_css_filter=None):

        from playwright.sync_api import sync_playwright
+        import playwright._impl._api_types
+        from playwright._impl._api_types import Error, TimeoutError

        with sync_playwright() as p:
            browser_type = getattr(p, self.browser_type)
@ -235,17 +240,23 @@ class base_html_playwright(Fetcher):
            browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000)

            # Set user agent to prevent Cloudflare from blocking the browser
+            # Use the default one configured in the App.py model that's passed from fetch_site_status.py
            context = browser.new_context(
-                user_agent="Mozilla/5.0",
+                user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
                proxy=self.proxy
            )
            page = context.new_page()
-            response = page.goto(url, timeout=timeout * 1000)
-            # set size after visiting page, otherwise it wont work (seems to default to 800x)
            page.set_viewport_size({"width": 1280, "height": 1024})
-
-            extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
-            page.wait_for_timeout(extra_wait * 1000)
+            try:
+                response = page.goto(url, timeout=timeout * 1000, wait_until='commit')
+                # Wait_until = commit
+                # - `'commit'` - consider operation to be finished when network response is received and the document started loading.
+                # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds
+                # This seemed to solve nearly all 'TimeoutErrors'
+                extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
+                page.wait_for_timeout(extra_wait * 1000)
+            except playwright._impl._api_types.TimeoutError as e:
+                raise EmptyReply(url=url, status_code=None)

            if response is None:
                raise EmptyReply(url=url, status_code=None)
@ -283,7 +294,7 @@ class base_html_webdriver(Fetcher):
                                        'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
    proxy = None

-    def __init__(self):
+    def __init__(self, proxy_override=None):
        from selenium.webdriver.common.proxy import Proxy as SeleniumProxy

        # .strip('"') is going to save someone a lot of time when they accidently wrap the env value
@ -296,6 +307,16 @@ class base_html_webdriver(Fetcher):
            if v:
                proxy_args[k] = v.strip('"')

+        # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
+        if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
+            proxy_args['httpProxy'] = self.system_http_proxy
+        if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
+            proxy_args['httpsProxy'] = self.system_https_proxy
+
+        # Allows override the proxy on a per-request basis
+        if proxy_override is not None:
+            proxy_args['httpProxy'] = proxy_override
+
        if proxy_args:
            self.proxy = SeleniumProxy(raw=proxy_args)

@ -366,6 +387,9 @@ class base_html_webdriver(Fetcher):
 class html_requests(Fetcher):
    fetcher_description = "Basic fast Plaintext/HTTP Client"

+    def __init__(self, proxy_override=None):
+        self.proxy_override = proxy_override
+
    def run(self,
            url,
            timeout,
@ -375,11 +399,23 @@ class html_requests(Fetcher):
            ignore_status_codes=False,
            current_css_filter=None):

+        proxies={}
+
+        # Allows override the proxy on a per-request basis
+        if self.proxy_override:
+            proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override}
+        else:
+            if self.system_http_proxy:
+                proxies['http'] = self.system_http_proxy
+            if self.system_https_proxy:
+                proxies['https'] = self.system_https_proxy
+
        r = requests.request(method=request_method,
                             data=request_body,
                             url=url,
                             headers=request_headers,
                             timeout=timeout,
+                             proxies=proxies,
                             verify=False)

        # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -16,6 +16,34 @@ class perform_site_check():
        super().__init__(*args, **kwargs)
        self.datastore = datastore

+    # If there was a proxy list enabled, figure out what proxy_args/which proxy to use
+    # if watch.proxy use that
+    # fetcher.proxy_override = watch.proxy or main config proxy
+    # Allows override the proxy on a per-request basis
+    # ALWAYS use the first one is nothing selected
+
+    def set_proxy_from_list(self, watch):
+        proxy_args = None
+        if self.datastore.proxy_list is None:
+            return None
+
+        # If its a valid one
+        if any([watch['proxy'] in p for p in self.datastore.proxy_list]):
+            proxy_args = watch['proxy']
+
+        # not valid (including None), try the system one
+        else:
+            system_proxy = self.datastore.data['settings']['requests']['proxy']
+            # Is not None and exists
+            if any([system_proxy in p for p in self.datastore.proxy_list]):
+                proxy_args = system_proxy
+
+        # Fallback - Did not resolve anything, use the first available
+        if proxy_args is None:
+            proxy_args = self.datastore.proxy_list[0][0]
+
+        return proxy_args
+
    def run(self, uuid):
        timestamp = int(time.time())  # used for storage etc too

@ -66,10 +94,15 @@ class perform_site_check():
            # If the klass doesnt exist, just use a default
            klass = getattr(content_fetcher, "html_requests")

-        fetcher = klass()
+
+        proxy_args = self.set_proxy_from_list(watch)
+        fetcher = klass(proxy_override=proxy_args)
+
+        # Proxy List support
        fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code, watch['css_filter'])
        fetcher.quit()

+
        # Fetching complete, now filters
        # @todo move to class / maybe inside of fetcher abstract base?

@ -119,11 +152,13 @@ class perform_site_check():
                # Then we assume HTML
                if has_filter_rule:
                    # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
-                    if css_filter_rule[0] == '/':
-                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+                    if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
+                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
+                                                               html_content=fetcher.content)
                    else:
                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                        html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+
                if has_subtractive_selectors:
                    html_content = html_tools.element_removal(subtractive_selectors, html_content)

@ -143,7 +178,6 @@ class perform_site_check():
            # Re #340 - return the content before the 'ignore text' was applied
            text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

-
        # Re #340 - return the content before the 'ignore text' was applied
        text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -337,9 +337,9 @@ class watchForm(commonSettingsForm):
    method = SelectField('Request method', choices=valid_method, default=default_method)
    ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False)
    trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
-
    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
    save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"})
+    proxy = RadioField('Proxy')

    def validate(self, **kwargs):
        if not super().validate():
@ -358,6 +358,7 @@ class watchForm(commonSettingsForm):
 # datastore.data['settings']['requests']..
 class globalSettingsRequestForm(Form):
    time_between_check = FormField(TimeBetweenCheckForm)
+    proxy = RadioField('Proxy')


 # datastore.data['settings']['application']..
@ -382,4 +383,3 @@ class globalSettingsForm(Form):
    requests = FormField(globalSettingsRequestForm)
    application = FormField(globalSettingsApplicationForm)
    save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"})
-
--- a/changedetectionio/importer.py
+++ b/changedetectionio/importer.py
@ -0,0 +1,133 @@
+from abc import ABC, abstractmethod
+import time
+import validators
+
+
+class Importer():
+    remaining_data = []
+    new_uuids = []
+    good = 0
+
+    def __init__(self):
+        self.new_uuids = []
+        self.good = 0
+        self.remaining_data = []
+
+    @abstractmethod
+    def run(self,
+            data,
+            flash,
+            datastore):
+        pass
+
+
+class import_url_list(Importer):
+    """
+    Imports a list, can be in <code>https://example.com tag1, tag2, last tag</code> format
+    """
+    def run(self,
+            data,
+            flash,
+            datastore,
+            ):
+
+        urls = data.split("\n")
+        good = 0
+        now = time.time()
+
+        if (len(urls) > 5000):
+            flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.")
+
+        for url in urls:
+            url = url.strip()
+            if not len(url):
+                continue
+
+            tags = ""
+
+            # 'tags' should be a csv list after the URL
+            if ' ' in url:
+                url, tags = url.split(" ", 1)
+
+            # Flask wtform validators wont work with basic auth, use validators package
+            # Up to 5000 per batch so we dont flood the server
+            if len(url) and validators.url(url.replace('source:', '')) and good < 5000:
+                new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False)
+                if new_uuid:
+                    # Straight into the queue.
+                    self.new_uuids.append(new_uuid)
+                    good += 1
+                    continue
+
+            # Worked past the 'continue' above, append it to the bad list
+            if self.remaining_data is None:
+                self.remaining_data = []
+            self.remaining_data.append(url)
+
+        flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data)))
+
+
+class import_distill_io_json(Importer):
+    def run(self,
+            data,
+            flash,
+            datastore,
+            ):
+
+        import json
+        good = 0
+        now = time.time()
+        self.new_uuids=[]
+
+
+        try:
+            data = json.loads(data.strip())
+        except json.decoder.JSONDecodeError:
+            flash("Unable to read JSON file, was it broken?", 'error')
+            return
+
+        if not data.get('data'):
+            flash("JSON structure looks invalid, was it broken?", 'error')
+            return
+
+        for d in data.get('data'):
+            d_config = json.loads(d['config'])
+            extras = {'title': d['name']}
+
+            if len(d['uri']) and good < 5000:
+                try:
+                    # @todo we only support CSS ones at the moment
+                    if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css':
+                        extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr']
+                except KeyError:
+                    pass
+                except IndexError:
+                    pass
+
+                try:
+                    extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr']
+                    if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath':
+                        extras['css_filter'] = 'xpath:' + extras['css_filter']
+
+                except KeyError:
+                    pass
+                except IndexError:
+                    pass
+
+                try:
+                    extras['tag'] = ", ".join(d['tags'])
+                except KeyError:
+                    pass
+                except IndexError:
+                    pass
+
+                new_uuid = datastore.add_watch(url=d['uri'].strip(),
+                                               extras=extras,
+                                               write_to_disk_now=False)
+
+                if new_uuid:
+                    # Straight into the queue.
+                    self.new_uuids.append(new_uuid)
+                    good += 1
+
+        flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data)))
--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@ -23,7 +23,8 @@ class model(dict):
                'requests': {
                    'timeout': 15,  # Default 15 seconds
                    'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None},
-                    'workers': 10  # Number of threads, lower is better for slow connections
+                    'workers': 10,  # Number of threads, lower is better for slow connections
+                    'proxy': None # Preferred proxy connection
                },
                'application': {
                    'password': False,
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -39,6 +39,7 @@ class model(dict):
            'trigger_text': [],  # List of text or regex to wait for until a change is detected
            'fetch_backend': None,
            'extract_title_as_title': False,
+            'proxy': None, # Preferred proxy connection
            # Re #110, so then if this is set to None, we know to use the default value instead
            # Requires setting to None on submit if it's the same as the default
            # Should be all None by default, so we use the system default in this case.
--- a/changedetectionio/static/js/settings.js
+++ b/changedetectionio/static/js/settings.js
@ -1,13 +0,0 @@
-window.addEventListener("load", (event) => {
-  // just an example for now
-  function toggleVisible(elem) {
-    // theres better ways todo this
-    var x = document.getElementById(elem);
-    if (x.style.display === "block") {
-      x.style.display = "none";
-    } else {
-      x.style.display = "block";
-    }
-  }
-});
-
--- a/changedetectionio/static/js/watch-settings.js
+++ b/changedetectionio/static/js/watch-settings.js
@ -0,0 +1,14 @@
+$(document).ready(function() {
+    function toggle() {
+        if ($('input[name="fetch_backend"]:checked').val() != 'html_requests') {
+            $('#requests-override-options').hide();
+        } else {
+            $('#requests-override-options').show();
+        }
+    }
+    $('input[name="fetch_backend"]').click(function (e) {
+        toggle();
+    });
+    toggle();
+
+});
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
@ -309,10 +309,10 @@ footer {
    font-weight: bold; }
  .pure-form textarea {
    width: 100%; }
-  .pure-form ul.fetch-backend {
+  .pure-form .inline-radio ul {
    margin: 0px;
    list-style: none; }
-    .pure-form ul.fetch-backend li > * {
+    .pure-form .inline-radio ul li > * {
      display: inline-block; }

@media only screen and (max-width: 760px), (min-device-width: 768px) and (max-device-width: 1024px) {
--- a/changedetectionio/static/styles/styles.scss
+++ b/changedetectionio/static/styles/styles.scss
@ -418,14 +418,16 @@ footer {
  textarea {
    width: 100%;
  }
-  ul.fetch-backend {
-    margin: 0px;
-    list-style: none;
-    li {
-        > * {
-            display: inline-block;
+  .inline-radio {
+      ul {
+        margin: 0px;
+        list-style: none;
+        li {
+            > * {
+                display: inline-block;
+            }
        }
-    }
+      }
  }
 }

--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -33,6 +33,7 @@ class ChangeDetectionStore:
        self.needs_write = False
        self.datastore_path = datastore_path
        self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
+        self.proxy_list = None
        self.stop_thread = False

        self.__data = App.model()
@ -111,6 +112,14 @@ class ChangeDetectionStore:
            secret = secrets.token_hex(16)
            self.__data['settings']['application']['rss_access_token'] = secret

+
+        # Proxy list support - available as a selection in settings when text file is imported
+        # CSV list
+        # "name, address", or just "name"
+        proxy_list_file = "{}/proxies.txt".format(self.datastore_path)
+        if path.isfile(proxy_list_file):
+            self.import_proxy_list(proxy_list_file)
+
        # Bump the update version by running updates
        self.run_updates()

@ -435,6 +444,21 @@ class ChangeDetectionStore:
                    print ("Removing",item)
                    unlink(item)

+    def import_proxy_list(self, filename):
+        import csv
+        with open(filename, newline='') as f:
+            reader = csv.reader(f, skipinitialspace=True)
+            # @todo This loop can could be improved
+            l = []
+            for row in reader:
+                if len(row):
+                    if len(row)>=2:
+                        l.append(tuple(row[:2]))
+                    else:
+                        l.append(tuple([row[0], row[0]]))
+            self.proxy_list = l if len(l) else None
+
+
    # Run all updates
    # IMPORTANT - Each update could be run even when they have a new install and the schema is correct
    #             So therefor - each `update_n` should be very careful about checking if it needs to actually run
--- a/changedetectionio/templates/_common_fields.jinja
+++ b/changedetectionio/templates/_common_fields.jinja
@ -2,7 +2,6 @@
 {% from '_helpers.jinja' import render_field %}

 {% macro render_common_settings_form(form, current_base_url, emailprefix) %}
-
                        <div class="pure-control-group">
                            {{ render_field(form.notification_urls, rows=5, placeholder="Examples:
    Gitter - gitter://token/room
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -11,6 +11,7 @@
 {% endif %}
    const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}";
 </script>
+<script type="text/javascript" src="{{url_for('static_content', group='js', filename='watch-settings.js')}}" defer></script>
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='visual-selector.js')}}" defer></script>
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='limit.js')}}" defer></script>
@ -62,20 +63,25 @@
            </div>

            <div class="tab-pane-inner" id="request">
-                    <div class="pure-control-group">
+                    <div class="pure-control-group inline-radio">
                        {{ render_field(form.fetch_backend, class="fetch-backend") }}
                        <span class="pure-form-message-inline">
                            <p>Use the <strong>Basic</strong> method (default) where your watched site doesn't need Javascript to render.</p>
                            <p>The <strong>Chrome/Javascript</strong> method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'. </p>
                        </span>
                    </div>
-
-                <hr/>
-                <fieldset class="pure-group">
-
-                    <span class="pure-form-message-inline">
+                {% if form.proxy %}
+                    <div class="pure-control-group inline-radio">
+                        {{ render_field(form.proxy, class="fetch-backend-proxy") }}
+                        <span class="pure-form-message-inline">
+                        Choose a proxy for this watch
+                        </span>
+                    </div>
+                {% endif %}
+                <fieldset class="pure-group" id="requests-override-options">
+                    <div class="pure-form-message-inline">
                        <strong>Request override is currently only used by the <i>Basic fast Plaintext/HTTP Client</i> method.</strong>
-                    </span>
+                    </div>
                    <div class="pure-control-group">
                        {{ render_field(form.method) }}
                    </div>
@ -130,7 +136,7 @@ User-Agent: wonderbra 1.0") }}
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required,  <a
                                href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
-                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code>, <a
+                        <li>XPath - Limit text to this XPath rule, simply start with a forward-slash, example  <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
                                href="http://xpather.com/" target="new">test your XPath here</a></li>
                    </ul>
                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
--- a/changedetectionio/templates/import.html
+++ b/changedetectionio/templates/import.html
@ -1,30 +1,86 @@
 {% extends 'base.html' %}
-
 {% block content %}
-<div class="edit-form">
-     <div class="inner">
+<script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
+<div class="edit-form monospaced-textarea">
+
+    <div class="tabs collapsable">
+        <ul>
+            <li class="tab" id="default-tab"><a href="#url-list">URL List</a></li>
+            <li class="tab"><a href="#distill-io">Distill.io</a></li>
+        </ul>
+    </div>
+
+    <div class="box-wrap inner">
        <form class="pure-form pure-form-aligned" action="{{url_for('import_page')}}" method="POST">
            <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
-            <fieldset class="pure-group">
-              <legend>
-                Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,):
-                <br>
-                <code>https://example.com tag1, tag2, last tag</code>
-                <br>
-                URLs which do not pass validation will stay in the textarea.
-              </legend>
-              
-
-                <textarea name="urls" class="pure-input-1-2" placeholder="https://"
-                          style="width: 100%;
+            <div class="tab-pane-inner" id="url-list">
+                <fieldset class="pure-group">
+                    <legend>
+                        Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma
+                        (,):
+                        <br>
+                        <code>https://example.com tag1, tag2, last tag</code>
+                        <br>
+                        URLs which do not pass validation will stay in the textarea.
+                    </legend>
+
+
+                    <textarea name="urls" class="pure-input-1-2" placeholder="https://"
+                              style="width: 100%;
                                font-family:monospace;
                                white-space: pre;
                                overflow-wrap: normal;
-                                overflow-x: scroll;" rows="25">{{ remaining }}</textarea>
-            </fieldset>
+                                overflow-x: scroll;" rows="25">{{ import_url_list_remaining }}</textarea>
+                </fieldset>
+
+
+            </div>
+
+            <div class="tab-pane-inner" id="distill-io">
+
+
+                <fieldset class="pure-group">
+                    <legend>
+                        Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.</br>
+                        This is <i>experimental</i>, supported fields are <code>name</code>, <code>uri</code>, <code>tags</code>, <code>config:selections</code>, the rest (including <code>schedule</code>) are ignored.
+                        <br/>
+                        <p>
+                        How to export? <a href="https://distill.io/docs/web-monitor/how-export-and-import-monitors/">https://distill.io/docs/web-monitor/how-export-and-import-monitors/</a><br/>
+                        Be sure to set your default fetcher to Chrome if required.</br>
+                        </p>
+                    </legend>
+
+
+                    <textarea name="distill-io" class="pure-input-1-2" style="width: 100%;
+                                font-family:monospace;
+                                white-space: pre;
+                                overflow-wrap: normal;
+                                overflow-x: scroll;" placeholder="Example Distill.io JSON export file
+
+{
+    &quot;client&quot;: {
+        &quot;local&quot;: 1
+    },
+    &quot;data&quot;: [
+        {
+            &quot;name&quot;: &quot;Unraid | News&quot;,
+            &quot;uri&quot;: &quot;https://unraid.net/blog&quot;,
+            &quot;config&quot;: &quot;{\&quot;selections\&quot;:[{\&quot;frames\&quot;:[{\&quot;index\&quot;:0,\&quot;excludes\&quot;:[],\&quot;includes\&quot;:[{\&quot;type\&quot;:\&quot;xpath\&quot;,\&quot;expr\&quot;:\&quot;(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\&quot;}]}],\&quot;dynamic\&quot;:true,\&quot;delay\&quot;:2}],\&quot;ignoreEmptyText\&quot;:true,\&quot;includeStyle\&quot;:false,\&quot;dataAttr\&quot;:\&quot;text\&quot;}&quot;,
+            &quot;tags&quot;: [],
+            &quot;content_type&quot;: 2,
+            &quot;state&quot;: 40,
+            &quot;schedule&quot;: &quot;{\&quot;type\&quot;:\&quot;INTERVAL\&quot;,\&quot;params\&quot;:{\&quot;interval\&quot;:4447}}&quot;,
+            &quot;ts&quot;: &quot;2022-03-27T15:51:15.667Z&quot;
+        }
+    ]
+}
+" rows="25">{{ original_distill_json }}</textarea>
+                </fieldset>
+            </div>
            <button type="submit" class="pure-button pure-input-1-2 pure-button-primary">Import</button>
        </form>
-     </div>
+
+    </div>
 </div>

 {% endblock %}
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@ -9,7 +9,6 @@
    const email_notification_prefix=JSON.parse('{{emailprefix|tojson}}');
 {% endif %}
 </script>
-<script type="text/javascript" src="{{url_for('static_content', group='js', filename='settings.js')}}" defer></script>
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='tabs.js')}}" defer></script>
 <script type="text/javascript" src="{{url_for('static_content', group='js', filename='notifications.js')}}" defer></script>

@ -61,7 +60,14 @@
                        {{ render_checkbox_field(form.application.form.real_browser_save_screenshot) }}
                        <span class="pure-form-message-inline">When using a Chrome browser, a screenshot from the last check will be available on the Diff page</span>
                    </div>
-
+                {% if form.requests.proxy %}
+                    <div class="pure-control-group inline-radio">
+                        {{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }}
+                        <span class="pure-form-message-inline">
+                        Choose a default proxy for all watches
+                        </span>
+                    </div>
+                {% endif %}
                </fieldset>
            </div>

@ -74,7 +80,7 @@
            </div>

            <div class="tab-pane-inner" id="fetching">
-                <div class="pure-control-group">
+                <div class="pure-control-group inline-radio">
                    {{ render_field(form.application.form.fetch_backend, class="fetch-backend") }}
                    <span class="pure-form-message-inline">
                        <p>Use the <strong>Basic</strong> method (default) where your watched sites don't need Javascript to render.</p>
--- a/changedetectionio/tests/test_import.py
+++ b/changedetectionio/tests/test_import.py
@ -5,18 +5,17 @@ import time
 from flask import url_for

 from .util import live_server_setup
-
-
-def test_import(client, live_server):
-
+def test_setup(client, live_server):
    live_server_setup(live_server)

+def test_import(client, live_server):
    # Give the endpoint time to spin up
    time.sleep(1)

    res = client.post(
        url_for("import_page"),
        data={
+            "distill-io": "",
            "urls": """https://example.com
 https://example.com tag1
 https://example.com tag1, other tag"""
@ -26,3 +25,96 @@ https://example.com tag1, other tag"""
    assert b"3 Imported" in res.data
    assert b"tag1" in res.data
    assert b"other tag" in res.data
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+
+    # Clear flask alerts
+    res = client.get( url_for("index"))
+    res = client.get( url_for("index"))
+
+def xtest_import_skip_url(client, live_server):
+
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    res = client.post(
+        url_for("import_page"),
+        data={
+            "distill-io": "",
+            "urls": """https://example.com
+:ht000000broken
+"""
+        },
+        follow_redirects=True,
+    )
+    assert b"1 Imported" in res.data
+    assert b"ht000000broken" in res.data
+    assert b"1 Skipped" in res.data
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    # Clear flask alerts
+    res = client.get( url_for("index"))
+
+def test_import_distillio(client, live_server):
+
+    distill_data='''
+{
+    "client": {
+        "local": 1
+    },
+    "data": [
+        {
+            "name": "Unraid | News",
+            "uri": "https://unraid.net/blog",
+            "config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}",
+            "tags": ["nice stuff", "nerd-news"],
+            "content_type": 2,
+            "state": 40,
+            "schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}",
+            "ts": "2022-03-27T15:51:15.667Z"
+        }
+    ]
+}		   
+
+'''
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+    client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    res = client.post(
+        url_for("import_page"),
+        data={
+            "distill-io": distill_data,
+            "urls" : ''
+        },
+        follow_redirects=True,
+    )
+
+
+    assert b"Unable to read JSON file, was it broken?" not in res.data
+    assert b"1 Imported from Distill.io" in res.data
+
+    res = client.get( url_for("edit_page", uuid="first"))
+
+    assert b"https://unraid.net/blog" in res.data
+    assert b"Unraid | News" in res.data
+
+
+    # flask/wtforms should recode this, check we see it
+    # wtforms encodes it like id=&#39 ,but html.escape makes it like id=&#x27
+    # - so just check it manually :(
+    #import json
+    #import html
+    #d = json.loads(distill_data)
+    # embedded_d=json.loads(d['data'][0]['config'])
+    # x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8')
+    assert b"xpath:(//div[@id=&#39;App&#39;]/div[contains(@class,&#39;flex&#39;)]/main[contains(@class,&#39;relative&#39;)]/section[contains(@class,&#39;relative&#39;)]/div[@class=&#39;container&#39;]/div[contains(@class,&#39;flex&#39;)]/div[contains(@class,&#39;w-full&#39;)])[1]" in res.data
+
+    # did the tags work?
+    res = client.get( url_for("index"))
+
+    assert b"nice stuff" in res.data
+    assert b"nerd-news" in res.data
+
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    # Clear flask alerts
+    res = client.get(url_for("index"))
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@ -116,4 +116,46 @@ def test_xpath_validation(client, live_server):
        data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
        follow_redirects=True
    )
-    assert b"is not a valid XPath expression" in res.data
+    assert b"is not a valid XPath expression" in res.data
+
+
+# actually only really used by the distll.io importer, but could be handy too
+def test_check_with_prefix_css_filter(client, live_server):
+    res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    set_original_response()
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    time.sleep(3)
+
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter":  "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+
+    assert b"Updated watch." in res.data
+    time.sleep(3)
+
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    with open('/tmp/fuck.html', 'wb') as f:
+        f.write(res.data)
+    assert b"Some text thats the same" in res.data #in selector
+    assert b"Some text that will change" not in res.data #not in selector
+
+    client.get(url_for("api_delete", uuid="all"), follow_redirects=True)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -17,14 +17,14 @@ services:
  #       Alternative WebDriver/selenium URL, do not use "'s or 's!
  #      - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
  #
-  #       WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_httpProxy, webdriver_noProxy,
-  #                                webdriver_proxyAutoconfigUrl, webdriver_sslProxy, webdriver_autodetect,
+  #       WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_noProxy,
+  #                                webdriver_proxyAutoconfigUrl, webdriver_autodetect,
  #                                webdriver_socksProxy, webdriver_socksUsername, webdriver_socksVersion, webdriver_socksPassword
  #
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
  #
  #       Alternative Playwright URL, do not use "'s or 's!
-  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/playwright
+  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/
  #
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
  #