diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 789c72a6..154162a0 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -40,7 +40,7 @@ from flask_wtf import CSRFProtect from changedetectionio import html_tools -__version__ = '0.39.12' +__version__ = '0.39.13' datastore = None @@ -518,10 +518,31 @@ def changedetection_app(config=None, datastore_o=None): if all(value == 0 or value == None for value in datastore.data['watching'][uuid]['time_between_check'].values()): default['time_between_check'] = deepcopy(datastore.data['settings']['requests']['time_between_check']) + # Defaults for proxy choice + if datastore.proxy_list is not None: # When enabled + system_proxy = datastore.data['settings']['requests']['proxy'] + if default['proxy'] is None: + default['proxy'] = system_proxy + else: + # Does the chosen one exist? + if not any(default['proxy'] in tup for tup in datastore.proxy_list): + default['proxy'] = datastore.proxy_list[0][0] + + # Used by the form handler to keep or remove the proxy settings + default['proxy_list'] = datastore.proxy_list + + # proxy_override set to the json/text list of the items form = forms.watchForm(formdata=request.form if request.method == 'POST' else None, - data=default - ) + data=default, + ) + if datastore.proxy_list is None: + # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead + del form.proxy + else: + form.proxy.choices = datastore.proxy_list + if default['proxy'] is None: + form.proxy.default='http://hello' if request.method == 'POST' and form.validate(): extra_update_obj = {} @@ -601,10 +622,28 @@ def changedetection_app(config=None, datastore_o=None): def settings_page(): from changedetectionio import content_fetcher, forms + default = deepcopy(datastore.data['settings']) + if datastore.proxy_list is not None: + # When enabled + system_proxy = datastore.data['settings']['requests']['proxy'] + # In the case it doesnt exist anymore + if not any([system_proxy in tup for tup in datastore.proxy_list]): + system_proxy = None + + default['requests']['proxy'] = system_proxy if system_proxy is not None else datastore.proxy_list[0][0] + # Used by the form handler to keep or remove the proxy settings + default['proxy_list'] = datastore.proxy_list + + # Don't use form.data on POST so that it doesnt overrid the checkbox status from the POST status form = forms.globalSettingsForm(formdata=request.form if request.method == 'POST' else None, - data=datastore.data['settings'] + data=default ) + if datastore.proxy_list is None: + # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead + del form.requests.form.proxy + else: + form.requests.form.proxy.choices = datastore.proxy_list if request.method == 'POST': # Password unset is a GET, but we can lock the session to a salted env password to always need the password @@ -644,44 +683,37 @@ def changedetection_app(config=None, datastore_o=None): @app.route("/import", methods=['GET', "POST"]) @login_required def import_page(): - import validators remaining_urls = [] + if request.method == 'POST': + from .importer import import_url_list, import_distill_io_json + + # URL List import + if request.values.get('urls') and len(request.values.get('urls').strip()): + # Import and push into the queue for immediate update check + importer = import_url_list() + importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore) + for uuid in importer.new_uuids: + update_q.put(uuid) + + if len(importer.remaining_data) == 0: + return redirect(url_for('index')) + else: + remaining_urls = importer.remaining_data + + # Distill.io import + if request.values.get('distill-io') and len(request.values.get('distill-io').strip()): + # Import and push into the queue for immediate update check + d_importer = import_distill_io_json() + d_importer.run(data=request.values.get('distill-io'), flash=flash, datastore=datastore) + for uuid in d_importer.new_uuids: + update_q.put(uuid) - good = 0 - if request.method == 'POST': - now=time.time() - urls = request.values.get('urls').split("\n") - - if (len(urls) > 5000): - flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.") - - for url in urls: - url = url.strip() - url, *tags = url.split(" ") - # Flask wtform validators wont work with basic auth, use validators package - # Up to 5000 per batch so we dont flood the server - if len(url) and validators.url(url.replace('source:', '')) and good < 5000: - new_uuid = datastore.add_watch(url=url.strip(), tag=" ".join(tags), write_to_disk_now=False) - if new_uuid: - # Straight into the queue. - update_q.put(new_uuid) - good += 1 - continue - - if len(url.strip()): - remaining_urls.append(url) - - flash("{} Imported in {:.2f}s, {} Skipped.".format(good, time.time()-now,len(remaining_urls))) - datastore.needs_write = True - - if len(remaining_urls) == 0: - # Looking good, redirect to index. - return redirect(url_for('index')) # Could be some remaining, or we could be on GET output = render_template("import.html", - remaining="\n".join(remaining_urls) + import_url_list_remaining="\n".join(remaining_urls), + original_distill_json='' ) return output diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index f6a4a154..a4668c17 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -128,6 +128,9 @@ class Fetcher(): # Will be needed in the future by the VisualSelector, always get this where possible. screenshot = False + fetcher_description = "No description" + system_http_proxy = os.getenv('HTTP_PROXY') + system_https_proxy = os.getenv('HTTPS_PROXY') @abstractmethod def get_error(self): @@ -184,21 +187,17 @@ class base_html_playwright(Fetcher): if os.getenv("PLAYWRIGHT_DRIVER_URL"): fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) - # try: - # from playwright.sync_api import sync_playwright - # except ModuleNotFoundError: - # fetcher_enabled = False - browser_type = '' command_executor = '' # Configs for Proxy setup # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" - playwright_proxy_settings_mappings = ['server', 'bypass', 'username', 'password'] + playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password'] proxy = None - def __init__(self): + def __init__(self, proxy_override=None): + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') self.command_executor = os.getenv( @@ -216,6 +215,10 @@ class base_html_playwright(Fetcher): if proxy_args: self.proxy = proxy_args + # allow per-watch proxy selection override + if proxy_override: + self.proxy = {'server': proxy_override} + def run(self, url, timeout, @@ -226,6 +229,8 @@ class base_html_playwright(Fetcher): current_css_filter=None): from playwright.sync_api import sync_playwright + import playwright._impl._api_types + from playwright._impl._api_types import Error, TimeoutError with sync_playwright() as p: browser_type = getattr(p, self.browser_type) @@ -235,17 +240,23 @@ class base_html_playwright(Fetcher): browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000) # Set user agent to prevent Cloudflare from blocking the browser + # Use the default one configured in the App.py model that's passed from fetch_site_status.py context = browser.new_context( - user_agent="Mozilla/5.0", + user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0', proxy=self.proxy ) page = context.new_page() - response = page.goto(url, timeout=timeout * 1000) - # set size after visiting page, otherwise it wont work (seems to default to 800x) page.set_viewport_size({"width": 1280, "height": 1024}) - - extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) - page.wait_for_timeout(extra_wait * 1000) + try: + response = page.goto(url, timeout=timeout * 1000, wait_until='commit') + # Wait_until = commit + # - `'commit'` - consider operation to be finished when network response is received and the document started loading. + # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds + # This seemed to solve nearly all 'TimeoutErrors' + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + page.wait_for_timeout(extra_wait * 1000) + except playwright._impl._api_types.TimeoutError as e: + raise EmptyReply(url=url, status_code=None) if response is None: raise EmptyReply(url=url, status_code=None) @@ -283,7 +294,7 @@ class base_html_webdriver(Fetcher): 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] proxy = None - def __init__(self): + def __init__(self, proxy_override=None): from selenium.webdriver.common.proxy import Proxy as SeleniumProxy # .strip('"') is going to save someone a lot of time when they accidently wrap the env value @@ -296,6 +307,16 @@ class base_html_webdriver(Fetcher): if v: proxy_args[k] = v.strip('"') + # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy + if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy: + proxy_args['httpProxy'] = self.system_http_proxy + if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy: + proxy_args['httpsProxy'] = self.system_https_proxy + + # Allows override the proxy on a per-request basis + if proxy_override is not None: + proxy_args['httpProxy'] = proxy_override + if proxy_args: self.proxy = SeleniumProxy(raw=proxy_args) @@ -366,6 +387,9 @@ class base_html_webdriver(Fetcher): class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" + def __init__(self, proxy_override=None): + self.proxy_override = proxy_override + def run(self, url, timeout, @@ -375,11 +399,23 @@ class html_requests(Fetcher): ignore_status_codes=False, current_css_filter=None): + proxies={} + + # Allows override the proxy on a per-request basis + if self.proxy_override: + proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override} + else: + if self.system_http_proxy: + proxies['http'] = self.system_http_proxy + if self.system_https_proxy: + proxies['https'] = self.system_https_proxy + r = requests.request(method=request_method, data=request_body, url=url, headers=request_headers, timeout=timeout, + proxies=proxies, verify=False) # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks. diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index f7253440..16790c10 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -16,6 +16,34 @@ class perform_site_check(): super().__init__(*args, **kwargs) self.datastore = datastore + # If there was a proxy list enabled, figure out what proxy_args/which proxy to use + # if watch.proxy use that + # fetcher.proxy_override = watch.proxy or main config proxy + # Allows override the proxy on a per-request basis + # ALWAYS use the first one is nothing selected + + def set_proxy_from_list(self, watch): + proxy_args = None + if self.datastore.proxy_list is None: + return None + + # If its a valid one + if any([watch['proxy'] in p for p in self.datastore.proxy_list]): + proxy_args = watch['proxy'] + + # not valid (including None), try the system one + else: + system_proxy = self.datastore.data['settings']['requests']['proxy'] + # Is not None and exists + if any([system_proxy in p for p in self.datastore.proxy_list]): + proxy_args = system_proxy + + # Fallback - Did not resolve anything, use the first available + if proxy_args is None: + proxy_args = self.datastore.proxy_list[0][0] + + return proxy_args + def run(self, uuid): timestamp = int(time.time()) # used for storage etc too @@ -66,10 +94,15 @@ class perform_site_check(): # If the klass doesnt exist, just use a default klass = getattr(content_fetcher, "html_requests") - fetcher = klass() + + proxy_args = self.set_proxy_from_list(watch) + fetcher = klass(proxy_override=proxy_args) + + # Proxy List support fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code, watch['css_filter']) fetcher.quit() + # Fetching complete, now filters # @todo move to class / maybe inside of fetcher abstract base? @@ -119,11 +152,13 @@ class perform_site_check(): # Then we assume HTML if has_filter_rule: # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." - if css_filter_rule[0] == '/': - html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) + if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'): + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''), + html_content=fetcher.content) else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) + if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) @@ -143,7 +178,6 @@ class perform_site_check(): # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') - # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 72dae639..6d12267e 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -337,9 +337,9 @@ class watchForm(commonSettingsForm): method = SelectField('Request method', choices=valid_method, default=default_method) ignore_status_codes = BooleanField('Ignore status codes (process non-2xx status codes as normal)', default=False) trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()]) - save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) save_and_preview_button = SubmitField('Save & Preview', render_kw={"class": "pure-button pure-button-primary"}) + proxy = RadioField('Proxy') def validate(self, **kwargs): if not super().validate(): @@ -358,6 +358,7 @@ class watchForm(commonSettingsForm): # datastore.data['settings']['requests'].. class globalSettingsRequestForm(Form): time_between_check = FormField(TimeBetweenCheckForm) + proxy = RadioField('Proxy') # datastore.data['settings']['application'].. @@ -382,4 +383,3 @@ class globalSettingsForm(Form): requests = FormField(globalSettingsRequestForm) application = FormField(globalSettingsApplicationForm) save_button = SubmitField('Save', render_kw={"class": "pure-button pure-button-primary"}) - diff --git a/changedetectionio/importer.py b/changedetectionio/importer.py new file mode 100644 index 00000000..30d349e6 --- /dev/null +++ b/changedetectionio/importer.py @@ -0,0 +1,133 @@ +from abc import ABC, abstractmethod +import time +import validators + + +class Importer(): + remaining_data = [] + new_uuids = [] + good = 0 + + def __init__(self): + self.new_uuids = [] + self.good = 0 + self.remaining_data = [] + + @abstractmethod + def run(self, + data, + flash, + datastore): + pass + + +class import_url_list(Importer): + """ + Imports a list, can be in https://example.com tag1, tag2, last tag format + """ + def run(self, + data, + flash, + datastore, + ): + + urls = data.split("\n") + good = 0 + now = time.time() + + if (len(urls) > 5000): + flash("Importing 5,000 of the first URLs from your list, the rest can be imported again.") + + for url in urls: + url = url.strip() + if not len(url): + continue + + tags = "" + + # 'tags' should be a csv list after the URL + if ' ' in url: + url, tags = url.split(" ", 1) + + # Flask wtform validators wont work with basic auth, use validators package + # Up to 5000 per batch so we dont flood the server + if len(url) and validators.url(url.replace('source:', '')) and good < 5000: + new_uuid = datastore.add_watch(url=url.strip(), tag=tags, write_to_disk_now=False) + if new_uuid: + # Straight into the queue. + self.new_uuids.append(new_uuid) + good += 1 + continue + + # Worked past the 'continue' above, append it to the bad list + if self.remaining_data is None: + self.remaining_data = [] + self.remaining_data.append(url) + + flash("{} Imported from list in {:.2f}s, {} Skipped.".format(good, time.time() - now, len(self.remaining_data))) + + +class import_distill_io_json(Importer): + def run(self, + data, + flash, + datastore, + ): + + import json + good = 0 + now = time.time() + self.new_uuids=[] + + + try: + data = json.loads(data.strip()) + except json.decoder.JSONDecodeError: + flash("Unable to read JSON file, was it broken?", 'error') + return + + if not data.get('data'): + flash("JSON structure looks invalid, was it broken?", 'error') + return + + for d in data.get('data'): + d_config = json.loads(d['config']) + extras = {'title': d['name']} + + if len(d['uri']) and good < 5000: + try: + # @todo we only support CSS ones at the moment + if d_config['selections'][0]['frames'][0]['excludes'][0]['type'] == 'css': + extras['subtractive_selectors'] = d_config['selections'][0]['frames'][0]['excludes'][0]['expr'] + except KeyError: + pass + except IndexError: + pass + + try: + extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr'] + if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath': + extras['css_filter'] = 'xpath:' + extras['css_filter'] + + except KeyError: + pass + except IndexError: + pass + + try: + extras['tag'] = ", ".join(d['tags']) + except KeyError: + pass + except IndexError: + pass + + new_uuid = datastore.add_watch(url=d['uri'].strip(), + extras=extras, + write_to_disk_now=False) + + if new_uuid: + # Straight into the queue. + self.new_uuids.append(new_uuid) + good += 1 + + flash("{} Imported from Distill.io in {:.2f}s, {} Skipped.".format(len(self.new_uuids), time.time() - now, len(self.remaining_data))) diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index ebd5731a..21d53f7d 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -23,7 +23,8 @@ class model(dict): 'requests': { 'timeout': 15, # Default 15 seconds 'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None}, - 'workers': 10 # Number of threads, lower is better for slow connections + 'workers': 10, # Number of threads, lower is better for slow connections + 'proxy': None # Preferred proxy connection }, 'application': { 'password': False, diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index c0313868..43d6b979 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -39,6 +39,7 @@ class model(dict): 'trigger_text': [], # List of text or regex to wait for until a change is detected 'fetch_backend': None, 'extract_title_as_title': False, + 'proxy': None, # Preferred proxy connection # Re #110, so then if this is set to None, we know to use the default value instead # Requires setting to None on submit if it's the same as the default # Should be all None by default, so we use the system default in this case. diff --git a/changedetectionio/static/js/settings.js b/changedetectionio/static/js/settings.js deleted file mode 100644 index 9ce51d07..00000000 --- a/changedetectionio/static/js/settings.js +++ /dev/null @@ -1,13 +0,0 @@ -window.addEventListener("load", (event) => { - // just an example for now - function toggleVisible(elem) { - // theres better ways todo this - var x = document.getElementById(elem); - if (x.style.display === "block") { - x.style.display = "none"; - } else { - x.style.display = "block"; - } - } -}); - diff --git a/changedetectionio/static/js/watch-settings.js b/changedetectionio/static/js/watch-settings.js new file mode 100644 index 00000000..c7f070fe --- /dev/null +++ b/changedetectionio/static/js/watch-settings.js @@ -0,0 +1,14 @@ +$(document).ready(function() { + function toggle() { + if ($('input[name="fetch_backend"]:checked').val() != 'html_requests') { + $('#requests-override-options').hide(); + } else { + $('#requests-override-options').show(); + } + } + $('input[name="fetch_backend"]').click(function (e) { + toggle(); + }); + toggle(); + +}); diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css index a10bab9b..fbe3c63e 100644 --- a/changedetectionio/static/styles/styles.css +++ b/changedetectionio/static/styles/styles.css @@ -309,10 +309,10 @@ footer { font-weight: bold; } .pure-form textarea { width: 100%; } - .pure-form ul.fetch-backend { + .pure-form .inline-radio ul { margin: 0px; list-style: none; } - .pure-form ul.fetch-backend li > * { + .pure-form .inline-radio ul li > * { display: inline-block; } @media only screen and (max-width: 760px), (min-device-width: 768px) and (max-device-width: 1024px) { diff --git a/changedetectionio/static/styles/styles.scss b/changedetectionio/static/styles/styles.scss index a7e47a7c..0a51f215 100644 --- a/changedetectionio/static/styles/styles.scss +++ b/changedetectionio/static/styles/styles.scss @@ -418,14 +418,16 @@ footer { textarea { width: 100%; } - ul.fetch-backend { - margin: 0px; - list-style: none; - li { - > * { - display: inline-block; + .inline-radio { + ul { + margin: 0px; + list-style: none; + li { + > * { + display: inline-block; + } } - } + } } } diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 5e139294..7266aa24 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -33,6 +33,7 @@ class ChangeDetectionStore: self.needs_write = False self.datastore_path = datastore_path self.json_store_path = "{}/url-watches.json".format(self.datastore_path) + self.proxy_list = None self.stop_thread = False self.__data = App.model() @@ -111,6 +112,14 @@ class ChangeDetectionStore: secret = secrets.token_hex(16) self.__data['settings']['application']['rss_access_token'] = secret + + # Proxy list support - available as a selection in settings when text file is imported + # CSV list + # "name, address", or just "name" + proxy_list_file = "{}/proxies.txt".format(self.datastore_path) + if path.isfile(proxy_list_file): + self.import_proxy_list(proxy_list_file) + # Bump the update version by running updates self.run_updates() @@ -435,6 +444,21 @@ class ChangeDetectionStore: print ("Removing",item) unlink(item) + def import_proxy_list(self, filename): + import csv + with open(filename, newline='') as f: + reader = csv.reader(f, skipinitialspace=True) + # @todo This loop can could be improved + l = [] + for row in reader: + if len(row): + if len(row)>=2: + l.append(tuple(row[:2])) + else: + l.append(tuple([row[0], row[0]])) + self.proxy_list = l if len(l) else None + + # Run all updates # IMPORTANT - Each update could be run even when they have a new install and the schema is correct # So therefor - each `update_n` should be very careful about checking if it needs to actually run diff --git a/changedetectionio/templates/_common_fields.jinja b/changedetectionio/templates/_common_fields.jinja index 30ada5c0..961ff1db 100644 --- a/changedetectionio/templates/_common_fields.jinja +++ b/changedetectionio/templates/_common_fields.jinja @@ -2,7 +2,6 @@ {% from '_helpers.jinja' import render_field %} {% macro render_common_settings_form(form, current_base_url, emailprefix) %} -
{{ render_field(form.notification_urls, rows=5, placeholder="Examples: Gitter - gitter://token/room diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index b6e2c471..0c9a53f7 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -11,6 +11,7 @@ {% endif %} const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}"; + @@ -62,20 +63,25 @@
-
+
{{ render_field(form.fetch_backend, class="fetch-backend") }}

Use the Basic method (default) where your watched site doesn't need Javascript to render.

The Chrome/Javascript method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'.

- -
-
- - + {% if form.proxy %} +
+ {{ render_field(form.proxy, class="fetch-backend-proxy") }} + + Choose a proxy for this watch + +
+ {% endif %} +
+
Request override is currently only used by the Basic fast Plaintext/HTTP Client method. - +
{{ render_field(form.method) }}
@@ -130,7 +136,7 @@ User-Agent: wonderbra 1.0") }}
  • CSS - Limit text to this CSS rule, only text matching this CSS rule is included.
  • JSON - Limit text to this JSON rule, using JSONPath, prefix with "json:", use json:$ to force re-formatting if required, test your JSONPath here
  • -
  • XPath - Limit text to this XPath rule, simply start with a forward-slash, example //*[contains(@class, 'sametext')], XPath - Limit text to this XPath rule, simply start with a forward-slash, example //*[contains(@class, 'sametext')] or xpath://*[contains(@class, 'sametext')], test your XPath here
  • Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! -
    + +
    + + + +
    -
    - - Enter one URL per line, and optionally add tags for each URL after a space, delineated by comma (,): -
    - https://example.com tag1, tag2, last tag -
    - URLs which do not pass validation will stay in the textarea. -
    - - - -
    + overflow-x: scroll;" rows="25">{{ import_url_list_remaining }} +
    + + +
    + +
    + + +
    + + Copy and Paste your Distill.io watch 'export' file, this should be a JSON file.
    + This is experimental, supported fields are name, uri, tags, config:selections, the rest (including schedule) are ignored. +
    +

    + How to export? https://distill.io/docs/web-monitor/how-export-and-import-monitors/
    + Be sure to set your default fetcher to Chrome if required.
    +

    +
    + + + +
    +
    -
    + + {% endblock %} diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index fcac7097..2b052985 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -9,7 +9,6 @@ const email_notification_prefix=JSON.parse('{{emailprefix|tojson}}'); {% endif %} - @@ -61,7 +60,14 @@ {{ render_checkbox_field(form.application.form.real_browser_save_screenshot) }} When using a Chrome browser, a screenshot from the last check will be available on the Diff page - + {% if form.requests.proxy %} +
    + {{ render_field(form.requests.form.proxy, class="fetch-backend-proxy") }} + + Choose a default proxy for all watches + +
    + {% endif %} @@ -74,7 +80,7 @@
    -
    +
    {{ render_field(form.application.form.fetch_backend, class="fetch-backend") }}

    Use the Basic method (default) where your watched sites don't need Javascript to render.

    diff --git a/changedetectionio/tests/test_import.py b/changedetectionio/tests/test_import.py index 07676023..c4edad5c 100644 --- a/changedetectionio/tests/test_import.py +++ b/changedetectionio/tests/test_import.py @@ -5,18 +5,17 @@ import time from flask import url_for from .util import live_server_setup - - -def test_import(client, live_server): - +def test_setup(client, live_server): live_server_setup(live_server) +def test_import(client, live_server): # Give the endpoint time to spin up time.sleep(1) res = client.post( url_for("import_page"), data={ + "distill-io": "", "urls": """https://example.com https://example.com tag1 https://example.com tag1, other tag""" @@ -26,3 +25,96 @@ https://example.com tag1, other tag""" assert b"3 Imported" in res.data assert b"tag1" in res.data assert b"other tag" in res.data + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + + # Clear flask alerts + res = client.get( url_for("index")) + res = client.get( url_for("index")) + +def xtest_import_skip_url(client, live_server): + + + # Give the endpoint time to spin up + time.sleep(1) + + res = client.post( + url_for("import_page"), + data={ + "distill-io": "", + "urls": """https://example.com +:ht000000broken +""" + }, + follow_redirects=True, + ) + assert b"1 Imported" in res.data + assert b"ht000000broken" in res.data + assert b"1 Skipped" in res.data + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + # Clear flask alerts + res = client.get( url_for("index")) + +def test_import_distillio(client, live_server): + + distill_data=''' +{ + "client": { + "local": 1 + }, + "data": [ + { + "name": "Unraid | News", + "uri": "https://unraid.net/blog", + "config": "{\\"selections\\":[{\\"frames\\":[{\\"index\\":0,\\"excludes\\":[],\\"includes\\":[{\\"type\\":\\"xpath\\",\\"expr\\":\\"(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]\\"}]}],\\"dynamic\\":true,\\"delay\\":2}],\\"ignoreEmptyText\\":true,\\"includeStyle\\":false,\\"dataAttr\\":\\"text\\"}", + "tags": ["nice stuff", "nerd-news"], + "content_type": 2, + "state": 40, + "schedule": "{\\"type\\":\\"INTERVAL\\",\\"params\\":{\\"interval\\":4447}}", + "ts": "2022-03-27T15:51:15.667Z" + } + ] +} + +''' + + # Give the endpoint time to spin up + time.sleep(1) + client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + res = client.post( + url_for("import_page"), + data={ + "distill-io": distill_data, + "urls" : '' + }, + follow_redirects=True, + ) + + + assert b"Unable to read JSON file, was it broken?" not in res.data + assert b"1 Imported from Distill.io" in res.data + + res = client.get( url_for("edit_page", uuid="first")) + + assert b"https://unraid.net/blog" in res.data + assert b"Unraid | News" in res.data + + + # flask/wtforms should recode this, check we see it + # wtforms encodes it like id=' ,but html.escape makes it like id=' + # - so just check it manually :( + #import json + #import html + #d = json.loads(distill_data) + # embedded_d=json.loads(d['data'][0]['config']) + # x=html.escape(embedded_d['selections'][0]['frames'][0]['includes'][0]['expr']).encode('utf-8') + assert b"xpath:(//div[@id='App']/div[contains(@class,'flex')]/main[contains(@class,'relative')]/section[contains(@class,'relative')]/div[@class='container']/div[contains(@class,'flex')]/div[contains(@class,'w-full')])[1]" in res.data + + # did the tags work? + res = client.get( url_for("index")) + + assert b"nice stuff" in res.data + assert b"nerd-news" in res.data + + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + # Clear flask alerts + res = client.get(url_for("index")) diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py index d1374834..7a0ba0dc 100644 --- a/changedetectionio/tests/test_xpath_selector.py +++ b/changedetectionio/tests/test_xpath_selector.py @@ -116,4 +116,46 @@ def test_xpath_validation(client, live_server): data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) - assert b"is not a valid XPath expression" in res.data \ No newline at end of file + assert b"is not a valid XPath expression" in res.data + + +# actually only really used by the distll.io importer, but could be handy too +def test_check_with_prefix_css_filter(client, live_server): + res = client.get(url_for("api_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + # Give the endpoint time to spin up + time.sleep(1) + + set_original_response() + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + time.sleep(3) + + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + + assert b"Updated watch." in res.data + time.sleep(3) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + with open('/tmp/fuck.html', 'wb') as f: + f.write(res.data) + assert b"Some text thats the same" in res.data #in selector + assert b"Some text that will change" not in res.data #not in selector + + client.get(url_for("api_delete", uuid="all"), follow_redirects=True) diff --git a/docker-compose.yml b/docker-compose.yml index 88ee8a76..3dd5ebe6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,14 +17,14 @@ services: # Alternative WebDriver/selenium URL, do not use "'s or 's! # - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub # - # WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_httpProxy, webdriver_noProxy, - # webdriver_proxyAutoconfigUrl, webdriver_sslProxy, webdriver_autodetect, + # WebDriver proxy settings webdriver_proxyType, webdriver_ftpProxy, webdriver_noProxy, + # webdriver_proxyAutoconfigUrl, webdriver_autodetect, # webdriver_socksProxy, webdriver_socksUsername, webdriver_socksVersion, webdriver_socksPassword # # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # # Alternative Playwright URL, do not use "'s or 's! - # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/playwright + # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/ # # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password #