From 0e0bd932342e2c29ecaf850a04cdd71c9e7c272b Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 13 Sep 2022 09:52:29 +0200 Subject: [PATCH] WIP --- changedetectionio/__init__.py | 28 +++-- changedetectionio/fetch_processor/__init__.py | 2 +- changedetectionio/fetch_processor/image.py | 13 ++- .../fetch_processor/rendered_webpage.py | 105 ------------------ .../templates/watch-overview.html | 2 + 5 files changed, 33 insertions(+), 117 deletions(-) delete mode 100644 changedetectionio/fetch_processor/rendered_webpage.py diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 84fef256..060df7ac 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -396,18 +396,20 @@ def changedetection_app(config=None, datastore_o=None): existing_tags = datastore.get_all_tags() form = forms.quickWatchForm(request.form) + webdriver_enabled = True if os.getenv('PLAYWRIGHT_DRIVER_URL', False) or os.getenv('PLAYWRIGHT_DRIVER_URL', False) else False + output = render_template("watch-overview.html", - form=form, - watches=sorted_watches, - tags=existing_tags, active_tag=limit_tag, app_rss_token=datastore.data['settings']['application']['rss_access_token'], + form=form, + guid=datastore.data['app_guid'], has_unviewed=datastore.has_unviewed, - # Don't link to hosting when we're on the hosting environment hosted_sticky=os.getenv("SALTED_PASS", False) == False, - guid=datastore.data['app_guid'], - queued_uuids=[uuid for p,uuid in update_q.queue]) - + queued_uuids=[uuid for p, uuid in update_q.queue], + tags=existing_tags, + watches=sorted_watches, + webdriver_enabled=webdriver_enabled + ) if session.get('share-link'): del(session['share-link']) @@ -1228,15 +1230,23 @@ def changedetection_app(config=None, datastore_o=None): return redirect(url_for('index')) url = request.form.get('url').strip() - fetch_processor =request.form.get('fetch_processor').strip() + if datastore.url_exists(url): flash('The URL {} already exists'.format(url), "error") return redirect(url_for('index')) add_paused = request.form.get('edit_and_watch_submit_button') != None + fetch_processor = request.form.get('fetch_processor') + + extras = {'paused': add_paused} + if fetch_processor: + extras['fetch_processor']=fetch_processor + if fetch_processor == 'image': + extras['fetch_backend'] = 'html_webdriver' + new_uuid = datastore.add_watch(url=url, tag=request.form.get('tag').strip(), - extras={'paused': add_paused, 'fetch_processor': fetch_processor} + extras=extras ) diff --git a/changedetectionio/fetch_processor/__init__.py b/changedetectionio/fetch_processor/__init__.py index 5bec8922..9c82cb9b 100644 --- a/changedetectionio/fetch_processor/__init__.py +++ b/changedetectionio/fetch_processor/__init__.py @@ -1,4 +1,4 @@ -available_fetchers = [('json_html_plaintext', 'JSON/HTML/Text'), ('image', 'Static Image'), ('rendered_webpage', 'Screenshot of page or element')] +available_fetchers = [('json_html_plaintext', 'JSON/HTML/Text'), ('image', 'Graphically by image or web-page')] class fetch_processor(): contents = b'' diff --git a/changedetectionio/fetch_processor/image.py b/changedetectionio/fetch_processor/image.py index ed006601..8f1f1472 100644 --- a/changedetectionio/fetch_processor/image.py +++ b/changedetectionio/fetch_processor/image.py @@ -34,6 +34,12 @@ class perform_site_check(fetch_processor): watch = self.datastore.data['watching'].get(uuid) + + if watch.get('fetch_backend') != 'html_webdriver': + raise Exception( + "Requires a Chrome compatible fetcher enabled." + ) + # Protect against file:// access if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): raise Exception( @@ -80,9 +86,12 @@ class perform_site_check(fetch_processor): update_obj["last_check_status"] = fetcher.get_last_status_code() - self.contents = fetcher.raw_content + if 'image' in fetcher.headers['content-type']: + self.contents = fetcher.raw_content + else: + self.contents = fetcher.screenshot - image = Image.open(io.BytesIO(fetcher.raw_content)) + image = Image.open(io.BytesIO(self.contents)) # @todo different choice? # https://github.com/JohannesBuchner/imagehash#references diff --git a/changedetectionio/fetch_processor/rendered_webpage.py b/changedetectionio/fetch_processor/rendered_webpage.py deleted file mode 100644 index 037b85c3..00000000 --- a/changedetectionio/fetch_processor/rendered_webpage.py +++ /dev/null @@ -1,105 +0,0 @@ -import hashlib -import imagehash -from PIL import Image -import io -import logging -import os -import re -import time -import urllib3 - -# fetch processor for requesting and comparing a single image -# can use both requests and playwright/selenium - -# - imagehash for change detection (or https://github.com/dgtlmoon/changedetection.io/pull/419/files#diff-7d3854710a6c0faead783f75850100a4c4b69409309200d3a83692dc9783bf6eR17 ?) -# - skimage.metrics import structural_similarity for viewing the diff - - -from changedetectionio import content_fetcher, html_tools - -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -from . import fetch_processor - - -# Some common stuff here that can be moved to a base class -# (set_proxy_from_list) -class perform_site_check(fetch_processor): - xpath_data = None - - def run(self, uuid): - changed_detected = False - - watch = self.datastore.data['watching'].get(uuid) - - # Protect against file:// access - if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): - raise Exception( - "file:// type access is denied for security reasons." - ) - - if watch.get('fetch_backend') != 'html_webdriver': - raise Exception( - "Requires a Chrome compatible fetcher enabled." - ) - - # Unset any existing notification error - update_obj = {'last_notification_error': False, 'last_error': False} - - extra_headers = self.datastore.data['watching'][uuid].get('headers') - - # Tweak the base config with the per-watch ones - request_headers = self.datastore.data['settings']['headers'].copy() - request_headers.update(extra_headers) - - # https://github.com/psf/requests/issues/4525 - # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot - # do this by accident. - if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: - request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - - timeout = self.datastore.data['settings']['requests']['timeout'] - url = watch.get('url') - request_body = self.datastore.data['watching'][uuid].get('body') - request_method = self.datastore.data['watching'][uuid].get('method') - ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) - - prefer_backend = watch['fetch_backend'] - if hasattr(content_fetcher, prefer_backend): - klass = getattr(content_fetcher, prefer_backend) - else: - # If the klass doesnt exist, just use a default - klass = getattr(content_fetcher, "html_requests") - - proxy_args = self.set_proxy_from_list(watch) - fetcher = klass(proxy_override=proxy_args) - - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes) - fetcher.quit() - - # if not image/foobar in mimetype - # raise content_fecther.NotAnImage(mimetype) ? - # or better to try load with PIL and catch exception? - - update_obj["last_check_status"] = fetcher.get_last_status_code() - - self.contents = fetcher.screenshot - - image = Image.open(io.BytesIO(fetcher.screenshot)) - - # @todo different choice? - # https://github.com/JohannesBuchner/imagehash#references - fetched_hash = str(imagehash.average_hash(image)) - - # The main thing that all this at the moment comes down to :) - if watch['previous_md5'] != fetched_hash: - changed_detected = True - - # Always record the new checksum - update_obj["previous_md5"] = fetched_hash - - # On the first run of a site, watch['previous_md5'] will be None, set it the current one. - if not watch.get('previous_md5'): - watch['previous_md5'] = fetched_hash - - return changed_detected, update_obj diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 11dd25fa..817f8ff4 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -15,8 +15,10 @@
{{ render_simple_field(form.url, placeholder="https://...", required=true) }} {{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }} + {% if webdriver_enabled %}
{{ render_field(form.fetch_processor) }} + {% endif %}
{{ render_simple_field(form.watch_submit_button, title="Watch this URL!" ) }}