diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index c9177403..5340ad9e 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -632,11 +632,19 @@ def changedetection_app(config=None, datastore_o=None): # Only works reliably with Playwright visualselector_enabled = os.getenv('PLAYWRIGHT_DRIVER_URL', False) and default['fetch_backend'] == 'html_webdriver' + watch = datastore.data['watching'].get(uuid) + + # Which tabs to show/hide ? + enabled_tabs = [] + if watch.get('fetch_processor') == 'json_html_plaintext' or not watch.get('fetch_processor'): + enabled_tabs.append('visual-selector') + enabled_tabs.append('text-filters-and-triggers') output = render_template("edit.html", uuid=uuid, - watch=datastore.data['watching'][uuid], + watch=watch, form=form, + enabled_tabs = enabled_tabs, has_empty_checktime=using_default_check_time, has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False, using_global_webdriver_wait=default['webdriver_delay'] is None, diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index d831ce84..d1c38074 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -66,13 +66,14 @@ class ReplyWithContentButNoText(Exception): return class Fetcher(): - error = None - status_code = None content = None - headers = None - + error = None fetcher_description = "No description" + headers = None + raw_content = None + status_code = None webdriver_js_execute_code = None + xpath_element_js = """ // Include the getXpath script directly, easier than fetching !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}}); @@ -399,6 +400,8 @@ class base_html_playwright(Fetcher): raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url) self.content = page.content() + self.raw_content = page.content() + self.status_code = response.status self.headers = response.all_headers() @@ -524,6 +527,7 @@ class base_html_webdriver(Fetcher): # @todo - dom wait loaded? time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) self.content = self.driver.page_source + self.raw_content = self.driver.page_source self.headers = {} # Does the connection to the webdriver work? run a test connection. @@ -603,6 +607,7 @@ class html_requests(Fetcher): self.status_code = r.status_code self.content = r.text + self.raw_content = r.content self.headers = r.headers diff --git a/changedetectionio/fetch_processor/__init__.py b/changedetectionio/fetch_processor/__init__.py index b3f5c0b5..74ae2c87 100644 --- a/changedetectionio/fetch_processor/__init__.py +++ b/changedetectionio/fetch_processor/__init__.py @@ -1,3 +1,5 @@ +available_fetchers = [('json_html_plaintext', 'JSON/HTML/Text'), ('image', 'Static Image')] + class fetch_processor(): contents = b'' screenshot = None diff --git a/changedetectionio/fetch_processor/image.py b/changedetectionio/fetch_processor/image.py new file mode 100644 index 00000000..ffe1589f --- /dev/null +++ b/changedetectionio/fetch_processor/image.py @@ -0,0 +1,102 @@ +import hashlib +import imagehash +from PIL import Image +import io +import logging +import os +import re +import time +import urllib3 + +# fetch processor for requesting and comparing a single image +# can use both requests and playwright/selenium + +# - imagehash for change detection (or https://github.com/dgtlmoon/changedetection.io/pull/419/files#diff-7d3854710a6c0faead783f75850100a4c4b69409309200d3a83692dc9783bf6eR17 ?) +# - skimage.metrics import structural_similarity for viewing the diff + + +from changedetectionio import content_fetcher, html_tools + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +from . import fetch_processor + + +# Some common stuff here that can be moved to a base class +# (set_proxy_from_list) +class perform_site_check(fetch_processor): + xpath_data = None + + def run(self, uuid): + changed_detected = False + screenshot = False # as bytes + stripped_text_from_html = "" + + watch = self.datastore.data['watching'].get(uuid) + + # Protect against file:// access + if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): + raise Exception( + "file:// type access is denied for security reasons." + ) + + # Unset any existing notification error + update_obj = {'last_notification_error': False, 'last_error': False} + + extra_headers = self.datastore.data['watching'][uuid].get('headers') + + # Tweak the base config with the per-watch ones + request_headers = self.datastore.data['settings']['headers'].copy() + request_headers.update(extra_headers) + + # https://github.com/psf/requests/issues/4525 + # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot + # do this by accident. + if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: + request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') + + timeout = self.datastore.data['settings']['requests']['timeout'] + url = watch.get('url') + request_body = self.datastore.data['watching'][uuid].get('body') + request_method = self.datastore.data['watching'][uuid].get('method') + ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) + + prefer_backend = watch['fetch_backend'] + if hasattr(content_fetcher, prefer_backend): + klass = getattr(content_fetcher, prefer_backend) + else: + # If the klass doesnt exist, just use a default + klass = getattr(content_fetcher, "html_requests") + + proxy_args = self.set_proxy_from_list(watch) + fetcher = klass(proxy_override=proxy_args) + + fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes) + fetcher.quit() + + # if not image/foobar in mimetype + # raise content_fecther.NotAnImage(mimetype) ? + # or better to try load with PIL and catch exception? + + update_obj["last_check_status"] = fetcher.get_last_status_code() + + image = Image.open(io.BytesIO(fetcher.raw_content)) + + # @todo different choice? + # https://github.com/JohannesBuchner/imagehash#references + fetched_hash = str(imagehash.average_hash(image)) + + # The main thing that all this at the moment comes down to :) + if watch['previous_md5'] != fetched_hash: + changed_detected = True + + # Always record the new checksum + update_obj["previous_md5"] = fetched_hash + + # On the first run of a site, watch['previous_md5'] will be None, set it the current one. + if not watch.get('previous_md5'): + watch['previous_md5'] = fetched_hash + + #self.contents = fetcher.screenshot + + return changed_detected, update_obj diff --git a/changedetectionio/fetch_processor/rendered_webpage.py b/changedetectionio/fetch_processor/rendered_webpage.py new file mode 100644 index 00000000..e69de29b diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 279f7c7f..a3d6731f 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -306,8 +306,11 @@ class ValidateCSSJSONXPATHInput(object): class quickWatchForm(Form): + from . import fetch_processor + url = fields.URLField('URL', validators=[validateURL()]) tag = StringField('Group tag', [validators.Optional()]) + fetch_processor = RadioField(u'Compare as', choices=fetch_processor.available_fetchers, default=fetch_processor.available_fetchers[0][0]) watch_submit_button = SubmitField('Watch', render_kw={"class": "pure-button pure-button-primary"}) edit_and_watch_submit_button = SubmitField('Edit > Watch', render_kw={"class": "pure-button pure-button-primary"}) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 9c12283b..d624bff0 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -27,6 +27,7 @@ class model(dict): 'extract_text': [], # Extract text by regex after filters 'extract_title_as_title': False, 'fetch_backend': None, + 'fetch_processor': None, # default None, json_html_plaintext, image 'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), 'headers': {}, # Extra headers to send 'ignore_text': [], # List of text to ignore when calculating the comparison checksum diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 231c2016..d3bbf221 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -25,8 +25,13 @@