pull/938/head
dgtlmoon 2 years ago
parent 9244e2fb9c
commit 97c2cd633d

@ -632,11 +632,19 @@ def changedetection_app(config=None, datastore_o=None):
# Only works reliably with Playwright
visualselector_enabled = os.getenv('PLAYWRIGHT_DRIVER_URL', False) and default['fetch_backend'] == 'html_webdriver'
watch = datastore.data['watching'].get(uuid)
# Which tabs to show/hide ?
enabled_tabs = []
if watch.get('fetch_processor') == 'json_html_plaintext' or not watch.get('fetch_processor'):
enabled_tabs.append('visual-selector')
enabled_tabs.append('text-filters-and-triggers')
output = render_template("edit.html",
uuid=uuid,
watch=datastore.data['watching'][uuid],
watch=watch,
form=form,
enabled_tabs = enabled_tabs,
has_empty_checktime=using_default_check_time,
has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False,
using_global_webdriver_wait=default['webdriver_delay'] is None,

@ -66,13 +66,14 @@ class ReplyWithContentButNoText(Exception):
return
class Fetcher():
error = None
status_code = None
content = None
headers = None
error = None
fetcher_description = "No description"
headers = None
raw_content = None
status_code = None
webdriver_js_execute_code = None
xpath_element_js = """
// Include the getXpath script directly, easier than fetching
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
@ -399,6 +400,8 @@ class base_html_playwright(Fetcher):
raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url)
self.content = page.content()
self.raw_content = page.content()
self.status_code = response.status
self.headers = response.all_headers()
@ -524,6 +527,7 @@ class base_html_webdriver(Fetcher):
# @todo - dom wait loaded?
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
self.content = self.driver.page_source
self.raw_content = self.driver.page_source
self.headers = {}
# Does the connection to the webdriver work? run a test connection.
@ -603,6 +607,7 @@ class html_requests(Fetcher):
self.status_code = r.status_code
self.content = r.text
self.raw_content = r.content
self.headers = r.headers

@ -1,3 +1,5 @@
available_fetchers = [('json_html_plaintext', 'JSON/HTML/Text'), ('image', 'Static Image')]
class fetch_processor():
contents = b''
screenshot = None

@ -0,0 +1,102 @@
import hashlib
import imagehash
from PIL import Image
import io
import logging
import os
import re
import time
import urllib3
# fetch processor for requesting and comparing a single image
# can use both requests and playwright/selenium
# - imagehash for change detection (or https://github.com/dgtlmoon/changedetection.io/pull/419/files#diff-7d3854710a6c0faead783f75850100a4c4b69409309200d3a83692dc9783bf6eR17 ?)
# - skimage.metrics import structural_similarity for viewing the diff
from changedetectionio import content_fetcher, html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from . import fetch_processor
# Some common stuff here that can be moved to a base class
# (set_proxy_from_list)
class perform_site_check(fetch_processor):
xpath_data = None
def run(self, uuid):
changed_detected = False
screenshot = False # as bytes
stripped_text_from_html = ""
watch = self.datastore.data['watching'].get(uuid)
# Protect against file:// access
if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False):
raise Exception(
"file:// type access is denied for security reasons."
)
# Unset any existing notification error
update_obj = {'last_notification_error': False, 'last_error': False}
extra_headers = self.datastore.data['watching'][uuid].get('headers')
# Tweak the base config with the per-watch ones
request_headers = self.datastore.data['settings']['headers'].copy()
request_headers.update(extra_headers)
# https://github.com/psf/requests/issues/4525
# Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot
# do this by accident.
if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']:
request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '')
timeout = self.datastore.data['settings']['requests']['timeout']
url = watch.get('url')
request_body = self.datastore.data['watching'][uuid].get('body')
request_method = self.datastore.data['watching'][uuid].get('method')
ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False)
prefer_backend = watch['fetch_backend']
if hasattr(content_fetcher, prefer_backend):
klass = getattr(content_fetcher, prefer_backend)
else:
# If the klass doesnt exist, just use a default
klass = getattr(content_fetcher, "html_requests")
proxy_args = self.set_proxy_from_list(watch)
fetcher = klass(proxy_override=proxy_args)
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes)
fetcher.quit()
# if not image/foobar in mimetype
# raise content_fecther.NotAnImage(mimetype) ?
# or better to try load with PIL and catch exception?
update_obj["last_check_status"] = fetcher.get_last_status_code()
image = Image.open(io.BytesIO(fetcher.raw_content))
# @todo different choice?
# https://github.com/JohannesBuchner/imagehash#references
fetched_hash = str(imagehash.average_hash(image))
# The main thing that all this at the moment comes down to :)
if watch['previous_md5'] != fetched_hash:
changed_detected = True
# Always record the new checksum
update_obj["previous_md5"] = fetched_hash
# On the first run of a site, watch['previous_md5'] will be None, set it the current one.
if not watch.get('previous_md5'):
watch['previous_md5'] = fetched_hash
#self.contents = fetcher.screenshot
return changed_detected, update_obj

@ -306,8 +306,11 @@ class ValidateCSSJSONXPATHInput(object):
class quickWatchForm(Form):
from . import fetch_processor
url = fields.URLField('URL', validators=[validateURL()])
tag = StringField('Group tag', [validators.Optional()])
fetch_processor = RadioField(u'Compare as', choices=fetch_processor.available_fetchers, default=fetch_processor.available_fetchers[0][0])
watch_submit_button = SubmitField('Watch', render_kw={"class": "pure-button pure-button-primary"})
edit_and_watch_submit_button = SubmitField('Edit > Watch', render_kw={"class": "pure-button pure-button-primary"})

@ -27,6 +27,7 @@ class model(dict):
'extract_text': [], # Extract text by regex after filters
'extract_title_as_title': False,
'fetch_backend': None,
'fetch_processor': None, # default None, json_html_plaintext, image
'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')),
'headers': {}, # Extra headers to send
'ignore_text': [], # List of text to ignore when calculating the comparison checksum

@ -25,8 +25,13 @@
<ul>
<li class="tab" id=""><a href="#general">General</a></li>
<li class="tab"><a href="#request">Request</a></li>
{% if 'visual-selector' in enabled_tabs %}
<li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Filter Selector</a></li>
{%endif%}
{% if 'text-filters-and-triggers' in enabled_tabs %}
<li class="tab"><a href="#filters-and-triggers">Filters &amp; Triggers</a></li>
{%endif%}
<li class="tab"><a href="#notifications">Notifications</a></li>
</ul>
</div>

@ -15,6 +15,8 @@
<div>
{{ render_simple_field(form.url, placeholder="https://...", required=true) }}
{{ render_simple_field(form.tag, value=active_tag if active_tag else '', placeholder="watch group") }}
<br/>
{{ render_field(form.fetch_processor) }}
</div>
<div>
{{ render_simple_field(form.watch_submit_button, title="Watch this URL!" ) }}

@ -120,9 +120,10 @@ class update_worker(threading.Thread):
os.unlink(full_path)
def run(self):
from .fetch_processor import json_html_plaintext
from .fetch_processor import json_html_plaintext as processor_json_html_plaintext, image as processor_image
while not self.app.config.exit.is_set():
@ -139,12 +140,21 @@ class update_worker(threading.Thread):
changed_detected = False
update_obj = {}
process_changedetection_results = True
print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, self.datastore.data['watching'][uuid]['url']))
watch = self.datastore.data['watching'].get(uuid)
print("> Processing UUID {} Priority {} URL {}".format(uuid, priority, watch.get('url')))
now = time.time()
try:
update_handler = json_html_plaintext.perform_site_check(datastore=self.datastore)
update_handler = None
if watch.get('fetch_processor') == 'image':
update_handler = processor_image.perform_site_check(datastore=self.datastore)
else:
# Anything else for now will be `json_html_plaintext`
update_handler = processor_json_html_plaintext.perform_site_check(datastore=self.datastore)
changed_detected, update_obj = update_handler.run(uuid)
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc

@ -42,3 +42,7 @@ selenium ~= 4.1.0
werkzeug ~= 2.0.0
# playwright is installed at Dockerfile build time because it's not available on all platforms
imagehash ~= 4.3.0
pillow

Loading…
Cancel
Save