diff --git a/.github/workflows/image-javascript.yml b/.github/workflows/image-javascript.yml
new file mode 100644
index 00000000..a6b4d8ee
--- /dev/null
+++ b/.github/workflows/image-javascript.yml
@@ -0,0 +1,87 @@
+name: Javascript/Webdriver support - Test, build and push to Docker Hub :javascript tag
+
+on:
+ push:
+ branches: [ javascript-browser ]
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+
+ - uses: actions/checkout@v2
+ - name: Set up Python 3.9
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.9
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install flake8 pytest
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+ - name: Lint with flake8
+ run: |
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+
+ - name: Create release metadata
+ run: |
+ # COPY'ed by Dockerfile into backend/ of the image, then read by the server in store.py
+ echo ${{ github.sha }} > backend/source.txt
+ echo ${{ github.ref }} > backend/tag.txt
+
+ - name: Test with pytest
+ run: |
+ # Each test is totally isolated and performs its own cleanup/reset
+ cd backend; ./run_all_tests.sh
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v1
+ with:
+ image: tonistiigi/binfmt:latest
+ platforms: all
+
+ - name: Login to Docker Hub
+ uses: docker/login-action@v1
+ with:
+ username: ${{ secrets.DOCKER_HUB_USERNAME }}
+ password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+ - name: Set up Docker Buildx
+ id: buildx
+ uses: docker/setup-buildx-action@v1
+ with:
+ install: true
+ version: latest
+ driver-opts: image=moby/buildkit:master
+
+ - name: Build and push
+ id: docker_build
+ uses: docker/build-push-action@v2
+ with:
+ context: ./
+ file: ./Dockerfile
+ push: true
+ tags: |
+ ${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:javascript-dev
+ platforms: linux/amd64,linux/arm64,linux/arm/v6,linux/arm/v7
+ cache-from: type=local,src=/tmp/.buildx-cache
+ cache-to: type=local,dest=/tmp/.buildx-cache
+
+ - name: Image digest
+ run: echo step SHA ${{ steps.vars.outputs.sha_short }} tag ${{steps.vars.outputs.tag}} branch ${{steps.vars.outputs.branch}} digest ${{ steps.docker_build.outputs.digest }}
+
+# failed: Cache service responded with 503
+# - name: Cache Docker layers
+# uses: actions/cache@v2
+# with:
+# path: /tmp/.buildx-cache
+# key: ${{ runner.os }}-buildx-${{ github.sha }}
+# restore-keys: |
+# ${{ runner.os }}-buildx-
+
+
diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml
index 3150c60e..7abf493c 100644
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -44,6 +44,7 @@ jobs:
with:
image: tonistiigi/binfmt:latest
platforms: all
+
- name: Login to Docker Hub
uses: docker/login-action@v1
with:
@@ -66,10 +67,8 @@ jobs:
file: ./Dockerfile
push: true
tags: |
- ${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io:latest
- # ${{ secrets.DOCKER_HUB_USERNAME }}:/changedetection.io:${{ env.RELEASE_VERSION }}
+ ${{ secrets.DOCKER_HUB_USERNAME }}/changedetection.io
platforms: linux/amd64,linux/arm64,linux/arm/v6,linux/arm/v7
-# platforms: linux/amd64
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
diff --git a/README.md b/README.md
index 601bb0c4..e9ca8dcc 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ Know when ...
_Need an actual Chrome runner with Javascript support? see the experimental Javascript/Chrome support changedetection.io branch!_
**Get monitoring now! super simple, one command!**
+
Run the python code on your own machine by cloning this repository, or with docker and/or docker-compose
With one docker-compose command
@@ -40,24 +41,18 @@ With one docker-compose command
docker-compose up -d
```
-or
-
+Then visit http://127.0.0.1:5000 , You should now be able to access the UI.
-```bash
-docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/datastore --name changedetection.io dgtlmoon/changedetection.io
-```
+_Now with per-site configurable support for using a fast built in HTTP fetcher or use a Chrome based fetcher for monitoring of JavaScript websites!_
-Now visit http://127.0.0.1:5000 , You should now be able to access the UI.
-#### Updating to latest version
+#### Updating to the latest version
Highly recommended :)
```bash
docker pull dgtlmoon/changedetection.io
-docker kill $(docker ps -a|grep changedetection.io|awk '{print $1}')
-docker rm $(docker ps -a|grep changedetection.io|awk '{print $1}')
-docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/datastore --name changedetection.io dgtlmoon/changedetection.io
+docker-compose up -d
```
### Screenshots
@@ -135,6 +130,7 @@ For more information see https://docs.python-requests.org/en/master/user/advance
This proxy support also extends to the notifications https://github.com/caronc/apprise/issues/387#issuecomment-841718867
+
### Notes
- ~~Does not yet support Javascript~~
@@ -143,6 +139,7 @@ This proxy support also extends to the notifications https://github.com/caronc/a
See the experimental Javascript/Chrome browser support!
+
### RaspberriPi support?
RaspberriPi and linux/arm/v6 linux/arm/v7 arm64 devices are supported!
diff --git a/backend/__init__.py b/backend/__init__.py
index bee1bc13..91608993 100644
--- a/backend/__init__.py
+++ b/backend/__init__.py
@@ -378,6 +378,7 @@ def changedetection_app(config=None, datastore_o=None):
if uuid == 'first':
uuid = list(datastore.data['watching'].keys()).pop()
+
if request.method == 'GET':
if not uuid in datastore.data['watching']:
flash("No watch with the UUID %s found." % (uuid), "error")
@@ -385,17 +386,25 @@ def changedetection_app(config=None, datastore_o=None):
populate_form_from_watch(form, datastore.data['watching'][uuid])
+ if datastore.data['watching'][uuid]['fetch_backend'] is None:
+ form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
+
if request.method == 'POST' and form.validate():
# Re #110, if they submit the same as the default value, set it to None, so we continue to follow the default
if form.minutes_between_check.data == datastore.data['settings']['requests']['minutes_between_check']:
form.minutes_between_check.data = None
+ if form.fetch_backend.data == datastore.data['settings']['application']['fetch_backend']:
+ form.fetch_backend.data = None
+
+
update_obj = {'url': form.url.data.strip(),
'minutes_between_check': form.minutes_between_check.data,
'tag': form.tag.data.strip(),
'title': form.title.data.strip(),
- 'headers': form.headers.data
+ 'headers': form.headers.data,
+ 'fetch_backend': form.fetch_backend.data
}
# Notification URLs
@@ -428,8 +437,8 @@ def changedetection_app(config=None, datastore_o=None):
if form.trigger_check.data:
n_object = {'watch_url': form.url.data.strip(),
- 'notification_urls': form.notification_urls.data,
- 'uuid': uuid}
+ 'notification_urls': form.notification_urls.data
+ }
notification_q.put(n_object)
flash('Notifications queued.')
@@ -464,12 +473,15 @@ def changedetection_app(config=None, datastore_o=None):
def settings_page():
from backend import forms
+ from backend import content_fetcher
+
form = forms.globalSettingsForm(request.form)
if request.method == 'GET':
form.minutes_between_check.data = int(datastore.data['settings']['requests']['minutes_between_check'])
form.notification_urls.data = datastore.data['settings']['application']['notification_urls']
form.extract_title_as_title.data = datastore.data['settings']['application']['extract_title_as_title']
+ form.fetch_backend.data = datastore.data['settings']['application']['fetch_backend']
form.notification_title.data = datastore.data['settings']['application']['notification_title']
form.notification_body.data = datastore.data['settings']['application']['notification_body']
@@ -486,6 +498,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['settings']['application']['notification_urls'] = form.notification_urls.data
datastore.data['settings']['requests']['minutes_between_check'] = form.minutes_between_check.data
datastore.data['settings']['application']['extract_title_as_title'] = form.extract_title_as_title.data
+ datastore.data['settings']['application']['fetch_backend'] = form.fetch_backend.data
datastore.data['settings']['application']['notification_title'] = form.notification_title.data
datastore.data['settings']['application']['notification_body'] = form.notification_body.data
diff --git a/backend/content_fetcher.py b/backend/content_fetcher.py
new file mode 100644
index 00000000..40ff7327
--- /dev/null
+++ b/backend/content_fetcher.py
@@ -0,0 +1,137 @@
+import os
+import time
+from abc import ABC, abstractmethod
+from selenium import webdriver
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.common.exceptions import WebDriverException
+import urllib3.exceptions
+
+
+class EmptyReply(Exception):
+ pass
+
+class Fetcher():
+ error = None
+ status_code = None
+ content = None # Should be bytes?
+
+ fetcher_description ="No description"
+
+ @abstractmethod
+ def get_error(self):
+ return self.error
+
+ @abstractmethod
+ def run(self, url, timeout, request_headers):
+ # Should set self.error, self.status_code and self.content
+ pass
+
+ @abstractmethod
+ def get_last_status_code(self):
+ return self.status_code
+
+ @abstractmethod
+ # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc
+ def is_ready(self):
+ return True
+
+# Maybe for the future, each fetcher provides its own diff output, could be used for text, image
+# the current one would return javascript output (as we use JS to generate the diff)
+#
+# Returns tuple(mime_type, stream)
+# @abstractmethod
+# def return_diff(self, stream_a, stream_b):
+# return
+
+def available_fetchers():
+ import inspect
+ from backend import content_fetcher
+ p=[]
+ for name, obj in inspect.getmembers(content_fetcher):
+ if inspect.isclass(obj):
+ # @todo html_ is maybe better as fetcher_ or something
+ # In this case, make sure to edit the default one in store.py and fetch_site_status.py
+ if "html_" in name:
+ t=tuple([name,obj.fetcher_description])
+ p.append(t)
+
+ return p
+
+class html_webdriver(Fetcher):
+ fetcher_description = "WebDriver Chrome/Javascript"
+ command_executor = ''
+
+ def __init__(self):
+ self.command_executor = os.getenv("WEBDRIVER_URL",'http://browser-chrome:4444/wd/hub')
+
+ def run(self, url, timeout, request_headers):
+
+ # check env for WEBDRIVER_URL
+ driver = webdriver.Remote(
+ command_executor=self.command_executor,
+ desired_capabilities=DesiredCapabilities.CHROME)
+
+ try:
+ driver.get(url)
+ except WebDriverException as e:
+ # Be sure we close the session window
+ driver.quit()
+ raise
+
+ # @todo - how to check this? is it possible?
+ self.status_code = 200
+
+ # @todo - dom wait loaded?
+ time.sleep(5)
+ self.content = driver.page_source
+
+ driver.quit()
+
+
+ def is_ready(self):
+ from selenium import webdriver
+ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+ from selenium.common.exceptions import WebDriverException
+
+ driver = webdriver.Remote(
+ command_executor='http://browser-chrome:4444/wd/hub',
+ desired_capabilities=DesiredCapabilities.CHROME)
+
+ # driver.quit() seems to cause better exceptions
+ driver.quit()
+
+
+ return True
+
+# "html_requests" is listed as the default fetcher in store.py!
+class html_requests(Fetcher):
+ fetcher_description = "Basic fast Plaintext/HTTP Client"
+
+ def run(self, url, timeout, request_headers):
+ import requests
+ try:
+ r = requests.get(url,
+ headers=request_headers,
+ timeout=timeout,
+ verify=False)
+
+ html = r.text
+
+ # Usually from networkIO/requests level
+ except (
+ requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout,
+ requests.exceptions.MissingSchema) as e:
+ self.error = str(e)
+ return None
+
+ except Exception as e:
+ self.error = "Other exception" + str(e)
+ return None
+
+ # @todo test this
+ if not r or not html or not len(html):
+ raise EmptyReply(url)
+
+ self.status_code = r.status_code
+ self.content = html
+
diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py
index 870a6515..242a46b1 100644
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@@ -1,11 +1,13 @@
import time
-import requests
+from backend import content_fetcher
import hashlib
from inscriptis import get_text
import urllib3
from . import html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+from selenium import webdriver
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# Some common stuff here that can be moved to a base class
@@ -52,8 +54,8 @@ class perform_site_check():
def run(self, uuid):
timestamp = int(time.time()) # used for storage etc too
- stripped_text_from_html = False
changed_detected = False
+ stripped_text_from_html = ""
update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'],
'history': {},
@@ -72,71 +74,63 @@ class perform_site_check():
if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']:
request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '')
- try:
- timeout = self.datastore.data['settings']['requests']['timeout']
- except KeyError:
- # @todo yeah this should go back to the default value in store.py, but this whole object should abstract off it
- timeout = 15
+ # @todo check the failures are really handled how we expect
- try:
+ else:
+ timeout = self.datastore.data['settings']['requests']['timeout']
url = self.datastore.get_val(uuid, 'url')
- r = requests.get(url,
- headers=request_headers,
- timeout=timeout,
- verify=False)
+ # Pluggable content fetcher
+ prefer_backend = self.datastore.data['watching'][uuid]['fetch_backend']
+ if hasattr(content_fetcher, prefer_backend):
+ klass = getattr(content_fetcher, prefer_backend)
+ else:
+ # If the klass doesnt exist, just use a default
+ klass = getattr(content_fetcher, "html_requests")
+
- html = r.text
+ fetcher = klass()
+ fetcher.run(url, timeout, request_headers)
+ # Fetching complete, now filters
+ # @todo move to class / maybe inside of fetcher abstract base?
is_html = True
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
if css_filter_rule and len(css_filter_rule.strip()):
if 'json:' in css_filter_rule:
- stripped_text_from_html = html_tools.extract_json_as_string(html, css_filter_rule)
+ stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
is_html = False
else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
- html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
+ stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
if is_html:
- stripped_text_from_html = get_text(html)
+ # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+ html_content = fetcher.content
+ css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
+ if css_filter_rule and len(css_filter_rule.strip()):
+ html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
- # Usually from networkIO/requests level
- except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
- update_obj["last_error"] = str(e)
- print(str(e))
+ # get_text() via inscriptis
+ stripped_text_from_html = get_text(html_content)
- except requests.exceptions.MissingSchema:
- print("Skipping {} due to missing schema/bad url".format(uuid))
-
- # Usually from html2text level
- except Exception as e:
- # except UnicodeDecodeError as e:
- update_obj["last_error"] = str(e)
- print(str(e))
- # figure out how to deal with this cleaner..
- # 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte
-
-
- else:
# We rely on the actual text in the html output.. many sites have random script vars etc,
# in the future we'll implement other mechanisms.
- update_obj["last_check_status"] = r.status_code
+ update_obj["last_check_status"] = fetcher.get_last_status_code()
update_obj["last_error"] = False
- if not len(r.text):
- update_obj["last_error"] = "Empty reply"
# If there's text to skip
# @todo we could abstract out the get_text() to handle this cleaner
if len(self.datastore.data['watching'][uuid]['ignore_text']):
- content = self.strip_ignore_text(stripped_text_from_html,
+ stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html,
self.datastore.data['watching'][uuid]['ignore_text'])
else:
- content = stripped_text_from_html.encode('utf8')
+ stripped_text_from_html = stripped_text_from_html.encode('utf8')
+
- fetched_md5 = hashlib.md5(content).hexdigest()
+ fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
# could be None or False depending on JSON type
if self.datastore.data['watching'][uuid]['previous_md5'] != fetched_md5:
@@ -149,9 +143,9 @@ class perform_site_check():
update_obj["previous_md5"] = fetched_md5
# Extract title as title
- if self.datastore.data['settings']['application']['extract_title_as_title']:
+ if is_html and self.datastore.data['settings']['application']['extract_title_as_title']:
if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']):
- update_obj['title'] = html_tools.extract_element(find='title', html_content=html)
+ update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
return changed_detected, update_obj, stripped_text_from_html
diff --git a/backend/forms.py b/backend/forms.py
index 3abb2e55..28d74224 100644
--- a/backend/forms.py
+++ b/backend/forms.py
@@ -1,9 +1,9 @@
-from wtforms import Form, BooleanField, StringField, PasswordField, validators, IntegerField, fields, TextAreaField, \
+from wtforms import Form, SelectField, RadioField, BooleanField, StringField, PasswordField, validators, IntegerField, fields, TextAreaField, \
Field
from wtforms import widgets
from wtforms.validators import ValidationError
from wtforms.fields import html5
-
+from backend import content_fetcher
class StringListField(StringField):
widget = widgets.TextArea()
@@ -82,6 +82,40 @@ class StringDictKeyValue(StringField):
else:
self.data = {}
+class ValidateContentFetcherIsReady(object):
+ """
+ Validates that anything that looks like a regex passes as a regex
+ """
+ def __init__(self, message=None):
+ self.message = message
+
+ def __call__(self, form, field):
+ from backend import content_fetcher
+ import urllib3.exceptions
+
+ # Better would be a radiohandler that keeps a reference to each class
+ if field.data is not None:
+ klass = getattr(content_fetcher, field.data)
+ some_object = klass()
+ try:
+ ready = some_object.is_ready()
+
+ except urllib3.exceptions.MaxRetryError as e:
+ driver_url = some_object.command_executor
+ message = field.gettext('Content fetcher \'%s\' did not respond.' % (field.data))
+ message += '
'+field.gettext('Be sure that the selenium/webdriver runner is running and accessible via network from this container/host.')
+ message += '
' + field.gettext('Did you follow the instructions in the wiki?')
+ message += '
' + field.gettext('WebDriver Host: %s' % (driver_url))
+ message += '
Go here for more information'
+
+ raise ValidationError(message)
+
+ except Exception as e:
+ message = field.gettext('Content fetcher \'%s\' did not respond properly, unable to use it.\n %s')
+ raise ValidationError(message % (field.data, e))
+
+
+
class ValidateListRegex(object):
"""
Validates that anything that looks like a regex passes as a regex
@@ -138,6 +172,8 @@ class watchForm(quickWatchForm):
css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()])
title = StringField('Title')
+ fetch_backend = RadioField(u'Fetch Method', choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
+
ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
notification_urls = StringListField('Notification URL List')
headers = StringDictKeyValue('Request Headers')
@@ -152,6 +188,9 @@ class globalSettingsForm(Form):
[validators.NumberRange(min=1)])
notification_urls = StringListField('Notification URL List')
+
+ fetch_backend = RadioField(u'Fetch Method', choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()])
+
extract_title_as_title = BooleanField('Extract