Merge pull request #11 from dgtlmoon/pytest

Separate flask from eventlet runtime and get pytest working
pull/19/head
dgtlmoon 4 years ago committed by GitHub
commit 0a08616c87
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,7 +1,7 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python # This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: changedetection.io Python application name: changedetection.io
on: on:

2
.gitignore vendored

@ -3,3 +3,5 @@ __pycache__
*.pyc *.pyc
datastore/url-watches.json datastore/url-watches.json
datastore/* datastore/*
__pycache__
.pytest_cache

@ -18,7 +18,7 @@ RUN echo "commit: $SOURCE_COMMIT branch: $SOURCE_BRANCH" >/source.txt
RUN [ ! -d "/datastore" ] && mkdir /datastore RUN [ ! -d "/datastore" ] && mkdir /datastore
CMD [ "python", "./backend.py" ] CMD [ "python", "./backend.py" , "-d", "/datastore"]

@ -1,4 +1,5 @@
# changedetection.io # changedetection.io
![changedetection.io](https://github.com/dgtlmoon/changedetection.io/actions/workflows/python-app.yml/badge.svg?branch=master)
## Self-hosted change monitoring of web pages. ## Self-hosted change monitoring of web pages.
@ -11,7 +12,7 @@ Know when ...
- Government department updates (changes are often only on their websites) - Government department updates (changes are often only on their websites)
- Local government news (changes are often only on their websites) - Local government news (changes are often only on their websites)
- New software releases - New software releases, security advisories when you're not on their mailing list.
- Festivals with changes - Festivals with changes
- Realestate listing changes - Realestate listing changes

@ -0,0 +1,65 @@
#!/usr/bin/python3
# Launch as a eventlet.wsgi server instance.
import getopt
import sys
import eventlet
import eventlet.wsgi
import backend
from backend import store
def main(argv):
ssl_mode = False
port = 5000
datastore_path = "./datastore"
try:
opts, args = getopt.getopt(argv, "sd:p:", "purge")
except getopt.GetoptError:
print('backend.py -s SSL enable -p [port] -d [datastore path]')
sys.exit(2)
for opt, arg in opts:
# if opt == '--purge':
# Remove history, the actual files you need to delete manually.
# for uuid, watch in datastore.data['watching'].items():
# watch.update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'previous_md5': None})
if opt == '-s':
ssl_mode = True
if opt == '-p':
port = int(arg)
if opt == '-d':
datastore_path = arg
# threads can read from disk every x seconds right?
# front end can just save
# We just need to know which threads are looking at which UUIDs
# isnt there some @thingy to attach to each route to tell it, that this route needs a datastore
app_config = {'datastore_path': datastore_path}
datastore = store.ChangeDetectionStore(datastore_path=app_config['datastore_path'])
app = backend.changedetection_app(app_config, datastore)
if ssl_mode:
# @todo finalise SSL config, but this should get you in the right direction if you need it.
eventlet.wsgi.server(eventlet.wrap_ssl(eventlet.listen(('', port)),
certfile='cert.pem',
keyfile='privkey.pem',
server_side=True), app)
else:
eventlet.wsgi.server(eventlet.listen(('', port)), app)
if __name__ == '__main__':
main(sys.argv[1:])

@ -0,0 +1 @@
Note: run `pytest` from this directory.

@ -0,0 +1,478 @@
#!/usr/bin/python3
# @todo logging
# @todo sort by last_changed
# @todo extra options for url like , verify=False etc.
# @todo enable https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl as option?
# @todo maybe a button to reset all 'last-changed'.. so you can see it clearly when something happens since your last visit
# @todo option for interval day/6 hour/etc
# @todo on change detected, config for calling some API
# @todo make tables responsive!
# @todo fetch title into json
# https://distill.io/features
# proxy per check
# - flask_cors, itsdangerous,MarkupSafe
import time
import os
import timeago
import threading
import queue
from flask import Flask, render_template, request, send_file, send_from_directory, abort, redirect, url_for
datastore = None
# Local
running_update_threads = []
ticker_thread = None
messages = []
extra_stylesheets = []
update_q = queue.Queue()
app = Flask(__name__, static_url_path="/var/www/change-detection/backen/static")
# Stop browser caching of assets
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
app.config['STOP_THREADS'] = False
# Disables caching of the templates
app.config['TEMPLATES_AUTO_RELOAD'] = True
# We use the whole watch object from the store/JSON so we can see if there's some related status in terms of a thread
# running or something similar.
@app.template_filter('format_last_checked_time')
def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
# Worker thread tells us which UUID it is currently processing.
for t in running_update_threads:
if t.current_uuid == watch_obj['uuid']:
return "Checking now.."
if watch_obj['last_checked'] == 0:
return 'Not yet'
return timeago.format(int(watch_obj['last_checked']), time.time())
# @app.context_processor
# def timeago():
# def _timeago(lower_time, now):
# return timeago.format(lower_time, now)
# return dict(timeago=_timeago)
@app.template_filter('format_timestamp_timeago')
def _jinja2_filter_datetimestamp(timestamp, format="%Y-%m-%d %H:%M:%S"):
return timeago.format(timestamp, time.time())
# return timeago.format(timestamp, time.time())
# return datetime.datetime.utcfromtimestamp(timestamp).strftime(format)
def changedetection_app(config=None, datastore_o=None):
global datastore
datastore = datastore_o
# Hmm
app.config.update(dict(DEBUG=True))
app.config.update(config or {})
# Setup cors headers to allow all domains
# https://flask-cors.readthedocs.io/en/latest/
# CORS(app)
# https://github.com/pallets/flask/blob/93dd1709d05a1cf0e886df6223377bdab3b077fb/examples/tutorial/flaskr/__init__.py#L39
# You can divide up the stuff like this
@app.route("/", methods=['GET'])
def index():
global messages
limit_tag = request.args.get('tag')
# Sort by last_changed and add the uuid which is usually the key..
sorted_watches = []
for uuid, watch in datastore.data['watching'].items():
if limit_tag != None:
# Support for comma separated list of tags.
for tag_in_watch in watch['tag'].split(','):
tag_in_watch = tag_in_watch.strip()
if tag_in_watch == limit_tag:
watch['uuid'] = uuid
sorted_watches.append(watch)
else:
watch['uuid'] = uuid
sorted_watches.append(watch)
sorted_watches.sort(key=lambda x: x['last_changed'], reverse=True)
existing_tags = datastore.get_all_tags()
output = render_template("watch-overview.html",
watches=sorted_watches,
messages=messages,
tags=existing_tags,
active_tag=limit_tag)
# Show messages but once.
messages = []
return output
@app.route("/scrub", methods=['GET', 'POST'])
def scrub_page():
from pathlib import Path
global messages
if request.method == 'POST':
confirmtext = request.form.get('confirmtext')
if confirmtext == 'scrub':
for txt_file_path in Path(app.config['datastore_path']).rglob('*.txt'):
os.unlink(txt_file_path)
for uuid, watch in datastore.data['watching'].items():
watch['last_checked'] = 0
watch['last_changed'] = 0
watch['previous_md5'] = None
watch['history'] = {}
datastore.needs_write = True
messages.append({'class': 'ok', 'message': 'Cleaned all version history.'})
else:
messages.append({'class': 'error', 'message': 'Wrong confirm text.'})
return redirect(url_for('index'))
return render_template("scrub.html")
@app.route("/edit", methods=['GET', 'POST'])
def edit_page():
global messages
import validators
if request.method == 'POST':
uuid = request.args.get('uuid')
url = request.form.get('url').strip()
tag = request.form.get('tag').strip()
form_headers = request.form.get('headers').strip().split("\n")
extra_headers = {}
if form_headers:
for header in form_headers:
if len(header):
parts = header.split(':', 1)
extra_headers.update({parts[0].strip(): parts[1].strip()})
validators.url(url) # @todo switch to prop/attr/observer
datastore.data['watching'][uuid].update({'url': url,
'tag': tag,
'headers': extra_headers})
datastore.needs_write = True
messages.append({'class': 'ok', 'message': 'Updated watch.'})
return redirect(url_for('index'))
else:
uuid = request.args.get('uuid')
output = render_template("edit.html", uuid=uuid, watch=datastore.data['watching'][uuid], messages=messages)
return output
@app.route("/settings", methods=['GET', "POST"])
def settings_page():
global messages
if request.method == 'POST':
try:
minutes = int(request.values.get('minutes').strip())
except ValueError:
messages.append({'class': 'error', 'message': "Invalid value given, use an integer."})
else:
if minutes >= 5 and minutes <= 600:
datastore.data['settings']['requests']['minutes_between_check'] = minutes
datastore.needs_write = True
messages.append({'class': 'ok', 'message': "Updated"})
else:
messages.append(
{'class': 'error', 'message': "Must be equal to or greater than 5 and less than 600 minutes"})
output = render_template("settings.html", messages=messages,
minutes=datastore.data['settings']['requests']['minutes_between_check'])
messages = []
return output
@app.route("/import", methods=['GET', "POST"])
def import_page():
import validators
global messages
remaining_urls = []
good = 0
if request.method == 'POST':
urls = request.values.get('urls').split("\n")
for url in urls:
url = url.strip()
if len(url) and validators.url(url):
new_uuid = datastore.add_watch(url=url.strip(), tag="")
# Straight into the queue.
update_q.put(new_uuid)
good += 1
else:
if len(url):
remaining_urls.append(url)
messages.append({'class': 'ok', 'message': "{} Imported, {} Skipped.".format(good, len(remaining_urls))})
if len(remaining_urls) == 0:
return redirect(url_for('index'))
else:
output = render_template("import.html",
messages=messages,
remaining="\n".join(remaining_urls)
)
messages = []
return output
@app.route("/diff/<string:uuid>", methods=['GET'])
def diff_history_page(uuid):
global messages
extra_stylesheets = ['/static/css/diff.css']
watch = datastore.data['watching'][uuid]
dates = list(watch['history'].keys())
# Convert to int, sort and back to str again
dates = [int(i) for i in dates]
dates.sort(reverse=True)
dates = [str(i) for i in dates]
# Save the current newest history as the most recently viewed
datastore.set_last_viewed(uuid, dates[0])
newest_file = watch['history'][dates[0]]
with open(newest_file, 'r') as f:
newest_version_file_contents = f.read()
previous_version = request.args.get('previous_version')
try:
previous_file = watch['history'][previous_version]
except KeyError:
# Not present, use a default value, the second one in the sorted list.
previous_file = watch['history'][dates[1]]
with open(previous_file, 'r') as f:
previous_version_file_contents = f.read()
output = render_template("diff.html", watch_a=watch,
messages=messages,
newest=newest_version_file_contents,
previous=previous_version_file_contents,
extra_stylesheets=extra_stylesheets,
versions=dates[1:],
newest_version_timestamp=dates[0],
current_previous_version=str(previous_version),
current_diff_url=watch['url'])
return output
@app.route("/favicon.ico", methods=['GET'])
def favicon():
return send_from_directory("/app/static/images", filename="favicon.ico")
# We're good but backups are even better!
@app.route("/backup", methods=['GET'])
def get_backup():
import zipfile
from pathlib import Path
# create a ZipFile object
backupname = "changedetection-backup-{}.zip".format(int(time.time()))
# We only care about UUIDS from the current index file
uuids = list(datastore.data['watching'].keys())
with zipfile.ZipFile(os.path.join(app.config['datastore_path'], backupname), 'w',
compression=zipfile.ZIP_DEFLATED,
compresslevel=6) as zipObj:
# Be sure we're written fresh
datastore.sync_to_json()
# Add the index
zipObj.write(os.path.join(app.config['datastore_path'], "url-watches.json"))
# Add any snapshot data we find
for txt_file_path in Path(app.config['datastore_path']).rglob('*.txt'):
parent_p = txt_file_path.parent
if parent_p.name in uuids:
zipObj.write(txt_file_path)
return send_file(os.path.join(app.config['datastore_path'], backupname),
as_attachment=True,
mimetype="application/zip",
attachment_filename=backupname)
@app.route("/static/<string:group>/<string:filename>", methods=['GET'])
def static_content(group, filename):
# These files should be in our subdirectory
full_path = os.path.realpath(__file__)
p = os.path.dirname(full_path)
try:
return send_from_directory("{}/static/{}".format(p, group), filename=filename)
except FileNotFoundError:
abort(404)
@app.route("/api/add", methods=['POST'])
def api_watch_add():
global messages
# @todo add_watch should throw a custom Exception for validation etc
new_uuid = datastore.add_watch(url=request.form.get('url').strip(), tag=request.form.get('tag').strip())
# Straight into the queue.
update_q.put(new_uuid)
messages.append({'class': 'ok', 'message': 'Watch added.'})
return redirect(url_for('index'))
@app.route("/api/delete", methods=['GET'])
def api_delete():
global messages
uuid = request.args.get('uuid')
datastore.delete(uuid)
messages.append({'class': 'ok', 'message': 'Deleted.'})
return redirect(url_for('index'))
@app.route("/api/checknow", methods=['GET'])
def api_watch_checknow():
global messages
tag = request.args.get('tag')
uuid = request.args.get('uuid')
i = 0
running_uuids = []
for t in running_update_threads:
running_uuids.append(t.current_uuid)
# @todo check thread is running and skip
if uuid:
if uuid not in running_uuids:
update_q.put(uuid)
i = 1
elif tag != None:
# Items that have this current tag
for watch_uuid, watch in datastore.data['watching'].items():
if (tag != None and tag in watch['tag']):
i += 1
if watch_uuid not in running_uuids:
update_q.put(watch_uuid)
else:
# No tag, no uuid, add everything.
for watch_uuid, watch in datastore.data['watching'].items():
i += 1
if watch_uuid not in running_uuids:
update_q.put(watch_uuid)
messages.append({'class': 'ok', 'message': "{} watches are rechecking.".format(i)})
return redirect(url_for('index', tag=tag))
# @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
return app
# Requests for checking on the site use a pool of thread Workers managed by a Queue.
class Worker(threading.Thread):
current_uuid = None
def __init__(self, q, *args, **kwargs):
self.q = q
super().__init__(*args, **kwargs)
def run(self):
from backend import fetch_site_status
update_handler = fetch_site_status.perform_site_check(datastore=datastore)
while True:
try:
uuid = self.q.get(block=True, timeout=1)
except queue.Empty:
# We have a chance to kill this thread that needs to monitor for new jobs..
# Delays here would be caused by a current response object pending
# @todo switch to threaded response handler
if app.config['STOP_THREADS']:
return
else:
self.current_uuid = uuid
if uuid in list(datastore.data['watching'].keys()):
try:
result, contents = update_handler.run(uuid)
except PermissionError as s:
app.logger.error("File permission error updating", uuid, str(s))
else:
if result:
datastore.update_watch(uuid=uuid, update_obj=result)
if contents:
# A change was detected
datastore.save_history_text(uuid=uuid, contents=contents, result_obj=result)
self.current_uuid = None # Done
self.q.task_done()
# Thread runner to check every minute, look for new watches to feed into the Queue.
def ticker_thread_check_time_launch_checks():
# Spin up Workers.
for _ in range(datastore.data['settings']['requests']['workers']):
new_worker = Worker(update_q)
running_update_threads.append(new_worker)
new_worker.start()
# Every minute check for new UUIDs to follow up on
while True:
if app.config['STOP_THREADS']:
return
running_uuids = []
for t in running_update_threads:
running_uuids.append(t.current_uuid)
# Look at the dataset, find a stale watch to process
minutes = datastore.data['settings']['requests']['minutes_between_check']
for uuid, watch in datastore.data['watching'].items():
if watch['last_checked'] <= time.time() - (minutes * 60):
# @todo maybe update_q.queue is enough?
if not uuid in running_uuids and uuid not in update_q.queue:
update_q.put(uuid)
# Should be low so we can break this out in testing
time.sleep(1)

@ -1,501 +0,0 @@
#!/usr/bin/python3
# @todo logging
# @todo sort by last_changed
# @todo extra options for url like , verify=False etc.
# @todo enable https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl as option?
# @todo maybe a button to reset all 'last-changed'.. so you can see it clearly when something happens since your last visit
# @todo option for interval day/6 hour/etc
# @todo on change detected, config for calling some API
# @todo make tables responsive!
# @todo fetch title into json
# https://distill.io/features
# proxy per check
#i
import json
import eventlet
import eventlet.wsgi
import time
import os
import getopt
import sys
import datetime
import timeago
import threading
import queue
from flask import Flask, render_template, request, send_file, send_from_directory, safe_join, abort, redirect, url_for
# Local
import store
running_update_threads = []
ticker_thread = None
datastore = store.ChangeDetectionStore()
messages = []
extra_stylesheets = []
update_q = queue.Queue()
app = Flask(__name__, static_url_path='/static')
app.config['STATIC_RESOURCES'] = "/app/static"
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
# app.config['SECRET_KEY'] = 'secret!'
# Disables caching of the templates
app.config['TEMPLATES_AUTO_RELOAD'] = True
# We use the whole watch object from the store/JSON so we can see if there's some related status in terms of a thread
# running or something similar.
@app.template_filter('format_last_checked_time')
def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"):
# Worker thread tells us which UUID it is currently processing.
for t in running_update_threads:
if t.current_uuid == watch_obj['uuid']:
return "Checking now.."
if watch_obj['last_checked'] == 0:
return 'Not yet'
return timeago.format(int(watch_obj['last_checked']), time.time())
# @app.context_processor
# def timeago():
# def _timeago(lower_time, now):
# return timeago.format(lower_time, now)
# return dict(timeago=_timeago)
@app.template_filter('format_timestamp_timeago')
def _jinja2_filter_datetimestamp(timestamp, format="%Y-%m-%d %H:%M:%S"):
return timeago.format(timestamp, time.time())
# return timeago.format(timestamp, time.time())
# return datetime.datetime.utcfromtimestamp(timestamp).strftime(format)
@app.route("/", methods=['GET'])
def main_page():
global messages
limit_tag = request.args.get('tag')
# Sort by last_changed and add the uuid which is usually the key..
sorted_watches = []
for uuid, watch in datastore.data['watching'].items():
if limit_tag != None:
# Support for comma separated list of tags.
for tag_in_watch in watch['tag'].split(','):
tag_in_watch = tag_in_watch.strip()
if tag_in_watch == limit_tag:
watch['uuid'] = uuid
sorted_watches.append(watch)
else:
watch['uuid'] = uuid
sorted_watches.append(watch)
sorted_watches.sort(key=lambda x: x['last_changed'], reverse=True)
existing_tags = datastore.get_all_tags()
output = render_template("watch-overview.html",
watches=sorted_watches,
messages=messages,
tags=existing_tags,
active_tag=limit_tag)
# Show messages but once.
messages = []
return output
@app.route("/scrub", methods=['GET', 'POST'])
def scrub_page():
from pathlib import Path
global messages
if request.method == 'POST':
confirmtext = request.form.get('confirmtext')
if confirmtext == 'scrub':
for txt_file_path in Path('/datastore').rglob('*.txt'):
os.unlink(txt_file_path)
for uuid, watch in datastore.data['watching'].items():
watch['last_checked'] = 0
watch['last_changed'] = 0
watch['previous_md5'] = None
watch['history'] = {}
datastore.needs_write = True
messages.append({'class': 'ok', 'message': 'Cleaned all version history.'})
else:
messages.append({'class': 'error', 'message': 'Wrong confirm text.'})
return redirect(url_for('main_page'))
return render_template("scrub.html")
@app.route("/edit", methods=['GET', 'POST'])
def edit_page():
global messages
import validators
if request.method == 'POST':
uuid = request.args.get('uuid')
url = request.form.get('url').strip()
tag = request.form.get('tag').strip()
form_headers = request.form.get('headers').strip().split("\n")
extra_headers = {}
if form_headers:
for header in form_headers:
if len(header):
parts = header.split(':', 1)
extra_headers.update({parts[0].strip(): parts[1].strip()})
validators.url(url) # @todo switch to prop/attr/observer
datastore.data['watching'][uuid].update({'url': url,
'tag': tag,
'headers': extra_headers})
datastore.needs_write = True
messages.append({'class': 'ok', 'message': 'Updated watch.'})
return redirect(url_for('main_page'))
else:
uuid = request.args.get('uuid')
output = render_template("edit.html", uuid=uuid, watch=datastore.data['watching'][uuid], messages=messages)
return output
@app.route("/settings", methods=['GET', "POST"])
def settings_page():
global messages
if request.method == 'POST':
try:
minutes = int(request.values.get('minutes').strip())
except ValueError:
messages.append({'class': 'error', 'message': "Invalid value given, use an integer."})
else:
if minutes >= 5 and minutes <= 600:
datastore.data['settings']['requests']['minutes_between_check'] = minutes
datastore.needs_write = True
messages.append({'class': 'ok', 'message': "Updated"})
else:
messages.append({'class': 'error', 'message': "Must be equal to or greater than 5 and less than 600 minutes"})
output = render_template("settings.html", messages=messages, minutes=datastore.data['settings']['requests']['minutes_between_check'])
messages =[]
return output
@app.route("/import", methods=['GET', "POST"])
def import_page():
import validators
global messages
remaining_urls=[]
good = 0
if request.method == 'POST':
urls = request.values.get('urls').split("\n")
for url in urls:
url = url.strip()
if len(url) and validators.url(url):
datastore.add_watch(url=url.strip(), tag="")
good += 1
else:
if len(url):
remaining_urls.append(url)
messages.append({'class': 'ok', 'message': "{} Imported, {} Skipped.".format(good, len(remaining_urls))})
output = render_template("import.html",
messages=messages,
remaining="\n".join(remaining_urls)
)
messages = []
return output
@app.route("/diff/<string:uuid>", methods=['GET'])
def diff_history_page(uuid):
global messages
extra_stylesheets=['/static/css/diff.css']
watch = datastore.data['watching'][uuid]
dates = list(watch['history'].keys())
# Convert to int, sort and back to str again
dates = [int(i) for i in dates]
dates.sort(reverse=True)
dates = [str(i) for i in dates]
# Save the current newest history as the most recently viewed
datastore.set_last_viewed(uuid, dates[0])
newest_file = watch['history'][dates[0]]
with open(newest_file, 'r') as f:
newest_version_file_contents = f.read()
previous_version = request.args.get('previous_version')
try:
previous_file = watch['history'][previous_version]
except KeyError:
# Not present, use a default value, the second one in the sorted list.
previous_file = watch['history'][dates[1]]
with open(previous_file, 'r') as f:
previous_version_file_contents = f.read()
output = render_template("diff.html", watch_a=watch,
messages=messages,
newest=newest_version_file_contents,
previous=previous_version_file_contents,
extra_stylesheets=extra_stylesheets,
versions=dates[1:],
newest_version_timestamp=dates[0],
current_previous_version=str(previous_version),
current_diff_url=watch['url'])
return output
@app.route("/favicon.ico", methods=['GET'])
def favicon():
return send_from_directory("/app/static/images", filename="favicon.ico")
# We're good but backups are even better!
@app.route("/backup", methods=['GET'])
def get_backup():
import zipfile
from pathlib import Path
import zlib
# create a ZipFile object
backupname = "changedetection-backup-{}.zip".format(int(time.time()))
# We only care about UUIDS from the current index file
uuids = list(datastore.data['watching'].keys())
with zipfile.ZipFile(os.path.join("/datastore", backupname), 'w', compression=zipfile.ZIP_DEFLATED,
compresslevel=6) as zipObj:
# Be sure we're written fresh
datastore.sync_to_json()
# Add the index
zipObj.write(os.path.join("/datastore", "url-watches.json"))
# Add any snapshot data we find
for txt_file_path in Path('/datastore').rglob('*.txt'):
parent_p = txt_file_path.parent
if parent_p.name in uuids:
zipObj.write(txt_file_path)
return send_file(os.path.join("/datastore", backupname),
as_attachment=True,
mimetype="application/zip",
attachment_filename=backupname)
# A few self sanity checks, mostly for developer/bug check
@app.route("/self-check", methods=['GET'])
def selfcheck():
output = "All fine"
# In earlier versions before a single threaded write of the JSON store, sometimes histories could get mixed.
# Could also maybe affect people who manually fiddle with their JSON store?
for uuid, watch in datastore.data['watching'].items():
for timestamp, path in watch['history'].items():
# Each history snapshot should include a full path, which contains the {uuid}
if not uuid in path:
output = "Something weird in {}, suspected incorrect snapshot path.".format(uuid)
return output
@app.route("/static/<string:group>/<string:filename>", methods=['GET'])
def static_content(group, filename):
try:
return send_from_directory("/app/static/{}".format(group), filename=filename)
except FileNotFoundError:
abort(404)
@app.route("/api/add", methods=['POST'])
def api_watch_add():
global messages
# @todo add_watch should throw a custom Exception for validation etc
new_uuid = datastore.add_watch(url=request.form.get('url').strip(), tag=request.form.get('tag').strip())
# Straight into the queue.
update_q.put(new_uuid)
messages.append({'class': 'ok', 'message': 'Watch added.'})
return redirect(url_for('main_page'))
@app.route("/api/delete", methods=['GET'])
def api_delete():
global messages
uuid = request.args.get('uuid')
datastore.delete(uuid)
messages.append({'class': 'ok', 'message': 'Deleted.'})
return redirect(url_for('main_page'))
@app.route("/api/checknow", methods=['GET'])
def api_watch_checknow():
global messages
tag = request.args.get('tag')
uuid = request.args.get('uuid')
i=0
if uuid:
update_q.put(uuid)
i = 1
elif tag != None:
for watch_uuid, watch in datastore.data['watching'].items():
if (tag != None and tag in watch['tag']):
i += 1
update_q.put(watch_uuid)
else:
# No tag, no uuid, add everything.
for watch_uuid, watch in datastore.data['watching'].items():
i += 1
update_q.put(watch_uuid)
messages.append({'class': 'ok', 'message': "{} watches are rechecking.".format(i)})
return redirect(url_for('main_page', tag=tag))
# Requests for checking on the site use a pool of thread Workers managed by a Queue.
class Worker(threading.Thread):
current_uuid = None
def __init__(self, q, *args, **kwargs):
self.q = q
super().__init__(*args, **kwargs)
def run(self):
import fetch_site_status
from copy import deepcopy
update_handler = fetch_site_status.perform_site_check(datastore=datastore)
try:
while True:
uuid = self.q.get() # Blocking
self.current_uuid = uuid
if uuid in list(datastore.data['watching'].keys()):
result = update_handler.run(uuid)
datastore.update_watch(uuid=uuid, update_obj=result)
self.current_uuid = None # Done
self.q.task_done()
except KeyboardInterrupt:
return
# Thread runner to check every minute, look for new watches to feed into the Queue.
def ticker_thread_check_time_launch_checks():
# Spin up Workers.
for _ in range(datastore.data['settings']['requests']['workers']):
new_worker = Worker(update_q)
running_update_threads.append(new_worker)
new_worker.start()
# Every minute check for new UUIDs to follow up on
while True:
minutes = datastore.data['settings']['requests']['minutes_between_check']
for uuid, watch in datastore.data['watching'].items():
if watch['last_checked'] <= time.time() - (minutes * 60):
update_q.put(uuid)
time.sleep(60)
# Thread runner, this helps with thread/write issues when there are many operations that want to update the JSON
# by just running periodically in one thread, according to python, dict updates are threadsafe.
def save_datastore():
try:
while True:
if datastore.needs_write:
datastore.sync_to_json()
time.sleep(1)
except KeyboardInterrupt:
return
def main(argv):
ssl_mode = False
port = 5000
try:
opts, args = getopt.getopt(argv, "sp:", "purge")
except getopt.GetoptError:
print('backend.py -s SSL enable -p [port]')
sys.exit(2)
for opt, arg in opts:
if opt == '--purge':
# Remove history, the actual files you need to delete manually.
for uuid, watch in datastore.data['watching'].items():
watch.update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'previous_md5': None})
if opt == '-s':
ssl_mode = True
if opt == '-p':
port = arg
# @todo handle ctrl break
ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
save_data_thread = threading.Thread(target=save_datastore).start()
# @todo finalise SSL config, but this should get you in the right direction if you need it.
if ssl_mode:
eventlet.wsgi.server(eventlet.wrap_ssl(eventlet.listen(('', port)),
certfile='cert.pem',
keyfile='privkey.pem',
server_side=True), app)
else:
eventlet.wsgi.server(eventlet.listen(('', port)), app)
if __name__ == '__main__':
main(sys.argv[1:])

@ -1,9 +1,7 @@
import time import time
import sys
print ("Sleep loop, you should run your script from the console") print ("Sleep loop, you should run your script from the console")
while True: while True:
# Wait for 5 seconds # Wait for 5 seconds
time.sleep(2) time.sleep(2)

@ -1,12 +1,8 @@
import time import time
import requests import requests
import hashlib import hashlib
import os
import re
from inscriptis import get_text from inscriptis import get_text
from copy import deepcopy
# Some common stuff here that can be moved to a base class # Some common stuff here that can be moved to a base class
class perform_site_check(): class perform_site_check():
@ -15,37 +11,15 @@ class perform_site_check():
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.datastore = datastore self.datastore = datastore
def save_firefox_screenshot(self, uuid, output):
# @todo call selenium or whatever
return
def ensure_output_path(self):
try:
os.stat(self.output_path)
except:
os.mkdir(self.output_path)
def save_response_stripped_output(self, output, fname):
with open(fname, 'w') as f:
f.write(output)
f.close()
return fname
def run(self, uuid): def run(self, uuid):
timestamp = int(time.time()) # used for storage etc too timestamp = int(time.time()) # used for storage etc too
stripped_text_from_html = False
update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'], update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'],
'history': {}, 'history': {},
"last_checked": timestamp "last_checked": timestamp
} }
self.output_path = "/datastore/{}".format(uuid)
self.ensure_output_path()
extra_headers = self.datastore.get_val(uuid, 'headers') extra_headers = self.datastore.get_val(uuid, 'headers')
# Tweak the base config with the per-watch ones # Tweak the base config with the per-watch ones
@ -65,15 +39,15 @@ class perform_site_check():
timeout = 15 timeout = 15
try: try:
r = requests.get(self.datastore.get_val(uuid, 'url'), url = self.datastore.get_val(uuid, 'url')
r = requests.get(url,
headers=request_headers, headers=request_headers,
timeout=timeout, timeout=timeout,
verify=False) verify=False)
stripped_text_from_html = get_text(r.text) stripped_text_from_html = get_text(r.text)
# Usually from networkIO/requests level # Usually from networkIO/requests level
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
update_obj["last_error"] = str(e) update_obj["last_error"] = str(e)
@ -111,13 +85,5 @@ class perform_site_check():
update_obj["last_changed"] = timestamp update_obj["last_changed"] = timestamp
update_obj["previous_md5"] = fetched_md5 update_obj["previous_md5"] = fetched_md5
fname = "{}/{}.stripped.txt".format(self.output_path, fetched_md5)
with open(fname, 'w') as f:
f.write(stripped_text_from_html)
f.close()
# Update history with the stripped text for future reference, this will also mean we save the first
# Should always be keyed by string(timestamp)
update_obj.update({"history": {str(timestamp): fname}})
return update_obj return update_obj, stripped_text_from_html

@ -1,14 +0,0 @@
from flask import make_response
from functools import wraps, update_wrapper
from datetime import datetime
def nocache(view):
@wraps(view)
def no_cache(*args, **kwargs):
response = make_response(view(*args, **kwargs))
response.headers['hmm'] = datetime.now()
return response
return update_wrapper(no_cache, view)

@ -0,0 +1,2 @@
[pytest]
addopts = --no-start-live-server --live-server-port=5005

@ -1,21 +1,27 @@
import json import json
import uuid as uuid_builder import uuid as uuid_builder
import validators
import os.path import os.path
from os import path from os import path
from threading import Lock, Thread from threading import Lock
from copy import deepcopy from copy import deepcopy
import logging
import time
import threading
# Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods? # Is there an existing library to ensure some data store (JSON etc) is in sync with CRUD methods?
# Open a github issue if you know something :) # Open a github issue if you know something :)
# https://stackoverflow.com/questions/6190468/how-to-trigger-function-on-value-change # https://stackoverflow.com/questions/6190468/how-to-trigger-function-on-value-change
class ChangeDetectionStore: class ChangeDetectionStore:
lock = Lock() lock = Lock()
def __init__(self): def __init__(self, datastore_path="/datastore", include_default_watches=True):
self.needs_write = False self.needs_write = False
self.datastore_path = datastore_path
self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
self.stop_thread = False
self.__data = { self.__data = {
'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!", 'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!",
'watching': {}, 'watching': {},
@ -41,7 +47,7 @@ class ChangeDetectionStore:
'tag': None, 'tag': None,
'last_checked': 0, 'last_checked': 0,
'last_changed': 0, 'last_changed': 0,
'last_viewed': 0, # history key value of the last viewed via the [diff] link 'last_viewed': 0, # history key value of the last viewed via the [diff] link
'newest_history_key': "", 'newest_history_key': "",
'title': None, 'title': None,
'previous_md5': "", 'previous_md5': "",
@ -57,7 +63,7 @@ class ChangeDetectionStore:
self.__data['build_sha'] = f.read() self.__data['build_sha'] = f.read()
try: try:
with open('/datastore/url-watches.json') as json_file: with open(self.json_store_path) as json_file:
from_disk = json.load(json_file) from_disk = json.load(json_file)
# @todo isnt there a way todo this dict.update recursively? # @todo isnt there a way todo this dict.update recursively?
@ -84,11 +90,16 @@ class ChangeDetectionStore:
# First time ran, doesnt exist. # First time ran, doesnt exist.
except (FileNotFoundError, json.decoder.JSONDecodeError): except (FileNotFoundError, json.decoder.JSONDecodeError):
print("Creating JSON store") if include_default_watches:
self.add_watch(url='http://www.quotationspage.com/random.php', tag='test') print("Creating JSON store at", self.datastore_path)
self.add_watch(url='https://news.ycombinator.com/', tag='Tech news')
self.add_watch(url='https://www.gov.uk/coronavirus', tag='Covid') self.add_watch(url='http://www.quotationspage.com/random.php', tag='test')
self.add_watch(url='https://changedetection.io', tag='Tech news') self.add_watch(url='https://news.ycombinator.com/', tag='Tech news')
self.add_watch(url='https://www.gov.uk/coronavirus', tag='Covid')
self.add_watch(url='https://changedetection.io', tag='Tech news')
# Finally start the thread that will manage periodic data saves to JSON
save_data_thread = threading.Thread(target=self.save_datastore).start()
# Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0.
def get_newest_history_key(self, uuid): def get_newest_history_key(self, uuid):
@ -105,9 +116,6 @@ class ChangeDetectionStore:
return 0 return 0
def set_last_viewed(self, uuid, timestamp): def set_last_viewed(self, uuid, timestamp):
self.data['watching'][uuid].update({'last_viewed': str(timestamp)}) self.data['watching'][uuid].update({'last_viewed': str(timestamp)})
self.needs_write = True self.needs_write = True
@ -121,7 +129,7 @@ class ChangeDetectionStore:
if isinstance(d, dict): if isinstance(d, dict):
if update_obj is not None and dict_key in update_obj: if update_obj is not None and dict_key in update_obj:
self.__data['watching'][uuid][dict_key].update(update_obj[dict_key]) self.__data['watching'][uuid][dict_key].update(update_obj[dict_key])
del(update_obj[dict_key]) del (update_obj[dict_key])
self.__data['watching'][uuid].update(update_obj) self.__data['watching'][uuid].update(update_obj)
self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid) self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid)
@ -140,7 +148,7 @@ class ChangeDetectionStore:
# Support for comma separated list of tags. # Support for comma separated list of tags.
for tag in watch['tag'].split(','): for tag in watch['tag'].split(','):
tag = tag.strip() tag = tag.strip()
if not tag in tags: if tag not in tags:
tags.append(tag) tags.append(tag)
tags.sort() tags.sort()
@ -166,7 +174,6 @@ class ChangeDetectionStore:
def add_watch(self, url, tag): def add_watch(self, url, tag):
with self.lock: with self.lock:
# @todo use a common generic version of this # @todo use a common generic version of this
new_uuid = str(uuid_builder.uuid4()) new_uuid = str(uuid_builder.uuid4())
_blank = deepcopy(self.generic_definition) _blank = deepcopy(self.generic_definition)
@ -178,17 +185,50 @@ class ChangeDetectionStore:
self.data['watching'][new_uuid] = _blank self.data['watching'][new_uuid] = _blank
self.needs_write = True # Get the directory ready
output_path = "{}/{}".format(self.datastore_path, new_uuid)
try:
os.mkdir(output_path)
except FileExistsError:
print(output_path, "already exists.")
self.sync_to_json()
return new_uuid return new_uuid
def sync_to_json(self): # Save some text file to the appropriate path and bump the history
# result_obj from fetch_site_status.run()
def save_history_text(self, uuid, result_obj, contents):
output_path = "{}/{}".format(self.datastore_path, uuid)
fname = "{}/{}-{}.stripped.txt".format(output_path, result_obj['previous_md5'], str(time.time()))
with open(fname, 'w') as f:
f.write(contents)
f.close()
with open('/datastore/url-watches.json', 'w') as json_file: # Update history with the stripped text for future reference, this will also mean we save the first
# Should always be keyed by string(timestamp)
self.update_watch(uuid, {"history": {str(result_obj["last_checked"]): fname}})
return fname
def sync_to_json(self):
print("Saving..")
with open(self.json_store_path, 'w') as json_file:
json.dump(self.__data, json_file, indent=4) json.dump(self.__data, json_file, indent=4)
print("Re-saved index") logging.info("Re-saved index")
self.needs_write = False self.needs_write = False
# Thread runner, this helps with thread/write issues when there are many operations that want to update the JSON
# by just running periodically in one thread, according to python, dict updates are threadsafe.
def save_datastore(self):
while True:
if self.stop_thread:
print("Shutting down datastore thread")
return
if self.needs_write:
self.sync_to_json()
time.sleep(1)
# body of the constructor # body of the constructor

@ -0,0 +1,2 @@
"""Tests for the app."""

@ -0,0 +1,43 @@
#!/usr/bin/python3
import pytest
from backend import changedetection_app
from backend import store
import os
# https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py
# Much better boilerplate than the docs
# https://www.python-boilerplate.com/py3+flask+pytest/
global app
@pytest.fixture(scope='session')
def app(request):
"""Create application for the tests."""
datastore_path = "./test-datastore"
try:
os.mkdir(datastore_path)
except FileExistsError:
pass
try:
os.unlink("{}/url-watches.json".format(datastore_path))
except FileNotFoundError:
pass
app_config = {'datastore_path': datastore_path}
datastore = store.ChangeDetectionStore(datastore_path=app_config['datastore_path'], include_default_watches=False)
app = changedetection_app(app_config, datastore)
def teardown():
datastore.stop_thread = True
app.config['STOP_THREADS'] = True
request.addfinalizer(teardown)
return app

@ -0,0 +1,93 @@
#!/usr/bin/python3
import time
from flask import url_for
from urllib.request import urlopen
def set_original_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/output.txt", "w") as f:
f.write(test_return_data)
def set_modified_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>which has this one new line</p>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/output.txt", "w") as f:
f.write(test_return_data)
def test_check_basic_change_detection_functionality(client, live_server):
sleep_time_for_fetch_thread = 3
@live_server.app.route('/test-endpoint')
def test_endpoint():
# Tried using a global var here but didn't seem to work, so reading from a file instead.
with open("test-datastore/output.txt", "r") as f:
return f.read()
set_original_response()
live_server.start()
# Add our URL to the import page
res = client.post(
url_for("import_page"),
data={"urls": url_for('test_endpoint', _external=True)},
follow_redirects=True
)
assert b"1 Imported" in res.data
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
assert b'test-endpoint' in res.data
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
#####################
# Make a change
set_modified_response()
res = urlopen(url_for('test_endpoint', _external=True))
assert b'which has this one new line' in res.read()
# Force recheck
res = client.get(url_for("api_watch_checknow"), follow_redirects=True)
assert b'1 watches are rechecking.' in res.data
time.sleep(sleep_time_for_fetch_thread)
# Now something should be ready, indicated by having a 'unviewed' class
res = client.get(url_for("index"))
assert b'unviewed' in res.data

@ -7,6 +7,9 @@ six==1.10.0
yarl yarl
flask flask
pytest
pytest-flask # for live_server
eventlet eventlet
requests requests
validators validators

Loading…
Cancel
Save