diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index ae14bba8..ee021775 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -4,11 +4,7 @@ name: changedetection.io -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] +on: [push, pull_request] jobs: build: diff --git a/README.md b/README.md index 3dc4736d..c76996d9 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ - + ## Self-hosted change monitoring of web pages. diff --git a/backend/__init__.py b/backend/__init__.py index 101e0094..00b0ba81 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -2,10 +2,8 @@ # @todo logging -# @todo sort by last_changed # @todo extra options for url like , verify=False etc. # @todo enable https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl as option? -# @todo maybe a button to reset all 'last-changed'.. so you can see it clearly when something happens since your last visit # @todo option for interval day/6 hour/etc # @todo on change detected, config for calling some API # @todo make tables responsive! @@ -19,9 +17,16 @@ import os import timeago import threading +from threading import Event + import queue -from flask import Flask, render_template, request, send_file, send_from_directory, abort, redirect, url_for +from flask import Flask, render_template, request, send_file, send_from_directory, abort, redirect, url_for + +from feedgen.feed import FeedGenerator +from flask import make_response +import datetime +import pytz datastore = None @@ -39,7 +44,9 @@ app = Flask(__name__, static_url_path="/var/www/change-detection/backen/static") # Stop browser caching of assets app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0 -app.config['STOP_THREADS'] = False +app.config.exit = Event() + +app.config['NEW_VERSION_AVAILABLE'] = False # Disables caching of the templates app.config['TEMPLATES_AUTO_RELOAD'] = True @@ -76,7 +83,7 @@ def _jinja2_filter_datetimestamp(timestamp, format="%Y-%m-%d %H:%M:%S"): def changedetection_app(config=None, datastore_o=None): global datastore datastore = datastore_o - # Hmm + app.config.update(dict(DEBUG=True)) app.config.update(config or {}) @@ -112,14 +119,40 @@ def changedetection_app(config=None, datastore_o=None): sorted_watches.sort(key=lambda x: x['last_changed'], reverse=True) existing_tags = datastore.get_all_tags() - output = render_template("watch-overview.html", - watches=sorted_watches, - messages=messages, - tags=existing_tags, - active_tag=limit_tag) + rss = request.args.get('rss') + + if rss: + fg = FeedGenerator() + fg.title('changedetection.io') + fg.description('Feed description') + fg.link(href='https://changedetection.io') + + for watch in sorted_watches: + if not watch['viewed']: + fe = fg.add_entry() + fe.title(watch['url']) + fe.link(href=watch['url']) + fe.description(watch['url']) + fe.guid(watch['uuid'], permalink=False) + dt = datetime.datetime.fromtimestamp(int(watch['newest_history_key'])) + dt = dt.replace(tzinfo=pytz.UTC) + fe.pubDate(dt) + + response = make_response(fg.rss_str()) + response.headers.set('Content-Type', 'application/rss+xml') + return response + + else: + output = render_template("watch-overview.html", + watches=sorted_watches, + messages=messages, + tags=existing_tags, + active_tag=limit_tag, + has_unviewed=datastore.data['has_unviewed']) + + # Show messages but once. + messages = [] - # Show messages but once. - messages = [] return output @app.route("/scrub", methods=['GET', 'POST']) @@ -151,29 +184,80 @@ def changedetection_app(config=None, datastore_o=None): return render_template("scrub.html") - @app.route("/edit", methods=['GET', 'POST']) - def edit_page(): + # If they edited an existing watch, we need to know to reset the current/previous md5 to include + # the excluded text. + def get_current_checksum_include_ignore_text(uuid): + + import hashlib + from backend import fetch_site_status + + # Get the most recent one + newest_history_key = datastore.get_val(uuid, 'newest_history_key') + + # 0 means that theres only one, so that there should be no 'unviewed' history availabe + if newest_history_key == 0: + newest_history_key = list(datastore.data['watching'][uuid]['history'].keys())[0] + + if newest_history_key: + with open(datastore.data['watching'][uuid]['history'][newest_history_key], + encoding='utf-8') as file: + raw_content = file.read() + + handler = fetch_site_status.perform_site_check(datastore=datastore) + stripped_content = handler.strip_ignore_text(raw_content, + datastore.data['watching'][uuid]['ignore_text']) + + checksum = hashlib.md5(stripped_content).hexdigest() + return checksum + + return datastore.data['watching'][uuid]['previous_md5'] + + @app.route("/edit/", methods=['GET', 'POST']) + def edit_page(uuid): global messages import validators + # More for testing, possible to return the first/only + if uuid == 'first': + uuid = list(datastore.data['watching'].keys()).pop() + if request.method == 'POST': - uuid = request.args.get('uuid') url = request.form.get('url').strip() tag = request.form.get('tag').strip() + # Extra headers form_headers = request.form.get('headers').strip().split("\n") extra_headers = {} if form_headers: for header in form_headers: if len(header): parts = header.split(':', 1) - extra_headers.update({parts[0].strip(): parts[1].strip()}) + if len(parts) == 2: + extra_headers.update({parts[0].strip(): parts[1].strip()}) + + update_obj = {'url': url, + 'tag': tag, + 'headers': extra_headers + } + + # Ignore text + form_ignore_text = request.form.get('ignore-text').strip() + ignore_text = [] + if len(form_ignore_text): + for text in form_ignore_text.split("\n"): + text = text.strip() + if len(text): + ignore_text.append(text) + + datastore.data['watching'][uuid]['ignore_text'] = ignore_text + + # Reset the previous_md5 so we process a new snapshot including stripping ignore text. + if len(datastore.data['watching'][uuid]['history']): + update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) validators.url(url) # @todo switch to prop/attr/observer - datastore.data['watching'][uuid].update({'url': url, - 'tag': tag, - 'headers': extra_headers}) + datastore.data['watching'][uuid].update(update_obj) datastore.needs_write = True messages.append({'class': 'ok', 'message': 'Updated watch.'}) @@ -181,8 +265,6 @@ def changedetection_app(config=None, datastore_o=None): return redirect(url_for('index')) else: - - uuid = request.args.get('uuid') output = render_template("edit.html", uuid=uuid, watch=datastore.data['watching'][uuid], messages=messages) return output @@ -235,23 +317,37 @@ def changedetection_app(config=None, datastore_o=None): messages.append({'class': 'ok', 'message': "{} Imported, {} Skipped.".format(good, len(remaining_urls))}) - if len(remaining_urls) == 0: - return redirect(url_for('index')) - else: - output = render_template("import.html", - messages=messages, - remaining="\n".join(remaining_urls) - ) - messages = [] + if len(remaining_urls) == 0: + # Looking good, redirect to index. + return redirect(url_for('index')) + + # Could be some remaining, or we could be on GET + output = render_template("import.html", + messages=messages, + remaining="\n".join(remaining_urls) + ) + messages = [] + return output + # Clear all statuses, so we do not see the 'unviewed' class + @app.route("/api/mark-all-viewed", methods=['GET']) + def mark_all_viewed(): + + # Save the current newest history as the most recently viewed + for watch_uuid, watch in datastore.data['watching'].items(): + datastore.set_last_viewed(watch_uuid, watch['newest_history_key']) + + messages.append({'class': 'ok', 'message': "Cleared all statuses."}) + return redirect(url_for('index')) + @app.route("/diff/", methods=['GET']) def diff_history_page(uuid): global messages # More for testing, possible to return the first/only if uuid == 'first': - uuid= list(datastore.data['watching'].keys()).pop() + uuid = list(datastore.data['watching'].keys()).pop() extra_stylesheets = ['/static/css/diff.css'] try: @@ -266,9 +362,9 @@ def changedetection_app(config=None, datastore_o=None): dates.sort(reverse=True) dates = [str(i) for i in dates] - if len(dates) < 2: - messages.append({'class': 'error', 'message': "Not enough saved change detection snapshots to produce a report."}) + messages.append( + {'class': 'error', 'message': "Not enough saved change detection snapshots to produce a report."}) return redirect(url_for('index')) # Save the current newest history as the most recently viewed @@ -409,9 +505,38 @@ def changedetection_app(config=None, datastore_o=None): # @todo handle ctrl break ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() + # Check for new release version + threading.Thread(target=check_for_new_version).start() return app +# Check for new version and anonymous stats +def check_for_new_version(): + import requests + + import urllib3 + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + while not app.config.exit.is_set(): + try: + r = requests.post("https://changedetection.io/check-ver.php", + data={'version': datastore.data['version_tag'], + 'app_guid': datastore.data['app_guid']}, + + verify=False) + except: + pass + + try: + if "new_version" in r.text: + app.config['NEW_VERSION_AVAILABLE'] = True + except: + pass + + # Check daily + app.config.exit.wait(86400) + + # Requests for checking on the site use a pool of thread Workers managed by a Queue. class Worker(threading.Thread): current_uuid = None @@ -425,16 +550,13 @@ class Worker(threading.Thread): update_handler = fetch_site_status.perform_site_check(datastore=datastore) - while True: + while not app.config.exit.is_set(): try: - uuid = self.q.get(block=True, timeout=1) + uuid = self.q.get(block=False) except queue.Empty: - # We have a chance to kill this thread that needs to monitor for new jobs.. - # Delays here would be caused by a current response object pending - # @todo switch to threaded response handler - if app.config['STOP_THREADS']: - return + pass + else: self.current_uuid = uuid @@ -453,10 +575,11 @@ class Worker(threading.Thread): # A change was detected datastore.save_history_text(uuid=uuid, contents=contents, result_obj=result) - self.current_uuid = None # Done self.q.task_done() + app.config.exit.wait(1) + # Thread runner to check every minute, look for new watches to feed into the Queue. def ticker_thread_check_time_launch_checks(): @@ -467,23 +590,19 @@ def ticker_thread_check_time_launch_checks(): new_worker.start() # Every minute check for new UUIDs to follow up on - while True: - - if app.config['STOP_THREADS']: - return + minutes = datastore.data['settings']['requests']['minutes_between_check'] + while not app.config.exit.is_set(): running_uuids = [] for t in running_update_threads: running_uuids.append(t.current_uuid) # Look at the dataset, find a stale watch to process - minutes = datastore.data['settings']['requests']['minutes_between_check'] + threshold = time.time() - (minutes * 60) for uuid, watch in datastore.data['watching'].items(): - if watch['last_checked'] <= time.time() - (minutes * 60): - - # @todo maybe update_q.queue is enough? + if watch['last_checked'] <= threshold: if not uuid in running_uuids and uuid not in update_q.queue: update_q.put(uuid) # Should be low so we can break this out in testing - time.sleep(1) + app.config.exit.wait(1) diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index c3e557e0..9fb52c75 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -2,7 +2,8 @@ import time import requests import hashlib from inscriptis import get_text - +import urllib3 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Some common stuff here that can be moved to a base class class perform_site_check(): @@ -11,6 +12,24 @@ class perform_site_check(): super().__init__(*args, **kwargs) self.datastore = datastore + def strip_ignore_text(self, content, list_ignore_text): + ignore = [] + for k in list_ignore_text: + ignore.append(k.encode('utf8')) + + output = [] + for line in content.splitlines(): + line = line.encode('utf8') + + # Always ignore blank lines in this mode. (when this function gets called) + if len(line.strip()): + if not any(skip_text in line for skip_text in ignore): + output.append(line) + + return "\n".encode('utf8').join(output) + + + def run(self, uuid): timestamp = int(time.time()) # used for storage etc too stripped_text_from_html = False @@ -76,7 +95,15 @@ class perform_site_check(): if not len(r.text): update_obj["last_error"] = "Empty reply" - fetched_md5 = hashlib.md5(stripped_text_from_html.encode('utf-8')).hexdigest() + # If there's text to skip + # @todo we could abstract out the get_text() to handle this cleaner + if len(self.datastore.data['watching'][uuid]['ignore_text']): + content = self.strip_ignore_text(stripped_text_from_html, + self.datastore.data['watching'][uuid]['ignore_text']) + else: + content = stripped_text_from_html.encode('utf8') + + fetched_md5 = hashlib.md5(content).hexdigest() # could be None or False depending on JSON type if self.datastore.data['watching'][uuid]['previous_md5'] != fetched_md5: diff --git a/backend/pytest.ini b/backend/pytest.ini index 8b9ccf85..883439b1 100644 --- a/backend/pytest.ini +++ b/backend/pytest.ini @@ -1,2 +1,12 @@ [pytest] -addopts = --no-start-live-server --live-server-port=5005 \ No newline at end of file +addopts = --no-start-live-server --live-server-port=5005 +#testpaths = tests pytest_invenio +#live_server_scope = session + +filterwarnings = + ignore::DeprecationWarning:urllib3.*: + +; logging options +log_cli = 1 +log_cli_level = DEBUG +log_cli_format = %(asctime)s %(name)s: %(levelname)s %(message)s \ No newline at end of file diff --git a/backend/static/css/styles.css b/backend/static/css/styles.css index 8f3c0dbf..da504eef 100644 --- a/backend/static/css/styles.css +++ b/backend/static/css/styles.css @@ -88,11 +88,16 @@ section.content { margin: 0 3px 0 5px; } -#check-all-button { - text-align:right; +#post-list-buttons { + text-align: right; + padding: 0px; + margin: 0px; +} +#post-list-buttons li { + display: inline-block; } -#check-all-button a { +#post-list-buttons a { border-top-left-radius: initial; border-top-right-radius: initial; border-bottom-left-radius: 5px; @@ -243,4 +248,21 @@ footer { background: #fff; color: #444; text-align: center; -} \ No newline at end of file +} + +#feed-icon { + vertical-align: middle; +} + +#version { + position: absolute; + top: 80px; + right: 0px; + font-size: 8px; + background: #fff; + padding: 10px; +} + +#new-version-text a{ + color: #e07171; +} diff --git a/backend/static/images/Generic_Feed-icon.svg b/backend/static/images/Generic_Feed-icon.svg new file mode 100644 index 00000000..a7f9cf19 --- /dev/null +++ b/backend/static/images/Generic_Feed-icon.svg @@ -0,0 +1,18 @@ + + + + RSS feed icon + + + + + + + + + \ No newline at end of file diff --git a/backend/store.py b/backend/store.py index d36f4b25..499f2bce 100644 --- a/backend/store.py +++ b/backend/store.py @@ -22,10 +22,10 @@ class ChangeDetectionStore: self.datastore_path = datastore_path self.json_store_path = "{}/url-watches.json".format(self.datastore_path) self.stop_thread = False + self.__data = { 'note': "Hello! If you change this file manually, please be sure to restart your changedetection.io instance!", 'watching': {}, - 'tag': '0.261', 'settings': { 'headers': { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36', @@ -53,7 +53,8 @@ class ChangeDetectionStore: 'previous_md5': "", 'uuid': str(uuid_builder.uuid4()), 'headers': {}, # Extra headers to send - 'history': {} # Dict of timestamp and output stripped filename + 'history': {}, # Dict of timestamp and output stripped filename + 'ignore_text': [] # List of text to ignore when calculating the comparison checksum } if path.isfile('/source.txt'): @@ -63,6 +64,7 @@ class ChangeDetectionStore: self.__data['build_sha'] = f.read() try: + # @todo retest with ", encoding='utf-8'" with open(self.json_store_path) as json_file: from_disk = json.load(json_file) @@ -80,8 +82,7 @@ class ChangeDetectionStore: # Reinitialise each `watching` with our generic_definition in the case that we add a new var in the future. # @todo pretty sure theres a python we todo this with an abstracted(?) object! - - for uuid, watch in self.data['watching'].items(): + for uuid, watch in self.__data['watching'].items(): _blank = deepcopy(self.generic_definition) _blank.update(watch) self.__data['watching'].update({uuid: _blank}) @@ -98,6 +99,14 @@ class ChangeDetectionStore: self.add_watch(url='https://www.gov.uk/coronavirus', tag='Covid') self.add_watch(url='https://changedetection.io', tag='Tech news') + + self.__data['version_tag'] = "0.27" + + if not 'app_guid' in self.__data: + self.__data['app_guid'] = str(uuid_builder.uuid4()) + + self.needs_write = True + # Finally start the thread that will manage periodic data saves to JSON save_data_thread = threading.Thread(target=self.save_datastore).start() @@ -117,7 +126,7 @@ class ChangeDetectionStore: return 0 def set_last_viewed(self, uuid, timestamp): - self.data['watching'][uuid].update({'last_viewed': str(timestamp)}) + self.data['watching'][uuid].update({'last_viewed': int(timestamp)}) self.needs_write = True def update_watch(self, uuid, update_obj): @@ -139,6 +148,19 @@ class ChangeDetectionStore: @property def data(self): + has_unviewed = False + + for uuid, v in self.__data['watching'].items(): + self.__data['watching'][uuid]['newest_history_key'] = self.get_newest_history_key(uuid) + if int(v['newest_history_key']) <= int(v['last_viewed']): + self.__data['watching'][uuid]['viewed'] = True + + else: + self.__data['watching'][uuid]['viewed'] = False + has_unviewed = True + + self.__data['has_unviewed'] = has_unviewed + return self.__data def get_all_tags(self): @@ -156,7 +178,11 @@ class ChangeDetectionStore: def delete(self, uuid): with self.lock: - del (self.__data['watching'][uuid]) + if uuid == 'all': + self.__data['watching'] = {} + else: + del (self.__data['watching'][uuid]) + self.needs_write = True def url_exists(self, url): diff --git a/backend/templates/base.html b/backend/templates/base.html index e5d258c7..03afb75a 100644 --- a/backend/templates/base.html +++ b/backend/templates/base.html @@ -19,9 +19,11 @@ ChangeDetection.io {% if current_diff_url %} - {{ current_diff_url }} + {{ current_diff_url }} {% else %} - Version {{ version }} + {% if new_version_available %} + A new version is available + {% endif %} {% endif %} @@ -36,7 +38,8 @@ SETTINGS - @@ -49,7 +52,7 @@ - +v{{ version }} {% block header %}{% endblock %} diff --git a/backend/templates/edit.html b/backend/templates/edit.html index 89c2bfd6..3b032acb 100644 --- a/backend/templates/edit.html +++ b/backend/templates/edit.html @@ -4,7 +4,7 @@ - + URL @@ -18,10 +18,26 @@ Grouping tags, can be a comma separated list. + + + Ignore text + + {% for value in watch.ignore_text %}{{ value }} +{% endfor %} + Each line will be processed separately as an ignore rule. + + + + Extra request headers - + + Save diff --git a/backend/templates/watch-overview.html b/backend/templates/watch-overview.html index 568e9ae5..e17c4ffa 100644 --- a/backend/templates/watch-overview.html +++ b/backend/templates/watch-overview.html @@ -15,13 +15,11 @@ - + All {% for tag in tags %} - {% if tag == "" %} - All - {% else %} - {{ tag }} - {% endif %} + {% if tag != "" %} + {{ tag }} + {% endif %} {% endfor %} @@ -64,22 +62,29 @@ Recheck - Edit + Edit {% if watch.history|length >= 2 %} Diff {% endif %} {% endfor %} - - - - - Recheck + + {% if has_unviewed %} + + Mark all viewed + + {% endif %} + + Recheck all {% if active_tag%}in "{{active_tag}}"{%endif%} - + + + + + {% endblock %} \ No newline at end of file diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 2c2fa334..836e0fd1 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -7,17 +7,14 @@ import os # https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py - # Much better boilerplate than the docs # https://www.python-boilerplate.com/py3+flask+pytest/ global app - @pytest.fixture(scope='session') def app(request): """Create application for the tests.""" - datastore_path = "./test-datastore" try: @@ -33,11 +30,19 @@ def app(request): app_config = {'datastore_path': datastore_path} datastore = store.ChangeDetectionStore(datastore_path=app_config['datastore_path'], include_default_watches=False) app = changedetection_app(app_config, datastore) + app.config['STOP_THREADS'] = True def teardown(): datastore.stop_thread = True - app.config['STOP_THREADS'] = True + app.config.exit.set() + try: + os.unlink("{}/url-watches.json".format(datastore_path)) + except FileNotFoundError: + # This is fine in the case of a failure. + pass + + assert 1 == 1 request.addfinalizer(teardown) + yield app - return app diff --git a/backend/tests/test_backend.py b/backend/tests/test_backend.py index e2040da0..c5fd46a9 100644 --- a/backend/tests/test_backend.py +++ b/backend/tests/test_backend.py @@ -3,6 +3,21 @@ import time from flask import url_for from urllib.request import urlopen +import pytest + +sleep_time_for_fetch_thread = 3 + + +def test_setup_liveserver(live_server): + @live_server.app.route('/test-endpoint') + def test_endpoint(): + # Tried using a global var here but didn't seem to work, so reading from a file instead. + with open("test-datastore/output.txt", "r") as f: + return f.read() + + live_server.start() + + assert 1 == 1 def set_original_response(): @@ -14,7 +29,6 @@ def set_original_response(): So let's see what happens.