Merge branch 'master' into diff-filters

2 years ago · 62b6645810
parent e5e8b3bbbd 4eb4b401a1
commit 62b6645810
14 changed files with 103 additions and 58 deletions
--- a/.github/workflows/test-container-build.yml
+++ b/.github/workflows/test-container-build.yml
@ -1,12 +1,21 @@
 name: ChangeDetection.io Container Build Test

 # Triggers the workflow on push or pull request events
+
+# This line doesnt work, even tho it is the documented one
+#on: [push, pull_request]
+
 on:
  push:
    paths:
      - requirements.txt
      - Dockerfile

+  pull_request:
+    paths:
+      - requirements.txt
+      - Dockerfile
+
  # Changes to requirements.txt packages and Dockerfile may or may not always be compatible with arm etc, so worth testing
  # @todo: some kind of path filter for requirements.txt and Dockerfile
 jobs:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -6,7 +6,7 @@ Otherwise, it's always best to PR into the `dev` branch.

 Please be sure that all new functionality has a matching test!

-Use `pytest` to validate/test, you can run the existing tests as `pytest tests/test_notifications.py` for example
+Use `pytest` to validate/test, you can run the existing tests as `pytest tests/test_notification.py` for example

 ```
 pip3 install -r requirements-dev
--- a/1
+++ b/1
@ -64,6 +64,7 @@ EXPOSE 5000

 # The actual flask app
 COPY changedetectionio /app/changedetectionio
+
 # The eventlet server wrapper
 COPY changedetection.py /app/changedetection.py

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -2,6 +2,7 @@ recursive-include changedetectionio/api *
 recursive-include changedetectionio/templates *
 recursive-include changedetectionio/static *
 recursive-include changedetectionio/model *
+recursive-include changedetectionio/tests *
 include changedetection.py
 global-exclude *.pyc
 global-exclude node_modules
--- a/README.md
+++ b/README.md
@ -161,50 +161,14 @@ This will re-parse the JSON and apply formatting to the text, making it super ea

 ### JSONPath or jq?

-For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more information on jq.
+For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more specifc information on jq.

-Notes:
- `jq` must be added manually separately from the installation of changedetection.io (simply run `pip3 install jq`)
- `jq` is not available on Windows or must be manually compiled (No "wheel" package available on pypi)
+One big advantage of `jq` is that you can use logic in your JSON filter, such as filters to only show items that have a value greater than/less than etc.

- The example below adds the price in dollars to each item in the JSON data, and then filters to only show items that are greater than 10.
+See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/JSON-Selector-Filter-help for more information and examples

-#### Sample input data from API
-```
-{
-    "items": [
-        {
-           "name": "Product A",
-           "priceInCents": 2500
-        },
-        {
-           "name": "Product B",
-           "priceInCents": 500
-        },
-        {
-           "name": "Product C",
-           "priceInCents": 2000
-        }
-    ]
-}
-```
-
-#### Sample jq
-`jq:.items[] | . + { "priceInDollars": (.priceInCents / 100) } | select(.priceInDollars > 10)`
+Note: `jq` library must be added separately (`pip3 install jq`)

-#### Sample output data
-```
-{
-  "name": "Product A",
-  "priceInCents": 2500,
-  "priceInDollars": 25
-}
-{
-  "name": "Product C",
-  "priceInCents": 2000,
-  "priceInDollars": 20
-}
-```

 ### Parse JSON embedded in HTML!

--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -33,7 +33,7 @@ from flask_wtf import CSRFProtect
 from changedetectionio import html_tools
 from changedetectionio.api import api_v1

-__version__ = '0.39.20.2'
+__version__ = '0.39.20.4'

 datastore = None

@ -194,6 +194,9 @@ def changedetection_app(config=None, datastore_o=None):
    watch_api.add_resource(api_v1.Watch, '/api/v1/watch/<string:uuid>',
                           resource_class_kwargs={'datastore': datastore, 'update_q': update_q})

+    watch_api.add_resource(api_v1.SystemInfo, '/api/v1/systeminfo',
+                           resource_class_kwargs={'datastore': datastore, 'update_q': update_q})
+



@ -816,8 +819,10 @@ def changedetection_app(config=None, datastore_o=None):

        newest_file = history[dates[-1]]

+        # Read as binary and force decode as UTF-8
+        # Windows may fail decode in python if we just use 'r' mode (chardet decode exception)
        try:
-            with open(newest_file, 'r') as f:
+            with open(newest_file, 'r', encoding='utf-8', errors='ignore') as f:
                newest_version_file_contents = f.read()
        except Exception as e:
            newest_version_file_contents = "Unable to read {}.\n".format(newest_file)
@ -830,7 +835,7 @@ def changedetection_app(config=None, datastore_o=None):
            previous_file = history[dates[-2]]

        try:
-            with open(previous_file, 'r') as f:
+            with open(previous_file, 'r', encoding='utf-8', errors='ignore') as f:
                previous_version_file_contents = f.read()
        except Exception as e:
            previous_version_file_contents = "Unable to read {}.\n".format(previous_file)
@ -907,7 +912,7 @@ def changedetection_app(config=None, datastore_o=None):
        timestamp = list(watch.history.keys())[-1]
        filename = watch.history[timestamp]
        try:
-            with open(filename, 'r') as f:
+            with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
                tmp = f.readlines()

                # Get what needs to be highlighted
--- a/changedetectionio/api/api_v1.py
+++ b/changedetectionio/api/api_v1.py
@ -122,3 +122,37 @@ class CreateWatch(Resource):
            return {'status': "OK"}, 200

        return list, 200
+
+class SystemInfo(Resource):
+    def __init__(self, **kwargs):
+        # datastore is a black box dependency
+        self.datastore = kwargs['datastore']
+        self.update_q = kwargs['update_q']
+
+    @auth.check_token
+    def get(self):
+        import time
+        overdue_watches = []
+
+        # Check all watches and report which have not been checked but should have been
+
+        for uuid, watch in self.datastore.data.get('watching', {}).items():
+            # see if now - last_checked is greater than the time that should have been
+            # this is not super accurate (maybe they just edited it) but better than nothing
+            t = watch.threshold_seconds()
+            if not t:
+                # Use the system wide default
+                t = self.datastore.threshold_seconds
+
+            time_since_check = time.time() - watch.get('last_checked')
+
+            # Allow 5 minutes of grace time before we decide it's overdue
+            if time_since_check - (5 * 60) > t:
+                overdue_watches.append(uuid)
+
+        return {
+                   'queue_size': self.update_q.qsize(),
+                   'overdue_watches': overdue_watches,
+                   'uptime': round(time.time() - self.datastore.start_time, 2),
+                   'watch_count': len(self.datastore.data.get('watching', {}))
+               }, 200
--- a/changedetectionio/changedetection.py
+++ b/changedetectionio/changedetection.py
@ -102,6 +102,14 @@ def main():
                    has_password=datastore.data['settings']['application']['password'] != False
                    )

+    # Monitored websites will not receive a Referer header
+    # when a user clicks on an outgoing link.
+    @app.after_request
+    def hide_referrer(response):
+        if os.getenv("HIDE_REFERER", False):
+            response.headers["Referrer-Policy"] = "no-referrer"
+        return response
+
    # Proxy sub-directory support
    # Set environment var USE_X_SETTINGS=1 on this script
    # And then in your proxy_pass settings
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -120,7 +120,10 @@ class model(dict):
        if os.path.isfile(fname):
            logging.debug("Reading history index " + str(time.time()))
            with open(fname, "r") as f:
-                tmp_history = dict(i.strip().split(',', 2) for i in f.readlines())
+                for i in f.readlines():
+                    if ',' in i:
+                        k, v = i.strip().split(',', 2)
+                        tmp_history[k] = v

        if len(tmp_history):
            self.__newest_history_key = list(tmp_history.keys())[-1]
@ -153,28 +156,30 @@ class model(dict):
        import uuid
        import logging

-        output_path = "{}/{}".format(self.__datastore_path, self['uuid'])
+        output_path = os.path.join(self.__datastore_path, self['uuid'])

        self.ensure_data_dir_exists()
+        snapshot_fname = os.path.join(output_path, str(uuid.uuid4()))

-        snapshot_fname = "{}/{}.stripped.txt".format(output_path, uuid.uuid4())
        logging.debug("Saving history text {}".format(snapshot_fname))

+        # in /diff/ and /preview/ we are going to assume for now that it's UTF-8 when reading
+        # most sites are utf-8 and some are even broken utf-8
        with open(snapshot_fname, 'wb') as f:
            f.write(contents)
            f.close()

        # Append to index
        # @todo check last char was \n
-        index_fname = "{}/history.txt".format(output_path)
+        index_fname = os.path.join(output_path, "history.txt")
        with open(index_fname, 'a') as f:
            f.write("{},{}\n".format(timestamp, snapshot_fname))
            f.close()

        self.__newest_history_key = timestamp
-        self.__history_n+=1
+        self.__history_n += 1

-        #@todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status
+        # @todo bump static cache of the last timestamp so we dont need to examine the file to set a proper ''viewed'' status
        return snapshot_fname

    # Save previous text snapshot for diffing - used for calculating additions and deletions
--- a/changedetectionio/run_all_tests.sh
+++ b/changedetectionio/run_all_tests.sh
@ -9,6 +9,8 @@
 # exit when any command fails
 set -e

+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
 find tests/test_*py -type f|while read test_name
 do
  echo "TEST RUNNING $test_name"
@ -45,7 +47,9 @@ docker kill $$-test_selenium

 echo "TESTING WEBDRIVER FETCH > PLAYWRIGHT/BROWSERLESS..."
 # Not all platforms support playwright (not ARM/rPI), so it's not packaged in requirements.txt
-pip3 install playwright~=1.24
+PLAYWRIGHT_VERSION=$(grep -i -E "RUN pip install.+" "$SCRIPT_DIR/../Dockerfile" | grep --only-matching -i -E "playwright[=><~+]+[0-9\.]+")
+echo "using $PLAYWRIGHT_VERSION"
+pip3 install "$PLAYWRIGHT_VERSION"
 docker run -d --name $$-test_browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.53-chrome-stable
 # takes a while to spin up
 sleep 5
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -30,14 +30,14 @@ class ChangeDetectionStore:
    def __init__(self, datastore_path="/datastore", include_default_watches=True, version_tag="0.0.0"):
        # Should only be active for docker
        # logging.basicConfig(filename='/dev/stdout', level=logging.INFO)
-        self.needs_write = False
+        self.__data = App.model()
        self.datastore_path = datastore_path
        self.json_store_path = "{}/url-watches.json".format(self.datastore_path)
+        self.needs_write = False
        self.proxy_list = None
+        self.start_time = time.time()
        self.stop_thread = False

-        self.__data = App.model()
-
        # Base definition for all watchers
        # deepcopy part of #569 - not sure why its needed exactly
        self.generic_definition = deepcopy(Watch.model(datastore_path = datastore_path, default={}))
--- a/changedetectionio/tests/test_api.py
+++ b/changedetectionio/tests/test_api.py
@ -147,6 +147,16 @@ def test_api_simple(client, live_server):
    # @todo how to handle None/default global values?
    assert watch['history_n'] == 2, "Found replacement history section, which is in its own API"

+    # basic systeminfo check
+    res = client.get(
+        url_for("systeminfo"),
+        headers={'x-api-key': api_key},
+    )
+    info = json.loads(res.data)
+    assert info.get('watch_count') == 1
+    assert info.get('uptime') > 0.5
+
+
    # Finally delete the watch
    res = client.delete(
        url_for("watch", uuid=watch_uuid),
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -45,6 +45,9 @@ services:
  #        Respect proxy_pass type settings, `proxy_set_header Host "localhost";` and `proxy_set_header X-Forwarded-Prefix /app;`
  #        More here https://github.com/dgtlmoon/changedetection.io/wiki/Running-changedetection.io-behind-a-reverse-proxy-sub-directory
  #      - USE_X_SETTINGS=1
+  #
+  #        Hides the `Referer` header so that monitored websites can't see the changedetection.io hostname.
+  #      - HIDE_REFERER=true

      # Comment out ports: when using behind a reverse proxy , enable networks: etc.
      ports:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,8 @@
-flask~= 2.0
+flask ~= 2.0
 flask_wtf
-eventlet>=0.31.0
+eventlet >= 0.31.0
 validators
-timeago ~=1.0
+timeago ~= 1.0
 inscriptis ~= 2.2
 feedgen ~= 0.9
 flask-login ~= 0.5
@ -47,3 +47,4 @@ selenium ~= 4.1.0
 werkzeug ~= 2.0.0

 # playwright is installed at Dockerfile build time because it's not available on all platforms
+