merging v0.39.6

3 years ago · 7236572de6
parent fe037064d8 bc74227635
commit 7236572de6
14 changed files with 257 additions and 29 deletions
--- a/README.md
+++ b/README.md
@ -91,6 +91,8 @@ docker run -d --restart always -p "127.0.0.1:5000:5000" -v datastore-volume:/dat
 ```bash
 docker-compose pull && docker-compose up -d
 ```
+### Filters
+XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.

 ### Notifications

@ -144,9 +146,9 @@ When you enable a `json:` filter, you can even automatically extract and parse e

 See the wiki https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration

-### RaspberriPi support?
+### Raspberry Pi support?

-RaspberriPi and linux/arm/v6 linux/arm/v7 arm64 devices are supported! 
+Raspberry Pi and linux/arm/v6 linux/arm/v7 arm64 devices are supported! 

 ### Windows native support?

--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -30,7 +30,7 @@ import datetime
 import pytz
 from copy import deepcopy

-__version__ = '0.39.5'
+__version__ = '0.39.6'

 datastore = None

@ -806,7 +806,8 @@ def changedetection_app(config=None, datastore_o=None):
                         compress_type=zipfile.ZIP_DEFLATED,
                         compresslevel=8)

-        return send_from_directory(datastore_o.datastore_path, backupname, as_attachment=True)
+        # Send_from_directory needs to be the full absolute path
+        return send_from_directory(os.path.abspath(datastore_o.datastore_path), backupname, as_attachment=True)

    @app.route("/static/<string:group>/<string:filename>", methods=['GET'])
    def static_content(group, filename):
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -9,6 +9,12 @@ import urllib3.exceptions


 class EmptyReply(Exception):
+    def __init__(self, status_code, url):
+        # Set this so we can use it in other parts of the app
+        self.status_code = status_code
+        self.url = url
+        return
+
    pass

 class Fetcher():
@ -110,6 +116,8 @@ class html_webdriver(Fetcher):

        # @todo - how to check this? is it possible?
        self.status_code = 200
+        # @todo somehow we should try to get this working for WebDriver
+        # raise EmptyReply(url=url, status_code=r.status_code)

        # @todo - dom wait loaded?
        time.sleep(5)
@ -151,10 +159,10 @@ class html_requests(Fetcher):
        # Return bytes here
        html = r.text

-
        # @todo test this
+        # @todo maybe you really want to test zero-byte return pages?
        if not r or not html or not len(html):
-            raise EmptyReply(url)
+            raise EmptyReply(url=url, status_code=r.status_code)

        self.status_code = r.status_code
        self.content = html
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -58,9 +58,7 @@ class perform_site_check():

        watch = self.datastore.data['watching'][uuid]

-        update_obj = {
-                      "last_checked": timestamp
-                      }
+        update_obj = {}

        extra_headers = self.datastore.get_val(uuid, 'headers')

@ -116,15 +114,17 @@ class perform_site_check():
                if 'json:' in css_filter_rule:
                    stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
                    is_html = False
-                else:
-                    # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
-                    stripped_text_from_html = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)

            if is_html:
                # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                html_content = fetcher.content
                if has_filter_rule:
-                    html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+                    # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
+                    if css_filter_rule[0] == '/':
+                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+                    else:
+                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                        html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)

                # get_text() via inscriptis
                stripped_text_from_html = get_text(html_content)
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@ -181,7 +181,7 @@ class ValidateListRegex(object):
                    message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
                    raise ValidationError(message % (line))

-class ValidateCSSJSONInput(object):
+class ValidateCSSJSONXPATHInput(object):
    """
    Filter validation
    @todo CSS validator ;)
@ -191,6 +191,24 @@ class ValidateCSSJSONInput(object):
        self.message = message

    def __call__(self, form, field):
+
+        # Nothing to see here
+        if not len(field.data.strip()):
+            return
+
+        # Does it look like XPath?
+        if field.data.strip()[0] == '/':
+            from lxml import html, etree
+            tree = html.fromstring("<html></html>")
+
+            try:
+                tree.xpath(field.data.strip())
+            except etree.XPathEvalError as e:
+                message = field.gettext('\'%s\' is not a valid XPath expression. (%s)')
+                raise ValidationError(message % (field.data, str(e)))
+            except:
+                raise ValidationError("A system-error occurred when validating your XPath expression")
+
        if 'json:' in field.data:
            from jsonpath_ng.exceptions import JsonPathParserError, JsonPathLexerError
            from jsonpath_ng.ext import parse
@ -202,6 +220,8 @@ class ValidateCSSJSONInput(object):
            except (JsonPathParserError, JsonPathLexerError) as e:
                message = field.gettext('\'%s\' is not a valid JSONPath expression. (%s)')
                raise ValidationError(message % (input, str(e)))
+            except:
+                raise ValidationError("A system-error occurred when validating your JSONPath expression")

            # Re #265 - maybe in the future fetch the page and offer a
            # warning/notice that its possible the rule doesnt yet match anything?
@ -232,7 +252,7 @@ class watchForm(commonSettingsForm):
    seconds_between_check = html5.IntegerField('Maximum time in seconds until recheck',
                                               [validators.Optional(), validators.NumberRange(min=1,max=59)])
    minutes_or_seconds = RadioField('Minutes or Seconds', choices=[('minutes','Minutes'),('seconds','Seconds')])
-    css_filter = StringField('CSS/JSON Filter', [ValidateCSSJSONInput()])
+    css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()])
    title = StringField('Title')

    ignore_text = StringListField('Ignore Text', [ValidateListRegex()])
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -17,6 +17,20 @@ def css_filter(css_filter, html_content):
    return html_block + "\n"


+# Return str Utf-8 of matched rules
+def xpath_filter(xpath_filter, html_content):
+    from lxml import html
+    from lxml import etree
+
+    tree = html.fromstring(html_content)
+    html_block = ""
+
+    for item in tree.xpath(xpath_filter.strip()):
+        html_block+= etree.tostring(item, pretty_print=True).decode('utf-8')+"<br/>"
+
+    return html_block
+
+
 # Extract/find element
 def extract_element(find='title', html_content=''):

--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -101,8 +101,10 @@ User-Agent: wonderbra 1.0") }}
                        <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
                        <li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <b>"json:"</b>, <a
                                href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
+                        <li>XPATH - Limit text to this XPath rule, simply start with a forward-slash, example  <b>//*[contains(@class, 'sametext')]</b>, <a
+                                href="http://xpather.com/" target="new">test your XPath here</a></li>
                    </ul>
-                    Please be sure that you thoroughly understand how to write CSS or JSONPath selector rules before filing an issue on GitHub! <a
+                    Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
                                href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
                </span>
                    </div>
@ -113,8 +115,11 @@ User-Agent: wonderbra 1.0") }}
 /some.regex\d{2}/ for case-INsensitive regex
                    ") }}
                    <span class="pure-form-message-inline">
-                    Each line processed separately, any line matching will be ignored.<br/>
-                    Regular Expression support, wrap the line in forward slash <b>/regex/</b>.
+                        <ul>
+                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
+                            <li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
+                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
+                        </ul>
                </span>

            </fieldset>
--- a/changedetectionio/templates/settings.html
+++ b/changedetectionio/templates/settings.html
@ -83,8 +83,13 @@
 /some.regex\d{2}/ for case-INsensitive regex
                    ") }}
                    <span class="pure-form-message-inline">Note: This is applied globally in addition to the per-watch rules.</span><br/>
-                    <span class="pure-form-message-inline">Each line processed separately, any line matching will be ignored.<br/>
-                    Regular Expression support, wrap the line in forward slash <b>/regex/</b>.
+                    <span class="pure-form-message-inline">
+                        <ul>
+                            <li>Note: This is applied globally in addition to the per-watch rules.</li>
+                            <li>Each line processed separately, any line matching will be ignored (removed before creating the checksum)</li>
+                            <li>Regular Expression support, wrap the line in forward slash <b>/regex/</b></li>
+                            <li>Changing this will affect the comparison checksum which may trigger an alert</li>
+                        </ul>
                     </span>
                    </fieldset>
           </div>
--- a/changedetectionio/tests/test_errorhandling.py
+++ b/changedetectionio/tests/test_errorhandling.py
@ -0,0 +1,38 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from . util import live_server_setup
+
+from ..html_tools import *
+
+def test_setup(live_server):
+    live_server_setup(live_server)
+
+
+def test_error_handler(client, live_server):
+
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint_403_error', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(3)
+
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+    assert b'Status Code 403' in res.data
+    assert bytes("just now".encode('utf-8')) in res.data
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@ -0,0 +1,118 @@
+#!/usr/bin/python3
+
+import time
+from flask import url_for
+from . util import live_server_setup
+
+from ..html_tools import *
+
+def test_setup(live_server):
+    live_server_setup(live_server)
+
+def set_original_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  </br>
+     <div class="sametext">Some text thats the same</div>
+     <div class="changetext">Some text that will change</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+    return None
+
+def set_modified_response():
+    test_return_data = """<html>
+       <body>
+     Some initial text</br>
+     <p>Which is across multiple lines</p>
+     </br>
+     So let's see what happens.  THIS CHANGES AND SHOULDNT TRIGGER A CHANGE</br>
+     <div class="sametext">Some text thats the same</div>
+     <div class="changetext">Some new text</div>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+    return None
+
+
+def test_check_markup_xpath_filter_restriction(client, live_server):
+    sleep_time_for_fetch_thread = 3
+
+    xpath_filter = "//*[contains(@class, 'sametext')]"
+
+    set_original_response()
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # Goto the edit page, add our ignore text
+    # Add our URL to the import page
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    # view it/reset state back to viewed
+    client.get(url_for("diff_history_page", uuid="first"), follow_redirects=True)
+
+    #  Make a change
+    set_modified_response()
+
+    # Trigger a check
+    client.get(url_for("api_watch_checknow"), follow_redirects=True)
+    # Give the thread time to pick it up
+    time.sleep(sleep_time_for_fetch_thread)
+
+    res = client.get(url_for("index"))
+    assert b'unviewed' not in res.data
+
+def test_xpath_validation(client, live_server):
+
+    # Give the endpoint time to spin up
+    time.sleep(1)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"is not a valid XPath expression" in res.data
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@ -54,6 +54,13 @@ def live_server_setup(live_server):
            resp.headers['Content-Type'] = 'application/json'
            return resp

+    @live_server.app.route('/test-403')
+    def test_endpoint_403_error():
+
+        from flask import make_response
+        resp = make_response('', 403)
+        return resp
+
    # Just return the headers in the request
    @live_server.app.route('/test-headers')
    def test_headers():
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -39,9 +39,10 @@ class update_worker(threading.Thread):
                    changed_detected = False
                    contents = ""
                    update_obj= {}
+                    now = time.time()

                    try:
-                        now = time.time()
+
                        changed_detected, update_obj, contents = update_handler.run(uuid)

                        # Re #342
@ -51,14 +52,13 @@ class update_worker(threading.Thread):
                            raise Exception("Error - returned data from the fetch handler SHOULD be bytes")


-                        # Always record that we atleast tried
-                        self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3)})
-
                    except PermissionError as e:
                        self.app.logger.error("File permission error updating", uuid, str(e))
                    except content_fetcher.EmptyReply as e:
-                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)})
-
+                        # Some kind of custom to-str handler in the exception handler that does this?
+                        err_text = "EmptyReply: Status Code {}".format(e.status_code)
+                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
+                                                                           'last_check_status': e.status_code})
                    except Exception as e:
                        self.app.logger.error("Exception reached processing watch UUID:%s - %s", uuid, str(e))
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
@ -66,13 +66,14 @@ class update_worker(threading.Thread):
                    else:
                        try:
                            watch = self.datastore.data['watching'][uuid]
+                            fname = "" # Saved history text filename

                            # For the FIRST time we check a site, or a change detected, save the snapshot.
                            if changed_detected or not watch['last_checked']:
                                # A change was detected
                                fname = self.datastore.save_history_text(watch_uuid=uuid, contents=contents)
                                # Should always be keyed by string(timestamp)
-                                self.datastore.update_watch(uuid, {"history": {str(update_obj["last_checked"]): fname}})
+                                self.datastore.update_watch(uuid, {"history": {str(round(time.time())): fname}})

                            # Generally update anything interesting returned
                            self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
@ -136,6 +137,11 @@ class update_worker(threading.Thread):
                            # Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
                            print("!!!! Exception in update_worker !!!\n", e)

+                    finally:
+                        # Always record that we atleast tried
+                        self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
+                                                                           'last_checked': round(time.time())})
+
                self.current_uuid = None  # Done
                self.q.task_done()

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -43,7 +43,8 @@ services:
      restart: unless-stopped

     # Used for fetching pages via WebDriver+Chrome where you need Javascript support.
-     # Does not work on rPi, https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver
+     # Now working on arm64 (needs testing on rPi - tested on Oracle ARM instance)
+     # replace image with seleniarm/standalone-chromium:4.0.0-20211213

 #    browser-chrome:
 #        hostname: browser-chrome
--- a/requirements.txt
+++ b/requirements.txt
@ -26,8 +26,11 @@ paho-mqtt
 # ERROR: Could not build wheels for cryptography which use PEP 517 and cannot be installed directly
 cryptography ~= 3.4

-# Used for CSS filtering, replace with soupsieve and lxml for xpath
+# Used for CSS filtering
 bs4

+# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
+lxml
+
 # 3.141 was missing socksVersion, 3.150 was not in pypi, so we try 4.1.0
 selenium ~= 4.1.0