Trigger text/wait (#187)

Re #71 - Ability to set filters
3 years ago · 252d6ee6fd
parent ba7b6b0f8b
commit 252d6ee6fd
9 changed files with 353 additions and 15 deletions
--- a/backend/init.py
+++ b/backend/init.py
@ -404,7 +404,8 @@ def changedetection_app(config=None, datastore_o=None):
                          'tag': form.tag.data.strip(),
                          'title': form.title.data.strip(),
                          'headers': form.headers.data,
-                          'fetch_backend': form.fetch_backend.data
+                          'fetch_backend': form.fetch_backend.data,
                          'trigger_text': form.trigger_text.data
                          }
            # Notification URLs
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -4,10 +4,9 @@ import hashlib
 from inscriptis import get_text
 import urllib3
 from . import html_tools
 import re
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 from selenium import webdriver
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 # Some common stuff here that can be moved to a base class
@ -57,6 +56,8 @@ class perform_site_check():
        changed_detected = False
        stripped_text_from_html = ""
        watch = self.datastore.data['watching'][uuid]
        update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'],
                      'history': {},
                      "last_checked": timestamp
@ -81,7 +82,7 @@ class perform_site_check():
            url = self.datastore.get_val(uuid, 'url')
            # Pluggable content fetcher
-            prefer_backend = self.datastore.data['watching'][uuid]['fetch_backend']
+            prefer_backend = watch['fetch_backend']
            if hasattr(content_fetcher, prefer_backend):
                klass = getattr(content_fetcher, prefer_backend)
            else:
@ -94,8 +95,15 @@ class perform_site_check():
            # Fetching complete, now filters
            # @todo move to class / maybe inside of fetcher abstract base?
            # @note: I feel like the following should be in a more obvious chain system
            #  - Check filter text
            #  - Is the checksum different?
            #  - Do we convert to JSON?
            # https://stackoverflow.com/questions/41817578/basic-method-chaining ?
            # return content().textfilter().jsonextract().checksumcompare() ?
            is_html = True
-            css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
+            css_filter_rule = watch['css_filter']
            if css_filter_rule and len(css_filter_rule.strip()):
                if 'json:' in css_filter_rule:
                    stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
@ -107,7 +115,6 @@ class perform_site_check():
            if is_html:
                # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                html_content = fetcher.content
                css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
                if css_filter_rule and len(css_filter_rule.strip()):
                    html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
@ -123,17 +130,37 @@ class perform_site_check():
            # If there's text to skip
            # @todo we could abstract out the get_text() to handle this cleaner
-            if len(self.datastore.data['watching'][uuid]['ignore_text']):
+            if len(watch['ignore_text']):
-                stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html,
+                stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, watch['ignore_text'])
                                                 self.datastore.data['watching'][uuid]['ignore_text'])
            else:
                stripped_text_from_html = stripped_text_from_html.encode('utf8')
            fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
            blocked_by_not_found_trigger_text = False
            if len(watch['trigger_text']):
                blocked_by_not_found_trigger_text = True
                for line in watch['trigger_text']:
                    # Because JSON wont serialize a re.compile object
                    if line[0] == '/' and line[-1] == '/':
                        regex = re.compile(line.strip('/'), re.IGNORECASE)
                        # Found it? so we don't wait for it anymore
                        r = re.search(regex, str(stripped_text_from_html))
                        if r:
                            blocked_by_not_found_trigger_text = False
                            break
                    elif line.lower() in str(stripped_text_from_html).lower():
                        # We found it don't wait for it.
                        blocked_by_not_found_trigger_text = False
                        break
            # could be None or False depending on JSON type
-            if self.datastore.data['watching'][uuid]['previous_md5'] != fetched_md5:
+            # On the first run of a site, watch['previous_md5'] will be an empty string
            if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
                changed_detected = True
                # Don't confuse people by updating as last-changed, when it actually just changed from None..
@ -144,7 +171,7 @@ class perform_site_check():
            # Extract title as title
            if is_html and self.datastore.data['settings']['application']['extract_title_as_title']:
-                if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']):
+                if not watch['title'] or not len(watch['title']):
                    update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
--- a/backend/forms.py
+++ b/backend/forms.py
@ -4,6 +4,7 @@ from wtforms import widgets
 from wtforms.validators import ValidationError
 from wtforms.fields import html5
 from backend import content_fetcher
 import re
 class StringListField(StringField):
    widget = widgets.TextArea()
@ -124,7 +125,6 @@ class ValidateListRegex(object):
        self.message = message
    def __call__(self, form, field):
        import re
        for line in field.data:
            if line[0] == '/' and line[-1] == '/':
@ -178,6 +178,7 @@ class watchForm(quickWatchForm):
    notification_urls = StringListField('Notification URL List')
    headers = StringDictKeyValue('Request Headers')
    trigger_check = BooleanField('Send test notification on save')
    trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
 class globalSettingsForm(Form):
--- a/backend/store.py
+++ b/backend/store.py
@ -68,6 +68,7 @@ class ChangeDetectionStore:
            'ignore_text': [], # List of text to ignore when calculating the comparison checksum
            'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
            'css_filter': "",
            'trigger_text': [],  # List of text or regex to wait for until a change is detected
            'fetch_backend': None,
        }
--- a/backend/templates/edit.html
+++ b/backend/templates/edit.html
@ -10,6 +10,7 @@
            <li class="tab" id="default-tab"><a href="#general">General</a></li>
            <li class="tab"><a href="#notifications">Notifications</a></li>
            <li class="tab"><a href="#filters">Filters</a></li>
            <li class="tab"><a href="#triggers">Triggers</a></li>
        </ul>
    </div>
@ -102,8 +103,20 @@ User-Agent: wonderbra 1.0") }}
                </span>
            </fieldset>
            </div>
-
+            <div class="tab-pane-inner" id="triggers">
                <fieldset>
                    <div class="pure-control-group">
                        {{ render_field(form.trigger_text, rows=5, placeholder="Some text to wait for in a line
 /some.regex\d{2}/ for case-INsensitive regex
                    ") }}</br>
                        <span class="pure-form-message-inline">Text to wait for before triggering a change/notification, all text and regex are tested <i>case-insensitive</i>.</span><br/>
                        <span class="pure-form-message-inline">Trigger text is processed from the result-text that comes out of any <a href="#filters">CSS/JSON Filters</a> for this watch</span>.<br/>
                        <span class="pure-form-message-inline">Each line is process separately (think of each line as "OR")</span><br/>
                        <span class="pure-form-message-inline">Note: Wrap in forward slash / to use regex  example: <span style="font-family: monospace; background: #eee">/foo\d/</span> </span>
                    </div>
                </fieldset>
            </div>
            <div id="actions">
                <div class="pure-control-group">
--- a/backend/tests/test_trigger.py
+++ b/backend/tests/test_trigger.py
@ -0,0 +1,131 @@
 #!/usr/bin/python3
 import time
 from flask import url_for
 from . util import live_server_setup
 def set_original_ignore_response():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 def set_modified_original_ignore_response():
    test_return_data = """<html>
       <body>
     Some NEW nice initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 def set_modified_with_trigger_text_response():
    test_return_data = """<html>
       <body>
     Some NEW nice initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     foobar123
     <br/>
     So let's see what happens.  </br>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 def test_trigger_functionality(client, live_server):
    live_server_setup(live_server)
    sleep_time_for_fetch_thread = 3
    trigger_text = "foobar123"
    set_original_ignore_response()
    # Give the endpoint time to spin up
    time.sleep(1)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # Goto the edit page, add our ignore text
    # Add our URL to the import page
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"trigger_text": trigger_text,
              "url": test_url,
              "fetch_backend": "html_requests"},
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    # Check it saved
    res = client.get(
        url_for("edit_page", uuid="first"),
    )
    assert bytes(trigger_text.encode('utf-8')) in res.data
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # It should report nothing found (no new 'unviewed' class)
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
    assert b'/test-endpoint' in res.data
    #  Make a change
    set_modified_original_ignore_response()
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # It should report nothing found (no new 'unviewed' class)
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
    # Just to be sure.. set a regular modified change..
    time.sleep(sleep_time_for_fetch_thread)
    set_modified_with_trigger_text_response()
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
--- a/backend/tests/test_trigger_regex.py
+++ b/backend/tests/test_trigger_regex.py
@ -0,0 +1,81 @@
 #!/usr/bin/python3
 import time
 from flask import url_for
 from . util import live_server_setup
 def set_original_ignore_response():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 def test_trigger_regex_functionality(client, live_server):
    live_server_setup(live_server)
    sleep_time_for_fetch_thread = 3
    set_original_ignore_response()
    # Give the endpoint time to spin up
    time.sleep(1)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # It should report nothing found (just a new one shouldnt have anything)
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
    ### test regex
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"trigger_text": '/something \d{3}/',
              "url": test_url,
              "fetch_backend": "html_requests"},
        follow_redirects=True
    )
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("some new noise")
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
    # It should report nothing found (nothing should match the regex)
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("regex test123<br/>\nsomething 123")
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
--- a/backend/tests/test_trigger_regex_with_filter.py
+++ b/backend/tests/test_trigger_regex_with_filter.py
@ -0,0 +1,84 @@
 #!/usr/bin/python3
 import time
 from flask import url_for
 from . util import live_server_setup
 def set_original_ignore_response():
    test_return_data = """<html>
       <body>
     Some initial text</br>
     <p>Which is across multiple lines</p>
     </br>
     So let's see what happens.  </br>
     </body>
     </html>
    """
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(test_return_data)
 def test_trigger_regex_functionality(client, live_server):
    live_server_setup(live_server)
    sleep_time_for_fetch_thread = 3
    set_original_ignore_response()
    # Give the endpoint time to spin up
    time.sleep(1)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True)
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    # Trigger a check
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    # Give the thread time to pick it up
    time.sleep(sleep_time_for_fetch_thread)
    # It should report nothing found (just a new one shouldnt have anything)
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
    ### test regex with filter
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"trigger_text": "/cool.stuff\d/",
              "url": test_url,
              "css_filter": '#in-here',
              "fetch_backend": "html_requests"},
        follow_redirects=True
    )
    # Check that we have the expected text.. but it's not in the css filter we want
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("<html>some new noise with cool stuff2 ok</html>")
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
    # It should report nothing found (nothing should match the regex and filter)
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write("<html>some new noise with <span id=in-here>cool stuff6</span> ok</html>")
    client.get(url_for("api_watch_checknow"), follow_redirects=True)
    time.sleep(sleep_time_for_fetch_thread)
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
--- a/backend/update_worker.py
+++ b/backend/update_worker.py
@ -47,9 +47,8 @@ class update_worker(threading.Thread):
                    except content_fetcher.EmptyReply as e:
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)})
                    #@todo how to handle when it's thrown from webdriver connecting?
                    except Exception as e:
-                        self.app.logger.error("Exception reached", uuid, str(e))
+                        self.app.logger.error("Exception reached processing watch UUID:%s - %s", uuid, str(e))
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
                    else: