Trigger text/wait (#187)

Re #71 - Ability to set filters
pull/186/head
dgtlmoon 3 years ago committed by GitHub
parent ba7b6b0f8b
commit 252d6ee6fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -404,7 +404,8 @@ def changedetection_app(config=None, datastore_o=None):
'tag': form.tag.data.strip(), 'tag': form.tag.data.strip(),
'title': form.title.data.strip(), 'title': form.title.data.strip(),
'headers': form.headers.data, 'headers': form.headers.data,
'fetch_backend': form.fetch_backend.data 'fetch_backend': form.fetch_backend.data,
'trigger_text': form.trigger_text.data
} }
# Notification URLs # Notification URLs

@ -4,10 +4,9 @@ import hashlib
from inscriptis import get_text from inscriptis import get_text
import urllib3 import urllib3
from . import html_tools from . import html_tools
import re
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# Some common stuff here that can be moved to a base class # Some common stuff here that can be moved to a base class
@ -57,6 +56,8 @@ class perform_site_check():
changed_detected = False changed_detected = False
stripped_text_from_html = "" stripped_text_from_html = ""
watch = self.datastore.data['watching'][uuid]
update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'], update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'],
'history': {}, 'history': {},
"last_checked": timestamp "last_checked": timestamp
@ -81,7 +82,7 @@ class perform_site_check():
url = self.datastore.get_val(uuid, 'url') url = self.datastore.get_val(uuid, 'url')
# Pluggable content fetcher # Pluggable content fetcher
prefer_backend = self.datastore.data['watching'][uuid]['fetch_backend'] prefer_backend = watch['fetch_backend']
if hasattr(content_fetcher, prefer_backend): if hasattr(content_fetcher, prefer_backend):
klass = getattr(content_fetcher, prefer_backend) klass = getattr(content_fetcher, prefer_backend)
else: else:
@ -94,8 +95,15 @@ class perform_site_check():
# Fetching complete, now filters # Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base? # @todo move to class / maybe inside of fetcher abstract base?
# @note: I feel like the following should be in a more obvious chain system
# - Check filter text
# - Is the checksum different?
# - Do we convert to JSON?
# https://stackoverflow.com/questions/41817578/basic-method-chaining ?
# return content().textfilter().jsonextract().checksumcompare() ?
is_html = True is_html = True
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter'] css_filter_rule = watch['css_filter']
if css_filter_rule and len(css_filter_rule.strip()): if css_filter_rule and len(css_filter_rule.strip()):
if 'json:' in css_filter_rule: if 'json:' in css_filter_rule:
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
@ -107,7 +115,6 @@ class perform_site_check():
if is_html: if is_html:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content = fetcher.content html_content = fetcher.content
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
if css_filter_rule and len(css_filter_rule.strip()): if css_filter_rule and len(css_filter_rule.strip()):
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
@ -123,17 +130,37 @@ class perform_site_check():
# If there's text to skip # If there's text to skip
# @todo we could abstract out the get_text() to handle this cleaner # @todo we could abstract out the get_text() to handle this cleaner
if len(self.datastore.data['watching'][uuid]['ignore_text']): if len(watch['ignore_text']):
stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, watch['ignore_text'])
self.datastore.data['watching'][uuid]['ignore_text'])
else: else:
stripped_text_from_html = stripped_text_from_html.encode('utf8') stripped_text_from_html = stripped_text_from_html.encode('utf8')
fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest()
blocked_by_not_found_trigger_text = False
if len(watch['trigger_text']):
blocked_by_not_found_trigger_text = True
for line in watch['trigger_text']:
# Because JSON wont serialize a re.compile object
if line[0] == '/' and line[-1] == '/':
regex = re.compile(line.strip('/'), re.IGNORECASE)
# Found it? so we don't wait for it anymore
r = re.search(regex, str(stripped_text_from_html))
if r:
blocked_by_not_found_trigger_text = False
break
elif line.lower() in str(stripped_text_from_html).lower():
# We found it don't wait for it.
blocked_by_not_found_trigger_text = False
break
# could be None or False depending on JSON type # could be None or False depending on JSON type
if self.datastore.data['watching'][uuid]['previous_md5'] != fetched_md5: # On the first run of a site, watch['previous_md5'] will be an empty string
if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5:
changed_detected = True changed_detected = True
# Don't confuse people by updating as last-changed, when it actually just changed from None.. # Don't confuse people by updating as last-changed, when it actually just changed from None..
@ -144,7 +171,7 @@ class perform_site_check():
# Extract title as title # Extract title as title
if is_html and self.datastore.data['settings']['application']['extract_title_as_title']: if is_html and self.datastore.data['settings']['application']['extract_title_as_title']:
if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']): if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)

@ -4,6 +4,7 @@ from wtforms import widgets
from wtforms.validators import ValidationError from wtforms.validators import ValidationError
from wtforms.fields import html5 from wtforms.fields import html5
from backend import content_fetcher from backend import content_fetcher
import re
class StringListField(StringField): class StringListField(StringField):
widget = widgets.TextArea() widget = widgets.TextArea()
@ -124,7 +125,6 @@ class ValidateListRegex(object):
self.message = message self.message = message
def __call__(self, form, field): def __call__(self, form, field):
import re
for line in field.data: for line in field.data:
if line[0] == '/' and line[-1] == '/': if line[0] == '/' and line[-1] == '/':
@ -178,6 +178,7 @@ class watchForm(quickWatchForm):
notification_urls = StringListField('Notification URL List') notification_urls = StringListField('Notification URL List')
headers = StringDictKeyValue('Request Headers') headers = StringDictKeyValue('Request Headers')
trigger_check = BooleanField('Send test notification on save') trigger_check = BooleanField('Send test notification on save')
trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()])
class globalSettingsForm(Form): class globalSettingsForm(Form):

@ -68,6 +68,7 @@ class ChangeDetectionStore:
'ignore_text': [], # List of text to ignore when calculating the comparison checksum 'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'css_filter': "", 'css_filter': "",
'trigger_text': [], # List of text or regex to wait for until a change is detected
'fetch_backend': None, 'fetch_backend': None,
} }

@ -10,6 +10,7 @@
<li class="tab" id="default-tab"><a href="#general">General</a></li> <li class="tab" id="default-tab"><a href="#general">General</a></li>
<li class="tab"><a href="#notifications">Notifications</a></li> <li class="tab"><a href="#notifications">Notifications</a></li>
<li class="tab"><a href="#filters">Filters</a></li> <li class="tab"><a href="#filters">Filters</a></li>
<li class="tab"><a href="#triggers">Triggers</a></li>
</ul> </ul>
</div> </div>
@ -102,8 +103,20 @@ User-Agent: wonderbra 1.0") }}
</span> </span>
</fieldset> </fieldset>
</div>
<div class="tab-pane-inner" id="triggers">
<fieldset>
<div class="pure-control-group">
{{ render_field(form.trigger_text, rows=5, placeholder="Some text to wait for in a line
/some.regex\d{2}/ for case-INsensitive regex
") }}</br>
<span class="pure-form-message-inline">Text to wait for before triggering a change/notification, all text and regex are tested <i>case-insensitive</i>.</span><br/>
<span class="pure-form-message-inline">Trigger text is processed from the result-text that comes out of any <a href="#filters">CSS/JSON Filters</a> for this watch</span>.<br/>
<span class="pure-form-message-inline">Each line is process separately (think of each line as "OR")</span><br/>
<span class="pure-form-message-inline">Note: Wrap in forward slash / to use regex example: <span style="font-family: monospace; background: #eee">/foo\d/</span> </span>
</div>
</fieldset>
</div> </div>
<div id="actions"> <div id="actions">
<div class="pure-control-group"> <div class="pure-control-group">

@ -0,0 +1,131 @@
#!/usr/bin/python3
import time
from flask import url_for
from . util import live_server_setup
def set_original_ignore_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def set_modified_original_ignore_response():
test_return_data = """<html>
<body>
Some NEW nice initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def set_modified_with_trigger_text_response():
test_return_data = """<html>
<body>
Some NEW nice initial text</br>
<p>Which is across multiple lines</p>
</br>
foobar123
<br/>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_trigger_functionality(client, live_server):
live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
trigger_text = "foobar123"
set_original_ignore_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"trigger_text": trigger_text,
"url": test_url,
"fetch_backend": "html_requests"},
follow_redirects=True
)
assert b"Updated watch." in res.data
# Check it saved
res = client.get(
url_for("edit_page", uuid="first"),
)
assert bytes(trigger_text.encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
assert b'/test-endpoint' in res.data
# Make a change
set_modified_original_ignore_response()
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (no new 'unviewed' class)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
# Just to be sure.. set a regular modified change..
time.sleep(sleep_time_for_fetch_thread)
set_modified_with_trigger_text_response()
client.get(url_for("api_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index"))
assert b'unviewed' in res.data

@ -0,0 +1,81 @@
#!/usr/bin/python3
import time
from flask import url_for
from . util import live_server_setup
def set_original_ignore_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_trigger_regex_functionality(client, live_server):
live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
set_original_ignore_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (just a new one shouldnt have anything)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
### test regex
res = client.post(
url_for("edit_page", uuid="first"),
data={"trigger_text": '/something \d{3}/',
"url": test_url,
"fetch_backend": "html_requests"},
follow_redirects=True
)
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("some new noise")
client.get(url_for("api_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (nothing should match the regex)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("regex test123<br/>\nsomething 123")
client.get(url_for("api_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index"))
assert b'unviewed' in res.data

@ -0,0 +1,84 @@
#!/usr/bin/python3
import time
from flask import url_for
from . util import live_server_setup
def set_original_ignore_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(test_return_data)
def test_trigger_regex_functionality(client, live_server):
live_server_setup(live_server)
sleep_time_for_fetch_thread = 3
set_original_ignore_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (just a new one shouldnt have anything)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
### test regex with filter
res = client.post(
url_for("edit_page", uuid="first"),
data={"trigger_text": "/cool.stuff\d/",
"url": test_url,
"css_filter": '#in-here',
"fetch_backend": "html_requests"},
follow_redirects=True
)
# Check that we have the expected text.. but it's not in the css filter we want
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("<html>some new noise with cool stuff2 ok</html>")
client.get(url_for("api_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
# It should report nothing found (nothing should match the regex and filter)
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write("<html>some new noise with <span id=in-here>cool stuff6</span> ok</html>")
client.get(url_for("api_watch_checknow"), follow_redirects=True)
time.sleep(sleep_time_for_fetch_thread)
res = client.get(url_for("index"))
assert b'unviewed' in res.data

@ -47,9 +47,8 @@ class update_worker(threading.Thread):
except content_fetcher.EmptyReply as e: except content_fetcher.EmptyReply as e:
self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)})
#@todo how to handle when it's thrown from webdriver connecting?
except Exception as e: except Exception as e:
self.app.logger.error("Exception reached", uuid, str(e)) self.app.logger.error("Exception reached processing watch UUID:%s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
else: else:

Loading…
Cancel
Save