diff --git a/backend/__init__.py b/backend/__init__.py index 91608993..e42d44d3 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -404,7 +404,8 @@ def changedetection_app(config=None, datastore_o=None): 'tag': form.tag.data.strip(), 'title': form.title.data.strip(), 'headers': form.headers.data, - 'fetch_backend': form.fetch_backend.data + 'fetch_backend': form.fetch_backend.data, + 'trigger_text': form.trigger_text.data } # Notification URLs diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index 242a46b1..61ff7746 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -4,10 +4,9 @@ import hashlib from inscriptis import get_text import urllib3 from . import html_tools +import re urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -from selenium import webdriver -from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # Some common stuff here that can be moved to a base class @@ -57,6 +56,8 @@ class perform_site_check(): changed_detected = False stripped_text_from_html = "" + watch = self.datastore.data['watching'][uuid] + update_obj = {'previous_md5': self.datastore.data['watching'][uuid]['previous_md5'], 'history': {}, "last_checked": timestamp @@ -81,7 +82,7 @@ class perform_site_check(): url = self.datastore.get_val(uuid, 'url') # Pluggable content fetcher - prefer_backend = self.datastore.data['watching'][uuid]['fetch_backend'] + prefer_backend = watch['fetch_backend'] if hasattr(content_fetcher, prefer_backend): klass = getattr(content_fetcher, prefer_backend) else: @@ -94,8 +95,15 @@ class perform_site_check(): # Fetching complete, now filters # @todo move to class / maybe inside of fetcher abstract base? + # @note: I feel like the following should be in a more obvious chain system + # - Check filter text + # - Is the checksum different? + # - Do we convert to JSON? + # https://stackoverflow.com/questions/41817578/basic-method-chaining ? + # return content().textfilter().jsonextract().checksumcompare() ? + is_html = True - css_filter_rule = self.datastore.data['watching'][uuid]['css_filter'] + css_filter_rule = watch['css_filter'] if css_filter_rule and len(css_filter_rule.strip()): if 'json:' in css_filter_rule: stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) @@ -107,7 +115,6 @@ class perform_site_check(): if is_html: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content - css_filter_rule = self.datastore.data['watching'][uuid]['css_filter'] if css_filter_rule and len(css_filter_rule.strip()): html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) @@ -123,17 +130,37 @@ class perform_site_check(): # If there's text to skip # @todo we could abstract out the get_text() to handle this cleaner - if len(self.datastore.data['watching'][uuid]['ignore_text']): - stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, - self.datastore.data['watching'][uuid]['ignore_text']) + if len(watch['ignore_text']): + stripped_text_from_html = self.strip_ignore_text(stripped_text_from_html, watch['ignore_text']) else: stripped_text_from_html = stripped_text_from_html.encode('utf8') fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() + blocked_by_not_found_trigger_text = False + + if len(watch['trigger_text']): + blocked_by_not_found_trigger_text = True + for line in watch['trigger_text']: + # Because JSON wont serialize a re.compile object + if line[0] == '/' and line[-1] == '/': + regex = re.compile(line.strip('/'), re.IGNORECASE) + # Found it? so we don't wait for it anymore + r = re.search(regex, str(stripped_text_from_html)) + if r: + blocked_by_not_found_trigger_text = False + break + + elif line.lower() in str(stripped_text_from_html).lower(): + # We found it don't wait for it. + blocked_by_not_found_trigger_text = False + break + + # could be None or False depending on JSON type - if self.datastore.data['watching'][uuid]['previous_md5'] != fetched_md5: + # On the first run of a site, watch['previous_md5'] will be an empty string + if not blocked_by_not_found_trigger_text and watch['previous_md5'] != fetched_md5: changed_detected = True # Don't confuse people by updating as last-changed, when it actually just changed from None.. @@ -144,7 +171,7 @@ class perform_site_check(): # Extract title as title if is_html and self.datastore.data['settings']['application']['extract_title_as_title']: - if not self.datastore.data['watching'][uuid]['title'] or not len(self.datastore.data['watching'][uuid]['title']): + if not watch['title'] or not len(watch['title']): update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) diff --git a/backend/forms.py b/backend/forms.py index 6e288bd8..ae0847e0 100644 --- a/backend/forms.py +++ b/backend/forms.py @@ -4,6 +4,7 @@ from wtforms import widgets from wtforms.validators import ValidationError from wtforms.fields import html5 from backend import content_fetcher +import re class StringListField(StringField): widget = widgets.TextArea() @@ -124,7 +125,6 @@ class ValidateListRegex(object): self.message = message def __call__(self, form, field): - import re for line in field.data: if line[0] == '/' and line[-1] == '/': @@ -178,6 +178,7 @@ class watchForm(quickWatchForm): notification_urls = StringListField('Notification URL List') headers = StringDictKeyValue('Request Headers') trigger_check = BooleanField('Send test notification on save') + trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()]) class globalSettingsForm(Form): diff --git a/backend/store.py b/backend/store.py index 2d3f7abf..17668717 100644 --- a/backend/store.py +++ b/backend/store.py @@ -68,6 +68,7 @@ class ChangeDetectionStore: 'ignore_text': [], # List of text to ignore when calculating the comparison checksum 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) 'css_filter': "", + 'trigger_text': [], # List of text or regex to wait for until a change is detected 'fetch_backend': None, } diff --git a/backend/templates/edit.html b/backend/templates/edit.html index d806138f..16233e83 100644 --- a/backend/templates/edit.html +++ b/backend/templates/edit.html @@ -10,6 +10,7 @@
Which is across multiple lines
+ + So let's see what happens. + + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def set_modified_original_ignore_response(): + test_return_data = """ + + Some NEW nice initial text +Which is across multiple lines
+ + So let's see what happens. + + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + +def set_modified_with_trigger_text_response(): + test_return_data = """ + + Some NEW nice initial text +Which is across multiple lines
+ + foobar123 +Which is across multiple lines
+ + So let's see what happens. + + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + + +def test_trigger_regex_functionality(client, live_server): + + live_server_setup(live_server) + + sleep_time_for_fetch_thread = 3 + + set_original_ignore_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (just a new one shouldnt have anything) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + ### test regex + res = client.post( + url_for("edit_page", uuid="first"), + data={"trigger_text": '/something \d{3}/', + "url": test_url, + "fetch_backend": "html_requests"}, + follow_redirects=True + ) + + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("some new noise") + + client.get(url_for("api_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (nothing should match the regex) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("regex test123Which is across multiple lines
+ + So let's see what happens. + + + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + + +def test_trigger_regex_functionality(client, live_server): + + live_server_setup(live_server) + + sleep_time_for_fetch_thread = 3 + + set_original_ignore_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (just a new one shouldnt have anything) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + ### test regex with filter + res = client.post( + url_for("edit_page", uuid="first"), + data={"trigger_text": "/cool.stuff\d/", + "url": test_url, + "css_filter": '#in-here', + "fetch_backend": "html_requests"}, + follow_redirects=True + ) + + # Check that we have the expected text.. but it's not in the css filter we want + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("some new noise with cool stuff2 ok") + + client.get(url_for("api_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + + # It should report nothing found (nothing should match the regex and filter) + res = client.get(url_for("index")) + assert b'unviewed' not in res.data + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("some new noise with cool stuff6 ok") + + client.get(url_for("api_watch_checknow"), follow_redirects=True) + time.sleep(sleep_time_for_fetch_thread) + res = client.get(url_for("index")) + assert b'unviewed' in res.data + + diff --git a/backend/update_worker.py b/backend/update_worker.py index e27bbfc9..49eabd29 100644 --- a/backend/update_worker.py +++ b/backend/update_worker.py @@ -47,9 +47,8 @@ class update_worker(threading.Thread): except content_fetcher.EmptyReply as e: self.datastore.update_watch(uuid=uuid, update_obj={'last_error':str(e)}) - #@todo how to handle when it's thrown from webdriver connecting? except Exception as e: - self.app.logger.error("Exception reached", uuid, str(e)) + self.app.logger.error("Exception reached processing watch UUID:%s - %s", uuid, str(e)) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) else: