From 2346b42ef2f66970586f4e56f7a9a47ce04ad87a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 30 May 2021 21:22:26 +1000 Subject: [PATCH] CSS selector filter (#73) * Re #9 CSS Selector filtering, Adding test for #9 --- backend/__init__.py | 13 +++- backend/fetch_site_status.py | 19 ++++-- backend/store.py | 3 +- backend/templates/edit.html | 8 ++- backend/tests/test_css_selector.py | 102 +++++++++++++++++++++++++++++ requirements.txt | 6 ++ 6 files changed, 144 insertions(+), 7 deletions(-) create mode 100644 backend/tests/test_css_selector.py diff --git a/backend/__init__.py b/backend/__init__.py index 96c26040..07fdcdb1 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -386,6 +386,17 @@ def changedetection_app(conig=None, datastore_o=None): if len(datastore.data['watching'][uuid]['history']): update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) + + # CSS Filter + css_filter = request.form.get('css_filter') + if css_filter: + datastore.data['watching'][uuid]['css_filter'] = css_filter.strip() + + # Reset the previous_md5 so we process a new snapshot including stripping ignore text. + if len(datastore.data['watching'][uuid]['history']): + update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) + + validators.url(url) # @todo switch to prop/attr/observer datastore.data['watching'][uuid].update(update_obj) datastore.needs_write = True @@ -876,7 +887,7 @@ def ticker_thread_check_time_launch_checks(): if not uuid in running_uuids and uuid not in update_q.queue: update_q.put(uuid) - time.sleep(1) + time.sleep(0.1) # Should be low so we can break this out in testing app.config.exit.wait(1) diff --git a/backend/fetch_site_status.py b/backend/fetch_site_status.py index 9fb52c75..8705201d 100644 --- a/backend/fetch_site_status.py +++ b/backend/fetch_site_status.py @@ -66,25 +66,36 @@ class perform_site_check(): timeout=timeout, verify=False) - stripped_text_from_html = get_text(r.text) + # CSS Filter + css_filter = self.datastore.data['watching'][uuid]['css_filter'] + if css_filter and len(css_filter.strip()): + from bs4 import BeautifulSoup + soup = BeautifulSoup(r.content, "html.parser") + stripped_text_from_html = "" + for item in soup.select(css_filter): + text = str(item.get_text())+"\n" + stripped_text_from_html += text + + else: + stripped_text_from_html = get_text(r.text) # Usually from networkIO/requests level except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: update_obj["last_error"] = str(e) - print(str(e)) except requests.exceptions.MissingSchema: print("Skipping {} due to missing schema/bad url".format(uuid)) # Usually from html2text level - except UnicodeDecodeError as e: - + except Exception as e: + # except UnicodeDecodeError as e: update_obj["last_error"] = str(e) print(str(e)) # figure out how to deal with this cleaner.. # 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte + else: # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. diff --git a/backend/store.py b/backend/store.py index 6ae6223d..ecb87088 100644 --- a/backend/store.py +++ b/backend/store.py @@ -61,7 +61,8 @@ class ChangeDetectionStore: 'headers': {}, # Extra headers to send 'history': {}, # Dict of timestamp and output stripped filename 'ignore_text': [], # List of text to ignore when calculating the comparison checksum - 'notification_urls': [] # List of URLs to add to the notification Queue (Usually AppRise) + 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) + 'css_filter': "", } if path.isfile('backend/source.txt'): diff --git a/backend/templates/edit.html b/backend/templates/edit.html index 14e109d7..d97cc8bb 100644 --- a/backend/templates/edit.html +++ b/backend/templates/edit.html @@ -24,7 +24,13 @@ size="5"/> Minimum 1 minute between recheck - +
+
+ + + Limit text to this CSS rule, all matching CSS is included. +
diff --git a/backend/tests/test_css_selector.py b/backend/tests/test_css_selector.py new file mode 100644 index 00000000..a3a0ffc6 --- /dev/null +++ b/backend/tests/test_css_selector.py @@ -0,0 +1,102 @@ +#!/usr/bin/python3 + +import time +from flask import url_for +from . util import live_server_setup + +def test_setup(live_server): + live_server_setup(live_server) + +def set_original_response(): + test_return_data = """ + + Some initial text
+

Which is across multiple lines

+
+ So let's see what happens.
+
Some text thats the same
+
Some text that will change
+ + + """ + + with open("test-datastore/output.txt", "w") as f: + f.write(test_return_data) + return None + +def set_modified_response(): + test_return_data = """ + + Some initial text
+

which has this one new line

+
+ So let's see what happens.
+
Some text thats the same
+
Some text that changes
+ + + """ + + with open("test-datastore/output.txt", "w") as f: + f.write(test_return_data) + + return None + + +def test_check_markup_css_filter_restriction(client, live_server): + sleep_time_for_fetch_thread = 3 + + css_filter = "#sametext" + + set_original_response() + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": ""}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + # Check it saved + res = client.get( + url_for("edit_page", uuid="first"), + ) + assert bytes(css_filter.encode('utf-8')) in res.data + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + # Make a change + set_modified_response() + + # Trigger a check + client.get(url_for("api_watch_checknow"), follow_redirects=True) + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + # It should have 'unviewed' still + # Because it should be looking at only that 'sametext' id + res = client.get(url_for("index")) + assert b'unviewed' in res.data diff --git a/requirements.txt b/requirements.txt index bcbeed4d..d7370ea7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,10 @@ feedgen ~= 0.9 flask-login ~= 0.5 pytz urllib3 + +# Notification library apprise ~= 0.9 + +# Used for CSS filtering +bs4 +