CSS selector filter (#73)

* Re #9 CSS Selector filtering,  Adding test for #9
pull/79/head
dgtlmoon 4 years ago committed by GitHub
parent 1a0c3f1250
commit 2346b42ef2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -386,6 +386,17 @@ def changedetection_app(conig=None, datastore_o=None):
if len(datastore.data['watching'][uuid]['history']): if len(datastore.data['watching'][uuid]['history']):
update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
# CSS Filter
css_filter = request.form.get('css_filter')
if css_filter:
datastore.data['watching'][uuid]['css_filter'] = css_filter.strip()
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
if len(datastore.data['watching'][uuid]['history']):
update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
validators.url(url) # @todo switch to prop/attr/observer validators.url(url) # @todo switch to prop/attr/observer
datastore.data['watching'][uuid].update(update_obj) datastore.data['watching'][uuid].update(update_obj)
datastore.needs_write = True datastore.needs_write = True
@ -876,7 +887,7 @@ def ticker_thread_check_time_launch_checks():
if not uuid in running_uuids and uuid not in update_q.queue: if not uuid in running_uuids and uuid not in update_q.queue:
update_q.put(uuid) update_q.put(uuid)
time.sleep(1) time.sleep(0.1)
# Should be low so we can break this out in testing # Should be low so we can break this out in testing
app.config.exit.wait(1) app.config.exit.wait(1)

@ -66,25 +66,36 @@ class perform_site_check():
timeout=timeout, timeout=timeout,
verify=False) verify=False)
stripped_text_from_html = get_text(r.text) # CSS Filter
css_filter = self.datastore.data['watching'][uuid]['css_filter']
if css_filter and len(css_filter.strip()):
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.content, "html.parser")
stripped_text_from_html = ""
for item in soup.select(css_filter):
text = str(item.get_text())+"\n"
stripped_text_from_html += text
else:
stripped_text_from_html = get_text(r.text)
# Usually from networkIO/requests level # Usually from networkIO/requests level
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
update_obj["last_error"] = str(e) update_obj["last_error"] = str(e)
print(str(e)) print(str(e))
except requests.exceptions.MissingSchema: except requests.exceptions.MissingSchema:
print("Skipping {} due to missing schema/bad url".format(uuid)) print("Skipping {} due to missing schema/bad url".format(uuid))
# Usually from html2text level # Usually from html2text level
except UnicodeDecodeError as e: except Exception as e:
# except UnicodeDecodeError as e:
update_obj["last_error"] = str(e) update_obj["last_error"] = str(e)
print(str(e)) print(str(e))
# figure out how to deal with this cleaner.. # figure out how to deal with this cleaner..
# 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte # 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte
else: else:
# We rely on the actual text in the html output.. many sites have random script vars etc, # We rely on the actual text in the html output.. many sites have random script vars etc,
# in the future we'll implement other mechanisms. # in the future we'll implement other mechanisms.

@ -61,7 +61,8 @@ class ChangeDetectionStore:
'headers': {}, # Extra headers to send 'headers': {}, # Extra headers to send
'history': {}, # Dict of timestamp and output stripped filename 'history': {}, # Dict of timestamp and output stripped filename
'ignore_text': [], # List of text to ignore when calculating the comparison checksum 'ignore_text': [], # List of text to ignore when calculating the comparison checksum
'notification_urls': [] # List of URLs to add to the notification Queue (Usually AppRise) 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise)
'css_filter': "",
} }
if path.isfile('backend/source.txt'): if path.isfile('backend/source.txt'):

@ -24,7 +24,13 @@
size="5"/> size="5"/>
<span class="pure-form-message-inline">Minimum 1 minute between recheck</span> <span class="pure-form-message-inline">Minimum 1 minute between recheck</span>
</div> </div>
</br>
<div class="pure-control-group">
<label for="minutes">CSS Filter</label>
<input type="text" id="css_filter" name="css_filter" value="{{watch.css_filter}}"
size="25"/>
<span class="pure-form-message-inline">Limit text to this CSS rule, all matching CSS is included.</span>
</div>
<!-- @todo: move to tabs ---> <!-- @todo: move to tabs --->
<fieldset class="pure-group"> <fieldset class="pure-group">
<label for="ignore-text">Ignore text</label> <label for="ignore-text">Ignore text</label>

@ -0,0 +1,102 @@
#!/usr/bin/python3
import time
from flask import url_for
from . util import live_server_setup
def test_setup(live_server):
live_server_setup(live_server)
def set_original_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>Which is across multiple lines</p>
</br>
So let's see what happens. </br>
<div id="sametext">Some text thats the same</div>
<div id="changetext">Some text that will change</div>
</body>
</html>
"""
with open("test-datastore/output.txt", "w") as f:
f.write(test_return_data)
return None
def set_modified_response():
test_return_data = """<html>
<body>
Some initial text</br>
<p>which has this one new line</p>
</br>
So let's see what happens. </br>
<div id="sametext">Some text thats the same</div>
<div id="changetext">Some text that changes</div>
</body>
</html>
"""
with open("test-datastore/output.txt", "w") as f:
f.write(test_return_data)
return None
def test_check_markup_css_filter_restriction(client, live_server):
sleep_time_for_fetch_thread = 3
css_filter = "#sametext"
set_original_response()
# Give the endpoint time to spin up
time.sleep(1)
# Add our URL to the import page
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Goto the edit page, add our ignore text
# Add our URL to the import page
res = client.post(
url_for("edit_page", uuid="first"),
data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": ""},
follow_redirects=True
)
assert b"Updated watch." in res.data
# Check it saved
res = client.get(
url_for("edit_page", uuid="first"),
)
assert bytes(css_filter.encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# Make a change
set_modified_response()
# Trigger a check
client.get(url_for("api_watch_checknow"), follow_redirects=True)
# Give the thread time to pick it up
time.sleep(sleep_time_for_fetch_thread)
# It should have 'unviewed' still
# Because it should be looking at only that 'sametext' id
res = client.get(url_for("index"))
assert b'unviewed' in res.data

@ -11,4 +11,10 @@ feedgen ~= 0.9
flask-login ~= 0.5 flask-login ~= 0.5
pytz pytz
urllib3 urllib3
# Notification library
apprise ~= 0.9 apprise ~= 0.9
# Used for CSS filtering
bs4

Loading…
Cancel
Save