diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index d355c209..27496e1a 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -39,7 +39,7 @@ def element_removal(selectors: List[str], html_content):
def xpath_filter(xpath_filter, html_content):
from lxml import etree, html
- tree = html.fromstring(html_content)
+ tree = html.fromstring(bytes(html_content, encoding='utf-8'))
html_block = ""
for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py
index 75e3b2cd..1ac1dd7e 100644
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -44,6 +44,61 @@ def set_modified_response():
return None
+# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
+def test_check_xpath_filter_utf8(client, live_server):
+ filter='//item/*[self::description]'
+
+ d='''
+
+
+ rpilocator.com
+ https://rpilocator.com
+ Find Raspberry Pi Computers in Stock
+ Thu, 19 May 2022 23:27:30 GMT
+
+ https://rpilocator.com/favicon.png
+ rpilocator.com
+ https://rpilocator.com/
+ 32
+ 32
+
+ -
+ Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni
+ Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni
+ https://rpilocator.com?vendor=pimoroni&utm_source=feed&utm_medium=rss
+ pimoroni
+ UK
+ CM4
+ F9FAB0D9-DF6F-40C8-8DEE5FC0646BB722
+ Thu, 19 May 2022 14:32:32 GMT
+
+
+'''
+
+ with open("test-datastore/endpoint-content.txt", "w") as f:
+ f.write(d)
+
+ # Add our URL to the import page
+ test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
+ res = client.post(
+ url_for("import_page"),
+ data={"urls": test_url},
+ follow_redirects=True
+ )
+ assert b"1 Imported" in res.data
+ res = client.post(
+ url_for("edit_page", uuid="first"),
+ data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+ follow_redirects=True
+ )
+ assert b"Updated watch." in res.data
+ time.sleep(3)
+ res = client.get(url_for("index"))
+ assert b'Unicode strings with encoding declaration are not supported.' not in res.data
+ res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+ assert b'Deleted' in res.data
+
+
def test_check_markup_xpath_filter_restriction(client, live_server):
sleep_time_for_fetch_thread = 3
@@ -95,6 +150,8 @@ def test_check_markup_xpath_filter_restriction(client, live_server):
res = client.get(url_for("index"))
assert b'unviewed' not in res.data
+ res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+ assert b'Deleted' in res.data
def test_xpath_validation(client, live_server):
@@ -117,6 +174,8 @@ def test_xpath_validation(client, live_server):
follow_redirects=True
)
assert b"is not a valid XPath expression" in res.data
+ res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+ assert b'Deleted' in res.data
# actually only really used by the distll.io importer, but could be handy too
@@ -153,8 +212,6 @@ def test_check_with_prefix_css_filter(client, live_server):
follow_redirects=True
)
- with open('/tmp/fuck.html', 'wb') as f:
- f.write(res.data)
assert b"Some text thats the same" in res.data #in selector
assert b"Some text that will change" not in res.data #not in selector