Fix encoding errors with XPath filters from UTF-8 responses (#619)

3 years ago · e17c2121f7
parent 07e279b38d
commit e17c2121f7
2 changed files with 60 additions and 3 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -39,7 +39,7 @@ def element_removal(selectors: List[str], html_content):
 def xpath_filter(xpath_filter, html_content):
    from lxml import etree, html

-    tree = html.fromstring(html_content)
+    tree = html.fromstring(bytes(html_content, encoding='utf-8'))
    html_block = ""

    for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@ -44,6 +44,61 @@ def set_modified_response():

    return None

+# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
+def test_check_xpath_filter_utf8(client, live_server):
+    filter='//item/*[self::description]'
+
+    d='''<?xml version="1.0" encoding="UTF-8"?>
+<rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
+	<channel>
+		<title>rpilocator.com</title>
+		<link>https://rpilocator.com</link>
+		<description>Find Raspberry Pi Computers in Stock</description>
+		<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
+		<image>
+			<url>https://rpilocator.com/favicon.png</url>
+			<title>rpilocator.com</title>
+			<link>https://rpilocator.com/</link>
+			<width>32</width>
+			<height>32</height>
+		</image>
+		<item>
+			<title>Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni</title>
+			<description>Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni</description>
+			<link>https://rpilocator.com?vendor=pimoroni&amp;utm_source=feed&amp;utm_medium=rss</link>
+			<category>pimoroni</category>
+			<category>UK</category>
+			<category>CM4</category>
+			<guid isPermaLink="false">F9FAB0D9-DF6F-40C8-8DEE5FC0646BB722</guid>
+			<pubDate>Thu, 19 May 2022 14:32:32 GMT</pubDate>
+		</item>
+	</channel>
+</rss>'''
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(d)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
+    res = client.post(
+        url_for("import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+    res = client.post(
+        url_for("edit_page", uuid="first"),
+        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+    time.sleep(3)
+    res = client.get(url_for("index"))
+    assert b'Unicode strings with encoding declaration are not supported.' not in res.data
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data
+
+

 def test_check_markup_xpath_filter_restriction(client, live_server):
    sleep_time_for_fetch_thread = 3
@ -95,6 +150,8 @@ def test_check_markup_xpath_filter_restriction(client, live_server):

    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data


 def test_xpath_validation(client, live_server):
@ -117,6 +174,8 @@ def test_xpath_validation(client, live_server):
        follow_redirects=True
    )
    assert b"is not a valid XPath expression" in res.data
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data


 # actually only really used by the distll.io importer, but could be handy too
@ -153,8 +212,6 @@ def test_check_with_prefix_css_filter(client, live_server):
        follow_redirects=True
    )

-    with open('/tmp/fuck.html', 'wb') as f:
-        f.write(res.data)
    assert b"Some text thats the same" in res.data #in selector
    assert b"Some text that will change" not in res.data #not in selector