Fix encoding errors with XPath filters from UTF-8 responses (#619)

3 years ago · e17c2121f7
parent 07e279b38d
commit e17c2121f7
2 changed files with 60 additions and 3 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -39,7 +39,7 @@ def element_removal(selectors: List[str], html_content):
 def xpath_filter(xpath_filter, html_content):
    from lxml import etree, html
-    tree = html.fromstring(html_content)
+    tree = html.fromstring(bytes(html_content, encoding='utf-8'))
    html_block = ""
    for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}):
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@ -44,6 +44,61 @@ def set_modified_response():
    return None
 # Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613
 def test_check_xpath_filter_utf8(client, live_server):
    filter='//item/*[self::description]'
    d='''<?xml version="1.0" encoding="UTF-8"?>
 <rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
 	<channel>
 		<title>rpilocator.com</title>
 		<link>https://rpilocator.com</link>
 		<description>Find Raspberry Pi Computers in Stock</description>
 		<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
 		<image>
 			<url>https://rpilocator.com/favicon.png</url>
 			<title>rpilocator.com</title>
 			<link>https://rpilocator.com/</link>
 			<width>32</width>
 			<height>32</height>
 		</image>
 		<item>
 			<title>Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni</title>
 			<description>Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni</description>
 			<link>https://rpilocator.com?vendor=pimoroni&amp;utm_source=feed&amp;utm_medium=rss</link>
 			<category>pimoroni</category>
 			<category>UK</category>
 			<category>CM4</category>
 			<guid isPermaLink="false">F9FAB0D9-DF6F-40C8-8DEE5FC0646BB722</guid>
 			<pubDate>Thu, 19 May 2022 14:32:32 GMT</pubDate>
 		</item>
 	</channel>
 </rss>'''
    with open("test-datastore/endpoint-content.txt", "w") as f:
        f.write(d)
    # Add our URL to the import page
    test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8")
    res = client.post(
        url_for("import_page"),
        data={"urls": test_url},
        follow_redirects=True
    )
    assert b"1 Imported" in res.data
    res = client.post(
        url_for("edit_page", uuid="first"),
        data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"},
        follow_redirects=True
    )
    assert b"Updated watch." in res.data
    time.sleep(3)
    res = client.get(url_for("index"))
    assert b'Unicode strings with encoding declaration are not supported.' not in res.data
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
 def test_check_markup_xpath_filter_restriction(client, live_server):
    sleep_time_for_fetch_thread = 3
@ -95,6 +150,8 @@ def test_check_markup_xpath_filter_restriction(client, live_server):
    res = client.get(url_for("index"))
    assert b'unviewed' not in res.data
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
 def test_xpath_validation(client, live_server):
@ -117,6 +174,8 @@ def test_xpath_validation(client, live_server):
        follow_redirects=True
    )
    assert b"is not a valid XPath expression" in res.data
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data
 # actually only really used by the distll.io importer, but could be handy too
@ -153,8 +212,6 @@ def test_check_with_prefix_css_filter(client, live_server):
        follow_redirects=True
    )
    with open('/tmp/fuck.html', 'wb') as f:
        f.write(res.data)
    assert b"Some text thats the same" in res.data #in selector
    assert b"Some text that will change" not in res.data #not in selector