diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index d355c209..27496e1a 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -39,7 +39,7 @@ def element_removal(selectors: List[str], html_content): def xpath_filter(xpath_filter, html_content): from lxml import etree, html - tree = html.fromstring(html_content) + tree = html.fromstring(bytes(html_content, encoding='utf-8')) html_block = "" for item in tree.xpath(xpath_filter.strip(), namespaces={'re':'http://exslt.org/regular-expressions'}): diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py index 75e3b2cd..1ac1dd7e 100644 --- a/changedetectionio/tests/test_xpath_selector.py +++ b/changedetectionio/tests/test_xpath_selector.py @@ -44,6 +44,61 @@ def set_modified_response(): return None +# Handle utf-8 charset replies https://github.com/dgtlmoon/changedetection.io/pull/613 +def test_check_xpath_filter_utf8(client, live_server): + filter='//item/*[self::description]' + + d=''' + + + rpilocator.com + https://rpilocator.com + Find Raspberry Pi Computers in Stock + Thu, 19 May 2022 23:27:30 GMT + + https://rpilocator.com/favicon.png + rpilocator.com + https://rpilocator.com/ + 32 + 32 + + + Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni + Stock Alert (UK): RPi CM4 - 1GB RAM, No MMC, No Wifi is In Stock at Pimoroni + https://rpilocator.com?vendor=pimoroni&utm_source=feed&utm_medium=rss + pimoroni + UK + CM4 + F9FAB0D9-DF6F-40C8-8DEE5FC0646BB722 + Thu, 19 May 2022 14:32:32 GMT + + +''' + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(d) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True, content_type="application/rss+xml;charset=UTF-8") + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + res = client.post( + url_for("edit_page", uuid="first"), + data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + time.sleep(3) + res = client.get(url_for("index")) + assert b'Unicode strings with encoding declaration are not supported.' not in res.data + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + def test_check_markup_xpath_filter_restriction(client, live_server): sleep_time_for_fetch_thread = 3 @@ -95,6 +150,8 @@ def test_check_markup_xpath_filter_restriction(client, live_server): res = client.get(url_for("index")) assert b'unviewed' not in res.data + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data def test_xpath_validation(client, live_server): @@ -117,6 +174,8 @@ def test_xpath_validation(client, live_server): follow_redirects=True ) assert b"is not a valid XPath expression" in res.data + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data # actually only really used by the distll.io importer, but could be handy too @@ -153,8 +212,6 @@ def test_check_with_prefix_css_filter(client, live_server): follow_redirects=True ) - with open('/tmp/fuck.html', 'wb') as f: - f.write(res.data) assert b"Some text thats the same" in res.data #in selector assert b"Some text that will change" not in res.data #not in selector