Bug fix for newer lxml module - module 'lxml.etree' has no attribute '_ElementStringResult' - reimplement _ElementStringResult (#2313 #2312)

pull/2317/head
dgtlmoon 2 weeks ago committed by GitHub
parent d4dac23ba1
commit 74707909f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -169,15 +169,13 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals
# And where the matched result doesn't include something that will cause Inscriptis to add a newline
# (This way each 'match' reliably has a new-line in the diff)
# Divs are converted to 4 whitespaces by inscriptis
if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
html_block += TEXT_FILTER_LIST_LINE_SUFFIX
if type(element) == etree._ElementStringResult:
html_block += str(element)
elif type(element) == etree._ElementUnicodeResult:
html_block += str(element)
if isinstance(element, str):
html_block += element
else:
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
html_block += etree.tostring(element, pretty_print=True, encoding='utf-8')
return html_block

@ -1,4 +1,4 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import time
from flask import url_for
@ -255,6 +255,69 @@ def test_xpath23_prefix_validation(client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_xpath1_lxml(client, live_server):
#live_server_setup(live_server)
d = '''<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
<channel>
<title>rpilocator.com</title>
<link>https://rpilocator.com</link>
<description>Find Raspberry Pi Computers in Stock</description>
<lastBuildDate>Thu, 19 May 2022 23:27:30 GMT</lastBuildDate>
<image>
<url>https://rpilocator.com/favicon.png</url>
<title>rpilocator.com</title>
<link>https://rpilocator.com/</link>
<width>32</width>
<height>32</height>
</image>
<item>
<title>Stock Alert (UK): RPi CM4</title>
<foo>something else unrelated</foo>
</item>
<item>
<title>Stock Alert (UK): Big monitorěěěě</title>
<foo>something else unrelated</foo>
</item>
</channel>
</rss>'''.encode('utf-8')
with open("test-datastore/endpoint-content.txt", "wb") as f:
f.write(d)
test_url = url_for('test_endpoint', _external=True)
res = client.post(
url_for("import_page"),
data={"urls": test_url},
follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.post(
url_for("edit_page", uuid="first"),
data={"include_filters": "xpath1://title/text()", "url": test_url, "tags": "", "headers": "",
'fetch_backend': "html_requests"},
follow_redirects=True
)
##### #2312
wait_for_all_checks(client)
res = client.get(url_for("index"))
assert b'_ElementStringResult' not in res.data # tested with 5.1.1 when it was removed and 5.1.0
assert b'Exception' not in res.data
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b"rpilocator.com" in res.data # in selector
assert "Stock Alert (UK): Big monitorěěěě".encode('utf-8') in res.data # not in selector
#####
def test_xpath1_validation(client, live_server):
# Add our URL to the import page

@ -52,6 +52,7 @@ cryptography~=3.4
beautifulsoup4
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
# #2312 - In 5.1.1 _ElementStringResult was removed - ImportError: cannot import name '_ElementStringResult' from 'lxml.etree'
lxml
# XPath 2.0-3.1 support - 4.2.0 broke something?

Loading…
Cancel
Save