Bug fix - further work on lxml filter extract (#2313 #2312 #2317)

pull/2322/head
dgtlmoon 1 month ago committed by GitHub
parent 6f3c3b7dfb
commit 3ae9bfa6f9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -172,10 +172,12 @@ def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=Fals
if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])): if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
html_block += TEXT_FILTER_LIST_LINE_SUFFIX html_block += TEXT_FILTER_LIST_LINE_SUFFIX
if isinstance(element, str): # Some kind of text, UTF-8 or other
if isinstance(element, (str, bytes)):
html_block += element html_block += element
else: else:
html_block += etree.tostring(element, pretty_print=True, encoding='utf-8') # Return the HTML which will get parsed as text
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
return html_block return html_block

@ -52,8 +52,7 @@ cryptography~=3.4
beautifulsoup4 beautifulsoup4
# XPath filtering, lxml is required by bs4 anyway, but put it here to be safe. # XPath filtering, lxml is required by bs4 anyway, but put it here to be safe.
# #2312 - In 5.1.1 _ElementStringResult was removed - ImportError: cannot import name '_ElementStringResult' from 'lxml.etree' lxml >=4.8.0,<6
lxml
# XPath 2.0-3.1 support - 4.2.0 broke something? # XPath 2.0-3.1 support - 4.2.0 broke something?
elementpath==4.1.5 elementpath==4.1.5

Loading…
Cancel
Save