html_tools/fix: Add forest_transplanting to handle invalid DOM

pull/2351/head
Constantin Hong 8 months ago
parent d127214d8f
commit 8e1f170924

@ -110,6 +110,24 @@ def elementpath_tostring(obj):
return str(obj) return str(obj)
def forest_transplanting(root):
"""
libxml2 violates DOM rules. it means there can be multiple root element
nodes. So I choose just transplating them to a new root by default.
See also, https://gitlab.gnome.org/GNOME/libxml2/-/issues/716
This will emulate xpath1 of html of libxml2 like '/html[2]/*'.
To make this function work, 'fragment=True' in elementpath.select is required.
"""
from lxml import etree
from itertools import chain
root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)]
root_siblings_preceding.reverse()
root_siblings = [s for s in root.itersiblings()]
new_root = etree.Element("new_root")
for node in chain(root_siblings_preceding, [root], root_siblings):
new_root.append(node)
return new_root
# Return str Utf-8 of matched rules # Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
from lxml import etree, html from lxml import etree, html
@ -123,9 +141,10 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
parser = etree.XMLParser(strip_cdata=False) parser = etree.XMLParser(strip_cdata=False)
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
tree = forest_transplanting(tree)
html_block = "" html_block = ""
r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser, fragment=True)
#@note: //title/text() wont work where <title>CDATA.. #@note: //title/text() wont work where <title>CDATA..
if type(r) != list: if type(r) != list:

Loading…
Cancel
Save