tests/test_xpath_selector_unit/feat: Do forest_transplanting by default

pull/2351/head
Constantin Hong 8 months ago
parent 66a7dae381
commit 4d266cac9f

@ -123,25 +123,13 @@ def forest_transplanting(root):
root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)] root_siblings_preceding = [ s for s in root.itersiblings(preceding=True)]
root_siblings = [s for s in root.itersiblings()] root_siblings = [s for s in root.itersiblings()]
Is_fragment=False
# If element node exsits in root element node's sibilings, it is fragment.
for node in chain(root_siblings_preceding, root_siblings):
if not hasattr(node.tag, '__name__'):
Is_fragment=True
# early exit. because the root is already root element.
# So, two root element nodes are detected. DOM violation.
break
if Is_fragment:
new_root = etree.Element("new_root") new_root = etree.Element("new_root")
root_siblings_preceding.reverse() root_siblings_preceding.reverse()
for node in chain(root_siblings_preceding, [root], root_siblings): for node in chain(root_siblings_preceding, [root], root_siblings):
new_root.append(node) new_root.append(node)
return new_root, True return new_root, True
return root, False
# Return str Utf-8 of matched rules # Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
from lxml import etree, html from lxml import etree, html

@ -218,6 +218,9 @@ DOM_violation_two_html_root_element = """<!DOCTYPE html>
</html>""" </html>"""
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
@pytest.mark.parametrize("xpath, answer", [ @pytest.mark.parametrize("xpath, answer", [
(".", "First paragraph."),
("/*", "First paragraph."),
("/html", "First paragraph."),
("/html/body/p[1]", "First paragraph."), ("/html/body/p[1]", "First paragraph."),
("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("count(/html/body/p[1])", "2"), ("count(/html/body/p[1])", "2"),

Loading…
Cancel
Save