import sys import os import pytest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import html_tools # test generation guide. # 1. Do not include encoding in the xml declaration if the test object is a str type. # 2. Always paraphrase test. hotels = """ Christopher Anderson 25 Christopher Carter 30 Lisa Walker 60 Jessica Walker 32 Jennifer Roberts 50 """ @pytest.mark.parametrize("html_content", [hotels]) @pytest.mark.parametrize("xpath, answer", [('(//staff/given_name, //staff/age)', '25'), ("xs:date('2023-10-10')", '2023-10-10'), ("if (/hotel/branch[@location = 'California']/staff[1]/age = 25) then 'is 25' else 'is not 25'", 'is 25'), ("if (//hotel/branch[@location = 'California']/staff[1]/age = 25) then 'is 25' else 'is not 25'", 'is 25'), ("if (count(/hotel/branch/staff) = 5) then true() else false()", 'true'), ("if (count(//hotel/branch/staff) = 5) then true() else false()", 'true'), ("for $i in /hotel/branch/staff return if ($i/age >= 40) then upper-case($i/surname) else lower-case($i/surname)", 'anderson'), ("given_name = 'Christopher' and age = 40", 'false'), ("//given_name = 'Christopher' and //age = 40", 'false'), #("(staff/given_name, staff/age)", 'Lisa'), ("(//staff/given_name, //staff/age)", 'Lisa'), #("hotel/branch[@location = 'California']/staff/age union hotel/branch[@location = 'Las Vegas']/staff/age", ''), ("(//hotel/branch[@location = 'California']/staff/age union //hotel/branch[@location = 'Las Vegas']/staff/age)", '60'), ("(200 to 210)", "205"), ("(//hotel/branch[@location = 'California']/staff/age union //hotel/branch[@location = 'Las Vegas']/staff/age)", "50"), ("(1, 9, 9, 5)", "5"), ("(3, (), (14, 15), 92, 653)", "653"), ("for $i in /hotel/branch/staff return $i/given_name", "Christopher"), ("for $i in //hotel/branch/staff return $i/given_name", "Christopher"), ("distinct-values(for $i in /hotel/branch/staff return $i/given_name)", "Jessica"), ("distinct-values(for $i in //hotel/branch/staff return $i/given_name)", "Jessica"), ("for $i in (7 to 15) return $i*10", "130"), ("some $i in /hotel/branch/staff satisfies $i/age < 20", "false"), ("some $i in //hotel/branch/staff satisfies $i/age < 20", "false"), ("every $i in /hotel/branch/staff satisfies $i/age > 20", "true"), ("every $i in //hotel/branch/staff satisfies $i/age > 20 ", "true"), ("let $x := branch[@location = 'California'], $y := branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $x := //branch[@location = 'California'], $y := //branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $nu := 1, $de := 1000 return 'probability = ' || $nu div $de * 100 || '%'", "0.1%"), ("let $nu := 2, $probability := function ($argument) { 'probability = ' || $nu div $argument * 100 || '%'}, $de := 5 return $probability($de)", "40%"), ("'XPATH2.0-3.1 dissemination' instance of xs:string ", "true"), ("'new stackoverflow question incoming' instance of xs:integer ", "false"), ("'50000' cast as xs:integer", "50000"), ("//branch[@location = 'California']/staff[1]/surname eq 'Anderson'", "true"), ("fn:false()", "false")]) def test_hotels(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content branches_to_visit = """ Area 51 A place with no name Stalsk12 Stalsk12 Barcelona Paris """ @pytest.mark.parametrize("html_content", [branches_to_visit]) @pytest.mark.parametrize("xpath, answer", [ ("manager[@name = 'Godot']/branch union manager[@name = 'Freya']/branch", "Area 51"), ("//manager[@name = 'Godot']/branch union //manager[@name = 'Freya']/branch", "Stalsk12"), ("manager[@name = 'Godot']/branch | manager[@name = 'Freya']/branch", "Stalsk12"), ("//manager[@name = 'Godot']/branch | //manager[@name = 'Freya']/branch", "Stalsk12"), ("manager/branch intersect manager[@name = 'Godot']/branch", "A place with no name"), ("//manager/branch intersect //manager[@name = 'Godot']/branch", "A place with no name"), ("manager[@name = 'Godot']/branch intersect manager[@name = 'Freya']/branch", ""), ("manager/branch except manager[@name = 'Godot']/branch", "Barcelona"), ("manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), ("manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), ("manager[@name = 'Godot']/branch[2] eq manager[@name = 'Freya']/branch[2]", "false"), ("//manager[@name = 'Godot']/branch[2] eq //manager[@name = 'Freya']/branch[2]", "false"), ("manager[1]/@room_no lt manager[2]/@room_no", "false"), ("//manager[1]/@room_no lt //manager[2]/@room_no", "false"), ("manager[1]/@room_no gt manager[2]/@room_no", "true"), ("//manager[1]/@room_no gt //manager[2]/@room_no", "true"), ("manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), ("manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), ("manager[@name = 'Godot']/branch = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch = 'Area 51'", "true"), ("manager[@name = 'Godot']/branch = 'Barcelona'", "false"), ("//manager[@name = 'Godot']/branch = 'Barcelona'", "false"), ("manager[1]/@room_no > manager[2]/@room_no", "true"), ("//manager[1]/@room_no > //manager[2]/@room_no", "true"), ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[1]", "false"), ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[1]/branch[3]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[3]", "true"), ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << //manager[1]/branch[1]", "false"), ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> manager[1]/branch[1]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> //manager[1]/branch[1]", "true"), ("manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), ("manager[1]/@name || manager[2]/@name", "GodotFreya"), ("//manager[1]/@name || //manager[2]/@name", "GodotFreya"), ]) def test_branches_to_visit(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content trips = """ 2023-10-06 2023-10-10 4 2000.00 2023-10-06 2023-10-12 6 3500.34 """ @pytest.mark.parametrize("html_content", [trips]) @pytest.mark.parametrize("xpath, answer", [ ("1 + 9 * 9 + 5 div 5", "83"), ("(1 + 9 * 9 + 5) div 6", "14.5"), ("23 idiv 3", "7"), ("23 div 3", "7.66666666"), ("for $i in ./trip return $i/traveler/duration * $i/traveler/price", "21002.04"), ("for $i in ./trip return $i/traveler/duration ", "4"), ("for $i in .//trip return $i/traveler/duration * $i/traveler/price", "21002.04"), ("sum(for $i in ./trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), ("sum(for $i in .//trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), #("trip[1]/depart - trip[1]/arrive", "fail_to_get_answer"), #("//trip[1]/depart - //trip[1]/arrive", "fail_to_get_answer"), #("trip[1]/depart + trip[1]/arrive", "fail_to_get_answer"), #("xs:date(trip[1]/depart) + xs:date(trip[1]/arrive)", "fail_to_get_answer"), ("(//trip[1]/arrive cast as xs:date) - (//trip[1]/depart cast as xs:date)", "P4D"), ("(//trip[1]/depart cast as xs:date) - (//trip[1]/arrive cast as xs:date)", "-P4D"), ("(//trip[1]/depart cast as xs:date) + xs:dayTimeDuration('P3D')", "2023-10-09"), ("(//trip[1]/depart cast as xs:date) - xs:dayTimeDuration('P3D')", "2023-10-03"), ("(456, 623) instance of xs:integer", "false"), ("(456, 623) instance of xs:integer*", "true"), ("/trips/trip instance of element()", "false"), ("/trips/trip instance of element()*", "true"), ("/trips/trip[1]/arrive instance of xs:date", "false"), ("date(/trips/trip[1]/arrive) instance of xs:date", "true"), ("'8' cast as xs:integer", "8"), ("'11.1E3' cast as xs:double", "11100"), ("6.5 cast as xs:integer", "6"), #("/trips/trip[1]/arrive cast as xs:dateTime", "fail_to_get_answer"), ("/trips/trip[1]/arrive cast as xs:date", "2023-10-10"), ("('2023-10-12') cast as xs:date", "2023-10-12"), ("for $i in //trip return concat($i/depart, ' ', $i/arrive)", "2023-10-06 2023-10-10"), ]) def test_trips(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content DOM_violation_two_html_root_element = """

Hello world

First paragraph.

Hello world

Browsers parse this part by fixing it but lxml doesn't and returns two root element node

Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.

""" @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ (".", "First paragraph."), ("/*", "First paragraph."), ("/html", "First paragraph."), ("/html/body/p[1]", "First paragraph."), ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("count(/html/body/p[1])", "2"), ("count(/html)", "2"), ("count(//html)", "2"), ("count(//body)", "2"), ("count(/html/body)", "2"), ("//html/body/p[1]", "First paragraph."), ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("//body/p[1]", "First paragraph."), ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) def test_broken_DOM_01(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. with pytest.raises(Exception): from lxml import etree, html import elementpath from elementpath.xpath3 import XPath3Parser parser = etree.HTMLParser() tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) # just example xpath # Error will occur. r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ ("/html[2]/body/p[1]", "First paragraph."), ("//html[2]/body/p[1]", "First paragraph."), ]) def test_Broken_DOM_02(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str # Check the answer is *not in* the html_content assert answer not in html_content @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ ("/html/body/p[1]", 2), ("/html", 2), ("//html", 2), ("//body", 2), ("/html/body", 2), ]) def test_Broken_DOM_03(html_content, xpath, answer): """just test for xpath1""" from lxml import etree, html parser = etree.HTMLParser() tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) # test xpath 1 assert len(tree.xpath(xpath)) == answer