import sys import os import pytest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import html_tools # test generation guide. # 1. Do not include encoding in the xml declaration if the test object is a str type. # 2. Always paraphrase test. hotels = """ Christopher Anderson 25 Christopher Carter 30 Lisa Walker 60 Jessica Walker 32 Jennifer Roberts 50 """ @pytest.mark.parametrize("html_content", [hotels]) @pytest.mark.parametrize("xpath, answer", [('(//staff/given_name, //staff/age)', '25'), ("xs:date('2023-10-10')", '2023-10-10'), ("if (/hotel/branch[@location = 'California']/staff[1]/age = 25) then 'is 25' else 'is not 25'", 'is 25'), ("if (//hotel/branch[@location = 'California']/staff[1]/age = 25) then 'is 25' else 'is not 25'", 'is 25'), ("if (count(/hotel/branch/staff) = 5) then true() else false()", 'true'), ("if (count(//hotel/branch/staff) = 5) then true() else false()", 'true'), ("for $i in /hotel/branch/staff return if ($i/age >= 40) then upper-case($i/surname) else lower-case($i/surname)", 'anderson'), ("given_name = 'Christopher' and age = 40", 'false'), ("//given_name = 'Christopher' and //age = 40", 'false'), #("(staff/given_name, staff/age)", 'Lisa'), ("(//staff/given_name, //staff/age)", 'Lisa'), #("hotel/branch[@location = 'California']/staff/age union hotel/branch[@location = 'Las Vegas']/staff/age", ''), ("(//hotel/branch[@location = 'California']/staff/age union //hotel/branch[@location = 'Las Vegas']/staff/age)", '60'), ("(200 to 210)", "205"), ("(//hotel/branch[@location = 'California']/staff/age union //hotel/branch[@location = 'Las Vegas']/staff/age)", "50"), ("(1, 9, 9, 5)", "5"), ("(3, (), (14, 15), 92, 653)", "653"), ("for $i in /hotel/branch/staff return $i/given_name", "Christopher"), ("for $i in //hotel/branch/staff return $i/given_name", "Christopher"), ("distinct-values(for $i in /hotel/branch/staff return $i/given_name)", "Jessica"), ("distinct-values(for $i in //hotel/branch/staff return $i/given_name)", "Jessica"), ("for $i in (7 to 15) return $i*10", "130"), ("some $i in /hotel/branch/staff satisfies $i/age < 20", "false"), ("some $i in //hotel/branch/staff satisfies $i/age < 20", "false"), ("every $i in /hotel/branch/staff satisfies $i/age > 20", "true"), ("every $i in //hotel/branch/staff satisfies $i/age > 20 ", "true"), ("let $x := hotel/branch[@location = 'California'], $y := hotel/branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $x := //branch[@location = 'California'], $y := //branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"), ("let $nu := 1, $de := 1000 return 'probability = ' || $nu div $de * 100 || '%'", "0.1%"), ("let $nu := 2, $probability := function ($argument) { 'probability = ' || $nu div $argument * 100 || '%'}, $de := 5 return $probability($de)", "40%"), ("'XPATH2.0-3.1 dissemination' instance of xs:string ", "true"), ("'new stackoverflow question incoming' instance of xs:integer ", "false"), ("'50000' cast as xs:integer", "50000"), ("//branch[@location = 'California']/staff[1]/surname eq 'Anderson'", "true"), ("fn:false()", "false")]) def test_hotels(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content branches_to_visit = """ Area 51 A place with no name Stalsk12 Stalsk12 Barcelona Paris """ @pytest.mark.parametrize("html_content", [branches_to_visit]) @pytest.mark.parametrize("xpath, answer", [ ("branches_to_visit/manager[@name = 'Godot']/branch union branches_to_visit/manager[@name = 'Freya']/branch", "Area 51"), ("//manager[@name = 'Godot']/branch union //manager[@name = 'Freya']/branch", "Stalsk12"), ("branches_to_visit/manager[@name = 'Godot']/branch | branches_to_visit/manager[@name = 'Freya']/branch", "Stalsk12"), ("//manager[@name = 'Godot']/branch | //manager[@name = 'Freya']/branch", "Stalsk12"), ("branches_to_visit/manager/branch intersect branches_to_visit/manager[@name = 'Godot']/branch", "A place with no name"), ("//manager/branch intersect //manager[@name = 'Godot']/branch", "A place with no name"), ("branches_to_visit/manager[@name = 'Godot']/branch intersect branches_to_visit/manager[@name = 'Freya']/branch", ""), ("branches_to_visit/manager/branch except branches_to_visit/manager[@name = 'Godot']/branch", "Barcelona"), ("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"), ("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"), ("branches_to_visit/manager[@name = 'Godot']/branch[2] eq branches_to_visit/manager[@name = 'Freya']/branch[2]", "false"), ("//manager[@name = 'Godot']/branch[2] eq //manager[@name = 'Freya']/branch[2]", "false"), ("branches_to_visit/manager[1]/@room_no lt branches_to_visit/manager[2]/@room_no", "false"), ("//manager[1]/@room_no lt //manager[2]/@room_no", "false"), ("branches_to_visit/manager[1]/@room_no gt branches_to_visit/manager[2]/@room_no", "true"), ("//manager[1]/@room_no gt //manager[2]/@room_no", "true"), ("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"), ("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), ("//manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"), ("branches_to_visit/manager[@name = 'Godot']/branch = 'Area 51'", "true"), ("//manager[@name = 'Godot']/branch = 'Area 51'", "true"), ("branches_to_visit/manager[@name = 'Godot']/branch = 'Barcelona'", "false"), ("//manager[@name = 'Godot']/branch = 'Barcelona'", "false"), ("branches_to_visit/manager[1]/@room_no > branches_to_visit/manager[2]/@room_no", "true"), ("//manager[1]/@room_no > //manager[2]/@room_no", "true"), ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[1]", "false"), ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[3]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[3]", "true"), ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << branches_to_visit/manager[1]/branch[1]", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << //manager[1]/branch[1]", "false"), ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> branches_to_visit/manager[1]/branch[1]", "true"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> //manager[1]/branch[1]", "true"), ("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), ("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"), ("branches_to_visit/manager[1]/@name || branches_to_visit/manager[2]/@name", "GodotFreya"), ("//manager[1]/@name || //manager[2]/@name", "GodotFreya"), ]) def test_branches_to_visit(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content trips = """ 2023-10-06 2023-10-10 4 2000.00 2023-10-06 2023-10-12 6 3500.34 """ @pytest.mark.parametrize("html_content", [trips]) @pytest.mark.parametrize("xpath, answer", [ ("1 + 9 * 9 + 5 div 5", "83"), ("(1 + 9 * 9 + 5) div 6", "14.5"), ("23 idiv 3", "7"), ("23 div 3", "7.66666666"), ("for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price", "21002.04"), ("for $i in ./trips/trip return $i/traveler/duration ", "4"), ("for $i in .//trip return $i/traveler/duration * $i/traveler/price", "21002.04"), ("sum(for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), ("sum(for $i in .//trip return $i/traveler/duration * $i/traveler/price)", "29002.04"), #("trip[1]/depart - trip[1]/arrive", "fail_to_get_answer"), #("//trip[1]/depart - //trip[1]/arrive", "fail_to_get_answer"), #("trip[1]/depart + trip[1]/arrive", "fail_to_get_answer"), #("xs:date(trip[1]/depart) + xs:date(trip[1]/arrive)", "fail_to_get_answer"), ("(//trip[1]/arrive cast as xs:date) - (//trip[1]/depart cast as xs:date)", "P4D"), ("(//trip[1]/depart cast as xs:date) - (//trip[1]/arrive cast as xs:date)", "-P4D"), ("(//trip[1]/depart cast as xs:date) + xs:dayTimeDuration('P3D')", "2023-10-09"), ("(//trip[1]/depart cast as xs:date) - xs:dayTimeDuration('P3D')", "2023-10-03"), ("(456, 623) instance of xs:integer", "false"), ("(456, 623) instance of xs:integer*", "true"), ("/trips/trip instance of element()", "false"), ("/trips/trip instance of element()*", "true"), ("/trips/trip[1]/arrive instance of xs:date", "false"), ("date(/trips/trip[1]/arrive) instance of xs:date", "true"), ("'8' cast as xs:integer", "8"), ("'11.1E3' cast as xs:double", "11100"), ("6.5 cast as xs:integer", "6"), #("/trips/trip[1]/arrive cast as xs:dateTime", "fail_to_get_answer"), ("/trips/trip[1]/arrive cast as xs:date", "2023-10-10"), ("('2023-10-12') cast as xs:date", "2023-10-12"), ("for $i in //trip return concat($i/depart, ' ', $i/arrive)", "2023-10-06 2023-10-10"), ]) def test_trips(html_content, xpath, answer): html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content DOM_violation_two_html_root_element = """

Hello world1

First paragraph.

Hello world2

Browsers parse this part by fixing it but lxml doesn't and returns two root element node

Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.

""" @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ (".", "Hello world1"), (".", "First paragraph."), (".", "Hello world2"), (".", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), (".", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), ("/*", "Hello world1"), ("/*", "First paragraph."), ("/*", "Hello world2"), ("/*", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("/*", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), ("html", "Hello world1"), ("html", "First paragraph."), ("html", "Hello world2"), ("html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), ("/html", "Hello world1"), ("/html", "First paragraph."), ("/html", "Hello world2"), ("/html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("/html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."), ("/html/body/p[1]", "First paragraph."), ("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("count(/html/body/p[1])", "2"), ("count(/html)", "2"), ("count(//html)", "2"), ("count(//body)", "2"), ("count(/html/body)", "2"), ("//html/body/p[1]", "First paragraph."), ("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("//body/p[1]", "First paragraph."), ("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"), ]) def test_broken_DOM_01(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. with pytest.raises(Exception): from lxml import etree, html import elementpath from elementpath.xpath3 import XPath3Parser parser = etree.HTMLParser() tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) # just example xpath # Error will occur. r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str assert answer in html_content @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ ("/html[2]/body/p[1]", "First paragraph."), ("//html[2]/body/p[1]", "First paragraph."), ]) def test_Broken_DOM_02(html_content, xpath, answer): # In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs. html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True) assert type(html_content) == str # Check the answer is *not in* the html_content assert answer not in html_content @pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element]) @pytest.mark.parametrize("xpath, answer", [ ("/html/body/p[1]", 2), ("/html", 2), ("//html", 2), ("//body", 2), ("/html/body", 2), ]) def test_Broken_DOM_03(html_content, xpath, answer): """just test for xpath1""" from lxml import etree, html parser = etree.HTMLParser() tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) # test xpath 1 assert len(tree.xpath(xpath)) == answer