import sys
import os
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import html_tools
# test generation guide.
# 1. Do not include encoding in the xml declaration if the test object is a str type.
# 2. Always paraphrase test.
hotels = """
Christopher
Anderson
25
Christopher
Carter
30
Lisa
Walker
60
Jessica
Walker
32
Jennifer
Roberts
50
"""
@pytest.mark.parametrize("html_content", [hotels])
@pytest.mark.parametrize("xpath, answer", [('(//staff/given_name, //staff/age)', '25'),
("xs:date('2023-10-10')", '2023-10-10'),
("if (/hotel/branch[@location = 'California']/staff[1]/age = 25) then 'is 25' else 'is not 25'", 'is 25'),
("if (//hotel/branch[@location = 'California']/staff[1]/age = 25) then 'is 25' else 'is not 25'", 'is 25'),
("if (count(/hotel/branch/staff) = 5) then true() else false()", 'true'),
("if (count(//hotel/branch/staff) = 5) then true() else false()", 'true'),
("for $i in /hotel/branch/staff return if ($i/age >= 40) then upper-case($i/surname) else lower-case($i/surname)", 'anderson'),
("given_name = 'Christopher' and age = 40", 'false'),
("//given_name = 'Christopher' and //age = 40", 'false'),
#("(staff/given_name, staff/age)", 'Lisa'),
("(//staff/given_name, //staff/age)", 'Lisa'),
#("hotel/branch[@location = 'California']/staff/age union hotel/branch[@location = 'Las Vegas']/staff/age", ''),
("(//hotel/branch[@location = 'California']/staff/age union //hotel/branch[@location = 'Las Vegas']/staff/age)", '60'),
("(200 to 210)", "205"),
("(//hotel/branch[@location = 'California']/staff/age union //hotel/branch[@location = 'Las Vegas']/staff/age)", "50"),
("(1, 9, 9, 5)", "5"),
("(3, (), (14, 15), 92, 653)", "653"),
("for $i in /hotel/branch/staff return $i/given_name", "Christopher"),
("for $i in //hotel/branch/staff return $i/given_name", "Christopher"),
("distinct-values(for $i in /hotel/branch/staff return $i/given_name)", "Jessica"),
("distinct-values(for $i in //hotel/branch/staff return $i/given_name)", "Jessica"),
("for $i in (7 to 15) return $i*10", "130"),
("some $i in /hotel/branch/staff satisfies $i/age < 20", "false"),
("some $i in //hotel/branch/staff satisfies $i/age < 20", "false"),
("every $i in /hotel/branch/staff satisfies $i/age > 20", "true"),
("every $i in //hotel/branch/staff satisfies $i/age > 20 ", "true"),
("let $x := hotel/branch[@location = 'California'], $y := hotel/branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"),
("let $x := //branch[@location = 'California'], $y := //branch[@location = 'Las Vegas'] return (avg($x/staff/age), avg($y/staff/age))", "27.5"),
("let $nu := 1, $de := 1000 return 'probability = ' || $nu div $de * 100 || '%'", "0.1%"),
("let $nu := 2, $probability := function ($argument) { 'probability = ' || $nu div $argument * 100 || '%'}, $de := 5 return $probability($de)", "40%"),
("'XPATH2.0-3.1 dissemination' instance of xs:string ", "true"),
("'new stackoverflow question incoming' instance of xs:integer ", "false"),
("'50000' cast as xs:integer", "50000"),
("//branch[@location = 'California']/staff[1]/surname eq 'Anderson'", "true"),
("fn:false()", "false")])
def test_hotels(html_content, xpath, answer):
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
assert answer in html_content
branches_to_visit = """
Area 51
A place with no name
Stalsk12
Stalsk12
Barcelona
Paris
"""
@pytest.mark.parametrize("html_content", [branches_to_visit])
@pytest.mark.parametrize("xpath, answer", [
("branches_to_visit/manager[@name = 'Godot']/branch union branches_to_visit/manager[@name = 'Freya']/branch", "Area 51"),
("//manager[@name = 'Godot']/branch union //manager[@name = 'Freya']/branch", "Stalsk12"),
("branches_to_visit/manager[@name = 'Godot']/branch | branches_to_visit/manager[@name = 'Freya']/branch", "Stalsk12"),
("//manager[@name = 'Godot']/branch | //manager[@name = 'Freya']/branch", "Stalsk12"),
("branches_to_visit/manager/branch intersect branches_to_visit/manager[@name = 'Godot']/branch", "A place with no name"),
("//manager/branch intersect //manager[@name = 'Godot']/branch", "A place with no name"),
("branches_to_visit/manager[@name = 'Godot']/branch intersect branches_to_visit/manager[@name = 'Freya']/branch", ""),
("branches_to_visit/manager/branch except branches_to_visit/manager[@name = 'Godot']/branch", "Barcelona"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"),
("//manager[@name = 'Godot']/branch[1] eq 'Area 51'", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"),
("//manager[@name = 'Godot']/branch[1] eq 'Seoul'", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[2] eq branches_to_visit/manager[@name = 'Freya']/branch[2]", "false"),
("//manager[@name = 'Godot']/branch[2] eq //manager[@name = 'Freya']/branch[2]", "false"),
("branches_to_visit/manager[1]/@room_no lt branches_to_visit/manager[2]/@room_no", "false"),
("//manager[1]/@room_no lt //manager[2]/@room_no", "false"),
("branches_to_visit/manager[1]/@room_no gt branches_to_visit/manager[2]/@room_no", "true"),
("//manager[1]/@room_no gt //manager[2]/@room_no", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"),
("//manager[@name = 'Godot']/branch[1] = 'Area 51'", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"),
("//manager[@name = 'Godot']/branch[1] = 'Seoul'", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch = 'Area 51'", "true"),
("//manager[@name = 'Godot']/branch = 'Area 51'", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch = 'Barcelona'", "false"),
("//manager[@name = 'Godot']/branch = 'Barcelona'", "false"),
("branches_to_visit/manager[1]/@room_no > branches_to_visit/manager[2]/@room_no", "true"),
("//manager[1]/@room_no > //manager[2]/@room_no", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[1]", "false"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[1]", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[1]/branch[3]", "true"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[1]/branch[3]", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << branches_to_visit/manager[1]/branch[1]", "false"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] << //manager[1]/branch[1]", "false"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> branches_to_visit/manager[1]/branch[1]", "true"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] >> //manager[1]/branch[1]", "true"),
("branches_to_visit/manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is branches_to_visit/manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"),
("//manager[@name = 'Godot']/branch[ . = 'Stalsk12'] is //manager[@name = 'Freya']/branch[ . = 'Stalsk12']", "false"),
("branches_to_visit/manager[1]/@name || branches_to_visit/manager[2]/@name", "GodotFreya"),
("//manager[1]/@name || //manager[2]/@name", "GodotFreya"),
])
def test_branches_to_visit(html_content, xpath, answer):
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
assert answer in html_content
trips = """
2023-10-06
2023-10-10
4
2000.00
2023-10-06
2023-10-12
6
3500.34
"""
@pytest.mark.parametrize("html_content", [trips])
@pytest.mark.parametrize("xpath, answer", [
("1 + 9 * 9 + 5 div 5", "83"),
("(1 + 9 * 9 + 5) div 6", "14.5"),
("23 idiv 3", "7"),
("23 div 3", "7.66666666"),
("for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price", "21002.04"),
("for $i in ./trips/trip return $i/traveler/duration ", "4"),
("for $i in .//trip return $i/traveler/duration * $i/traveler/price", "21002.04"),
("sum(for $i in ./trips/trip return $i/traveler/duration * $i/traveler/price)", "29002.04"),
("sum(for $i in .//trip return $i/traveler/duration * $i/traveler/price)", "29002.04"),
#("trip[1]/depart - trip[1]/arrive", "fail_to_get_answer"),
#("//trip[1]/depart - //trip[1]/arrive", "fail_to_get_answer"),
#("trip[1]/depart + trip[1]/arrive", "fail_to_get_answer"),
#("xs:date(trip[1]/depart) + xs:date(trip[1]/arrive)", "fail_to_get_answer"),
("(//trip[1]/arrive cast as xs:date) - (//trip[1]/depart cast as xs:date)", "P4D"),
("(//trip[1]/depart cast as xs:date) - (//trip[1]/arrive cast as xs:date)", "-P4D"),
("(//trip[1]/depart cast as xs:date) + xs:dayTimeDuration('P3D')", "2023-10-09"),
("(//trip[1]/depart cast as xs:date) - xs:dayTimeDuration('P3D')", "2023-10-03"),
("(456, 623) instance of xs:integer", "false"),
("(456, 623) instance of xs:integer*", "true"),
("/trips/trip instance of element()", "false"),
("/trips/trip instance of element()*", "true"),
("/trips/trip[1]/arrive instance of xs:date", "false"),
("date(/trips/trip[1]/arrive) instance of xs:date", "true"),
("'8' cast as xs:integer", "8"),
("'11.1E3' cast as xs:double", "11100"),
("6.5 cast as xs:integer", "6"),
#("/trips/trip[1]/arrive cast as xs:dateTime", "fail_to_get_answer"),
("/trips/trip[1]/arrive cast as xs:date", "2023-10-10"),
("('2023-10-12') cast as xs:date", "2023-10-12"),
("for $i in //trip return concat($i/depart, ' ', $i/arrive)", "2023-10-06 2023-10-10"),
])
def test_trips(html_content, xpath, answer):
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
assert answer in html_content
DOM_violation_two_html_root_element = """
Hello world1
First paragraph.
Hello world2
Browsers parse this part by fixing it but lxml doesn't and returns two root element node
Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one.
"""
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
@pytest.mark.parametrize("xpath, answer", [
(".", "Hello world1"),
(".", "First paragraph."),
(".", "Hello world2"),
(".", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
(".", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("/*", "Hello world1"),
("/*", "First paragraph."),
("/*", "Hello world2"),
("/*", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("/*", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("html", "Hello world1"),
("html", "First paragraph."),
("html", "Hello world2"),
("html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("/html", "Hello world1"),
("/html", "First paragraph."),
("/html", "Hello world2"),
("/html", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("/html", "Therefore, if the path is /html/body/p[1], lxml(libxml2) returns two element nodes not one."),
("/html/body/p[1]", "First paragraph."),
("/html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("count(/html/body/p[1])", "2"),
("count(/html)", "2"),
("count(//html)", "2"),
("count(//body)", "2"),
("count(/html/body)", "2"),
("//html/body/p[1]", "First paragraph."),
("//html/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("//body/p[1]", "First paragraph."),
("//body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("/html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
("//html[2]/body/p[1]", "Browsers parse this part by fixing it but lxml doesn't and returns two root element node"),
])
def test_broken_DOM_01(html_content, xpath, answer):
# In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
with pytest.raises(Exception):
from lxml import etree, html
import elementpath
from elementpath.xpath3 import XPath3Parser
parser = etree.HTMLParser()
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
# just example xpath
# Error will occur.
r = elementpath.select(tree, xpath.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
assert answer in html_content
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
@pytest.mark.parametrize("xpath, answer", [
("/html[2]/body/p[1]", "First paragraph."),
("//html[2]/body/p[1]", "First paragraph."),
])
def test_Broken_DOM_02(html_content, xpath, answer):
# In normal situation, DOM's root element node is only one. So when DOM violation happens, Exception occurs.
html_content = html_tools.xpath_filter(xpath, html_content, append_pretty_line_formatting=True)
assert type(html_content) == str
# Check the answer is *not in* the html_content
assert answer not in html_content
@pytest.mark.parametrize("html_content", [DOM_violation_two_html_root_element])
@pytest.mark.parametrize("xpath, answer", [
("/html/body/p[1]", 2),
("/html", 2),
("//html", 2),
("//body", 2),
("/html/body", 2),
])
def test_Broken_DOM_03(html_content, xpath, answer):
"""just test for xpath1"""
from lxml import etree, html
parser = etree.HTMLParser()
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
# test xpath 1
assert len(tree.xpath(xpath)) == answer