Filters - Process all CSS and XPath 'subtract' selectors in a single pass to prevent index shifting and reference loss during DOM manipulation. (#2754)

pull/2757/head
dgtlmoon 2 months ago committed by GitHub
parent 5e95dc62a5
commit 87510becb5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -54,29 +54,64 @@ def include_filters(include_filters, html_content, append_pretty_line_formatting
def subtractive_css_selector(css_selector, html_content): def subtractive_css_selector(css_selector, html_content):
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser") soup = BeautifulSoup(html_content, "html.parser")
for item in soup.select(css_selector):
# So that the elements dont shift their index, build a list of elements here which will be pointers to their place in the DOM
elements_to_remove = soup.select(css_selector)
# Then, remove them in a separate loop
for item in elements_to_remove:
item.decompose() item.decompose()
return str(soup) return str(soup)
def subtractive_xpath_selector(xpath_selector, html_content): def subtractive_xpath_selector(selectors: List[str], html_content: str) -> str:
# Parse the HTML content using lxml
html_tree = etree.HTML(html_content) html_tree = etree.HTML(html_content)
elements_to_remove = html_tree.xpath(xpath_selector)
# First, collect all elements to remove
elements_to_remove = []
# Iterate over the list of XPath selectors
for selector in selectors:
# Collect elements for each selector
elements_to_remove.extend(html_tree.xpath(selector))
# Then, remove them in a separate loop
for element in elements_to_remove: for element in elements_to_remove:
if element.getparent() is not None: # Ensure the element has a parent before removing
element.getparent().remove(element) element.getparent().remove(element)
# Convert the modified HTML tree back to a string
modified_html = etree.tostring(html_tree, method="html").decode("utf-8") modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
return modified_html return modified_html
def element_removal(selectors: List[str], html_content): def element_removal(selectors: List[str], html_content):
"""Removes elements that match a list of CSS or xPath selectors.""" """Removes elements that match a list of CSS or XPath selectors."""
modified_html = html_content modified_html = html_content
css_selectors = []
xpath_selectors = []
for selector in selectors: for selector in selectors:
if selector.startswith(('xpath:', 'xpath1:', '//')): if selector.startswith(('xpath:', 'xpath1:', '//')):
# Handle XPath selectors separately
xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:') xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
modified_html = subtractive_xpath_selector(xpath_selector, modified_html) xpath_selectors.append(xpath_selector)
else: else:
modified_html = subtractive_css_selector(selector, modified_html) # Collect CSS selectors as one "hit", see comment in subtractive_css_selector
css_selectors.append(selector.strip().strip(","))
if xpath_selectors:
modified_html = subtractive_xpath_selector(xpath_selectors, modified_html)
if css_selectors:
# Remove duplicates, then combine all CSS selectors into one string, separated by commas
# This stops the elements index shifting
unique_selectors = list(set(css_selectors)) # Ensure uniqueness
combined_css_selector = " , ".join(unique_selectors)
modified_html = subtractive_css_selector(combined_css_selector, modified_html)
return modified_html return modified_html
def elementpath_tostring(obj): def elementpath_tostring(obj):

@ -11,6 +11,35 @@ from .util import live_server_setup, wait_for_all_checks
def test_setup(live_server): def test_setup(live_server):
live_server_setup(live_server) live_server_setup(live_server)
def set_response_with_multiple_index():
data= """<!DOCTYPE html>
<html>
<body>
<!-- NOTE!! CHROME WILL ADD TBODY HERE IF ITS NOT THERE!! -->
<table style="width:100%">
<tr>
<th>Person 1</th>
<th>Person 2</th>
<th>Person 3</th>
</tr>
<tr>
<td>Emil</td>
<td>Tobias</td>
<td>Linus</td>
</tr>
<tr>
<td>16</td>
<td>14</td>
<td>10</td>
</tr>
</table>
</body>
</html>
"""
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(data)
def set_original_response(): def set_original_response():
test_return_data = """<html> test_return_data = """<html>
@ -177,3 +206,61 @@ def test_element_removal_full(client, live_server, measure_memory_usage):
# There should not be an unviewed change, as changes should be removed # There should not be an unviewed change, as changes should be removed
res = client.get(url_for("index")) res = client.get(url_for("index"))
assert b"unviewed" not in res.data assert b"unviewed" not in res.data
# Re #2752
def test_element_removal_nth_offset_no_shift(client, live_server, measure_memory_usage):
#live_server_setup(live_server)
set_response_with_multiple_index()
subtractive_selectors_data = ["""
body > table > tr:nth-child(1) > th:nth-child(2)
body > table > tr:nth-child(2) > td:nth-child(2)
body > table > tr:nth-child(3) > td:nth-child(2)
body > table > tr:nth-child(1) > th:nth-child(3)
body > table > tr:nth-child(2) > td:nth-child(3)
body > table > tr:nth-child(3) > td:nth-child(3)""",
"""//body/table/tr[1]/th[2]
//body/table/tr[2]/td[2]
//body/table/tr[3]/td[2]
//body/table/tr[1]/th[3]
//body/table/tr[2]/td[3]
//body/table/tr[3]/td[3]"""]
for selector_list in subtractive_selectors_data:
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
# Add our URL to the import page
test_url = url_for("test_endpoint", _external=True)
res = client.post(
url_for("import_page"), data={"urls": test_url}, follow_redirects=True
)
assert b"1 Imported" in res.data
wait_for_all_checks(client)
res = client.post(
url_for("edit_page", uuid="first"),
data={
"subtractive_selectors": selector_list,
"url": test_url,
"tags": "",
"fetch_backend": "html_requests",
},
follow_redirects=True,
)
assert b"Updated watch." in res.data
wait_for_all_checks(client)
res = client.get(
url_for("preview_page", uuid="first"),
follow_redirects=True
)
assert b"Tobias" not in res.data
assert b"Linus" not in res.data
assert b"Person 2" not in res.data
assert b"Person 3" not in res.data
# First column should exist
assert b"Emil" in res.data

Loading…
Cancel
Save