From 87510becb5d33b2f5542894d33160a2d79c34263 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 30 Oct 2024 12:00:53 +0100 Subject: [PATCH] Filters - Process all CSS and XPath 'subtract' selectors in a single pass to prevent index shifting and reference loss during DOM manipulation. (#2754) --- changedetectionio/html_tools.py | 49 +++++++++-- .../tests/test_element_removal.py | 87 +++++++++++++++++++ 2 files changed, 129 insertions(+), 7 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 6e4ebca0..b710077f 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -54,29 +54,64 @@ def include_filters(include_filters, html_content, append_pretty_line_formatting def subtractive_css_selector(css_selector, html_content): from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, "html.parser") - for item in soup.select(css_selector): + + # So that the elements dont shift their index, build a list of elements here which will be pointers to their place in the DOM + elements_to_remove = soup.select(css_selector) + + # Then, remove them in a separate loop + for item in elements_to_remove: item.decompose() + return str(soup) -def subtractive_xpath_selector(xpath_selector, html_content): +def subtractive_xpath_selector(selectors: List[str], html_content: str) -> str: + # Parse the HTML content using lxml html_tree = etree.HTML(html_content) - elements_to_remove = html_tree.xpath(xpath_selector) + # First, collect all elements to remove + elements_to_remove = [] + + # Iterate over the list of XPath selectors + for selector in selectors: + # Collect elements for each selector + elements_to_remove.extend(html_tree.xpath(selector)) + + # Then, remove them in a separate loop for element in elements_to_remove: - element.getparent().remove(element) + if element.getparent() is not None: # Ensure the element has a parent before removing + element.getparent().remove(element) + # Convert the modified HTML tree back to a string modified_html = etree.tostring(html_tree, method="html").decode("utf-8") return modified_html + def element_removal(selectors: List[str], html_content): - """Removes elements that match a list of CSS or xPath selectors.""" + """Removes elements that match a list of CSS or XPath selectors.""" modified_html = html_content + css_selectors = [] + xpath_selectors = [] + for selector in selectors: if selector.startswith(('xpath:', 'xpath1:', '//')): + # Handle XPath selectors separately xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:') - modified_html = subtractive_xpath_selector(xpath_selector, modified_html) + xpath_selectors.append(xpath_selector) else: - modified_html = subtractive_css_selector(selector, modified_html) + # Collect CSS selectors as one "hit", see comment in subtractive_css_selector + css_selectors.append(selector.strip().strip(",")) + + if xpath_selectors: + modified_html = subtractive_xpath_selector(xpath_selectors, modified_html) + + if css_selectors: + # Remove duplicates, then combine all CSS selectors into one string, separated by commas + # This stops the elements index shifting + unique_selectors = list(set(css_selectors)) # Ensure uniqueness + combined_css_selector = " , ".join(unique_selectors) + modified_html = subtractive_css_selector(combined_css_selector, modified_html) + + return modified_html def elementpath_tostring(obj): diff --git a/changedetectionio/tests/test_element_removal.py b/changedetectionio/tests/test_element_removal.py index d7474279..3dc51033 100644 --- a/changedetectionio/tests/test_element_removal.py +++ b/changedetectionio/tests/test_element_removal.py @@ -11,6 +11,35 @@ from .util import live_server_setup, wait_for_all_checks def test_setup(live_server): live_server_setup(live_server) +def set_response_with_multiple_index(): + data= """ + + + + + + + + + + + + + + + + + + + + +
Person 1Person 2Person 3
EmilTobiasLinus
161410
+ + +""" + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(data) + def set_original_response(): test_return_data = """ @@ -177,3 +206,61 @@ def test_element_removal_full(client, live_server, measure_memory_usage): # There should not be an unviewed change, as changes should be removed res = client.get(url_for("index")) assert b"unviewed" not in res.data + +# Re #2752 +def test_element_removal_nth_offset_no_shift(client, live_server, measure_memory_usage): + #live_server_setup(live_server) + + set_response_with_multiple_index() + subtractive_selectors_data = [""" +body > table > tr:nth-child(1) > th:nth-child(2) +body > table > tr:nth-child(2) > td:nth-child(2) +body > table > tr:nth-child(3) > td:nth-child(2) +body > table > tr:nth-child(1) > th:nth-child(3) +body > table > tr:nth-child(2) > td:nth-child(3) +body > table > tr:nth-child(3) > td:nth-child(3)""", +"""//body/table/tr[1]/th[2] +//body/table/tr[2]/td[2] +//body/table/tr[3]/td[2] +//body/table/tr[1]/th[3] +//body/table/tr[2]/td[3] +//body/table/tr[3]/td[3]"""] + + for selector_list in subtractive_selectors_data: + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + # Add our URL to the import page + test_url = url_for("test_endpoint", _external=True) + res = client.post( + url_for("import_page"), data={"urls": test_url}, follow_redirects=True + ) + assert b"1 Imported" in res.data + wait_for_all_checks(client) + + res = client.post( + url_for("edit_page", uuid="first"), + data={ + "subtractive_selectors": selector_list, + "url": test_url, + "tags": "", + "fetch_backend": "html_requests", + }, + follow_redirects=True, + ) + assert b"Updated watch." in res.data + wait_for_all_checks(client) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + assert b"Tobias" not in res.data + assert b"Linus" not in res.data + assert b"Person 2" not in res.data + assert b"Person 3" not in res.data + # First column should exist + assert b"Emil" in res.data +