From 919812bf8bb12a4f62aef72feff06a3ceae3beec Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 10 Sep 2024 14:31:09 +0200 Subject: [PATCH] Automatically apply any XML/RSS namespaces --- changedetectionio/html_tools.py | 24 ++++++++++++++++++- .../processors/text_json_diff/processor.py | 7 +++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index bd5fdb8f..456bdcfb 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -8,6 +8,7 @@ from xml.sax.saxutils import escape as xml_escape import json import re +from loguru import logger # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis TEXT_FILTER_LIST_LINE_SUFFIX = "
" @@ -108,6 +109,20 @@ def elementpath_tostring(obj): return str(obj) +def extract_namespaces(xml_content): + """ + Extracts all namespaces from the XML content. + """ + from lxml import etree + from io import BytesIO + + it = etree.iterparse(BytesIO(xml_content), events=('start-ns',)) + namespaces = {} + for _, ns in it: + prefix, uri = ns + namespaces[prefix] = uri + return namespaces + # Return str Utf-8 of matched rules def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False): from lxml import etree, html @@ -123,7 +138,14 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser) html_block = "" - r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser) + # Automatically extract all namespaces from the XML content + namespaces = {'re': 'http://exslt.org/regular-expressions'} + try: + namespaces.update(extract_namespaces(html_content.encode('utf-8'))) + except Exception as e: + logger.warning(f"Problem extracting namespaces from HTMl/XML content {str(e)}") + + r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser) #@note: //title/text() wont work where CDATA.. if type(r) != list: diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 1de5bafb..90cad63e 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -77,11 +77,12 @@ class perform_site_check(difference_detection_processor): ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower() # Go into RSS preprocess for converting CDATA/comment to usable text - if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']): - if '<rss' in self.fetcher.content[:100].lower(): + # Ctype_header could be unset if we are just reprocessing the existin content + if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']) or not ctype_header: + top_text = self.fetcher.content[:200].lower().strip() + if '<rss' in top_text or 'search.yahoo.com/mrss/' in top_text: self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content) is_rss = True - # source: support, basically treat it as plaintext if watch.is_source_type_url: is_html = False