From 919812bf8bb12a4f62aef72feff06a3ceae3beec Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Tue, 10 Sep 2024 14:31:09 +0200
Subject: [PATCH] Automatically apply any XML/RSS namespaces

---
 changedetectionio/html_tools.py               | 24 ++++++++++++++++++-
 .../processors/text_json_diff/processor.py    |  7 +++---
 2 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index bd5fdb8f..456bdcfb 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -8,6 +8,7 @@ from xml.sax.saxutils import escape as xml_escape
 import json
 import re
 
+from loguru import logger
 
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
@@ -108,6 +109,20 @@ def elementpath_tostring(obj):
 
     return str(obj)
 
+def extract_namespaces(xml_content):
+    """
+    Extracts all namespaces from the XML content.
+    """
+    from lxml import etree
+    from io import BytesIO
+
+    it = etree.iterparse(BytesIO(xml_content), events=('start-ns',))
+    namespaces = {}
+    for _, ns in it:
+        prefix, uri = ns
+        namespaces[prefix] = uri
+    return namespaces
+
 # Return str Utf-8 of matched rules
 def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
     from lxml import etree, html
@@ -123,7 +138,14 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
     tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
     html_block = ""
 
-    r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
+    # Automatically extract all namespaces from the XML content
+    namespaces = {'re': 'http://exslt.org/regular-expressions'}
+    try:
+        namespaces.update(extract_namespaces(html_content.encode('utf-8')))
+    except Exception as e:
+        logger.warning(f"Problem extracting namespaces from HTMl/XML content {str(e)}")
+
+    r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
     #@note: //title/text() wont work where <title>CDATA..
 
     if type(r) != list:
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index 1de5bafb..90cad63e 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -77,11 +77,12 @@ class perform_site_check(difference_detection_processor):
 
         ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
         # Go into RSS preprocess for converting CDATA/comment to usable text
-        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
-            if '<rss' in self.fetcher.content[:100].lower():
+        # Ctype_header could be unset if we are just reprocessing the existin content
+        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']) or not ctype_header:
+            top_text = self.fetcher.content[:200].lower().strip()
+            if '<rss' in top_text or 'search.yahoo.com/mrss/' in top_text:
                 self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
                 is_rss = True
-
         # source: support, basically treat it as plaintext
         if watch.is_source_type_url:
             is_html = False