From a4e6fd1ec35c5df9b15c887afbaf85f016d794bb Mon Sep 17 00:00:00 2001
From: Maciej Rapacz <mrapacz@users.noreply.github.com>
Date: Tue, 30 May 2023 06:57:17 +0000
Subject: [PATCH] Fetcher / Parser - Automatically attempt to extract JSON from
 document when document contains JSON but could be wrapped in HTML (#1593)

---
 changedetectionio/html_tools.py               | 43 ++++++++++---------
 .../tests/test_jsonpath_jq_selector.py        | 31 +++++++++++--
 2 files changed, 50 insertions(+), 24 deletions(-)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 63848030..0cdaeea4 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -137,12 +137,13 @@ def _get_stripped_text_from_json_match(match):
 def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
     stripped_text_from_html = False
 
-    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
+    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
     try:
         stripped_text_from_html = _parse_json(json.loads(content), json_filter)
     except json.JSONDecodeError:
 
         # Foreach <script json></script> blob.. just return the first that matches json_filter
+        # As a last resort, try to parse the whole <body>
         s = []
         soup = BeautifulSoup(content, 'html.parser')
 
@@ -150,32 +151,34 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
             bs_result = soup.findAll('script', {"type": "application/ld+json"})
         else:
             bs_result = soup.findAll('script')
+        bs_result += soup.findAll('body')
 
-
-        if not bs_result:
-            raise JSONNotFound("No parsable JSON found in this document")
-
+        bs_jsons = []
         for result in bs_result:
             # Skip empty tags, and things that dont even look like JSON
-            if not result.string or not '{' in result.string:
+            if not result.text or '{' not in result.text:
                 continue
-                
             try:
-                json_data = json.loads(result.string)
+                json_data = json.loads(result.text)
+                bs_jsons.append(json_data)
             except json.JSONDecodeError:
-                # Just skip it
+                # Skip objects which cannot be parsed
                 continue
-            else:
-                stripped_text_from_html = _parse_json(json_data, json_filter)
-                if ensure_is_ldjson_info_type:
-                    # Could sometimes be list, string or something else random
-                    if isinstance(json_data, dict):
-                        # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
-                        # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
-                        if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
-                            break
-                elif stripped_text_from_html:
-                    break
+
+        if not bs_jsons:
+            raise JSONNotFound("No parsable JSON found in this document")
+        
+        for json_data in bs_jsons:
+            stripped_text_from_html = _parse_json(json_data, json_filter)
+            if ensure_is_ldjson_info_type:
+                # Could sometimes be list, string or something else random
+                if isinstance(json_data, dict):
+                    # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
+                    # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
+                    if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
+                        break
+            elif stripped_text_from_html:
+                break
 
     if not stripped_text_from_html:
         # Re 265 - Just return an empty string when filter not found
diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py
index 300bbf76..f18cafe5 100644
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@@ -64,6 +64,24 @@ and it can also be repeated
         with pytest.raises(html_tools.JSONNotFound) as e_info:
             html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
 
+
+def test_unittest_inline_extract_body():
+    content = """
+    <html>
+        <head></head>
+        <body>
+            <pre style="word-wrap: break-word; white-space: pre-wrap;">
+                {"testKey": 42}
+            </pre>
+        </body>
+    </html>
+    """
+    from .. import html_tools
+
+    # See that we can find the second <script> one, which is not broken, and matches our filter
+    text = html_tools.extract_json_as_string(content, "json:$.testKey")
+    assert text == '42'
+
 def set_original_ext_response():
     data = """
         [
@@ -437,7 +455,6 @@ def test_ignore_json_order(client, live_server):
     assert b'Deleted' in res.data
 
 def test_correct_header_detect(client, live_server):
-    
     # Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
     # Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
     with open("test-datastore/endpoint-content.txt", "w") as f:
@@ -453,11 +470,17 @@ def test_correct_header_detect(client, live_server):
     )
     assert b"1 Imported" in res.data
     wait_for_all_checks(client)
+    res = client.get(url_for("index"))
 
+    # Fixed in #1593
+    assert b'No parsable JSON found in this document' not in res.data
 
-    res = client.get(url_for("index"))
-    # This will be fixed in #1593
-    assert b'No parsable JSON found in this document' in res.data
+    res = client.get(
+        url_for("preview_page", uuid="first"),
+        follow_redirects=True
+    )
+    assert b'&#34;world&#34;:' in res.data
+    assert res.data.count(b'{') >= 2
 
     res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
     assert b'Deleted' in res.data