Fetcher / Parser - Automatically attempt to extract JSON from document when document contains JSON but could be wrapped in HTML (#1593)

2 years ago · a4e6fd1ec3
parent d8b9f0fd78
commit a4e6fd1ec3
2 changed files with 50 additions and 24 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -137,12 +137,13 @@ def _get_stripped_text_from_json_match(match):
 def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
    stripped_text_from_html = False
-    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
+    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
    try:
        stripped_text_from_html = _parse_json(json.loads(content), json_filter)
    except json.JSONDecodeError:
        # Foreach <script json></script> blob.. just return the first that matches json_filter
        # As a last resort, try to parse the whole <body>
        s = []
        soup = BeautifulSoup(content, 'html.parser')
@ -150,22 +151,24 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
            bs_result = soup.findAll('script', {"type": "application/ld+json"})
        else:
            bs_result = soup.findAll('script')
        bs_result += soup.findAll('body')
-
+        bs_jsons = []
        if not bs_result:
            raise JSONNotFound("No parsable JSON found in this document")
        for result in bs_result:
            # Skip empty tags, and things that dont even look like JSON
-            if not result.string or not '{' in result.string:
+            if not result.text or '{' not in result.text:
                continue
            try:
-                json_data = json.loads(result.string)
+                json_data = json.loads(result.text)
                bs_jsons.append(json_data)
            except json.JSONDecodeError:
-                # Just skip it
+                # Skip objects which cannot be parsed
                continue
-            else:
+
        if not bs_jsons:
            raise JSONNotFound("No parsable JSON found in this document")
        for json_data in bs_jsons:
            stripped_text_from_html = _parse_json(json_data, json_filter)
            if ensure_is_ldjson_info_type:
                # Could sometimes be list, string or something else random
--- a/changedetectionio/tests/test_jsonpath_jq_selector.py
+++ b/changedetectionio/tests/test_jsonpath_jq_selector.py
@ -64,6 +64,24 @@ and it can also be repeated
        with pytest.raises(html_tools.JSONNotFound) as e_info:
            html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
 def test_unittest_inline_extract_body():
    content = """
    <html>
        <head></head>
        <body>
            <pre style="word-wrap: break-word; white-space: pre-wrap;">
                {"testKey": 42}
            </pre>
        </body>
    </html>
    """
    from .. import html_tools
    # See that we can find the second <script> one, which is not broken, and matches our filter
    text = html_tools.extract_json_as_string(content, "json:$.testKey")
    assert text == '42'
 def set_original_ext_response():
    data = """
        [
@ -437,7 +455,6 @@ def test_ignore_json_order(client, live_server):
    assert b'Deleted' in res.data
 def test_correct_header_detect(client, live_server):
    # Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
    # Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
    with open("test-datastore/endpoint-content.txt", "w") as f:
@ -453,11 +470,17 @@ def test_correct_header_detect(client, live_server):
    )
    assert b"1 Imported" in res.data
    wait_for_all_checks(client)
    res = client.get(url_for("index"))
    # Fixed in #1593
    assert b'No parsable JSON found in this document' not in res.data
-    res = client.get(url_for("index"))
+    res = client.get(
-    # This will be fixed in #1593
+        url_for("preview_page", uuid="first"),
-    assert b'No parsable JSON found in this document' in res.data
+        follow_redirects=True
    )
    assert b'&#34;world&#34;:' in res.data
    assert res.data.count(b'{') >= 2
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
    assert b'Deleted' in res.data