Fetcher / Parser - Automatically attempt to extract JSON from document when document contains JSON but could be wrapped in HTML (#1593)

bugfix-delete-missing-dir
Maciej Rapacz 2 years ago committed by GitHub
parent d8b9f0fd78
commit a4e6fd1ec3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -137,12 +137,13 @@ def _get_stripped_text_from_json_match(match):
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
stripped_text_from_html = False stripped_text_from_html = False
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson> # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
try: try:
stripped_text_from_html = _parse_json(json.loads(content), json_filter) stripped_text_from_html = _parse_json(json.loads(content), json_filter)
except json.JSONDecodeError: except json.JSONDecodeError:
# Foreach <script json></script> blob.. just return the first that matches json_filter # Foreach <script json></script> blob.. just return the first that matches json_filter
# As a last resort, try to parse the whole <body>
s = [] s = []
soup = BeautifulSoup(content, 'html.parser') soup = BeautifulSoup(content, 'html.parser')
@ -150,32 +151,34 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
bs_result = soup.findAll('script', {"type": "application/ld+json"}) bs_result = soup.findAll('script', {"type": "application/ld+json"})
else: else:
bs_result = soup.findAll('script') bs_result = soup.findAll('script')
bs_result += soup.findAll('body')
bs_jsons = []
if not bs_result:
raise JSONNotFound("No parsable JSON found in this document")
for result in bs_result: for result in bs_result:
# Skip empty tags, and things that dont even look like JSON # Skip empty tags, and things that dont even look like JSON
if not result.string or not '{' in result.string: if not result.text or '{' not in result.text:
continue continue
try: try:
json_data = json.loads(result.string) json_data = json.loads(result.text)
bs_jsons.append(json_data)
except json.JSONDecodeError: except json.JSONDecodeError:
# Just skip it # Skip objects which cannot be parsed
continue continue
else:
stripped_text_from_html = _parse_json(json_data, json_filter) if not bs_jsons:
if ensure_is_ldjson_info_type: raise JSONNotFound("No parsable JSON found in this document")
# Could sometimes be list, string or something else random
if isinstance(json_data, dict): for json_data in bs_jsons:
# If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search stripped_text_from_html = _parse_json(json_data, json_filter)
# (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) if ensure_is_ldjson_info_type:
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html: # Could sometimes be list, string or something else random
break if isinstance(json_data, dict):
elif stripped_text_from_html: # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
break # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html:
break
elif stripped_text_from_html:
break
if not stripped_text_from_html: if not stripped_text_from_html:
# Re 265 - Just return an empty string when filter not found # Re 265 - Just return an empty string when filter not found

@ -64,6 +64,24 @@ and it can also be repeated
with pytest.raises(html_tools.JSONNotFound) as e_info: with pytest.raises(html_tools.JSONNotFound) as e_info:
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id") html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
def test_unittest_inline_extract_body():
content = """
<html>
<head></head>
<body>
<pre style="word-wrap: break-word; white-space: pre-wrap;">
{"testKey": 42}
</pre>
</body>
</html>
"""
from .. import html_tools
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(content, "json:$.testKey")
assert text == '42'
def set_original_ext_response(): def set_original_ext_response():
data = """ data = """
[ [
@ -437,7 +455,6 @@ def test_ignore_json_order(client, live_server):
assert b'Deleted' in res.data assert b'Deleted' in res.data
def test_correct_header_detect(client, live_server): def test_correct_header_detect(client, live_server):
# Like in https://github.com/dgtlmoon/changedetection.io/pull/1593 # Like in https://github.com/dgtlmoon/changedetection.io/pull/1593
# Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc # Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc
with open("test-datastore/endpoint-content.txt", "w") as f: with open("test-datastore/endpoint-content.txt", "w") as f:
@ -453,11 +470,17 @@ def test_correct_header_detect(client, live_server):
) )
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
wait_for_all_checks(client) wait_for_all_checks(client)
res = client.get(url_for("index"))
# Fixed in #1593
assert b'No parsable JSON found in this document' not in res.data
res = client.get(url_for("index")) res = client.get(
# This will be fixed in #1593 url_for("preview_page", uuid="first"),
assert b'No parsable JSON found in this document' in res.data follow_redirects=True
)
assert b'&#34;world&#34;:' in res.data
assert res.data.count(b'{') >= 2
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data assert b'Deleted' in res.data

Loading…
Cancel
Save