From a4e6fd1ec35c5df9b15c887afbaf85f016d794bb Mon Sep 17 00:00:00 2001 From: Maciej Rapacz Date: Tue, 30 May 2023 06:57:17 +0000 Subject: [PATCH] Fetcher / Parser - Automatically attempt to extract JSON from document when document contains JSON but could be wrapped in HTML (#1593) --- changedetectionio/html_tools.py | 43 ++++++++++--------- .../tests/test_jsonpath_jq_selector.py | 31 +++++++++++-- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 63848030..0cdaeea4 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -137,12 +137,13 @@ def _get_stripped_text_from_json_match(match): def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): stripped_text_from_html = False - # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded blob.. just return the first that matches json_filter + # As a last resort, try to parse the whole s = [] soup = BeautifulSoup(content, 'html.parser') @@ -150,32 +151,34 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None bs_result = soup.findAll('script', {"type": "application/ld+json"}) else: bs_result = soup.findAll('script') + bs_result += soup.findAll('body') - - if not bs_result: - raise JSONNotFound("No parsable JSON found in this document") - + bs_jsons = [] for result in bs_result: # Skip empty tags, and things that dont even look like JSON - if not result.string or not '{' in result.string: + if not result.text or '{' not in result.text: continue - try: - json_data = json.loads(result.string) + json_data = json.loads(result.text) + bs_jsons.append(json_data) except json.JSONDecodeError: - # Just skip it + # Skip objects which cannot be parsed continue - else: - stripped_text_from_html = _parse_json(json_data, json_filter) - if ensure_is_ldjson_info_type: - # Could sometimes be list, string or something else random - if isinstance(json_data, dict): - # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search - # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) - if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html: - break - elif stripped_text_from_html: - break + + if not bs_jsons: + raise JSONNotFound("No parsable JSON found in this document") + + for json_data in bs_jsons: + stripped_text_from_html = _parse_json(json_data, json_filter) + if ensure_is_ldjson_info_type: + # Could sometimes be list, string or something else random + if isinstance(json_data, dict): + # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search + # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) + if json_data.get('@type', False) and json_data.get('@type','').lower() == ensure_is_ldjson_info_type.lower() and stripped_text_from_html: + break + elif stripped_text_from_html: + break if not stripped_text_from_html: # Re 265 - Just return an empty string when filter not found diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index 300bbf76..f18cafe5 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -64,6 +64,24 @@ and it can also be repeated with pytest.raises(html_tools.JSONNotFound) as e_info: html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id") + +def test_unittest_inline_extract_body(): + content = """ + + + +
+                {"testKey": 42}
+            
+ + + """ + from .. import html_tools + + # See that we can find the second