Re #154 Ldjson extract parse (#158)

* Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it. * Update README.md
3 years ago · e2304b2ce0
parent b87236ea20
commit e2304b2ce0
4 changed files with 108 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -102,6 +102,20 @@ This will re-parse the JSON and apply formatting to the text, making it super ea

 ![image](https://user-images.githubusercontent.com/275001/125165995-d9ea5580-e1dc-11eb-8030-f0deced2661a.png)

+#### Parse JSON embedded in HTML!
+
+When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. 
+
+```
+<html>
+...
+<script type="application/ld+json">
+  {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","price": 23.50 }
+</script>
+```  
+
+`json:$.price` would give `23.50`, or you can extract the whole structure
+
 ### Proxy

 A proxy for ChangeDetection.io can be configured by setting environment the 
--- a/backend/fetch_site_status.py
+++ b/backend/fetch_site_status.py
@ -92,27 +92,8 @@ class perform_site_check():
            css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
            if css_filter_rule and len(css_filter_rule.strip()):
                if 'json:' in css_filter_rule:
-                    # POC hack, @todo rename vars, see how it fits in with the javascript version
-                    import json
-                    from jsonpath_ng import jsonpath, parse
-
-                    json_data = json.loads(html)
-                    jsonpath_expression = parse(css_filter_rule.replace('json:', ''))
-                    match = jsonpath_expression.find(json_data)
-                    s = []
-
-                    # More than one result, we will return it as a JSON list.
-                    if len(match) > 1:
-                        for i in match:
-                            s.append(i.value)
-
-                    # Single value, use just the value, as it could be later used in a token in notifications.
-                    if len(match) == 1:
-                        s = match[0].value
-
-                    stripped_text_from_html = json.dumps(s, indent=4)
+                    stripped_text_from_html = html_tools.extract_json_as_string(html, css_filter_rule)
                    is_html = False
-
                else:
                    # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                    html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)
--- a/backend/html_tools.py
+++ b/backend/html_tools.py
@ -1,6 +1,12 @@
+import json
 from bs4 import BeautifulSoup
+from jsonpath_ng import parse


+class JSONNotFound(ValueError):
+    def __init__(self, msg):
+        ValueError.__init__(self, msg)
+
 # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
 def css_filter(css_filter, html_content):
    soup = BeautifulSoup(html_content, "html.parser")
@ -24,3 +30,54 @@ def extract_element(find='title', html_content=''):

    return element_text

+#
+def _parse_json(json_data, jsonpath_filter):
+    s=[]
+    jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
+    match = jsonpath_expression.find(json_data)
+
+    # More than one result, we will return it as a JSON list.
+    if len(match) > 1:
+        for i in match:
+            s.append(i.value)
+
+    # Single value, use just the value, as it could be later used in a token in notifications.
+    if len(match) == 1:
+        s = match[0].value
+
+    if not s:
+        raise JSONNotFound("No Matching JSON could be found for the rule {}".format(jsonpath_filter.replace('json:', '')))
+
+    stripped_text_from_html = json.dumps(s, indent=4)
+
+    return stripped_text_from_html
+
+def extract_json_as_string(content, jsonpath_filter):
+
+    stripped_text_from_html = False
+
+    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
+    try:
+        stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
+    except json.JSONDecodeError:
+
+        # Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
+        s = []
+        soup = BeautifulSoup(content, 'html.parser')
+        bs_result = soup.findAll('script')
+
+        if not bs_result:
+            raise JSONNotFound("No parsable JSON found in this document")
+
+        for result in bs_result:
+            try:
+                json_data = json.loads(result.string)
+            except json.JSONDecodeError:
+                # Just skip it
+                continue
+            else:
+                stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
+                if stripped_text_from_html:
+                    break
+
+    return stripped_text_from_html
--- a/backend/tests/test_jsonpath_selector.py
+++ b/backend/tests/test_jsonpath_selector.py
@ -4,6 +4,42 @@ import time
 from flask import url_for
 from . util import live_server_setup

+def test_unittest_inline_html_extract():
+    # So lets pretend that the JSON we want is inside some HTML
+    content="""
+    <html>
+    
+    food and stuff and more
+    <script>
+    alert('nothing really good here');
+    </script>
+    
+    <script type="application/ld+json">
+  xx {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
+</script>
+<body>
+and it can also be repeated
+<script type="application/ld+json">
+  {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula  800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cow’s milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
+</script>
+<h4>ok</h4>
+</body>
+</html>
+
+    """
+    from .. import html_tools
+
+    # See that we can find the second <script> one, which is not broken, and matches our filter
+    text = html_tools.extract_json_as_string(content, "$.offers.price")
+    assert text == "23.5"
+
+    text = html_tools.extract_json_as_string('{"id":5}', "$.id")
+    assert text == "5"
+
+#    @todo how to test for exception raised
+#    text = html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
+
+
 def test_setup(live_server):
    live_server_setup(live_server)