Re #154 Ldjson extract parse (#158)

* Use parsable JSON hiding in <script type="application/ld+json"> where possible, if it matches the filter rule, use it.
* Update README.md
pull/159/head
dgtlmoon 3 years ago committed by GitHub
parent b87236ea20
commit e2304b2ce0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -102,6 +102,20 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
![image](https://user-images.githubusercontent.com/275001/125165995-d9ea5580-e1dc-11eb-8030-f0deced2661a.png)
#### Parse JSON embedded in HTML!
When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
```
<html>
...
<script type="application/ld+json">
{"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","price": 23.50 }
</script>
```
`json:$.price` would give `23.50`, or you can extract the whole structure
### Proxy
A proxy for ChangeDetection.io can be configured by setting environment the

@ -92,27 +92,8 @@ class perform_site_check():
css_filter_rule = self.datastore.data['watching'][uuid]['css_filter']
if css_filter_rule and len(css_filter_rule.strip()):
if 'json:' in css_filter_rule:
# POC hack, @todo rename vars, see how it fits in with the javascript version
import json
from jsonpath_ng import jsonpath, parse
json_data = json.loads(html)
jsonpath_expression = parse(css_filter_rule.replace('json:', ''))
match = jsonpath_expression.find(json_data)
s = []
# More than one result, we will return it as a JSON list.
if len(match) > 1:
for i in match:
s.append(i.value)
# Single value, use just the value, as it could be later used in a token in notifications.
if len(match) == 1:
s = match[0].value
stripped_text_from_html = json.dumps(s, indent=4)
stripped_text_from_html = html_tools.extract_json_as_string(html, css_filter_rule)
is_html = False
else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html = html_tools.css_filter(css_filter=css_filter_rule, html_content=r.content)

@ -1,6 +1,12 @@
import json
from bs4 import BeautifulSoup
from jsonpath_ng import parse
class JSONNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def css_filter(css_filter, html_content):
soup = BeautifulSoup(html_content, "html.parser")
@ -24,3 +30,54 @@ def extract_element(find='title', html_content=''):
return element_text
#
def _parse_json(json_data, jsonpath_filter):
s=[]
jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
match = jsonpath_expression.find(json_data)
# More than one result, we will return it as a JSON list.
if len(match) > 1:
for i in match:
s.append(i.value)
# Single value, use just the value, as it could be later used in a token in notifications.
if len(match) == 1:
s = match[0].value
if not s:
raise JSONNotFound("No Matching JSON could be found for the rule {}".format(jsonpath_filter.replace('json:', '')))
stripped_text_from_html = json.dumps(s, indent=4)
return stripped_text_from_html
def extract_json_as_string(content, jsonpath_filter):
stripped_text_from_html = False
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
try:
stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
except json.JSONDecodeError:
# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
s = []
soup = BeautifulSoup(content, 'html.parser')
bs_result = soup.findAll('script')
if not bs_result:
raise JSONNotFound("No parsable JSON found in this document")
for result in bs_result:
try:
json_data = json.loads(result.string)
except json.JSONDecodeError:
# Just skip it
continue
else:
stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
if stripped_text_from_html:
break
return stripped_text_from_html

@ -4,6 +4,42 @@ import time
from flask import url_for
from . util import live_server_setup
def test_unittest_inline_html_extract():
# So lets pretend that the JSON we want is inside some HTML
content="""
<html>
food and stuff and more
<script>
alert('nothing really good here');
</script>
<script type="application/ld+json">
xx {"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cows milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
</script>
<body>
and it can also be repeated
<script type="application/ld+json">
{"@context":"http://schema.org","@type":"Product","name":"Nan Optipro Stage 1 Baby Formula 800g","description":"During the first year of life, nutrition is critical for your baby. NAN OPTIPRO 1 is tailored to ensure your formula fed infant receives balanced, high quality nutrition.<br />Starter infant formula. The age optimised protein source (whey dominant) is from cows milk.<br />Backed by more than 150 years of Nestlé expertise.<br />For hygiene and convenience, it is available in an innovative packaging format with a separate storage area for the scoop, and a semi-transparent window which allows you to see how much powder is left in the can without having to open it.","image":"https://cdn0.woolworths.media/content/wowproductimages/large/155536.jpg","brand":{"@context":"http://schema.org","@type":"Organization","name":"Nan"},"gtin13":"7613287517388","offers":{"@context":"http://schema.org","@type":"Offer","potentialAction":{"@context":"http://schema.org","@type":"BuyAction"},"availability":"http://schema.org/InStock","itemCondition":"http://schema.org/NewCondition","price":23.5,"priceCurrency":"AUD"},"review":[],"sku":"155536"}
</script>
<h4>ok</h4>
</body>
</html>
"""
from .. import html_tools
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(content, "$.offers.price")
assert text == "23.5"
text = html_tools.extract_json_as_string('{"id":5}', "$.id")
assert text == "5"
# @todo how to test for exception raised
# text = html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
def test_setup(live_server):
live_server_setup(live_server)

Loading…
Cancel
Save