Adds support for jq JSON path querying engine (#1001)

pull/1004/head
Yusef Ouda 2 years ago committed by GitHub
parent cd467df97a
commit dfa7fc3a81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -33,7 +33,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
#### Key Features
- Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
- Switch between fast non-JS and Chrome JS based "fetchers"
- Easily specify how often a site should be checked
- Execute JS before extracting text (Good for logging in, see examples in the UI!)

@ -47,7 +47,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W
#### Key Features
- Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions!
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules
- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq
- Switch between fast non-JS and Chrome JS based "fetchers"
- Easily specify how often a site should be checked
- Execute JS before extracting text (Good for logging in, see examples in the UI!)
@ -121,7 +121,7 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io
## Filters
XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools.
(We support LXML `re:test`, `re:math` and `re:replace`.)
@ -151,7 +151,7 @@ Now you can also customise your notification content!
## JSON API Monitoring
Detect changes and monitor data in JSON API's by using the built-in JSONPath selectors as a filter / selector.
Detect changes and monitor data in JSON API's by using either JSONPath or jq to filter, parse, and restructure JSON as needed.
![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-filter-field-example.png)
@ -159,9 +159,52 @@ This will re-parse the JSON and apply formatting to the text, making it super ea
![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-diff-example.png)
### JSONPath or jq?
For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more information on jq.
The example below adds the price in dollars to each item in the JSON data, and then filters to only show items that are greater than 10.
#### Sample input data from API
```
{
"items": [
{
"name": "Product A",
"priceInCents": 2500
},
{
"name": "Product B",
"priceInCents": 500
},
{
"name": "Product C",
"priceInCents": 2000
}
]
}
```
#### Sample jq
`jq:.items[] | . + { "priceInDollars": (.priceInCents / 100) } | select(.priceInDollars > 10)`
#### Sample output data
```
{
"name": "Product A",
"priceInCents": 2500,
"priceInDollars": 25
}
{
"name": "Product C",
"priceInCents": 2000,
"priceInDollars": 20
}
```
### Parse JSON embedded in HTML!
When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
When you enable a `json:` or `jq:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites.
```
<html>
@ -171,7 +214,7 @@ When you enable a `json:` filter, you can even automatically extract and parse e
</script>
```
`json:$.price` would give `23.50`, or you can extract the whole structure
`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure
## Proxy configuration

@ -141,8 +141,9 @@ class perform_site_check():
has_filter_rule = True
if has_filter_rule:
if 'json:' in css_filter_rule:
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule)
json_filter_prefixes = ['json:', 'jq:']
if any(prefix in css_filter_rule for prefix in json_filter_prefixes):
stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule)
is_html = False
if is_html or is_source:

@ -304,6 +304,21 @@ class ValidateCSSJSONXPATHInput(object):
# Re #265 - maybe in the future fetch the page and offer a
# warning/notice that its possible the rule doesnt yet match anything?
if 'jq:' in line:
if not self.allow_json:
raise ValidationError("jq not permitted in this field!")
import jq
input = line.replace('jq:', '')
try:
jq.compile(input)
except (ValueError) as e:
message = field.gettext('\'%s\' is not a valid jq expression. (%s)')
raise ValidationError(message % (input, str(e)))
except:
raise ValidationError("A system-error occurred when validating your jq expression")
class quickWatchForm(Form):
url = fields.URLField('URL', validators=[validateURL()])

@ -3,6 +3,7 @@ from typing import List
from bs4 import BeautifulSoup
from jsonpath_ng.ext import parse
import jq
import re
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
@ -79,19 +80,26 @@ def extract_element(find='title', html_content=''):
return element_text
#
def _parse_json(json_data, jsonpath_filter):
s=[]
jsonpath_expression = parse(jsonpath_filter.replace('json:', ''))
match = jsonpath_expression.find(json_data)
def _parse_json(json_data, json_filter):
if 'json:' in json_filter:
jsonpath_expression = parse(json_filter.replace('json:', ''))
match = jsonpath_expression.find(json_data)
return _get_stripped_text_from_json_match(match)
if 'jq:' in json_filter:
jq_expression = jq.compile(json_filter.replace('jq:', ''))
match = jq_expression.input(json_data).all()
return _get_stripped_text_from_json_match(match)
def _get_stripped_text_from_json_match(match):
s = []
# More than one result, we will return it as a JSON list.
if len(match) > 1:
for i in match:
s.append(i.value)
s.append(i.value if hasattr(i, 'value') else i)
# Single value, use just the value, as it could be later used in a token in notifications.
if len(match) == 1:
s = match[0].value
s = match[0].value if hasattr(match[0], 'value') else match[0]
# Re #257 - Better handling where it does not exist, in the case the original 's' value was False..
if not match:
@ -103,16 +111,16 @@ def _parse_json(json_data, jsonpath_filter):
return stripped_text_from_html
def extract_json_as_string(content, jsonpath_filter):
def extract_json_as_string(content, json_filter):
stripped_text_from_html = False
# Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded <script type=ldjson>
try:
stripped_text_from_html = _parse_json(json.loads(content), jsonpath_filter)
stripped_text_from_html = _parse_json(json.loads(content), json_filter)
except json.JSONDecodeError:
# Foreach <script json></script> blob.. just return the first that matches jsonpath_filter
# Foreach <script json></script> blob.. just return the first that matches json_filter
s = []
soup = BeautifulSoup(content, 'html.parser')
bs_result = soup.findAll('script')
@ -131,7 +139,7 @@ def extract_json_as_string(content, jsonpath_filter):
# Just skip it
continue
else:
stripped_text_from_html = _parse_json(json_data, jsonpath_filter)
stripped_text_from_html = _parse_json(json_data, json_filter)
if stripped_text_from_html:
break

@ -184,8 +184,12 @@ User-Agent: wonderbra 1.0") }}
<span class="pure-form-message-inline">
<ul>
<li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li>
<li>JSON - Limit text to this JSON rule, using <a href="https://pypi.org/project/jsonpath-ng/">JSONPath</a>, prefix with <code>"json:"</code>, use <code>json:$</code> to force re-formatting if required, <a
href="https://jsonpath.com/" target="new">test your JSONPath here</a></li>
<li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a>.
<ul>
<li>JSONPath: Prefix with <code>json:</code>, use <code>json:$</code> to force re-formatting if required, <a href="https://jsonpath.com/" target="new">test your JSONPath here</a>.</li>
<li>jq: Prefix with <code>jq:</code> and <a href="https://jqplay.org/" target="new">test your jq here</a>. Using <a href="https://stedolan.github.io/jq/" target="new">jq</a> allows for complex filtering and processing of JSON data with built-in functions, regex, filtering, and more. See examples and documentation <a href="https://stedolan.github.io/jq/manual/" target="new">here</a>.</li>
</ul>
</li>
<li>XPath - Limit text to this XPath rule, simply start with a forward-slash,
<ul>
<li>Example: <code>//*[contains(@class, 'sametext')]</code> or <code>xpath://*[contains(@class, 'sametext')]</code>, <a
@ -194,7 +198,7 @@ User-Agent: wonderbra 1.0") }}
</ul>
</li>
</ul>
Please be sure that you thoroughly understand how to write CSS or JSONPath, XPath selector rules before filing an issue on GitHub! <a
Please be sure that you thoroughly understand how to write CSS, JSONPath, XPath, or jq selector rules before filing an issue on GitHub! <a
href="https://github.com/dgtlmoon/changedetection.io/wiki/CSS-Selector-help">here for more CSS selector help</a>.<br/>
</span>
</div>

@ -2,7 +2,7 @@
# coding=utf-8
import time
from flask import url_for
from flask import url_for, escape
from . util import live_server_setup
import pytest
@ -36,16 +36,26 @@ and it can also be repeated
from .. import html_tools
# See that we can find the second <script> one, which is not broken, and matches our filter
text = html_tools.extract_json_as_string(content, "$.offers.price")
text = html_tools.extract_json_as_string(content, "json:$.offers.price")
assert text == "23.5"
text = html_tools.extract_json_as_string('{"id":5}', "$.id")
# also check for jq
text = html_tools.extract_json_as_string(content, "jq:.offers.price")
assert text == "23.5"
text = html_tools.extract_json_as_string('{"id":5}', "json:$.id")
assert text == "5"
text = html_tools.extract_json_as_string('{"id":5}', "jq:.id")
assert text == "5"
# When nothing at all is found, it should throw JSONNOTFound
# Which is caught and shown to the user in the watch-overview table
with pytest.raises(html_tools.JSONNotFound) as e_info:
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "$.id")
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "json:$.id")
with pytest.raises(html_tools.JSONNotFound) as e_info:
html_tools.extract_json_as_string('COMPLETE GIBBERISH, NO JSON!', "jq:.id")
def set_original_ext_response():
data = """
@ -66,6 +76,7 @@ def set_original_ext_response():
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(data)
return None
def set_modified_ext_response():
data = """
@ -86,6 +97,7 @@ def set_modified_ext_response():
with open("test-datastore/endpoint-content.txt", "w") as f:
f.write(data)
return None
def set_original_response():
test_return_data = """
@ -184,10 +196,10 @@ def test_check_json_without_filter(client, live_server):
assert b'&#34;&lt;b&gt;' in res.data
assert res.data.count(b'{\n') >= 2
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_json_filter(client, live_server):
json_filter = 'json:boss.name'
def check_json_filter(json_filter, client, live_server):
set_original_response()
# Give the endpoint time to spin up
@ -226,7 +238,7 @@ def test_check_json_filter(client, live_server):
res = client.get(
url_for("edit_page", uuid="first"),
)
assert bytes(json_filter.encode('utf-8')) in res.data
assert bytes(escape(json_filter).encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
@ -252,10 +264,16 @@ def test_check_json_filter(client, live_server):
# And #462 - check we see the proper utf-8 string there
assert "Örnsköldsvik".encode('utf-8') in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_filter(client, live_server):
check_json_filter('json:boss.name', client, live_server)
def test_check_json_filter_bool_val(client, live_server):
json_filter = "json:$['available']"
def test_check_jq_filter(client, live_server):
check_json_filter('jq:.boss.name', client, live_server)
def check_json_filter_bool_val(json_filter, client, live_server):
set_original_response()
# Give the endpoint time to spin up
@ -304,14 +322,21 @@ def test_check_json_filter_bool_val(client, live_server):
# But the change should be there, tho its hard to test the change was detected because it will show old and new versions
assert b'false' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_filter_bool_val(client, live_server):
check_json_filter_bool_val("json:$['available']", client, live_server)
def test_check_jq_filter_bool_val(client, live_server):
check_json_filter_bool_val("jq:.available", client, live_server)
# Re #265 - Extended JSON selector test
# Stuff to consider here
# - Selector should be allowed to return empty when it doesnt match (people might wait for some condition)
# - The 'diff' tab could show the old and new content
# - Form should let us enter a selector that doesnt (yet) match anything
def test_check_json_ext_filter(client, live_server):
json_filter = 'json:$[?(@.status==Sold)]'
def check_json_ext_filter(json_filter, client, live_server):
set_original_ext_response()
# Give the endpoint time to spin up
@ -350,7 +375,7 @@ def test_check_json_ext_filter(client, live_server):
res = client.get(
url_for("edit_page", uuid="first"),
)
assert bytes(json_filter.encode('utf-8')) in res.data
assert bytes(escape(json_filter).encode('utf-8')) in res.data
# Trigger a check
client.get(url_for("form_watch_checknow"), follow_redirects=True)
@ -376,3 +401,11 @@ def test_check_json_ext_filter(client, live_server):
assert b'ForSale' not in res.data
assert b'Sold' in res.data
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
def test_check_jsonpath_ext_filter(client, live_server):
check_json_ext_filter('json:$[?(@.status==Sold)]', client, live_server)
def test_check_jq_ext_filter(client, live_server):
check_json_ext_filter('jq:.[] | select(.status | contains("Sold"))', client, live_server)

@ -16,6 +16,7 @@ chardet > 2.3.0
wtforms ~= 3.0
jsonpath-ng ~= 1.5.3
jq ~= 1.3.0
# Notification library
apprise ~= 1.1.0

Loading…
Cancel
Save