diff --git a/README-pip.md b/README-pip.md index 746175db..b6a00d32 100644 --- a/README-pip.md +++ b/README-pip.md @@ -33,7 +33,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W #### Key Features - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions! -- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules +- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq - Switch between fast non-JS and Chrome JS based "fetchers" - Easily specify how often a site should be checked - Execute JS before extracting text (Good for logging in, see examples in the UI!) diff --git a/README.md b/README.md index 0d08d129..797f8c56 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W #### Key Features - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions! -- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules +- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq - Switch between fast non-JS and Chrome JS based "fetchers" - Easily specify how often a site should be checked - Execute JS before extracting text (Good for logging in, see examples in the UI!) @@ -121,7 +121,7 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io ## Filters -XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. +XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. (We support LXML `re:test`, `re:math` and `re:replace`.) @@ -151,7 +151,7 @@ Now you can also customise your notification content! ## JSON API Monitoring -Detect changes and monitor data in JSON API's by using the built-in JSONPath selectors as a filter / selector. +Detect changes and monitor data in JSON API's by using either JSONPath or jq to filter, parse, and restructure JSON as needed. ![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-filter-field-example.png) @@ -159,9 +159,52 @@ This will re-parse the JSON and apply formatting to the text, making it super ea ![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-diff-example.png) +### JSONPath or jq? + +For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more information on jq. + +The example below adds the price in dollars to each item in the JSON data, and then filters to only show items that are greater than 10. + +#### Sample input data from API +``` +{ + "items": [ + { + "name": "Product A", + "priceInCents": 2500 + }, + { + "name": "Product B", + "priceInCents": 500 + }, + { + "name": "Product C", + "priceInCents": 2000 + } + ] +} +``` + +#### Sample jq +`jq:.items[] | . + { "priceInDollars": (.priceInCents / 100) } | select(.priceInDollars > 10)` + +#### Sample output data +``` +{ + "name": "Product A", + "priceInCents": 2500, + "priceInDollars": 25 +} +{ + "name": "Product C", + "priceInCents": 2000, + "priceInDollars": 20 +} +``` + ### Parse JSON embedded in HTML! -When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. +When you enable a `json:` or `jq:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. ``` @@ -171,7 +214,7 @@ When you enable a `json:` filter, you can even automatically extract and parse e ``` -`json:$.price` would give `23.50`, or you can extract the whole structure +`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure ## Proxy configuration diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 79e282b5..0f84da16 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -141,8 +141,9 @@ class perform_site_check(): has_filter_rule = True if has_filter_rule: - if 'json:' in css_filter_rule: - stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) + json_filter_prefixes = ['json:', 'jq:'] + if any(prefix in css_filter_rule for prefix in json_filter_prefixes): + stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule) is_html = False if is_html or is_source: diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 279f7c7f..7fa17f90 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -304,6 +304,21 @@ class ValidateCSSJSONXPATHInput(object): # Re #265 - maybe in the future fetch the page and offer a # warning/notice that its possible the rule doesnt yet match anything? + if 'jq:' in line: + if not self.allow_json: + raise ValidationError("jq not permitted in this field!") + + import jq + input = line.replace('jq:', '') + + try: + jq.compile(input) + except (ValueError) as e: + message = field.gettext('\'%s\' is not a valid jq expression. (%s)') + raise ValidationError(message % (input, str(e))) + except: + raise ValidationError("A system-error occurred when validating your jq expression") + class quickWatchForm(Form): url = fields.URLField('URL', validators=[validateURL()]) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index a851a4d6..6cc8e20a 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -3,6 +3,7 @@ from typing import List from bs4 import BeautifulSoup from jsonpath_ng.ext import parse +import jq import re from inscriptis import get_text from inscriptis.model.config import ParserConfig @@ -79,19 +80,26 @@ def extract_element(find='title', html_content=''): return element_text # -def _parse_json(json_data, jsonpath_filter): - s=[] - jsonpath_expression = parse(jsonpath_filter.replace('json:', '')) - match = jsonpath_expression.find(json_data) - +def _parse_json(json_data, json_filter): + if 'json:' in json_filter: + jsonpath_expression = parse(json_filter.replace('json:', '')) + match = jsonpath_expression.find(json_data) + return _get_stripped_text_from_json_match(match) + if 'jq:' in json_filter: + jq_expression = jq.compile(json_filter.replace('jq:', '')) + match = jq_expression.input(json_data).all() + return _get_stripped_text_from_json_match(match) + +def _get_stripped_text_from_json_match(match): + s = [] # More than one result, we will return it as a JSON list. if len(match) > 1: for i in match: - s.append(i.value) + s.append(i.value if hasattr(i, 'value') else i) # Single value, use just the value, as it could be later used in a token in notifications. if len(match) == 1: - s = match[0].value + s = match[0].value if hasattr(match[0], 'value') else match[0] # Re #257 - Better handling where it does not exist, in the case the original 's' value was False.. if not match: @@ -103,16 +111,16 @@ def _parse_json(json_data, jsonpath_filter): return stripped_text_from_html -def extract_json_as_string(content, jsonpath_filter): +def extract_json_as_string(content, json_filter): stripped_text_from_html = False # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded blob.. just return the first that matches jsonpath_filter + # Foreach blob.. just return the first that matches json_filter s = [] soup = BeautifulSoup(content, 'html.parser') bs_result = soup.findAll('script') @@ -131,7 +139,7 @@ def extract_json_as_string(content, jsonpath_filter): # Just skip it continue else: - stripped_text_from_html = _parse_json(json_data, jsonpath_filter) + stripped_text_from_html = _parse_json(json_data, json_filter) if stripped_text_from_html: break diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 64e9cee3..907894e1 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -184,8 +184,12 @@ User-Agent: wonderbra 1.0") }}