diff --git a/.github/workflows/test-container-build.yml b/.github/workflows/test-container-build.yml new file mode 100644 index 00000000..dc6ab712 --- /dev/null +++ b/.github/workflows/test-container-build.yml @@ -0,0 +1,46 @@ +name: ChangeDetection.io Container Build Test + +# Triggers the workflow on push or pull request events +on: + push: + paths: + - requirements.txt + - Dockerfile + + # Changes to requirements.txt packages and Dockerfile may or may not always be compatible with arm etc, so worth testing + # @todo: some kind of path filter for requirements.txt and Dockerfile +jobs: + test-container-build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + # Just test that the build works, some libraries won't compile on ARM/rPi etc + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + with: + image: tonistiigi/binfmt:latest + platforms: all + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + with: + install: true + version: latest + driver-opts: image=moby/buildkit:master + + - name: Test that the docker containers can build + id: docker_build + uses: docker/build-push-action@v2 + # https://github.com/docker/build-push-action#customizing + with: + context: ./ + file: ./Dockerfile + platforms: linux/arm/v7,linux/arm/v6,linux/amd64,linux/arm64, + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,dest=/tmp/.buildx-cache diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index baf1d178..aac97335 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -1,28 +1,25 @@ -name: ChangeDetection.io Test +name: ChangeDetection.io App Test # Triggers the workflow on push or pull request events on: [push, pull_request] jobs: - test-build: + test-application: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - name: Set up Python 3.9 uses: actions/setup-python@v2 with: python-version: 3.9 - - name: Show env vars - run: set - - name: Install dependencies run: | python -m pip install --upgrade pip pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi + - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -39,7 +36,4 @@ jobs: # Each test is totally isolated and performs its own cleanup/reset cd changedetectionio; ./run_all_tests.sh - # https://github.com/docker/build-push-action/blob/master/docs/advanced/test-before-push.md ? - # https://github.com/docker/buildx/issues/59 ? Needs to be one platform? - # https://github.com/docker/buildx/issues/495#issuecomment-918925854 diff --git a/Dockerfile b/Dockerfile index 4e797e5c..24d3490e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,13 +5,14 @@ FROM python:3.8-slim as builder ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1 RUN apt-get update && apt-get install -y --no-install-recommends \ - libssl-dev \ - libffi-dev \ + g++ \ gcc \ libc-dev \ + libffi-dev \ + libssl-dev \ libxslt-dev \ - zlib1g-dev \ - g++ + make \ + zlib1g-dev RUN mkdir /install WORKDIR /install @@ -22,7 +23,7 @@ RUN pip install --target=/dependencies -r /requirements.txt # Playwright is an alternative to Selenium # Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing -RUN pip install --target=/dependencies playwright~=1.25 \ +RUN pip install --target=/dependencies playwright~=1.26 \ || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled." # Final image stage diff --git a/README-pip.md b/README-pip.md index 746175db..b6a00d32 100644 --- a/README-pip.md +++ b/README-pip.md @@ -33,7 +33,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W #### Key Features - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions! -- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules +- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq - Switch between fast non-JS and Chrome JS based "fetchers" - Easily specify how often a site should be checked - Execute JS before extracting text (Good for logging in, see examples in the UI!) diff --git a/README.md b/README.md index 0d08d129..797f8c56 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W #### Key Features - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions! -- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules +- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq - Switch between fast non-JS and Chrome JS based "fetchers" - Easily specify how often a site should be checked - Execute JS before extracting text (Good for logging in, see examples in the UI!) @@ -121,7 +121,7 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io ## Filters -XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. +XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. (We support LXML `re:test`, `re:math` and `re:replace`.) @@ -151,7 +151,7 @@ Now you can also customise your notification content! ## JSON API Monitoring -Detect changes and monitor data in JSON API's by using the built-in JSONPath selectors as a filter / selector. +Detect changes and monitor data in JSON API's by using either JSONPath or jq to filter, parse, and restructure JSON as needed. ![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-filter-field-example.png) @@ -159,9 +159,52 @@ This will re-parse the JSON and apply formatting to the text, making it super ea ![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-diff-example.png) +### JSONPath or jq? + +For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more information on jq. + +The example below adds the price in dollars to each item in the JSON data, and then filters to only show items that are greater than 10. + +#### Sample input data from API +``` +{ + "items": [ + { + "name": "Product A", + "priceInCents": 2500 + }, + { + "name": "Product B", + "priceInCents": 500 + }, + { + "name": "Product C", + "priceInCents": 2000 + } + ] +} +``` + +#### Sample jq +`jq:.items[] | . + { "priceInDollars": (.priceInCents / 100) } | select(.priceInDollars > 10)` + +#### Sample output data +``` +{ + "name": "Product A", + "priceInCents": 2500, + "priceInDollars": 25 +} +{ + "name": "Product C", + "priceInCents": 2000, + "priceInDollars": 20 +} +``` + ### Parse JSON embedded in HTML! -When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. +When you enable a `json:` or `jq:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. ``` @@ -171,7 +214,7 @@ When you enable a `json:` filter, you can even automatically extract and parse e ``` -`json:$.price` would give `23.50`, or you can extract the whole structure +`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure ## Proxy configuration diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 31b65657..52844c48 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -33,7 +33,7 @@ from flask_wtf import CSRFProtect from changedetectionio import html_tools from changedetectionio.api import api_v1 -__version__ = '0.39.19.1' +__version__ = '0.39.20.1' datastore = None diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 3543a285..ca938048 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -404,6 +404,11 @@ class base_html_playwright(Fetcher): raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url) + else: + # JS eval was run, now we also wait some time if possible to let the page settle + if self.render_extract_delay: + page.wait_for_timeout(self.render_extract_delay * 1000) + page.wait_for_timeout(500) self.content = page.content() @@ -529,8 +534,6 @@ class base_html_webdriver(Fetcher): # Selenium doesn't automatically wait for actions as good as Playwright, so wait again self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) - self.screenshot = self.driver.get_screenshot_as_png() - # @todo - how to check this? is it possible? self.status_code = 200 # @todo somehow we should try to get this working for WebDriver @@ -542,6 +545,8 @@ class base_html_webdriver(Fetcher): self.raw_content = self.driver.page_source self.headers = {} + self.screenshot = self.driver.get_screenshot_as_png() + # Does the connection to the webdriver work? run a test connection. def is_ready(self): from selenium import webdriver @@ -580,6 +585,11 @@ class html_requests(Fetcher): ignore_status_codes=False, current_css_filter=None): + # Make requests use a more modern looking user-agent + if not 'User-Agent' in request_headers: + request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') + proxies = {} # Allows override the proxy on a per-request basis diff --git a/changedetectionio/fetch_processor/json_html_plaintext.py b/changedetectionio/fetch_processor/json_html_plaintext.py index 4fddbc02..ec0d02ec 100644 --- a/changedetectionio/fetch_processor/json_html_plaintext.py +++ b/changedetectionio/fetch_processor/json_html_plaintext.py @@ -143,8 +143,9 @@ class perform_site_check(fetch_processor): has_filter_rule = True if has_filter_rule: - if 'json:' in css_filter_rule: - stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) + json_filter_prefixes = ['json:', 'jq:'] + if any(prefix in css_filter_rule for prefix in json_filter_prefixes): + stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule) is_html = False if is_html or is_source: diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index a3d6731f..5a44b225 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -304,6 +304,21 @@ class ValidateCSSJSONXPATHInput(object): # Re #265 - maybe in the future fetch the page and offer a # warning/notice that its possible the rule doesnt yet match anything? + if 'jq:' in line: + if not self.allow_json: + raise ValidationError("jq not permitted in this field!") + + import jq + input = line.replace('jq:', '') + + try: + jq.compile(input) + except (ValueError) as e: + message = field.gettext('\'%s\' is not a valid jq expression. (%s)') + raise ValidationError(message % (input, str(e))) + except: + raise ValidationError("A system-error occurred when validating your jq expression") + class quickWatchForm(Form): from . import fetch_processor diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index a851a4d6..6cc8e20a 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -3,6 +3,7 @@ from typing import List from bs4 import BeautifulSoup from jsonpath_ng.ext import parse +import jq import re from inscriptis import get_text from inscriptis.model.config import ParserConfig @@ -79,19 +80,26 @@ def extract_element(find='title', html_content=''): return element_text # -def _parse_json(json_data, jsonpath_filter): - s=[] - jsonpath_expression = parse(jsonpath_filter.replace('json:', '')) - match = jsonpath_expression.find(json_data) - +def _parse_json(json_data, json_filter): + if 'json:' in json_filter: + jsonpath_expression = parse(json_filter.replace('json:', '')) + match = jsonpath_expression.find(json_data) + return _get_stripped_text_from_json_match(match) + if 'jq:' in json_filter: + jq_expression = jq.compile(json_filter.replace('jq:', '')) + match = jq_expression.input(json_data).all() + return _get_stripped_text_from_json_match(match) + +def _get_stripped_text_from_json_match(match): + s = [] # More than one result, we will return it as a JSON list. if len(match) > 1: for i in match: - s.append(i.value) + s.append(i.value if hasattr(i, 'value') else i) # Single value, use just the value, as it could be later used in a token in notifications. if len(match) == 1: - s = match[0].value + s = match[0].value if hasattr(match[0], 'value') else match[0] # Re #257 - Better handling where it does not exist, in the case the original 's' value was False.. if not match: @@ -103,16 +111,16 @@ def _parse_json(json_data, jsonpath_filter): return stripped_text_from_html -def extract_json_as_string(content, jsonpath_filter): +def extract_json_as_string(content, json_filter): stripped_text_from_html = False # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded blob.. just return the first that matches jsonpath_filter + # Foreach blob.. just return the first that matches json_filter s = [] soup = BeautifulSoup(content, 'html.parser') bs_result = soup.findAll('script') @@ -131,7 +139,7 @@ def extract_json_as_string(content, jsonpath_filter): # Just skip it continue else: - stripped_text_from_html = _parse_json(json_data, jsonpath_filter) + stripped_text_from_html = _parse_json(json_data, json_filter) if stripped_text_from_html: break diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index c5f0e977..daedde1b 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -13,10 +13,6 @@ class model(dict): 'watching': {}, 'settings': { 'headers': { - 'User-Agent': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'Accept-Encoding': 'gzip, deflate', # No support for brolti in python requests yet. - 'Accept-Language': 'en-GB,en-US;q=0.9,en;' }, 'requests': { 'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 4eb5dcd0..bd86039a 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -81,8 +81,6 @@ class ChangeDetectionStore: except (FileNotFoundError, json.decoder.JSONDecodeError): if include_default_watches: print("Creating JSON store at", self.datastore_path) - - self.add_watch(url='http://www.quotationspage.com/random.php', tag='test') self.add_watch(url='https://news.ycombinator.com/', tag='Tech news') self.add_watch(url='https://changedetection.io/CHANGELOG.txt', tag='changedetection.io') @@ -577,3 +575,11 @@ class ChangeDetectionStore: continue return + + # We incorrectly used common header overrides that should only apply to Requests + # These are now handled in content_fetcher::html_requests and shouldnt be passed to Playwright/Selenium + def update_7(self): + # These were hard-coded in early versions + for v in ['User-Agent', 'Accept', 'Accept-Encoding', 'Accept-Language']: + if self.data['settings']['headers'].get(v): + del self.data['settings']['headers'][v] diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 95d59cfa..7ff711f5 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -79,6 +79,7 @@

Use the Basic method (default) where your watched site doesn't need Javascript to render.

The Chrome/Javascript method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'.

+ Tip: Connect using BrightData Proxies, find out more here.
{% if form.proxy %} @@ -188,8 +189,12 @@ User-Agent: wonderbra 1.0") }}