diff --git a/.github/workflows/test-container-build.yml b/.github/workflows/test-container-build.yml new file mode 100644 index 00000000..dc6ab712 --- /dev/null +++ b/.github/workflows/test-container-build.yml @@ -0,0 +1,46 @@ +name: ChangeDetection.io Container Build Test + +# Triggers the workflow on push or pull request events +on: + push: + paths: + - requirements.txt + - Dockerfile + + # Changes to requirements.txt packages and Dockerfile may or may not always be compatible with arm etc, so worth testing + # @todo: some kind of path filter for requirements.txt and Dockerfile +jobs: + test-container-build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + # Just test that the build works, some libraries won't compile on ARM/rPi etc + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + with: + image: tonistiigi/binfmt:latest + platforms: all + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + with: + install: true + version: latest + driver-opts: image=moby/buildkit:master + + - name: Test that the docker containers can build + id: docker_build + uses: docker/build-push-action@v2 + # https://github.com/docker/build-push-action#customizing + with: + context: ./ + file: ./Dockerfile + platforms: linux/arm/v7,linux/arm/v6,linux/amd64,linux/arm64, + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,dest=/tmp/.buildx-cache diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index baf1d178..aac97335 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -1,28 +1,25 @@ -name: ChangeDetection.io Test +name: ChangeDetection.io App Test # Triggers the workflow on push or pull request events on: [push, pull_request] jobs: - test-build: + test-application: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - name: Set up Python 3.9 uses: actions/setup-python@v2 with: python-version: 3.9 - - name: Show env vars - run: set - - name: Install dependencies run: | python -m pip install --upgrade pip pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi + - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -39,7 +36,4 @@ jobs: # Each test is totally isolated and performs its own cleanup/reset cd changedetectionio; ./run_all_tests.sh - # https://github.com/docker/build-push-action/blob/master/docs/advanced/test-before-push.md ? - # https://github.com/docker/buildx/issues/59 ? Needs to be one platform? - # https://github.com/docker/buildx/issues/495#issuecomment-918925854 diff --git a/Dockerfile b/Dockerfile index 03463647..24d3490e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,13 +5,14 @@ FROM python:3.8-slim as builder ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1 RUN apt-get update && apt-get install -y --no-install-recommends \ - libssl-dev \ - libffi-dev \ + g++ \ gcc \ libc-dev \ + libffi-dev \ + libssl-dev \ libxslt-dev \ - zlib1g-dev \ - g++ + make \ + zlib1g-dev RUN mkdir /install WORKDIR /install @@ -22,7 +23,7 @@ RUN pip install --target=/dependencies -r /requirements.txt # Playwright is an alternative to Selenium # Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing -RUN pip install --target=/dependencies playwright~=1.24 \ +RUN pip install --target=/dependencies playwright~=1.26 \ || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled." # Final image stage diff --git a/README-pip.md b/README-pip.md index 746175db..b6a00d32 100644 --- a/README-pip.md +++ b/README-pip.md @@ -33,7 +33,7 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W #### Key Features - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions! -- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules +- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq - Switch between fast non-JS and Chrome JS based "fetchers" - Easily specify how often a site should be checked - Execute JS before extracting text (Good for logging in, see examples in the UI!) diff --git a/README.md b/README.md index f2e75672..797f8c56 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,14 @@ Know when important content changes, we support notifications via Discord, Teleg [**Don't have time? Let us host it for you! try our $6.99/month subscription - use our proxies and support!**](https://lemonade.changedetection.io/start) , _half the price of other website change monitoring services and comes with unlimited watches & checks!_ +- Chrome browser included. +- Super fast, no registration needed setup. +- Start watching and receiving change notifications instantly. -- Automatic Updates, Automatic Backups, No Heroku "paused application", don't miss a change! -- Javascript browser included -- Unlimited checks and watches! +Easily see what changed, examine by word, line, or individual character. + + #### Example use cases @@ -44,22 +47,18 @@ _Need an actual Chrome runner with Javascript support? We support fetching via W #### Key Features - Lots of trigger filters, such as "Trigger on text", "Remove text by selector", "Ignore text", "Extract text", also using regular-expressions! -- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JsonPath rules +- Target elements with xPath and CSS Selectors, Easily monitor complex JSON with JSONPath or jq - Switch between fast non-JS and Chrome JS based "fetchers" - Easily specify how often a site should be checked - Execute JS before extracting text (Good for logging in, see examples in the UI!) - Override Request Headers, Specify `POST` or `GET` and other methods - Use the "Visual Selector" to help target specific elements +- Configurable [proxy per watch](https://github.com/dgtlmoon/changedetection.io/wiki/Proxy-configuration) +We [recommend and use Bright Data](https://brightdata.grsm.io/n0r16zf7eivq) global proxy services, Bright Data will match any first deposit up to $100 using our signup link. ## Screenshots -### Examine differences in content. - -Easily see what changed, examine by word, line, or individual character. - - - Please :star: star :star: this project and help it grow! https://github.com/dgtlmoon/changedetection.io/ ### Filter by elements using the Visual Selector tool. @@ -122,7 +121,7 @@ See the wiki for more information https://github.com/dgtlmoon/changedetection.io ## Filters -XPath, JSONPath and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. +XPath, JSONPath, jq, and CSS support comes baked in! You can be as specific as you need, use XPath exported from various XPath element query creation tools. (We support LXML `re:test`, `re:math` and `re:replace`.) @@ -152,7 +151,7 @@ Now you can also customise your notification content! ## JSON API Monitoring -Detect changes and monitor data in JSON API's by using the built-in JSONPath selectors as a filter / selector. +Detect changes and monitor data in JSON API's by using either JSONPath or jq to filter, parse, and restructure JSON as needed. ![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-filter-field-example.png) @@ -160,9 +159,52 @@ This will re-parse the JSON and apply formatting to the text, making it super ea ![image](https://raw.githubusercontent.com/dgtlmoon/changedetection.io/master/docs/json-diff-example.png) +### JSONPath or jq? + +For more complex parsing, filtering, and modifying of JSON data, jq is recommended due to the built-in operators and functions. Refer to the [documentation](https://stedolan.github.io/jq/manual/) for more information on jq. + +The example below adds the price in dollars to each item in the JSON data, and then filters to only show items that are greater than 10. + +#### Sample input data from API +``` +{ + "items": [ + { + "name": "Product A", + "priceInCents": 2500 + }, + { + "name": "Product B", + "priceInCents": 500 + }, + { + "name": "Product C", + "priceInCents": 2000 + } + ] +} +``` + +#### Sample jq +`jq:.items[] | . + { "priceInDollars": (.priceInCents / 100) } | select(.priceInDollars > 10)` + +#### Sample output data +``` +{ + "name": "Product A", + "priceInCents": 2500, + "priceInDollars": 25 +} +{ + "name": "Product C", + "priceInCents": 2000, + "priceInDollars": 20 +} +``` + ### Parse JSON embedded in HTML! -When you enable a `json:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. +When you enable a `json:` or `jq:` filter, you can even automatically extract and parse embedded JSON inside a HTML page! Amazingly handy for sites that build content based on JSON, such as many e-commerce websites. ``` @@ -172,7 +214,7 @@ When you enable a `json:` filter, you can even automatically extract and parse e ``` -`json:$.price` would give `23.50`, or you can extract the whole structure +`json:$.price` or `jq:.price` would give `23.50`, or you can extract the whole structure ## Proxy configuration diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index bdfe8b9f..8f6d5a55 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -33,7 +33,7 @@ from flask_wtf import CSRFProtect from changedetectionio import html_tools from changedetectionio.api import api_v1 -__version__ = '0.39.19' +__version__ = '0.39.20.1' datastore = None @@ -547,6 +547,7 @@ def changedetection_app(config=None, datastore_o=None): # Defaults for proxy choice if datastore.proxy_list is not None: # When enabled + # @todo # Radio needs '' not None, or incase that the chosen one no longer exists if default['proxy'] is None or not any(default['proxy'] in tup for tup in datastore.proxy_list): default['proxy'] = '' @@ -560,7 +561,10 @@ def changedetection_app(config=None, datastore_o=None): # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead del form.proxy else: - form.proxy.choices = [('', 'Default')] + datastore.proxy_list + form.proxy.choices = [('', 'Default')] + for p in datastore.proxy_list: + form.proxy.choices.append(tuple((p, datastore.proxy_list[p]['label']))) + if request.method == 'POST' and form.validate(): extra_update_obj = {} @@ -657,15 +661,16 @@ def changedetection_app(config=None, datastore_o=None): default = deepcopy(datastore.data['settings']) if datastore.proxy_list is not None: + available_proxies = list(datastore.proxy_list.keys()) # When enabled system_proxy = datastore.data['settings']['requests']['proxy'] # In the case it doesnt exist anymore - if not any([system_proxy in tup for tup in datastore.proxy_list]): + if not system_proxy in available_proxies: system_proxy = None - default['requests']['proxy'] = system_proxy if system_proxy is not None else datastore.proxy_list[0][0] + default['requests']['proxy'] = system_proxy if system_proxy is not None else available_proxies[0] # Used by the form handler to keep or remove the proxy settings - default['proxy_list'] = datastore.proxy_list + default['proxy_list'] = available_proxies[0] # Don't use form.data on POST so that it doesnt overrid the checkbox status from the POST status @@ -680,7 +685,10 @@ def changedetection_app(config=None, datastore_o=None): # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead del form.requests.form.proxy else: - form.requests.form.proxy.choices = datastore.proxy_list + form.requests.form.proxy.choices = [] + for p in datastore.proxy_list: + form.requests.form.proxy.choices.append(tuple((p, datastore.proxy_list[p]['label']))) + if request.method == 'POST': # Password unset is a GET, but we can lock the session to a salted env password to always need the password @@ -1189,7 +1197,7 @@ def changedetection_app(config=None, datastore_o=None): datastore.delete(uuid.strip()) flash("{} watches deleted".format(len(uuids))) - if (op == 'pause'): + elif (op == 'pause'): for uuid in uuids: uuid = uuid.strip() if datastore.data['watching'].get(uuid): @@ -1197,13 +1205,40 @@ def changedetection_app(config=None, datastore_o=None): flash("{} watches paused".format(len(uuids))) - if (op == 'unpause'): + elif (op == 'unpause'): for uuid in uuids: uuid = uuid.strip() if datastore.data['watching'].get(uuid): datastore.data['watching'][uuid.strip()]['paused'] = False flash("{} watches unpaused".format(len(uuids))) + elif (op == 'mute'): + for uuid in uuids: + uuid = uuid.strip() + if datastore.data['watching'].get(uuid): + datastore.data['watching'][uuid.strip()]['notification_muted'] = True + flash("{} watches muted".format(len(uuids))) + + elif (op == 'unmute'): + for uuid in uuids: + uuid = uuid.strip() + if datastore.data['watching'].get(uuid): + datastore.data['watching'][uuid.strip()]['notification_muted'] = False + flash("{} watches un-muted".format(len(uuids))) + + elif (op == 'notification-default'): + from changedetectionio.notification import ( + default_notification_format_for_watch + ) + for uuid in uuids: + uuid = uuid.strip() + if datastore.data['watching'].get(uuid): + datastore.data['watching'][uuid.strip()]['notification_title'] = None + datastore.data['watching'][uuid.strip()]['notification_body'] = None + datastore.data['watching'][uuid.strip()]['notification_urls'] = [] + datastore.data['watching'][uuid.strip()]['notification_format'] = default_notification_format_for_watch + flash("{} watches set to use default notification settings".format(len(uuids))) + return redirect(url_for('index')) @app.route("/api/share-url", methods=['GET']) @@ -1341,6 +1376,8 @@ def ticker_thread_check_time_launch_checks(): import random from changedetectionio import update_worker + proxy_last_called_time = {} + recheck_time_minimum_seconds = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 20)) print("System env MINIMUM_SECONDS_RECHECK_TIME", recheck_time_minimum_seconds) @@ -1401,10 +1438,30 @@ def ticker_thread_check_time_launch_checks(): if watch.jitter_seconds == 0: watch.jitter_seconds = random.uniform(-abs(jitter), jitter) - seconds_since_last_recheck = now - watch['last_checked'] + if seconds_since_last_recheck >= (threshold + watch.jitter_seconds) and seconds_since_last_recheck >= recheck_time_minimum_seconds: if not uuid in running_uuids and uuid not in [q_uuid for p,q_uuid in update_q.queue]: + + # Proxies can be set to have a limit on seconds between which they can be called + watch_proxy = datastore.get_preferred_proxy_for_watch(uuid=uuid) + if watch_proxy and watch_proxy in list(datastore.proxy_list.keys()): + # Proxy may also have some threshold minimum + proxy_list_reuse_time_minimum = int(datastore.proxy_list.get(watch_proxy, {}).get('reuse_time_minimum', 0)) + if proxy_list_reuse_time_minimum: + proxy_last_used_time = proxy_last_called_time.get(watch_proxy, 0) + time_since_proxy_used = int(time.time() - proxy_last_used_time) + if time_since_proxy_used < proxy_list_reuse_time_minimum: + # Not enough time difference reached, skip this watch + print("> Skipped UUID {} using proxy '{}', not enough time between proxy requests {}s/{}s".format(uuid, + watch_proxy, + time_since_proxy_used, + proxy_list_reuse_time_minimum)) + continue + else: + # Record the last used time + proxy_last_called_time[watch_proxy] = int(time.time()) + # Use Epoch time as priority, so we get a "sorted" PriorityQueue, but we can still push a priority 1 into it. priority = int(time.time()) print( diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index d831ce84..416ed6df 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -316,6 +316,7 @@ class base_html_playwright(Fetcher): import playwright._impl._api_types from playwright._impl._api_types import Error, TimeoutError response = None + with sync_playwright() as p: browser_type = getattr(p, self.browser_type) @@ -373,8 +374,11 @@ class base_html_playwright(Fetcher): print("response object was none") raise EmptyReply(url=url, status_code=None) - # Bug 2(?) Set the viewport size AFTER loading the page - page.set_viewport_size({"width": 1280, "height": 1024}) + + # Removed browser-set-size, seemed to be needed to make screenshots work reliably in older playwright versions + # Was causing exceptions like 'waiting for page but content is changing' etc + # https://www.browserstack.com/docs/automate/playwright/change-browser-window-size 1280x720 should be the default + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay time.sleep(extra_wait) @@ -398,6 +402,13 @@ class base_html_playwright(Fetcher): raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url) + else: + # JS eval was run, now we also wait some time if possible to let the page settle + if self.render_extract_delay: + page.wait_for_timeout(self.render_extract_delay * 1000) + + page.wait_for_timeout(500) + self.content = page.content() self.status_code = response.status self.headers = response.all_headers() @@ -514,8 +525,6 @@ class base_html_webdriver(Fetcher): # Selenium doesn't automatically wait for actions as good as Playwright, so wait again self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) - self.screenshot = self.driver.get_screenshot_as_png() - # @todo - how to check this? is it possible? self.status_code = 200 # @todo somehow we should try to get this working for WebDriver @@ -526,6 +535,8 @@ class base_html_webdriver(Fetcher): self.content = self.driver.page_source self.headers = {} + self.screenshot = self.driver.get_screenshot_as_png() + # Does the connection to the webdriver work? run a test connection. def is_ready(self): from selenium import webdriver @@ -564,6 +575,11 @@ class html_requests(Fetcher): ignore_status_codes=False, current_css_filter=None): + # Make requests use a more modern looking user-agent + if not 'User-Agent' in request_headers: + request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') + proxies = {} # Allows override the proxy on a per-request basis diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index d8bcb1d0..c1fc373f 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -21,34 +21,6 @@ class perform_site_check(): super().__init__(*args, **kwargs) self.datastore = datastore - # If there was a proxy list enabled, figure out what proxy_args/which proxy to use - # if watch.proxy use that - # fetcher.proxy_override = watch.proxy or main config proxy - # Allows override the proxy on a per-request basis - # ALWAYS use the first one is nothing selected - - def set_proxy_from_list(self, watch): - proxy_args = None - if self.datastore.proxy_list is None: - return None - - # If its a valid one - if any([watch['proxy'] in p for p in self.datastore.proxy_list]): - proxy_args = watch['proxy'] - - # not valid (including None), try the system one - else: - system_proxy = self.datastore.data['settings']['requests']['proxy'] - # Is not None and exists - if any([system_proxy in p for p in self.datastore.proxy_list]): - proxy_args = system_proxy - - # Fallback - Did not resolve anything, use the first available - if proxy_args is None: - proxy_args = self.datastore.proxy_list[0][0] - - return proxy_args - # Doesn't look like python supports forward slash auto enclosure in re.findall # So convert it to inline flag "foobar(?i)" type configuration def forward_slash_enclosed_regex_to_options(self, regex): @@ -69,6 +41,8 @@ class perform_site_check(): stripped_text_from_html = "" watch = self.datastore.data['watching'].get(uuid) + if not watch: + return # Protect against file:// access if re.search(r'^file', watch['url'], re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): @@ -91,7 +65,7 @@ class perform_site_check(): if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - timeout = self.datastore.data['settings']['requests']['timeout'] + timeout = self.datastore.data['settings']['requests'].get('timeout') url = watch.get('url') request_body = self.datastore.data['watching'][uuid].get('body') request_method = self.datastore.data['watching'][uuid].get('method') @@ -111,9 +85,13 @@ class perform_site_check(): # If the klass doesnt exist, just use a default klass = getattr(content_fetcher, "html_requests") + proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) + proxy_url = None + if proxy_id: + proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') + print ("UUID {} Using proxy {}".format(uuid, proxy_url)) - proxy_args = self.set_proxy_from_list(watch) - fetcher = klass(proxy_override=proxy_args) + fetcher = klass(proxy_override=proxy_url) # Configurable per-watch or global extra delay before extracting text (for webDriver types) system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) @@ -164,8 +142,9 @@ class perform_site_check(): has_filter_rule = True if has_filter_rule: - if 'json:' in css_filter_rule: - stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, jsonpath_filter=css_filter_rule) + json_filter_prefixes = ['json:', 'jq:'] + if any(prefix in css_filter_rule for prefix in json_filter_prefixes): + stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule) is_html = False if is_html or is_source: diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 7c67fa9f..0008d98b 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -304,6 +304,21 @@ class ValidateCSSJSONXPATHInput(object): # Re #265 - maybe in the future fetch the page and offer a # warning/notice that its possible the rule doesnt yet match anything? + if 'jq:' in line: + if not self.allow_json: + raise ValidationError("jq not permitted in this field!") + + import jq + input = line.replace('jq:', '') + + try: + jq.compile(input) + except (ValueError) as e: + message = field.gettext('\'%s\' is not a valid jq expression. (%s)') + raise ValidationError(message % (input, str(e))) + except: + raise ValidationError("A system-error occurred when validating your jq expression") + class ValidateDiffFilters(object): """ Validates that at least one filter checkbox is selected @@ -316,6 +331,7 @@ class ValidateDiffFilters(object): message = field.gettext('At least one filter checkbox must be selected') raise ValidationError(message) + class quickWatchForm(Form): url = fields.URLField('URL', validators=[validateURL()]) tag = StringField('Group tag', [validators.Optional()]) @@ -325,14 +341,14 @@ class quickWatchForm(Form): # Common to a single watch and the global settings class commonSettingsForm(Form): - notification_urls = StringListField('Notification URL list', validators=[validators.Optional(), ValidateAppRiseServers()]) - notification_title = StringField('Notification title', default=default_notification_title, validators=[validators.Optional(), ValidateTokensList()]) - notification_body = TextAreaField('Notification body', default=default_notification_body, validators=[validators.Optional(), ValidateTokensList()]) - notification_format = SelectField('Notification format', choices=valid_notification_formats.keys(), default=default_notification_format) + notification_title = StringField('Notification title', validators=[validators.Optional(), ValidateTokensList()]) + notification_body = TextAreaField('Notification body', validators=[validators.Optional(), ValidateTokensList()]) + notification_format = SelectField('Notification format', choices=valid_notification_formats.keys()) fetch_backend = RadioField(u'Fetch method', choices=content_fetcher.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) extract_title_as_title = BooleanField('Extract
Use the Basic method (default) where your watched sites don't need Javascript to render.
The Chrome/Javascript method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'.
+