diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml index a4c7b87c..f2864680 100644 --- a/.github/workflows/test-stack-reusable-workflow.yml +++ b/.github/workflows/test-stack-reusable-workflow.yml @@ -93,7 +93,7 @@ jobs: - name: Playwright and SocketPuppetBrowser - Headers and requests run: | # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers - docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' + docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py; pwd;find .' - name: Playwright and SocketPuppetBrowser - Restock detection run: | @@ -231,9 +231,9 @@ jobs: docker logs test-cdio-basic-tests > output-logs/test-cdio-basic-tests-stdout-${{ env.PYTHON_VERSION }}.txt docker logs test-cdio-basic-tests 2> output-logs/test-cdio-basic-tests-stderr-${{ env.PYTHON_VERSION }}.txt - - name: Store container log + - name: Store everything including test-datastore if: always() uses: actions/upload-artifact@v4 with: name: test-cdio-basic-tests-output-py${{ env.PYTHON_VERSION }} - path: output-logs + path: . diff --git a/Dockerfile b/Dockerfile index 626759cb..6641b947 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,6 +40,8 @@ FROM python:${PYTHON_VERSION}-slim-bookworm RUN apt-get update && apt-get install -y --no-install-recommends \ libxslt1.1 \ + # For presenting price amounts correctly in the restock/price detection overview + locales \ # For pdftohtml poppler-utils \ zlib1g \ diff --git a/changedetectionio/api/api_v1.py b/changedetectionio/api/api_v1.py index 19d83612..9b3eb440 100644 --- a/changedetectionio/api/api_v1.py +++ b/changedetectionio/api/api_v1.py @@ -12,9 +12,10 @@ import copy # See docs/README.md for rebuilding the docs/apidoc information from . import api_schema +from ..model import watch_base # Build a JSON Schema atleast partially based on our Watch model -from changedetectionio.model.Watch import base_config as watch_base_config +watch_base_config = watch_base() schema = api_schema.build_watch_json_schema(watch_base_config) schema_create_watch = copy.deepcopy(schema) diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py index 62a7dab3..8d7df73f 100644 --- a/changedetectionio/blueprint/check_proxies/__init__.py +++ b/changedetectionio/blueprint/check_proxies/__init__.py @@ -30,7 +30,7 @@ def construct_blueprint(datastore: ChangeDetectionStore): def long_task(uuid, preferred_proxy): import time from changedetectionio.content_fetchers import exceptions as content_fetcher_exceptions - from changedetectionio.processors import text_json_diff + from changedetectionio.processors.text_json_diff import text_json_diff from changedetectionio.safe_jinja import render as jinja_render status = {'status': '', 'length': 0, 'text': ''} diff --git a/changedetectionio/blueprint/price_data_follower/__init__.py b/changedetectionio/blueprint/price_data_follower/__init__.py index 89a2fc67..a41552d8 100644 --- a/changedetectionio/blueprint/price_data_follower/__init__.py +++ b/changedetectionio/blueprint/price_data_follower/__init__.py @@ -17,6 +17,8 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q: PriorityQueue @price_data_follower_blueprint.route("//accept", methods=['GET']) def accept(uuid): datastore.data['watching'][uuid]['track_ldjson_price_data'] = PRICE_DATA_TRACK_ACCEPT + datastore.data['watching'][uuid]['processor'] = 'restock_diff' + datastore.data['watching'][uuid].clear_watch() update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False})) return redirect(url_for("index")) diff --git a/changedetectionio/blueprint/tags/__init__.py b/changedetectionio/blueprint/tags/__init__.py index 7a49822b..e826aea0 100644 --- a/changedetectionio/blueprint/tags/__init__.py +++ b/changedetectionio/blueprint/tags/__init__.py @@ -103,7 +103,7 @@ def construct_blueprint(datastore: ChangeDetectionStore): default = datastore.data['settings']['application']['tags'].get(uuid) - form = forms.watchForm(formdata=request.form if request.method == 'POST' else None, + form = forms.processor_text_json_diff_form(formdata=request.form if request.method == 'POST' else None, data=default, ) form.datastore=datastore # needed? @@ -126,7 +126,7 @@ def construct_blueprint(datastore: ChangeDetectionStore): default = datastore.data['settings']['application']['tags'].get(uuid) - form = forms.watchForm(formdata=request.form if request.method == 'POST' else None, + form = forms.processor_text_json_diff_form(formdata=request.form if request.method == 'POST' else None, data=default, ) # @todo subclass form so validation works diff --git a/changedetectionio/content_fetchers/exceptions/__init__.py b/changedetectionio/content_fetchers/exceptions/__init__.py index 9552b838..80ebae69 100644 --- a/changedetectionio/content_fetchers/exceptions/__init__.py +++ b/changedetectionio/content_fetchers/exceptions/__init__.py @@ -1,6 +1,5 @@ from loguru import logger - class Non200ErrorCodeReceived(Exception): def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None): # Set this so we can use it in other parts of the app @@ -81,7 +80,7 @@ class ScreenshotUnavailable(Exception): self.status_code = status_code self.url = url if page_html: - from html_tools import html_to_text + from changedetectionio.html_tools import html_to_text self.page_text = html_to_text(page_html) return diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index d3f341d5..f07bbfcb 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -1,18 +1,22 @@ #!/usr/bin/python3 import datetime +import flask_login +import locale import os +import pytz import queue import threading import time +import timeago + +from .processors import find_processors, get_parent_module, get_custom_watch_obj_for_processor from .safe_jinja import render as jinja_render from changedetectionio.strtobool import strtobool from copy import deepcopy from functools import wraps from threading import Event -import flask_login -import pytz -import timeago + from feedgen.feed import FeedGenerator from flask import ( Flask, @@ -79,6 +83,14 @@ csrf = CSRFProtect() csrf.init_app(app) notification_debug_log=[] +# get locale ready +default_locale = locale.getdefaultlocale() +logger.info(f"System locale default is {default_locale}") +try: + locale.setlocale(locale.LC_ALL, default_locale) +except locale.Error: + logger.warning(f"Unable to set locale {default_locale}, locale is not installed maybe?") + watch_api = Api(app, decorators=[csrf.exempt]) def init_app_secret(datastore_path): @@ -108,6 +120,14 @@ def get_darkmode_state(): def get_css_version(): return __version__ +@app.template_filter('format_number_locale') +def _jinja2_filter_format_number_locale(value: float) -> str: + "Formats for example 4000.10 to the local locale default of 4,000.10" + # Format the number with two decimal places (locale format string will return 6 decimal) + formatted_value = locale.format_string("%.2f", value, grouping=True) + + return formatted_value + # We use the whole watch object from the store/JSON so we can see if there's some related status in terms of a thread # running or something similar. @app.template_filter('format_last_checked_time') @@ -616,11 +636,11 @@ def changedetection_app(config=None, datastore_o=None): @login_optionally_required # https://stackoverflow.com/questions/42984453/wtforms-populate-form-with-data-if-data-exists # https://wtforms.readthedocs.io/en/3.0.x/forms/#wtforms.form.Form.populate_obj ? - def edit_page(uuid): from . import forms from .blueprint.browser_steps.browser_steps import browser_step_ui_config from . import processors + import importlib # More for testing, possible to return the first/only if not datastore.data['watching'].keys(): @@ -652,9 +672,30 @@ def changedetection_app(config=None, datastore_o=None): # Radio needs '' not None, or incase that the chosen one no longer exists if default['proxy'] is None or not any(default['proxy'] in tup for tup in datastore.proxy_list): default['proxy'] = '' - # proxy_override set to the json/text list of the items - form = forms.watchForm(formdata=request.form if request.method == 'POST' else None, + + # Does it use some custom form? does one exist? + processor_name = datastore.data['watching'][uuid].get('processor', '') + processor_classes = next((tpl for tpl in find_processors() if tpl[1] == processor_name), None) + if not processor_classes: + flash(f"Cannot load the edit form for processor/plugin '{processor_classes[1]}', plugin missing?", 'error') + return redirect(url_for('index')) + + parent_module = get_parent_module(processor_classes[0]) + + try: + # Get the parent of the "processor.py" go up one, get the form (kinda spaghetti but its reusing existing code) + forms_module = importlib.import_module(f"{parent_module.__name__}.forms") + # Access the 'processor_settings_form' class from the 'forms' module + form_class = getattr(forms_module, 'processor_settings_form') + except ModuleNotFoundError as e: + # .forms didnt exist + form_class = forms.processor_text_json_diff_form + except AttributeError as e: + # .forms exists but no useful form + form_class = forms.processor_text_json_diff_form + + form = form_class(formdata=request.form if request.method == 'POST' else None, data=default ) @@ -679,6 +720,11 @@ def changedetection_app(config=None, datastore_o=None): if request.method == 'POST' and form.validate(): + # If they changed processor, it makes sense to reset it. + if datastore.data['watching'][uuid].get('processor') != form.data.get('processor'): + datastore.data['watching'][uuid].clear_watch() + flash("Reset watch history due to change of processor") + extra_update_obj = { 'consecutive_filter_failures': 0, 'last_error' : False @@ -720,10 +766,11 @@ def changedetection_app(config=None, datastore_o=None): datastore.data['watching'][uuid].update(form.data) datastore.data['watching'][uuid].update(extra_update_obj) - if request.args.get('unpause_on_save'): - flash("Updated watch - unpaused!") - else: - flash("Updated watch.") + # Recast it if need be to right data Watch handler + watch_class = get_custom_watch_obj_for_processor(form.data.get('processor')) + datastore.data['watching'][uuid] = watch_class(datastore_path=datastore_o.datastore_path, default=datastore.data['watching'][uuid]) + + flash("Updated watch - unpaused!" if request.args.get('unpause_on_save') else "Updated watch.") # Re #286 - We wait for syncing new data to disk in another thread every 60 seconds # But in the case something is added we should save straight away @@ -753,6 +800,7 @@ def changedetection_app(config=None, datastore_o=None): jq_support = False watch = datastore.data['watching'].get(uuid) + system_uses_webdriver = datastore.data['settings']['application']['fetch_backend'] == 'html_webdriver' is_html_webdriver = False @@ -761,23 +809,41 @@ def changedetection_app(config=None, datastore_o=None): # Only works reliably with Playwright visualselector_enabled = os.getenv('PLAYWRIGHT_DRIVER_URL', False) and is_html_webdriver + template_args = { + 'available_processors': processors.available_processors(), + 'browser_steps_config': browser_step_ui_config, + 'emailprefix': os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False), + 'extra_title': f" - Edit - {watch.label}", + 'extra_processor_config': form.extra_tab_content(), + 'form': form, + 'has_default_notification_urls': True if len(datastore.data['settings']['application']['notification_urls']) else False, + 'has_extra_headers_file': len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0, + 'has_special_tag_options': _watch_has_tag_options_set(watch=watch), + 'is_html_webdriver': is_html_webdriver, + 'jq_support': jq_support, + 'playwright_enabled': os.getenv('PLAYWRIGHT_DRIVER_URL', False), + 'settings_application': datastore.data['settings']['application'], + 'using_global_webdriver_wait': not default['webdriver_delay'], + 'uuid': uuid, + 'visualselector_enabled': visualselector_enabled, + 'watch': watch + } + + included_content = None + if form.extra_form_content(): + # So that the extra panels can access _helpers.html etc, we set the environment to load from templates/ + # And then render the code from the module + from jinja2 import Environment, FileSystemLoader + import importlib.resources + templates_dir = str(importlib.resources.files("changedetectionio").joinpath('templates')) + env = Environment(loader=FileSystemLoader(templates_dir)) + template = env.from_string(form.extra_form_content()) + included_content = template.render(**template_args) + output = render_template("edit.html", - available_processors=processors.available_processors(), - browser_steps_config=browser_step_ui_config, - emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False), - extra_title=f" - Edit - {watch.label}", - form=form, - has_default_notification_urls=True if len(datastore.data['settings']['application']['notification_urls']) else False, - has_extra_headers_file=len(datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) > 0, - has_special_tag_options=_watch_has_tag_options_set(watch=watch), - is_html_webdriver=is_html_webdriver, - jq_support=jq_support, - playwright_enabled=os.getenv('PLAYWRIGHT_DRIVER_URL', False), - settings_application=datastore.data['settings']['application'], - using_global_webdriver_wait=not default['webdriver_delay'], - uuid=uuid, - visualselector_enabled=visualselector_enabled, - watch=watch + extra_tab_content=form.extra_tab_content() if form.extra_tab_content() else None, + extra_form_content=included_content, + **template_args ) return output @@ -887,7 +953,7 @@ def changedetection_app(config=None, datastore_o=None): if request.values.get('urls') and len(request.values.get('urls').strip()): # Import and push into the queue for immediate update check importer = import_url_list() - importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore, processor=request.values.get('processor')) + importer.run(data=request.values.get('urls'), flash=flash, datastore=datastore, processor=request.values.get('processor', 'text_json_diff')) for uuid in importer.new_uuids: update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': True})) @@ -1388,7 +1454,7 @@ def changedetection_app(config=None, datastore_o=None): update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': watch_uuid, 'skip_when_checksum_same': False})) i += 1 - flash("{} watches queued for rechecking.".format(i)) + flash(f"{i} watches queued for rechecking.") return redirect(url_for('index', tag=tag)) @app.route("/form/checkbox-operations", methods=['POST']) diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 16720aa2..2cefae90 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -1,5 +1,6 @@ import os import re + from changedetectionio.strtobool import strtobool from wtforms import ( @@ -419,15 +420,18 @@ class quickWatchForm(Form): # Common to a single watch and the global settings class commonSettingsForm(Form): + from . import processors - notification_urls = StringListField('Notification URL List', validators=[validators.Optional(), ValidateAppRiseServers(), ValidateJinja2Template()]) - notification_title = StringField('Notification Title', default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()]) + extract_title_as_title = BooleanField('Extract from document and use as watch title', default=False) + fetch_backend = RadioField(u'Fetch Method', choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) notification_body = TextAreaField('Notification Body', default='{{ watch_url }} had a change.', validators=[validators.Optional(), ValidateJinja2Template()]) notification_format = SelectField('Notification format', choices=valid_notification_formats.keys()) - fetch_backend = RadioField(u'Fetch Method', choices=content_fetchers.available_fetchers(), validators=[ValidateContentFetcherIsReady()]) - extract_title_as_title = BooleanField('Extract <title> from document and use as watch title', default=False) + notification_title = StringField('Notification Title', default='ChangeDetection.io Notification - {{ watch_url }}', validators=[validators.Optional(), ValidateJinja2Template()]) + notification_urls = StringListField('Notification URL List', validators=[validators.Optional(), ValidateAppRiseServers(), ValidateJinja2Template()]) + processor = RadioField( label=u"Processor - What do you want to achieve?", choices=processors.available_processors(), default="text_json_diff") webdriver_delay = IntegerField('Wait seconds before extracting text', validators=[validators.Optional(), validators.NumberRange(min=1, message="Should contain one or more seconds")]) + class importForm(Form): from . import processors processor = RadioField(u'Processor', choices=processors.available_processors(), default="text_json_diff") @@ -447,7 +451,7 @@ class SingleBrowserStep(Form): # remove_button = SubmitField('-', render_kw={"type": "button", "class": "pure-button pure-button-primary", 'title': 'Remove'}) # add_button = SubmitField('+', render_kw={"type": "button", "class": "pure-button pure-button-primary", 'title': 'Add new step after'}) -class watchForm(commonSettingsForm): +class processor_text_json_diff_form(commonSettingsForm): url = fields.URLField('URL', validators=[validateURL()]) tags = StringTagUUID('Group tag', [validators.Optional()], default='') @@ -475,9 +479,6 @@ class watchForm(commonSettingsForm): filter_text_replaced = BooleanField('Replaced/changed lines', default=True) filter_text_removed = BooleanField('Removed lines', default=True) - # @todo this class could be moved to its own text_json_diff_watchForm and this goes to restock_diff_Watchform perhaps - in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True) - trigger_text = StringListField('Trigger/wait for text', [validators.Optional(), ValidateListRegex()]) if os.getenv("PLAYWRIGHT_DRIVER_URL"): browser_steps = FieldList(FormField(SingleBrowserStep), min_entries=10) @@ -493,6 +494,12 @@ class watchForm(commonSettingsForm): notification_muted = BooleanField('Notifications Muted / Off', default=False) notification_screenshot = BooleanField('Attach screenshot to notification (where possible)', default=False) + def extra_tab_content(self): + return None + + def extra_form_content(self): + return None + def validate(self, **kwargs): if not super().validate(): return False @@ -513,7 +520,6 @@ class watchForm(commonSettingsForm): result = False return result - class SingleExtraProxy(Form): # maybe better to set some <script>var.. diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 232a558c..bd5fdb8f 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -243,7 +243,7 @@ def _get_stripped_text_from_json_match(match): # ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector) def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None): stripped_text_from_html = False - +# https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags try: stripped_text_from_html = _parse_json(json.loads(content), json_filter) @@ -282,17 +282,19 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None if isinstance(json_data, dict): # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part) - # @type could also be a list (Product, SubType) + # @type could also be a list although non-standard ("@type": ["Product", "SubType"],) # LD_JSON auto-extract also requires some content PLUS the ldjson to be present # 1833 - could be either str or dict, should not be anything else - if json_data.get('@type') and stripped_text_from_html: - try: - if json_data.get('@type') == str or json_data.get('@type') == dict: - types = [json_data.get('@type')] if isinstance(json_data.get('@type'), str) else json_data.get('@type') - if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in types]: - break - except: - continue + + t = json_data.get('@type') + if t and stripped_text_from_html: + + if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower(): + break + # The non-standard part, some have a list + elif isinstance(t, list): + if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]: + break elif stripped_text_from_html: break diff --git a/changedetectionio/model/Tag.py b/changedetectionio/model/Tag.py index 1592cf08..60e27a48 100644 --- a/changedetectionio/model/Tag.py +++ b/changedetectionio/model/Tag.py @@ -1,19 +1,14 @@ -from .Watch import base_config -import uuid -class model(dict): +from changedetectionio.model import watch_base - def __init__(self, *arg, **kw): +class model(watch_base): - self.update(base_config) + def __init__(self, *arg, **kw): - self['uuid'] = str(uuid.uuid4()) + super(model, self).__init__(*arg, **kw) if kw.get('default'): self.update(kw['default']) del kw['default'] - # Goes at the end so we update the default object with the initialiser - super(model, self).__init__(*arg, **kw) - diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 553f6227..e4697f38 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -1,10 +1,8 @@ from changedetectionio.strtobool import strtobool from changedetectionio.safe_jinja import render as jinja_render - +from . import watch_base import os import re -import time -import uuid from pathlib import Path from loguru import logger @@ -15,69 +13,6 @@ SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):' minimum_seconds_recheck_time = int(os.getenv('MINIMUM_SECONDS_RECHECK_TIME', 3)) mtable = {'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 86400 * 7} -from changedetectionio.notification import ( - default_notification_format_for_watch -) - -base_config = { - 'body': None, - 'browser_steps': [], - 'browser_steps_last_error_step': None, - 'check_unique_lines': False, # On change-detected, compare against all history if its something new - 'check_count': 0, - 'date_created': None, - 'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine. - 'extract_text': [], # Extract text by regex after filters - 'extract_title_as_title': False, - 'fetch_backend': 'system', # plaintext, playwright etc - 'fetch_time': 0.0, - 'processor': 'text_json_diff', # could be restock_diff or others from .processors - 'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), - 'filter_text_added': True, - 'filter_text_replaced': True, - 'filter_text_removed': True, - 'has_ldjson_price_data': None, - 'track_ldjson_price_data': None, - 'headers': {}, # Extra headers to send - 'ignore_text': [], # List of text to ignore when calculating the comparison checksum - 'in_stock' : None, - 'in_stock_only' : True, # Only trigger change on going to instock from out-of-stock - 'include_filters': [], - 'last_checked': 0, - 'last_error': False, - 'last_viewed': 0, # history key value of the last viewed via the [diff] link - 'method': 'GET', - 'notification_alert_count': 0, - # Custom notification content - 'notification_body': None, - 'notification_format': default_notification_format_for_watch, - 'notification_muted': False, - 'notification_title': None, - 'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL - 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) - 'paused': False, - 'previous_md5': False, - 'previous_md5_before_filters': False, # Used for skipping changedetection entirely - 'proxy': None, # Preferred proxy connection - 'remote_server_reply': None, # From 'server' reply header - 'sort_text_alphabetically': False, - 'subtractive_selectors': [], - 'tag': '', # Old system of text name for a tag, to be removed - 'tags': [], # list of UUIDs to App.Tags - 'text_should_not_be_present': [], # Text that should not present - # Re #110, so then if this is set to None, we know to use the default value instead - # Requires setting to None on submit if it's the same as the default - # Should be all None by default, so we use the system default in this case. - 'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}, - 'time_between_check_use_default': True, - 'title': None, - 'trigger_text': [], # List of text or regex to wait for until a change is detected - 'url': '', - 'uuid': str(uuid.uuid4()), - 'webdriver_delay': None, - 'webdriver_js_execute_code': None, # Run before change-detection -} - def is_safe_url(test_url): # See https://github.com/dgtlmoon/changedetection.io/issues/1358 @@ -94,30 +29,26 @@ def is_safe_url(test_url): return True -class model(dict): + +class model(watch_base): __newest_history_key = None __history_n = 0 jitter_seconds = 0 def __init__(self, *arg, **kw): - - self.update(base_config) self.__datastore_path = kw['datastore_path'] - - self['uuid'] = str(uuid.uuid4()) - del kw['datastore_path'] - + super(model, self).__init__(*arg, **kw) if kw.get('default'): self.update(kw['default']) del kw['default'] + if self.get('default'): + del self['default'] + # Be sure the cached timestamp is ready bump = self.history - # Goes at the end so we update the default object with the initialiser - super(model, self).__init__(*arg, **kw) - @property def viewed(self): # Don't return viewed when last_viewed is 0 and newest_key is 0 @@ -157,6 +88,33 @@ class model(dict): ready_url=ready_url.replace('source:', '') return ready_url + def clear_watch(self): + import pathlib + + # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc + for item in pathlib.Path(str(self.watch_data_dir)).rglob("*.*"): + os.unlink(item) + + # Force the attr to recalculate + bump = self.history + + # Do this last because it will trigger a recheck due to last_checked being zero + self.update({ + 'browser_steps_last_error_step': None, + 'check_count': 0, + 'fetch_time': 0.0, + 'has_ldjson_price_data': None, + 'last_checked': 0, + 'last_error': False, + 'last_notification_error': False, + 'last_viewed': 0, + 'previous_md5': False, + 'previous_md5_before_filters': False, + 'remote_server_reply': None, + 'track_ldjson_price_data': None + }) + return + @property def is_source_type_url(self): return self.get('url', '').startswith('source:') @@ -258,6 +216,13 @@ class model(dict): return has_browser_steps + @property + def has_restock_info(self): + if self.get('restock') and self['restock'].get('in_stock') != None: + return True + + return False + # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. @property def newest_history_key(self): diff --git a/changedetectionio/model/__init__.py b/changedetectionio/model/__init__.py index e69de29b..e439de4f 100644 --- a/changedetectionio/model/__init__.py +++ b/changedetectionio/model/__init__.py @@ -0,0 +1,73 @@ +import os +import uuid + +from changedetectionio import strtobool +from changedetectionio.notification import default_notification_format_for_watch + +class watch_base(dict): + + def __init__(self, *arg, **kw): + self.update({ + # Custom notification content + # Re #110, so then if this is set to None, we know to use the default value instead + # Requires setting to None on submit if it's the same as the default + # Should be all None by default, so we use the system default in this case. + 'body': None, + 'browser_steps': [], + 'browser_steps_last_error_step': None, + 'check_count': 0, + 'check_unique_lines': False, # On change-detected, compare against all history if its something new + 'consecutive_filter_failures': 0, # Every time the CSS/xPath filter cannot be located, reset when all is fine. + 'date_created': None, + 'extract_text': [], # Extract text by regex after filters + 'extract_title_as_title': False, + 'fetch_backend': 'system', # plaintext, playwright etc + 'fetch_time': 0.0, + 'filter_failure_notification_send': strtobool(os.getenv('FILTER_FAILURE_NOTIFICATION_SEND_DEFAULT', 'True')), + 'filter_text_added': True, + 'filter_text_removed': True, + 'filter_text_replaced': True, + 'follow_price_changes': True, + 'has_ldjson_price_data': None, + 'headers': {}, # Extra headers to send + 'ignore_text': [], # List of text to ignore when calculating the comparison checksum + 'in_stock_only': True, # Only trigger change on going to instock from out-of-stock + 'include_filters': [], + 'last_checked': 0, + 'last_error': False, + 'last_viewed': 0, # history key value of the last viewed via the [diff] link + 'method': 'GET', + 'notification_alert_count': 0, + 'notification_body': None, + 'notification_format': default_notification_format_for_watch, + 'notification_muted': False, + 'notification_screenshot': False, # Include the latest screenshot if available and supported by the apprise URL + 'notification_title': None, + 'notification_urls': [], # List of URLs to add to the notification Queue (Usually AppRise) + 'paused': False, + 'previous_md5': False, + 'previous_md5_before_filters': False, # Used for skipping changedetection entirely + 'processor': 'text_json_diff', # could be restock_diff or others from .processors + 'price_change_threshold_percent': None, + 'proxy': None, # Preferred proxy connection + 'remote_server_reply': None, # From 'server' reply header + 'sort_text_alphabetically': False, + 'subtractive_selectors': [], + 'tag': '', # Old system of text name for a tag, to be removed + 'tags': [], # list of UUIDs to App.Tags + 'text_should_not_be_present': [], # Text that should not present + 'time_between_check': {'weeks': None, 'days': None, 'hours': None, 'minutes': None, 'seconds': None}, + 'time_between_check_use_default': True, + 'title': None, + 'track_ldjson_price_data': None, + 'trigger_text': [], # List of text or regex to wait for until a change is detected + 'url': '', + 'uuid': str(uuid.uuid4()), + 'webdriver_delay': None, + 'webdriver_js_execute_code': None, # Run before change-detection + }) + + super(watch_base, self).__init__(*arg, **kw) + + if self.get('default'): + del self['default'] \ No newline at end of file diff --git a/changedetectionio/processors/README.md b/changedetectionio/processors/README.md index 547ae4e8..0cc55572 100644 --- a/changedetectionio/processors/README.md +++ b/changedetectionio/processors/README.md @@ -8,4 +8,8 @@ The concept here is to be able to switch between different domain specific probl Some suggestions for the future - `graphical` -- `restock_and_price` - extract price AND stock text \ No newline at end of file + +## Todo + +- Make each processor return a extra list of sub-processed (so you could configure a single processor in different ways) +- move restock_diff to its own pip/github repo diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index d24c9a9d..0ce96497 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,11 +1,14 @@ from abc import abstractmethod from changedetectionio.strtobool import strtobool -from changedetectionio.model import Watch + from copy import deepcopy from loguru import logger import hashlib import os import re +import importlib +import pkgutil +import inspect class difference_detection_processor(): @@ -139,7 +142,7 @@ class difference_detection_processor(): # After init, call run_changedetection() which will do the actual change-detection @abstractmethod - def run_changedetection(self, watch: Watch, skip_when_checksum_same=True): + def run_changedetection(self, watch, skip_when_checksum_same=True): update_obj = {'last_notification_error': False, 'last_error': False} some_data = 'xxxxx' update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() @@ -147,8 +150,83 @@ class difference_detection_processor(): return changed_detected, update_obj, ''.encode('utf-8') +def find_sub_packages(package_name): + """ + Find all sub-packages within the given package. + + :param package_name: The name of the base package to scan for sub-packages. + :return: A list of sub-package names. + """ + package = importlib.import_module(package_name) + return [name for _, name, is_pkg in pkgutil.iter_modules(package.__path__) if is_pkg] + + +def find_processors(): + """ + Find all subclasses of DifferenceDetectionProcessor in the specified package. + + :param package_name: The name of the package to scan for processor modules. + :return: A list of (module, class) tuples. + """ + package_name = "changedetectionio.processors" # Name of the current package/module + + processors = [] + sub_packages = find_sub_packages(package_name) + + for sub_package in sub_packages: + module_name = f"{package_name}.{sub_package}.processor" + try: + module = importlib.import_module(module_name) + + # Iterate through all classes in the module + for name, obj in inspect.getmembers(module, inspect.isclass): + if issubclass(obj, difference_detection_processor) and obj is not difference_detection_processor: + processors.append((module, sub_package)) + except (ModuleNotFoundError, ImportError) as e: + logger.warning(f"Failed to import module {module_name}: {e} (find_processors())") + + return processors + + +def get_parent_module(module): + module_name = module.__name__ + if '.' not in module_name: + return None # Top-level module has no parent + parent_module_name = module_name.rsplit('.', 1)[0] + try: + return importlib.import_module(parent_module_name) + except Exception as e: + pass + + return False + + + +def get_custom_watch_obj_for_processor(processor_name): + from changedetectionio.model import Watch + watch_class = Watch.model + processor_classes = find_processors() + custom_watch_obj = next((tpl for tpl in processor_classes if tpl[1] == processor_name), None) + if custom_watch_obj: + # Parent of .processor.py COULD have its own Watch implementation + parent_module = get_parent_module(custom_watch_obj[0]) + if hasattr(parent_module, 'Watch'): + watch_class = parent_module.Watch + + return watch_class + + def available_processors(): - from . import restock_diff, text_json_diff - x=[('text_json_diff', text_json_diff.name), ('restock_diff', restock_diff.name)] - # @todo Make this smarter with introspection of sorts. - return x + """ + Get a list of processors by name and description for the UI elements + :return: A list :) + """ + + processor_classes = find_processors() + + available = [] + for package, processor_class in processor_classes: + available.append((processor_class, package.name)) + + return available + diff --git a/changedetectionio/processors/exceptions.py b/changedetectionio/processors/exceptions.py new file mode 100644 index 00000000..01c99a63 --- /dev/null +++ b/changedetectionio/processors/exceptions.py @@ -0,0 +1,10 @@ +class ProcessorException(Exception): + def __init__(self, message=None, status_code=None, url=None, screenshot=None, has_filters=False, html_content='', xpath_data=None): + self.message = message + self.status_code = status_code + self.url = url + self.screenshot = screenshot + self.has_filters = has_filters + self.html_content = html_content + self.xpath_data = xpath_data + return diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py deleted file mode 100644 index a948eb0d..00000000 --- a/changedetectionio/processors/restock_diff.py +++ /dev/null @@ -1,62 +0,0 @@ - -from . import difference_detection_processor -from loguru import logger -import hashlib -import urllib3 - -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -name = 'Re-stock detection for single product pages' -description = 'Detects if the product goes back to in-stock' - -class UnableToExtractRestockData(Exception): - def __init__(self, status_code): - # Set this so we can use it in other parts of the app - self.status_code = status_code - return - -class perform_site_check(difference_detection_processor): - screenshot = None - xpath_data = None - - def run_changedetection(self, watch, skip_when_checksum_same=True): - - if not watch: - raise Exception("Watch no longer exists.") - - # Unset any existing notification error - update_obj = {'last_notification_error': False, 'last_error': False} - - self.screenshot = self.fetcher.screenshot - self.xpath_data = self.fetcher.xpath_data - - # Track the content type - update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '') - update_obj["last_check_status"] = self.fetcher.get_last_status_code() - - # Main detection method - fetched_md5 = None - if self.fetcher.instock_data: - fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() - # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. - update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False - logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.") - else: - raise UnableToExtractRestockData(status_code=self.fetcher.status_code) - - # The main thing that all this at the moment comes down to :) - changed_detected = False - logger.debug(f"Watch UUID {watch.get('uuid')} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") - - if watch.get('previous_md5') and watch.get('previous_md5') != fetched_md5: - # Yes if we only care about it going to instock, AND we are in stock - if watch.get('in_stock_only') and update_obj["in_stock"]: - changed_detected = True - - if not watch.get('in_stock_only'): - # All cases - changed_detected = True - - # Always record the new checksum - update_obj["previous_md5"] = fetched_md5 - return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8').strip() diff --git a/changedetectionio/processors/restock_diff/__init__.py b/changedetectionio/processors/restock_diff/__init__.py new file mode 100644 index 00000000..3c48eec9 --- /dev/null +++ b/changedetectionio/processors/restock_diff/__init__.py @@ -0,0 +1,65 @@ + +from changedetectionio.model.Watch import model as BaseWatch +import re +from babel.numbers import parse_decimal + +class Restock(dict): + + def parse_currency(self, raw_value: str) -> float: + # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer. + standardized_value = raw_value + + if ',' in standardized_value and '.' in standardized_value: + # Identify the correct decimal separator + if standardized_value.rfind('.') > standardized_value.rfind(','): + standardized_value = standardized_value.replace(',', '') + else: + standardized_value = standardized_value.replace('.', '').replace(',', '.') + else: + standardized_value = standardized_value.replace(',', '.') + + # Remove any non-numeric characters except for the decimal point + standardized_value = re.sub(r'[^\d.-]', '', standardized_value) + + # Convert to float + return float(parse_decimal(standardized_value, locale='en')) + + def __init__(self, *args, **kwargs): + # Define default values + default_values = { + 'in_stock': None, + 'price': None, + 'currency': None, + 'original_price': None + } + + # Initialize the dictionary with default values + super().__init__(default_values) + + # Update with any provided positional arguments (dictionaries) + if args: + if len(args) == 1 and isinstance(args[0], dict): + self.update(args[0]) + else: + raise ValueError("Only one positional argument of type 'dict' is allowed") + + def __setitem__(self, key, value): + # Custom logic to handle setting price and original_price + if key == 'price': + if isinstance(value, str): + value = self.parse_currency(raw_value=value) + + if value and not self.get('original_price'): + self['original_price'] = value + + super().__setitem__(key, value) + +class Watch(BaseWatch): + def __init__(self, *arg, **kw): + super().__init__(*arg, **kw) + self['restock'] = Restock(kw['default']['restock']) if kw.get('default') and kw['default'].get('restock') else Restock() + + def clear_watch(self): + super().clear_watch() + self.update({'restock': Restock()}) + diff --git a/changedetectionio/processors/restock_diff/forms.py b/changedetectionio/processors/restock_diff/forms.py new file mode 100644 index 00000000..1ab01e6e --- /dev/null +++ b/changedetectionio/processors/restock_diff/forms.py @@ -0,0 +1,61 @@ + +from wtforms import ( + BooleanField, + validators, + FloatField +) + +from changedetectionio.forms import processor_text_json_diff_form + +class processor_settings_form(processor_text_json_diff_form): + in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True) + price_change_min = FloatField('Minimum amount to trigger notification', [validators.Optional()], + render_kw={"placeholder": "No limit", "size": "10"}) + price_change_max = FloatField('Maximum amount to trigger notification', [validators.Optional()], + render_kw={"placeholder": "No limit", "size": "10"}) + price_change_threshold_percent = FloatField('Threshold in % for price changes', validators=[ + validators.Optional(), + validators.NumberRange(min=0, max=100, message="Should be between 0 and 100"), + ], render_kw={"placeholder": "0%", "size": "5"}) + + follow_price_changes = BooleanField('Follow price changes', default=False) + + def extra_tab_content(self): + return 'Restock & Price Detection' + + def extra_form_content(self): + return """ + {% from '_helpers.html' import render_field, render_checkbox_field, render_button %} + <script> + $(document).ready(function () { + toggleOpacity('#follow_price_changes', '.price-change-minmax', true); + }); + </script> + + + <fieldset> + <div class="pure-control-group"> + <fieldset class="pure-group"> + {{ render_checkbox_field(form.in_stock_only) }} + <span class="pure-form-message-inline">Only trigger notifications when page changes from <strong>out of stock</strong> to <strong>back in stock</strong></span> + </fieldset> + <fieldset class="pure-group"> + {{ render_checkbox_field(form.follow_price_changes) }} + <span class="pure-form-message-inline">Changes in price should trigger a notification</span> + <span class="pure-form-message-inline">When OFF - only care about restock detection</span> + </fieldset> + <fieldset class="pure-group price-change-minmax"> + {{ render_field(form.price_change_min, placeholder=watch['restock']['price']) }} + <span class="pure-form-message-inline">Minimum amount, only trigger a change when the price is less than this amount.</span> + </fieldset> + <fieldset class="pure-group price-change-minmax"> + {{ render_field(form.price_change_max, placeholder=watch['restock']['price']) }} + <span class="pure-form-message-inline">Maximum amount, only trigger a change when the price is more than this amount.</span> + </fieldset> + <fieldset class="pure-group price-change-minmax"> + {{ render_field(form.price_change_threshold_percent) }} + <span class="pure-form-message-inline">Price must change more than this % to trigger a change.</span><br> + <span class="pure-form-message-inline">For example, If the product is $1,000 USD, <strong>2%</strong> would mean it has to change more than $20 since the first check.</span><br> + </fieldset> + </div> + </fieldset>""" \ No newline at end of file diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py new file mode 100644 index 00000000..42dd9bd4 --- /dev/null +++ b/changedetectionio/processors/restock_diff/processor.py @@ -0,0 +1,247 @@ +from .. import difference_detection_processor +from ..exceptions import ProcessorException +from . import Restock +from loguru import logger +import hashlib +import re +import urllib3 +import time + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +name = 'Re-stock & Price detection for single product pages' +description = 'Detects if the product goes back to in-stock' + +class UnableToExtractRestockData(Exception): + def __init__(self, status_code): + # Set this so we can use it in other parts of the app + self.status_code = status_code + return + +class MoreThanOnePriceFound(Exception): + def __init__(self): + return + +def _search_prop_by_value(matches, value): + for properties in matches: + for prop in properties: + if value in prop[0]: + return prop[1] # Yield the desired value and exit the function + +# should return Restock() +# add casting? +def get_itemprop_availability(html_content) -> Restock: + """ + Kind of funny/cool way to find price/availability in one many different possibilities. + Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it. + """ + from jsonpath_ng import parse + + now = time.time() + import extruct + logger.trace(f"Imported extruct module in {time.time() - now:.3f}s") + + value = {} + now = time.time() + # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest. + + syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph'] + + data = extruct.extract(html_content, syntaxes=syntaxes) + logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s") + + # First phase, dead simple scanning of anything that looks useful + value = Restock() + if data: + logger.debug(f"Using jsonpath to find price/availability/etc") + price_parse = parse('$..(price|Price)') + pricecurrency_parse = parse('$..(pricecurrency|currency|priceCurrency )') + availability_parse = parse('$..(availability|Availability)') + + price_result = price_parse.find(data) + if price_result: + # Right now, we just support single product items, maybe we will store the whole actual metadata seperately in teh future and + # parse that for the UI? + prices_found = set(str(item.value).replace('$', '') for item in price_result) + if len(price_result) > 1 and len(prices_found) > 1: + # See of all prices are different, in the case that one product has many embedded data types with the same price + # One might have $121.95 and another 121.95 etc + logger.warning(f"More than one price found {prices_found}, throwing exception, cant use this plugin.") + raise MoreThanOnePriceFound() + + value['price'] = price_result[0].value + + pricecurrency_result = pricecurrency_parse.find(data) + if pricecurrency_result: + value['currency'] = pricecurrency_result[0].value + + availability_result = availability_parse.find(data) + if availability_result: + value['availability'] = availability_result[0].value + + if value.get('availability'): + value['availability'] = re.sub(r'(?i)^(https|http)://schema.org/', '', + value.get('availability').strip(' "\'').lower()) if value.get('availability') else None + + # Second, go dig OpenGraph which is something that jsonpath_ng cant do because of the tuples and double-dots (:) + if not value.get('price') or value.get('availability'): + logger.debug(f"Alternatively digging through OpenGraph properties for restock/price info..") + jsonpath_expr = parse('$..properties') + + for match in jsonpath_expr.find(data): + if not value.get('price'): + value['price'] = _search_prop_by_value([match.value], "price:amount") + if not value.get('availability'): + value['availability'] = _search_prop_by_value([match.value], "product:availability") + if not value.get('currency'): + value['currency'] = _search_prop_by_value([match.value], "price:currency") + logger.trace(f"Processed with Extruct in {time.time()-now:.3f}s") + + return value + + +def is_between(number, lower=None, upper=None): + """ + Check if a number is between two values. + + Parameters: + number (float): The number to check. + lower (float or None): The lower bound (inclusive). If None, no lower bound. + upper (float or None): The upper bound (inclusive). If None, no upper bound. + + Returns: + bool: True if the number is between the lower and upper bounds, False otherwise. + """ + return (lower is None or lower <= number) and (upper is None or number <= upper) + + +class perform_site_check(difference_detection_processor): + screenshot = None + xpath_data = None + + def run_changedetection(self, watch, skip_when_checksum_same=True): + if not watch: + raise Exception("Watch no longer exists.") + + # Unset any existing notification error + update_obj = {'last_notification_error': False, 'last_error': False, 'restock': Restock()} + + self.screenshot = self.fetcher.screenshot + self.xpath_data = self.fetcher.xpath_data + + # Track the content type + update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '') + update_obj["last_check_status"] = self.fetcher.get_last_status_code() + + itemprop_availability = {} + try: + itemprop_availability = get_itemprop_availability(html_content=self.fetcher.content) + except MoreThanOnePriceFound as e: + # Add the real data + raise ProcessorException(message="Cannot run, more than one price detected, this plugin is only for product pages with ONE product, try the content-change detection mode.", + url=watch.get('url'), + status_code=self.fetcher.get_last_status_code(), + screenshot=self.fetcher.screenshot, + xpath_data=self.fetcher.xpath_data + ) + + # Something valid in get_itemprop_availability() by scraping metadata ? + if itemprop_availability.get('price') or itemprop_availability.get('availability'): + # Store for other usage + update_obj['restock'] = itemprop_availability + + if itemprop_availability.get('availability'): + # @todo: Configurable? + if any(substring.lower() in itemprop_availability['availability'].lower() for substring in [ + 'instock', + 'instoreonly', + 'limitedavailability', + 'onlineonly', + 'presale'] + ): + update_obj['restock']['in_stock'] = True + else: + update_obj['restock']['in_stock'] = False + + # Main detection method + fetched_md5 = None + + if not self.fetcher.instock_data and not itemprop_availability.get('availability'): + raise ProcessorException( + message=f"Unable to extract restock data for this page unfortunately. (Got code {self.fetcher.get_last_status_code()} from server), no embedded stock information was found and nothing interesting in the text, try using this watch with Chrome.", + url=watch.get('url'), + status_code=self.fetcher.get_last_status_code(), + screenshot=self.fetcher.screenshot, + xpath_data=self.fetcher.xpath_data + ) + + # Nothing automatic in microdata found, revert to scraping the page + if self.fetcher.instock_data and itemprop_availability.get('availability') is None: + # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. + # Careful! this does not really come from chrome/js when the watch is set to plaintext + update_obj['restock']["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False + logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.") + + # What we store in the snapshot + price = update_obj.get('restock').get('price') if update_obj.get('restock').get('price') else "" + snapshot_content = f"{update_obj.get('restock').get('in_stock')} - {price}" + + # Main detection method + fetched_md5 = hashlib.md5(snapshot_content.encode('utf-8')).hexdigest() + + # The main thing that all this at the moment comes down to :) + changed_detected = False + logger.debug(f"Watch UUID {watch.get('uuid')} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") + + # out of stock -> back in stock only? + if watch.get('restock') and watch['restock'].get('in_stock') != update_obj['restock'].get('in_stock'): + # Yes if we only care about it going to instock, AND we are in stock + if watch.get('in_stock_only') and update_obj['restock']['in_stock']: + changed_detected = True + + if not watch.get('in_stock_only'): + # All cases + changed_detected = True + + if watch.get('follow_price_changes') and watch.get('restock') and update_obj.get('restock') and update_obj['restock'].get('price'): + price = float(update_obj['restock'].get('price')) + # Default to current price if no previous price found + if watch['restock'].get('original_price'): + previous_price = float(watch['restock'].get('original_price')) + # It was different, but negate it further down + if price != previous_price: + changed_detected = True + + # Minimum/maximum price limit + if update_obj.get('restock') and update_obj['restock'].get('price'): + logger.debug( + f"{watch.get('uuid')} - Change was detected, 'price_change_max' is '{watch.get('price_change_max', '')}' 'price_change_min' is '{watch.get('price_change_min', '')}', price from website is '{update_obj['restock'].get('price', '')}'.") + if update_obj['restock'].get('price'): + min_limit = float(watch.get('price_change_min')) if watch.get('price_change_min') else None + max_limit = float(watch.get('price_change_max')) if watch.get('price_change_max') else None + + price = float(update_obj['restock'].get('price')) + logger.debug(f"{watch.get('uuid')} after float conversion - Min limit: '{min_limit}' Max limit: '{max_limit}' Price: '{price}'") + if min_limit or max_limit: + if is_between(number=price, lower=min_limit, upper=max_limit): + logger.trace(f"{watch.get('uuid')} {price} is between {min_limit} and {max_limit}") + if changed_detected: + logger.debug(f"{watch.get('uuid')} Override change-detected to FALSE because price was inside threshold") + changed_detected = False + else: + logger.trace(f"{watch.get('uuid')} {price} is NOT between {min_limit} and {max_limit}") + + # Price comparison by % + if watch['restock'].get('original_price') and changed_detected and watch.get('price_change_threshold_percent'): + previous_price = float(watch['restock'].get('original_price')) + pc = float(watch.get('price_change_threshold_percent')) + change = abs((price - previous_price) / previous_price * 100) + if change and change <= pc: + logger.debug(f"{watch.get('uuid')} Override change-detected to FALSE because % threshold ({pc}%) was {change:.3f}%") + changed_detected = False + else: + logger.debug(f"{watch.get('uuid')} Price change was {change:.3f}% , (threshold {pc}%)") + + # Always record the new checksum + update_obj["previous_md5"] = fetched_md5 + + return changed_detected, update_obj, snapshot_content.encode('utf-8').strip() diff --git a/changedetectionio/processors/text_json_diff/__init__.py b/changedetectionio/processors/text_json_diff/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff/processor.py similarity index 98% rename from changedetectionio/processors/text_json_diff.py rename to changedetectionio/processors/text_json_diff/processor.py index 797b6c2b..1dc501a2 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -6,8 +6,8 @@ import os import re import urllib3 -from . import difference_detection_processor -from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text +from changedetectionio.processors import difference_detection_processor +from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text from changedetectionio import html_tools, content_fetchers from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from loguru import logger @@ -16,6 +16,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) name = 'Webpage Text/HTML, JSON and PDF changes' description = 'Detects all text changes where possible' + json_filter_prefixes = ['json:', 'jq:', 'jqraw:'] class FilterNotFoundInResponse(ValueError): @@ -217,7 +218,7 @@ class perform_site_check(difference_detection_processor): # Rewrite's the processing text based on only what diff result they want to see if watch.has_special_diff_filter_options_set() and len(watch.history.keys()): # Now the content comes from the diff-parser and not the returned HTTP traffic, so could be some differences - from .. import diff + from changedetectionio import diff # needs to not include (added) etc or it may get used twice # Replace the processed text with the preferred result rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_text_before_filters(), diff --git a/changedetectionio/static/js/watch-settings.js b/changedetectionio/static/js/watch-settings.js index 73c66191..6d45dc76 100644 --- a/changedetectionio/static/js/watch-settings.js +++ b/changedetectionio/static/js/watch-settings.js @@ -1,8 +1,8 @@ -function toggleOpacity(checkboxSelector, fieldSelector) { +function toggleOpacity(checkboxSelector, fieldSelector, inverted) { const checkbox = document.querySelector(checkboxSelector); const fields = document.querySelectorAll(fieldSelector); function updateOpacity() { - const opacityValue = checkbox.checked ? 0.6 : 1; + const opacityValue = !checkbox.checked ? (inverted ? 0.6 : 1) : (inverted ? 1 : 0.6); fields.forEach(field => { field.style.opacity = opacityValue; }); @@ -25,6 +25,8 @@ $(document).ready(function () { $('#notification-tokens-info').toggle(); }); - toggleOpacity('#time_between_check_use_default', '#time_between_check'); + toggleOpacity('#time_between_check_use_default', '#time_between_check', false); + + }); diff --git a/changedetectionio/static/styles/scss/styles.scss b/changedetectionio/static/styles/scss/styles.scss index e090ef84..aa805285 100644 --- a/changedetectionio/static/styles/scss/styles.scss +++ b/changedetectionio/static/styles/scss/styles.scss @@ -186,12 +186,17 @@ code { } } -.watch-tag-list { - color: var(--color-white); +.inline-tag { white-space: nowrap; - background: var(--color-text-watch-tag-list); border-radius: 5px; padding: 2px 5px; + margin-right: 4px; +} + +.watch-tag-list { + color: var(--color-white); + background: var(--color-text-watch-tag-list); + @extend .inline-tag; } .box { @@ -1061,9 +1066,8 @@ ul { .tracking-ldjson-price-data { background-color: var(--color-background-button-green); color: #000; - padding: 3px; - border-radius: 3px; - white-space: nowrap; + opacity: 0.6; + @extend .inline-tag; } .ldjson-price-track-offer { @@ -1109,9 +1113,12 @@ ul { background-color: var(--color-background-button-cancel); color: #777; } - padding: 3px; - border-radius: 3px; - white-space: nowrap; + &.error { + background-color: var(--color-background-button-error); + color: #fff; + opacity: 0.7; + } + @extend .inline-tag; } #chrome-extension-link { diff --git a/changedetectionio/static/styles/styles.css b/changedetectionio/static/styles/styles.css index da60835b..b9469fd0 100644 --- a/changedetectionio/static/styles/styles.css +++ b/changedetectionio/static/styles/styles.css @@ -531,12 +531,15 @@ code { content: url(); margin: 0 3px 0 5px; } -.watch-tag-list { - color: var(--color-white); +.inline-tag, .watch-tag-list, .tracking-ldjson-price-data, .restock-label { white-space: nowrap; - background: var(--color-text-watch-tag-list); border-radius: 5px; - padding: 2px 5px; } + padding: 2px 5px; + margin-right: 4px; } + +.watch-tag-list { + color: var(--color-white); + background: var(--color-text-watch-tag-list); } .box { max-width: 80%; @@ -1153,9 +1156,7 @@ ul { .tracking-ldjson-price-data { background-color: var(--color-background-button-green); color: #000; - padding: 3px; - border-radius: 3px; - white-space: nowrap; } + opacity: 0.6; } .ldjson-price-track-offer { font-weight: bold; @@ -1180,16 +1181,18 @@ ul { #quick-watch-processor-type ul li > * { display: inline-block; } -.restock-label { - padding: 3px; - border-radius: 3px; - white-space: nowrap; } - .restock-label.in-stock { - background-color: var(--color-background-button-green); - color: #fff; } - .restock-label.not-in-stock { - background-color: var(--color-background-button-cancel); - color: #777; } +.restock-label.in-stock { + background-color: var(--color-background-button-green); + color: #fff; } + +.restock-label.not-in-stock { + background-color: var(--color-background-button-cancel); + color: #777; } + +.restock-label.error { + background-color: var(--color-background-button-error); + color: #fff; + opacity: 0.7; } #chrome-extension-link { padding: 9px; diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 5967091b..c9c7ad10 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -18,6 +18,9 @@ import time import uuid as uuid_builder from loguru import logger +from .processors import get_custom_watch_obj_for_processor +from .processors.restock_diff import Restock + # Because the server will run as a daemon and wont know the URL for notification links when firing off a notification BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)' @@ -80,9 +83,15 @@ class ChangeDetectionStore: self.__data['settings']['application'].update(from_disk['settings']['application']) # Convert each existing watch back to the Watch.model object + for uuid, watch in self.__data['watching'].items(): - watch['uuid']=uuid - self.__data['watching'][uuid] = Watch.model(datastore_path=self.datastore_path, default=watch) + watch['uuid'] = uuid + watch_class = get_custom_watch_obj_for_processor(watch.get('processor')) + if watch.get('uuid') != 'text_json_diff': + logger.trace(f"Loading Watch object '{watch_class.__module__}.{watch_class.__name__}' for UUID {uuid}") + + self.__data['watching'][uuid] = watch_class(datastore_path=self.datastore_path, default=watch) + logger.info(f"Watching: {uuid} {self.__data['watching'][uuid]['url']}") # First time ran, Create the datastore. @@ -240,32 +249,7 @@ class ChangeDetectionStore: # Remove a watchs data but keep the entry (URL etc) def clear_watch_history(self, uuid): - import pathlib - - # JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc - for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"): - unlink(item) - - # Force the attr to recalculate - bump = self.__data['watching'][uuid].history - - # Do this last because it will trigger a recheck due to last_checked being zero - self.__data['watching'][uuid].update({ - 'browser_steps_last_error_step' : None, - 'check_count': 0, - 'fetch_time' : 0.0, - 'has_ldjson_price_data': None, - 'in_stock': None, - 'last_checked': 0, - 'last_error': False, - 'last_notification_error': False, - 'last_viewed': 0, - 'previous_md5': False, - 'previous_md5_before_filters': False, - 'remote_server_reply': None, - 'track_ldjson_price_data': None, - }) - + self.__data['watching'][uuid].clear_watch() self.needs_write_urgent = True def add_watch(self, url, tag='', extras=None, tag_uuids=None, write_to_disk_now=True): @@ -342,11 +326,13 @@ class ChangeDetectionStore: if apply_extras.get('tags'): apply_extras['tags'] = list(set(apply_extras.get('tags'))) - new_watch = Watch.model(datastore_path=self.datastore_path, url=url) + # If the processor also has its own Watch implementation + watch_class = get_custom_watch_obj_for_processor(apply_extras.get('processor')) + new_watch = watch_class(datastore_path=self.datastore_path, url=url) new_uuid = new_watch.get('uuid') - logger.debug(f"Adding URL {url} - {new_uuid}") + logger.debug(f"Adding URL '{url}' - {new_uuid}") for k in ['uuid', 'history', 'last_checked', 'last_changed', 'newest_history_key', 'previous_md5', 'viewed']: if k in apply_extras: @@ -582,7 +568,8 @@ class ChangeDetectionStore: # Eventually almost everything todo with a watch will apply as a Tag # So we use the same model as a Watch with self.lock: - new_tag = Watch.model(datastore_path=self.datastore_path, default={ + from .model import Tag + new_tag = Tag.model(datastore_path=self.datastore_path, default={ 'title': name.strip(), 'date_created': int(time.time()) }) @@ -621,6 +608,12 @@ class ChangeDetectionStore: return next((v for v in tags if v.get('title', '').lower() == tag_name.lower()), None) + def any_watches_have_processor_by_name(self, processor_name): + for watch in self.data['watching'].values(): + if watch.get('processor') == processor_name: + return True + return False + def get_updates_available(self): import inspect updates_available = [] @@ -849,3 +842,12 @@ class ChangeDetectionStore: for uuid, watch in self.data['watching'].items(): if isinstance(watch.get('tags'), str): self.data['watching'][uuid]['tags'] = [] + + # Migrate old 'in_stock' values to the new Restock + def update_17(self): + for uuid, watch in self.data['watching'].items(): + if 'in_stock' in watch: + watch['restock'] = Restock({'in_stock': watch.get('in_stock')}) + del watch['in_stock'] + + diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 69fed0aa..0c3dfecb 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -16,7 +16,7 @@ const email_notification_prefix=JSON.parse('{{ emailprefix|tojson }}'); {% endif %} const notification_base_url="{{url_for('ajax_callback_send_notification_test', watch_uuid=uuid)}}"; - const playwright_enabled={% if playwright_enabled %} true {% else %} false {% endif %}; + const playwright_enabled={% if playwright_enabled %}true{% else %}false{% endif %}; const recheck_proxy_start_url="{{url_for('check_proxies.start_check', uuid=uuid)}}"; const proxy_recheck_status_url="{{url_for('check_proxies.get_recheck_status', uuid=uuid)}}"; const screenshot_url="{{url_for('static_content', group='screenshot', filename=uuid)}}"; @@ -41,18 +41,16 @@ <ul> <li class="tab" id=""><a href="#general">General</a></li> <li class="tab"><a href="#request">Request</a></li> + {% if extra_tab_content %} + <li class="tab"><a href="#extras_tab">{{ extra_tab_content }}</a></li> + {% endif %} {% if playwright_enabled %} <li class="tab"><a id="browsersteps-tab" href="#browser-steps">Browser Steps</a></li> {% endif %} - {% if watch['processor'] == 'text_json_diff' %} <li class="tab"><a id="visualselector-tab" href="#visualselector">Visual Filter Selector</a></li> <li class="tab"><a href="#filters-and-triggers">Filters & Triggers</a></li> {% endif %} - - {% if watch['processor'] == 'restock_diff' %} - <li class="tab"><a href="#restock">Restock Detection</a></li> - {% endif %} <li class="tab"><a href="#notifications">Notifications</a></li> <li class="tab"><a href="#stats">Stats</a></li> </ul> @@ -69,16 +67,9 @@ {{ render_field(form.url, placeholder="https://...", required=true, class="m-d") }} <span class="pure-form-message-inline">Some sites use JavaScript to create the content, for this you should <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Fetching-pages-with-WebDriver">use the Chrome/WebDriver Fetcher</a></span><br> <span class="pure-form-message-inline">You can use variables in the URL, perfect for inserting the current date and other logic, <a href="https://github.com/dgtlmoon/changedetection.io/wiki/Handling-variables-in-the-watched-URL">help and examples here</a></span><br> - <span class="pure-form-message-inline"> - {% if watch['processor'] == 'text_json_diff' %} - Current mode: <strong>Webpage Text/HTML, JSON and PDF changes.</strong><br> - <a href="{{url_for('edit_page', uuid=uuid)}}?switch_processor=restock_diff" class="pure-button button-xsmall">Switch to re-stock detection mode.</a> - {% else %} - Current mode: <strong>Re-stock detection.</strong><br> - <a href="{{url_for('edit_page', uuid=uuid)}}?switch_processor=text_json_diff" class="pure-button button-xsmall">Switch to Webpage Text/HTML, JSON and PDF changes mode.</a> - {% endif %} - </span> - + </div> + <div class="pure-control-group inline-radio"> + {{ render_field(form.processor) }} </div> <div class="pure-control-group"> {{ render_field(form.title, class="m-d") }} @@ -413,18 +404,12 @@ Unavailable") }} </div> </div> {% endif %} - - {% if watch['processor'] == 'restock_diff' %} - <div class="tab-pane-inner" id="restock"> - <fieldset> - <div class="pure-control-group"> - {{ render_checkbox_field(form.in_stock_only) }} - <span class="pure-form-message-inline">Only trigger notifications when page changes from <strong>out of stock</strong> to <strong>back in stock</strong></span> - </div> - </fieldset> + {# rendered sub Template #} + {% if extra_form_content %} + <div class="tab-pane-inner" id="extras_tab"> + {{ extra_form_content|safe }} </div> - {% endif %} - + {% endif %} {% if watch['processor'] == 'text_json_diff' %} <div class="tab-pane-inner visual-selector-ui" id="visualselector"> <img class="beta-logo" src="{{url_for('static_content', group='images', filename='beta-logo.png')}}" alt="New beta functionality"> diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 47e95cd9..736e19da 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -59,6 +59,11 @@ {% set sort_order = sort_order or 'asc' %} {% set sort_attribute = sort_attribute or 'last_changed' %} {% set pagination_page = request.args.get('page', 0) %} + {% set cols_required = 6 %} + {% set any_has_restock_price_processor = datastore.any_watches_have_processor_by_name("restock_diff") %} + {% if any_has_restock_price_processor %} + {% set cols_required = cols_required + 1 %} + {% endif %} <div id="watch-table-wrapper"> @@ -70,6 +75,9 @@ <th><input style="vertical-align: middle" type="checkbox" id="check-all" > <a class="{{ 'active '+link_order if sort_attribute == 'date_created' else 'inactive' }}" href="{{url_for('index', sort='date_created', order=link_order, tag=active_tag_uuid)}}"># <span class='arrow {{link_order}}'></span></a></th> <th class="empty-cell"></th> <th><a class="{{ 'active '+link_order if sort_attribute == 'label' else 'inactive' }}" href="{{url_for('index', sort='label', order=link_order, tag=active_tag_uuid)}}">Website <span class='arrow {{link_order}}'></span></a></th> + {% if any_has_restock_price_processor %} + <th>Restock & Price</th> + {% endif %} <th><a class="{{ 'active '+link_order if sort_attribute == 'last_checked' else 'inactive' }}" href="{{url_for('index', sort='last_checked', order=link_order, tag=active_tag_uuid)}}">Last Checked <span class='arrow {{link_order}}'></span></a></th> <th><a class="{{ 'active '+link_order if sort_attribute == 'last_changed' else 'inactive' }}" href="{{url_for('index', sort='last_changed', order=link_order, tag=active_tag_uuid)}}">Last Changed <span class='arrow {{link_order}}'></span></a></th> <th class="empty-cell"></th> @@ -78,7 +86,7 @@ <tbody> {% if not watches|length %} <tr> - <td colspan="6" style="text-wrap: wrap;">No website watches configured, please add a URL in the box above, or <a href="{{ url_for('import_page')}}" >import a list</a>.</td> + <td colspan="{{ cols_required }}" style="text-wrap: wrap;">No website watches configured, please add a URL in the box above, or <a href="{{ url_for('import_page')}}" >import a list</a>.</td> </tr> {% endif %} {% for watch in (watches|sort(attribute=sort_attribute, reverse=sort_order == 'asc'))|pagination_slice(skip=pagination.skip) %} @@ -91,6 +99,7 @@ {% if watch.last_notification_error is defined and watch.last_notification_error != False %}error{% endif %} {% if watch.paused is defined and watch.paused != False %}paused{% endif %} {% if is_unviewed %}unviewed{% endif %} + {% if watch.has_restock_info %} has-restock-info {% if watch['restock']['in_stock'] %}in-stock{% else %}not-in-stock{% endif %} {% else %}no-restock-info{% endif %} {% if watch.uuid in queued_uuids %}queued{% endif %}"> <td class="inline checkbox-uuid" ><input name="uuids" type="checkbox" value="{{ watch.uuid}} " > <span>{{ loop.index+pagination.skip }}</span></td> <td class="inline watch-controls"> @@ -135,30 +144,39 @@ {% if watch['processor'] == 'text_json_diff' %} {% if watch['has_ldjson_price_data'] and not watch['track_ldjson_price_data'] %} - <div class="ldjson-price-track-offer">Embedded price data detected, follow only price data? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div> - {% endif %} - {% if watch['track_ldjson_price_data'] == 'accepted' %} - <span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}" class="status-icon price-follow-tag-icon" > Price</span> + <div class="ldjson-price-track-offer">Switch to Restock & Price watch mode? <a href="{{url_for('price_data_follower.accept', uuid=watch.uuid)}}" class="pure-button button-xsmall">Yes</a> <a href="{{url_for('price_data_follower.reject', uuid=watch.uuid)}}" class="">No</a></div> {% endif %} {% endif %} - - {% if watch['processor'] == 'restock_diff' %} - <span class="restock-label {{'in-stock' if watch['in_stock'] else 'not-in-stock' }}" title="detecting restock conditions"> - <!-- maybe some object watch['processor'][restock_diff] or.. --> - {% if watch['last_checked'] and watch['in_stock'] != None %} - {% if watch['in_stock'] %} In stock {% else %} Not in stock {% endif %} - {% else %} - Not yet checked - {% endif %} - </span> + {% if watch['processor'] == 'restock_diff' %} + <span class="tracking-ldjson-price-data" title="Automatically following embedded price information"><img src="{{url_for('static_content', group='images', filename='price-tag-icon.svg')}}" class="status-icon price-follow-tag-icon" > Price</span> {% endif %} - - {% for watch_tag_uuid, watch_tag in datastore.get_all_tags_for_watch(watch['uuid']).items() %} <span class="watch-tag-list">{{ watch_tag.title }}</span> {% endfor %} + </td> + <!-- @todo make it so any watch handler obj can expose this ---> +{% if any_has_restock_price_processor %} + <td class="restock-and-price"> + {% if watch['processor'] == 'restock_diff' %} + {% if watch.has_restock_info %} + <span class="restock-label {{'in-stock' if watch['restock']['in_stock'] else 'not-in-stock' }}" title="Detecting restock and price"> + <!-- maybe some object watch['processor'][restock_diff] or.. --> + {% if watch['restock']['in_stock'] %} In stock {% else %} Not in stock {% endif %} + </span> + {% endif %} + {% if watch.get('restock') and watch['restock']['price'] != None %} + {% if watch['restock']['price'] != None %} + <span class="restock-label price" title="Price"> + {{ watch['restock']['price']|format_number_locale }} {{ watch['restock']['currency'] }} + </span> + {% endif %} + {% elif not watch.has_restock_info %} + <span class="restock-label error">No information</span> + {% endif %} + {% endif %} </td> +{% endif %} <td class="last-checked" data-timestamp="{{ watch.last_checked }}">{{watch|format_last_checked_time|safe}}</td> <td class="last-changed" data-timestamp="{{ watch.last_changed }}">{% if watch.history_n >=2 and watch.last_changed >0 %} {{watch.last_changed|format_timestamp_timeago}} diff --git a/changedetectionio/tests/test_add_replace_remove_filter.py b/changedetectionio/tests/test_add_replace_remove_filter.py index 3d6d9f32..0ad2b461 100644 --- a/changedetectionio/tests/test_add_replace_remove_filter.py +++ b/changedetectionio/tests/test_add_replace_remove_filter.py @@ -140,6 +140,7 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa url_for("edit_page", uuid="first"), data={"trigger_text": 'Oh yes please', "url": test_url, + 'processor': 'text_json_diff', 'fetch_backend': "html_requests", 'filter_text_removed': '', 'filter_text_added': 'y'}, diff --git a/changedetectionio/tests/test_automatic_follow_ldjson_price.py b/changedetectionio/tests/test_automatic_follow_ldjson_price.py index c27c58b4..686584d4 100644 --- a/changedetectionio/tests/test_automatic_follow_ldjson_price.py +++ b/changedetectionio/tests/test_automatic_follow_ldjson_price.py @@ -100,12 +100,8 @@ def test_check_ldjson_price_autodetect(client, live_server, measure_memory_usage # Accept it uuid = extract_UUID_from_client(client) - time.sleep(1) + #time.sleep(1) client.get(url_for('price_data_follower.accept', uuid=uuid, follow_redirects=True)) - wait_for_all_checks(client) - - # Trigger a check - time.sleep(1) client.get(url_for("form_watch_checknow"), follow_redirects=True) wait_for_all_checks(client) # Offer should be gone @@ -120,8 +116,8 @@ def test_check_ldjson_price_autodetect(client, live_server, measure_memory_usage headers={'x-api-key': api_key}, ) - # Should see this (dont know where the whitespace came from) - assert b'"highPrice": 8099900' in res.data + assert b'8097000' in res.data + # And not this cause its not the ld-json assert b"So let's see what happens" not in res.data @@ -235,4 +231,3 @@ def test_bad_ldjson_is_correctly_ignored(client, live_server, measure_memory_usa # f.write(test_return_data) # # _test_runner_check_bad_format_ignored(live_server=live_server, client=client, has_ldjson_price_data=False) - diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py index b31004ea..ecb5036d 100644 --- a/changedetectionio/tests/test_history_consistency.py +++ b/changedetectionio/tests/test_history_consistency.py @@ -74,3 +74,8 @@ def test_consistent_history(client, live_server, measure_memory_usage): assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot" + + + json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json') + with open(json_db_file, 'r') as f: + assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved" diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py index 45f73392..f480c91b 100644 --- a/changedetectionio/tests/test_ignore_regex_text.py +++ b/changedetectionio/tests/test_ignore_regex_text.py @@ -9,8 +9,6 @@ def test_setup(live_server): # Unit test of the stripper # Always we are dealing in utf-8 def test_strip_regex_text_func(): - from ..processors import text_json_diff as fetch_site_status - test_content = """ but sometimes we want to remove the lines. diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index dd643ddf..33cde7a9 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -11,9 +11,6 @@ def test_setup(live_server): # Unit test of the stripper # Always we are dealing in utf-8 def test_strip_text_func(): - from ..processors import text_json_diff as fetch_site_status - - test_content = """ Some content is listed here diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 402ab7ad..942a42cf 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -378,11 +378,17 @@ def test_headers_textfile_in_request(client, live_server, measure_memory_usage): with open('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt', 'w') as f: f.write("watch-header: nice") + wait_for_all_checks(client) client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up + # Give the thread time to pick it up, this actually is not super reliable and pytest can terminate before the check is ran wait_for_all_checks(client) + # WARNING - pytest and 'wait_for_all_checks' shuts down before it has actually stopped processing when using pyppeteer fetcher + # so adding more time here + if os.getenv('FAST_PUPPETEER_CHROME_FETCHER'): + time.sleep(6) + res = client.get(url_for("edit_page", uuid="first")) assert b"Extra headers file found and will be added to this watch" in res.data diff --git a/changedetectionio/tests/test_restock_itemprop.py b/changedetectionio/tests/test_restock_itemprop.py new file mode 100644 index 00000000..e33de270 --- /dev/null +++ b/changedetectionio/tests/test_restock_itemprop.py @@ -0,0 +1,312 @@ +#!/usr/bin/python3 +import time + +from flask import url_for +from .util import live_server_setup, wait_for_all_checks, extract_UUID_from_client + +instock_props = [ + # LD+JSON with non-standard list of 'type' https://github.com/dgtlmoon/changedetection.io/issues/1833 + '<script type=\'application/ld+json\'>{"@context": "http://schema.org","@type": ["Product", "SubType"],"name": "My test product","description":"","Offers": { "@type": "Offer", "offeredBy": { "@type": "Organization", "name":"Person", "telephone":"+1 999 999 999" }, "price": $$PRICE$$, "priceCurrency": "EUR", "url": "/some/url", "availability": "http://schema.org/InStock"} }</script>', + # LD JSON + '<script id="product-jsonld" type="application/ld+json">{"@context":"https://schema.org","@type":"Product","brand":{"@type":"Brand","name":"Ubiquiti"},"name":"UniFi Express","sku":"UX","description":"Impressively compact UniFi Cloud Gateway and WiFi 6 access point that runs UniFi Network. Powers an entire network or simply meshes as an access point.","url":"https://store.ui.com/us/en/products/ux","image":{"@type":"ImageObject","url":"https://cdn.ecomm.ui.com/products/4ed25b4c-db92-4b98-bbf3-b0989f007c0e/123417a2-895e-49c7-ba04-b6cd8f6acc03.png","width":"1500","height":"1500"},"offers":{"@type":"Offer","availability":"https://schema.org/InStock","priceSpecification":{"@type":"PriceSpecification","price":$$PRICE$$,"priceCurrency":"USD","valueAddedTaxIncluded":false}}}</script>', + '<script id="product-schema" type="application/ld+json">{"@context": "https://schema.org","@type": "Product","itemCondition": "https://schema.org/NewCondition","image": "//1.com/hmgo","name": "Polo MuscleFit","color": "Beige","description": "Polo","sku": "0957102010","brand": {"@type": "Brand","name": "H&M"},"category": {"@type": "Thing","name": "Polo"},"offers": [{"@type": "Offer","url": "https:/www2.xxxxxx.com/fr_fr/productpage.0957102010.html","priceCurrency": "EUR","price": $$PRICE$$,"availability": "http://schema.org/InStock","seller": { "@type": "Organization", "name": "H&M"}}]}</script>' + # Microdata + '<div itemscope itemtype="https://schema.org/Product"><h1 itemprop="name">Example Product</h1><p itemprop="description">This is a sample product description.</p><div itemprop="offers" itemscope itemtype="https://schema.org/Offer"><p>Price: <span itemprop="price">$$$PRICE$$</span></p><link itemprop="availability" href="https://schema.org/InStock" /></div></div>' +] + +out_of_stock_props = [ + # out of stock AND contains multiples + '<script type="application/ld+json">{"@context":"http://schema.org","@type":"WebSite","url":"https://www.medimops.de/","potentialAction":{"@type":"SearchAction","target":"https://www.medimops.de/produkte-C0/?fcIsSearch=1&searchparam={searchparam}","query-input":"required name=searchparam"}}</script><script type="application/ld+json">{"@context":"http://schema.org","@type":"Product","name":"Horsetrader: Robert Sangster and the Rise and Fall of the Sport of Kings","image":"https://images2.medimops.eu/product/43a982/M00002551322-large.jpg","productID":"isbn:9780002551328","gtin13":"9780002551328","category":"Livres en langue étrangère","offers":{"@type":"Offer","priceCurrency":"EUR","price":$$PRICE$$,"itemCondition":"UsedCondition","availability":"OutOfStock"},"brand":{"@type":"Thing","name":"Patrick Robinson","url":"https://www.momox-shop.fr/,patrick-robinson/"}}</script>' +] + +def set_original_response(props_markup='', price="121.95"): + + props_markup=props_markup.replace('$$PRICE$$', price) + test_return_data = f"""<html> + <body> + Some initial text<br> + <p>Which is across multiple lines</p> + <br> + So let's see what happens. <br> + <div>price: ${price}</div> + {props_markup} + </body> + </html> + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + time.sleep(1) + return None + + + + +def test_setup(client, live_server): + + live_server_setup(live_server) + +def test_restock_itemprop_basic(client, live_server): + + #live_server_setup(live_server) + + test_url = url_for('test_endpoint', _external=True) + + for p in instock_props: + set_original_response(props_markup=p) + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'more than one price detected' not in res.data + assert b'has-restock-info' in res.data + assert b' in-stock' in res.data + assert b' not-in-stock' not in res.data + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + + for p in out_of_stock_props: + set_original_response(props_markup=p) + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": '', 'processor': 'restock_diff'}, + follow_redirects=True + ) + wait_for_all_checks(client) + res = client.get(url_for("index")) + + assert b'has-restock-info not-in-stock' in res.data + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + +def test_itemprop_price_change(client, live_server): + #live_server_setup(live_server) + + test_url = url_for('test_endpoint', _external=True) + + set_original_response(props_markup=instock_props[0], price="190.95") + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + + # A change in price, should trigger a change by default + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'190.95' in res.data + + # basic price change, look for notification + set_original_response(props_markup=instock_props[0], price='180.45') + client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'180.45' in res.data + assert b'unviewed' in res.data + client.get(url_for("mark_all_viewed"), follow_redirects=True) + + # turning off price change trigger, but it should show the new price, with no change notification + set_original_response(props_markup=instock_props[0], price='120.45') + res = client.post( + url_for("edit_page", uuid="first"), + data={"follow_price_changes": "", "url": test_url, "tags": "", "headers": "", 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'120.45' in res.data + assert b'unviewed' not in res.data + + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + +def test_itemprop_price_minmax_limit(client, live_server): + #live_server_setup(live_server) + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + test_url = url_for('test_endpoint', _external=True) + + set_original_response(props_markup=instock_props[0], price="950.95") + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + + # A change in price, should trigger a change by default + wait_for_all_checks(client) + + + res = client.post( + url_for("edit_page", uuid="first"), + data={"follow_price_changes": "y", + "price_change_min": 900.0, + "price_change_max": 1100.10, + "url": test_url, + "tags": "", + "headers": "", + 'fetch_backend': "html_requests" + }, + follow_redirects=True + ) + assert b"Updated watch." in res.data + wait_for_all_checks(client) + + client.get(url_for("mark_all_viewed")) + + # price changed to something greater than min (900), and less than max (1100).. should be no change + set_original_response(props_markup=instock_props[0], price='1000.45') + client.get(url_for("form_watch_checknow")) + wait_for_all_checks(client) + res = client.get(url_for("index")) + + assert b'more than one price detected' not in res.data + # BUT the new price should show, even tho its within limits + assert b'1,000.45' or b'1000.45' in res.data #depending on locale + assert b'unviewed' not in res.data + + + # price changed to something LESS than min (900), SHOULD be a change + set_original_response(props_markup=instock_props[0], price='890.45') + # let previous runs wait + time.sleep(1) + res = client.get(url_for("form_watch_checknow"), follow_redirects=True) + assert b'1 watches queued for rechecking.' in res.data + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'890.45' in res.data + assert b'unviewed' in res.data + + client.get(url_for("mark_all_viewed")) + + # price changed to something MORE than max (1100.10), SHOULD be a change + set_original_response(props_markup=instock_props[0], price='1890.45') + client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'1,890.45' or b'1890.45' in res.data + assert b'unviewed' in res.data + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + +def test_itemprop_percent_threshold(client, live_server): + #live_server_setup(live_server) + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + test_url = url_for('test_endpoint', _external=True) + + set_original_response(props_markup=instock_props[0], price="950.95") + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + + # A change in price, should trigger a change by default + wait_for_all_checks(client) + + res = client.post( + url_for("edit_page", uuid="first"), + data={"follow_price_changes": "y", + "price_change_threshold_percent": 5.0, + "url": test_url, + "tags": "", + "headers": "", + 'fetch_backend': "html_requests" + }, + follow_redirects=True + ) + assert b"Updated watch." in res.data + wait_for_all_checks(client) + + + # Basic change should not trigger + set_original_response(props_markup=instock_props[0], price='960.45') + client.get(url_for("form_watch_checknow")) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'960.45' in res.data + assert b'unviewed' not in res.data + + # Bigger INCREASE change than the threshold should trigger + set_original_response(props_markup=instock_props[0], price='1960.45') + client.get(url_for("form_watch_checknow")) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'1,960.45' or b'1960.45' in res.data #depending on locale + assert b'unviewed' in res.data + + + # Small decrease should NOT trigger + client.get(url_for("mark_all_viewed")) + set_original_response(props_markup=instock_props[0], price='1950.45') + client.get(url_for("form_watch_checknow")) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'1,950.45' or b'1950.45' in res.data #depending on locale + assert b'unviewed' not in res.data + + + + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + +def test_data_sanity(client, live_server): + #live_server_setup(live_server) + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + test_url = url_for('test_endpoint', _external=True) + test_url2 = url_for('test_endpoint2', _external=True) + set_original_response(props_markup=instock_props[0], price="950.95") + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + + + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert b'950.95' in res.data + + # Check the restock model object doesnt store the value by mistake and used in a new one + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url2, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + wait_for_all_checks(client) + res = client.get(url_for("index")) + assert str(res.data.decode()).count("950.95") == 1, "Price should only show once (for the watch added, no other watches yet)" + + ## different test, check the edit page works on an empty request result + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data + + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url2, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + wait_for_all_checks(client) + + res = client.get( + url_for("edit_page", uuid="first")) + assert test_url2.encode('utf-8') in res.data diff --git a/changedetectionio/tests/unit/test_restock_logic.py b/changedetectionio/tests/unit/test_restock_logic.py new file mode 100644 index 00000000..c189844f --- /dev/null +++ b/changedetectionio/tests/unit/test_restock_logic.py @@ -0,0 +1,21 @@ +#!/usr/bin/python3 + +# run from dir above changedetectionio/ dir +# python3 -m unittest changedetectionio.tests.unit.test_restock_logic + +import unittest +import os + +from changedetectionio.processors import restock_diff + +# mostly +class TestDiffBuilder(unittest.TestCase): + + def test_logic(self): + assert restock_diff.is_between(number=10, lower=9, upper=11) == True, "Between 9 and 11" + assert restock_diff.is_between(number=10, lower=0, upper=11) == True, "Between 9 and 11" + assert restock_diff.is_between(number=10, lower=None, upper=11) == True, "Between None and 11" + assert not restock_diff.is_between(number=12, lower=None, upper=11) == True, "12 is not between None and 11" + +if __name__ == '__main__': + unittest.main() diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py index 186fc736..1ee87ea9 100644 --- a/changedetectionio/tests/util.py +++ b/changedetectionio/tests/util.py @@ -121,18 +121,21 @@ def extract_UUID_from_client(client): return uuid.strip() def wait_for_all_checks(client): + # actually this is not entirely true, it can still be 'processing' but not in the queue # Loop waiting until done.. attempt=0 - time.sleep(0.1) + # because sub-second rechecks are problematic in testing, use lots of delays + time.sleep(1) while attempt < 60: - time.sleep(1) res = client.get(url_for("index")) if not b'Checking now' in res.data: break logging.getLogger().info("Waiting for watch-list to not say 'Checking now'.. {}".format(attempt)) - + time.sleep(1) attempt += 1 + time.sleep(1) + def live_server_setup(live_server): @live_server.app.route('/test-random-content-endpoint') @@ -140,6 +143,9 @@ def live_server_setup(live_server): import secrets return "Random content - {}\n".format(secrets.token_hex(64)) + @live_server.app.route('/test-endpoint2') + def test_endpoint2(): + return "<html><body>some basic content</body></html>" @live_server.app.route('/test-endpoint') def test_endpoint(): diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 91453c47..a5af5c2b 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -1,8 +1,10 @@ +from .processors.exceptions import ProcessorException from . import content_fetchers -from .processors.restock_diff import UnableToExtractRestockData -from .processors.text_json_diff import FilterNotFoundInResponse + +from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse from changedetectionio import html_tools -from copy import deepcopy + +import importlib import os import queue import threading @@ -13,7 +15,6 @@ import time # Requests for checking on a single site(watch) from a queue of watches # (another process inserts watches into the queue that are time-ready for checking) -import sys from loguru import logger class update_worker(threading.Thread): @@ -27,7 +28,6 @@ class update_worker(threading.Thread): super().__init__(*args, **kwargs) def queue_notification_for_watch(self, notification_q, n_object, watch): - from changedetectionio import diff dates = [] trigger_text = '' @@ -226,8 +226,6 @@ class update_worker(threading.Thread): os.unlink(full_path) def run(self): - - from .processors import text_json_diff, restock_diff now = time.time() while not self.app.config.exit.is_set(): @@ -258,24 +256,21 @@ class update_worker(threading.Thread): try: # Processor is what we are using for detecting the "Change" processor = watch.get('processor', 'text_json_diff') - # if system... - # Abort processing when the content was the same as the last fetch skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') - # @todo some way to switch by name - # Init a new 'difference_detection_processor' + # Init a new 'difference_detection_processor', first look in processors + processor_module_name = f"changedetectionio.processors.{processor}.processor" + try: + processor_module = importlib.import_module(processor_module_name) + except ModuleNotFoundError as e: + print(f"Processor module '{processor}' not found.") + raise e - if processor == 'restock_diff': - update_handler = restock_diff.perform_site_check(datastore=self.datastore, + update_handler = processor_module.perform_site_check(datastore=self.datastore, watch_uuid=uuid ) - else: - # Used as a default and also by some tests - update_handler = text_json_diff.perform_site_check(datastore=self.datastore, - watch_uuid=uuid - ) update_handler.call_browser() @@ -293,6 +288,16 @@ class update_worker(threading.Thread): logger.critical(f"File permission error updating file, watch: {uuid}") logger.critical(str(e)) process_changedetection_results = False + + # A generic other-exception thrown by processors + except ProcessorException as e: + if e.screenshot: + watch.save_screenshot(screenshot=e.screenshot) + if e.xpath_data: + watch.save_xpath_data(data=e.xpath_data) + self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message}) + process_changedetection_results = False + except content_fetchers.exceptions.ReplyWithContentButNoText as e: # Totally fine, it's by choice - just continue on, nothing more to care about # Page had elements/content but no renderable text @@ -466,12 +471,6 @@ class update_worker(threading.Thread): process_changedetection_results = False logger.error(f"Exception (BrowserStepsInUnsupportedFetcher) reached processing watch UUID: {uuid}") - except UnableToExtractRestockData as e: - # Usually when fetcher.instock_data returns empty - logger.error(f"Exception (UnableToExtractRestockData) reached processing watch UUID: {uuid}") - logger.error(str(e)) - self.datastore.update_watch(uuid=uuid, update_obj={'last_error': f"Unable to extract restock data for this page unfortunately. (Got code {e.status_code} from server)"}) - process_changedetection_results = False except Exception as e: logger.error(f"Exception reached processing watch UUID: {uuid}") logger.error(str(e)) diff --git a/requirements.txt b/requirements.txt index 44049fb2..fcfdf774 100644 --- a/requirements.txt +++ b/requirements.txt @@ -82,5 +82,12 @@ pytest-flask ~=1.2 jsonschema==4.17.3 loguru + +# For scraping all possible metadata relating to products so we can do better restock detection +extruct + +# For cleaning up unknown currency formats +babel + # Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096 greenlet >= 3.0.3