Make processors more cleanly separated

pull/2041/head
dgtlmoon 4 months ago
parent b05c7aef1c
commit 7f5bdd47ae

@ -30,7 +30,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def long_task(uuid, preferred_proxy):
import time
from changedetectionio.content_fetchers import exceptions as content_fetcher_exceptions
from changedetectionio.processors import text_json_diff
from changedetectionio.processors.text_json_diff import text_json_diff
from changedetectionio.safe_jinja import render as jinja_render
status = {'status': '', 'length': 0, 'text': ''}

@ -8,6 +8,7 @@ import time
from jinja2 import Template
from .processors import find_processors, get_parent_module
from .safe_jinja import render as jinja_render
from changedetectionio.strtobool import strtobool
from copy import deepcopy
@ -623,6 +624,7 @@ def changedetection_app(config=None, datastore_o=None):
from . import forms
from .blueprint.browser_steps.browser_steps import browser_step_ui_config
from . import processors
import importlib
# More for testing, possible to return the first/only
if not datastore.data['watching'].keys():
@ -656,12 +658,20 @@ def changedetection_app(config=None, datastore_o=None):
default['proxy'] = ''
# proxy_override set to the json/text list of the items
processor = datastore.data['watching'][uuid].get('processor', '')
form_class_name = f"processor_{processor}_form"
form_class = forms
# Does it use some custom form? does one exist?
processor_name = datastore.data['watching'][uuid].get('processor', '')
custom_processor_class = next((tpl for tpl in find_processors() if tpl[1] == processor_name), None)
if custom_processor_class:
try:
form_class = getattr(forms, form_class_name)
except AttributeError:
flash(f"Cannot load the edit form for processor/plugin '{processor}', plugin missing?", 'error')
# Get the parent of the "processor.py" go up one, get the form (kinda spaghetti but its reusing existing code)
parent_module = get_parent_module(custom_processor_class[0])
forms_module = importlib.import_module(f"{parent_module.__name__}.forms")
# Access the 'processor_settings_form' class from the 'forms' module
form_class = getattr(forms_module, 'processor_settings_form')
except AttributeError as e:
flash(f"Cannot load the edit form for processor/plugin '{custom_processor_class[1]}', plugin missing?", 'error')
return redirect(url_for('index'))
form = form_class(formdata=request.form if request.method == 'POST' else None,

@ -1,8 +1,6 @@
import os
import re
from wtforms.fields.numeric import FloatField
from changedetectionio.strtobool import strtobool
from wtforms import (
@ -522,59 +520,6 @@ class processor_text_json_diff_form(commonSettingsForm):
result = False
return result
class processor_restock_diff_form(processor_text_json_diff_form):
in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True)
price_change_min = FloatField('Minimum amount to trigger notification', [validators.Optional()], render_kw={"placeholder": "No limit", "size": "10"})
price_change_max = FloatField('Maximum amount to trigger notification', [validators.Optional()], render_kw={"placeholder": "No limit", "size": "10"})
price_change_threshold_percent = FloatField('Threshold in % for price changes', validators=[
validators.Optional(),
validators.NumberRange(min=0, max=100, message="Should be between 0 and 100"),
], render_kw={"placeholder": "0%", "size": "5"})
follow_price_changes = BooleanField('Follow price changes', default=False)
def extra_tab_content(self):
return 'Restock & Price Detection'
def extra_form_content(self):
return """
{% from '_helpers.html' import render_field, render_checkbox_field, render_button %}
<script>
$(document).ready(function () {
toggleOpacity('#follow_price_changes', '.price-change-minmax', true);
});
</script>
<fieldset>
<div class="pure-control-group">
<fieldset class="pure-group">
{{ render_checkbox_field(form.in_stock_only) }}
<span class="pure-form-message-inline">Only trigger notifications when page changes from <strong>out of stock</strong> to <strong>back in stock</strong></span>
</fieldset>
<fieldset class="pure-group">
{{ render_checkbox_field(form.follow_price_changes) }}
<span class="pure-form-message-inline">Changes in price should trigger a notification</span>
<span class="pure-form-message-inline">When OFF - only care about restock detection</span>
</fieldset>
<fieldset class="pure-group price-change-minmax">
{{ render_field(form.price_change_min, placeholder=watch['restock']['price']) }}
<span class="pure-form-message-inline">Minimum amount, only trigger a change when the price is less than this amount.</span>
</fieldset>
<fieldset class="pure-group price-change-minmax">
{{ render_field(form.price_change_max, placeholder=watch['restock']['price']) }}
<span class="pure-form-message-inline">Maximum amount, only trigger a change when the price is more than this amount.</span>
</fieldset>
<fieldset class="pure-group price-change-minmax">
{{ render_field(form.price_change_threshold_percent) }}
<span class="pure-form-message-inline">Price must change more than this % to trigger a change.</span><br>
<span class="pure-form-message-inline">For example, If the product is $1,000 USD, <strong>2%</strong> would mean it has to change more than $20 since the first check.</span><br>
</fieldset>
</div>
</fieldset>"""
class SingleExtraProxy(Form):
# maybe better to set some <script>var..

@ -39,13 +39,15 @@ class model(watch_base):
def __init__(self, *arg, **kw):
self.__datastore_path = kw['datastore_path']
del kw['datastore_path']
super(model, self).__init__(*arg, **kw)
if kw.get('default'):
self.update(kw['default'])
del kw['default']
if self.get('default'):
del self['default']
# Be sure the cached timestamp is ready
bump = self.history
@ -88,6 +90,34 @@ class model(watch_base):
ready_url=ready_url.replace('source:', '')
return ready_url
def clear_watch(self):
import pathlib
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
for item in pathlib.Path(str(self.watch_data_dir)).rglob("*.*"):
os.unlink(item)
# Force the attr to recalculate
bump = self.history
# Do this last because it will trigger a recheck due to last_checked being zero
self.update({
'browser_steps_last_error_step': None,
'check_count': 0,
'fetch_time': 0.0,
'has_ldjson_price_data': None,
'in_stock': None,
'last_checked': 0,
'last_error': False,
'last_notification_error': False,
'last_viewed': 0,
'previous_md5': False,
'previous_md5_before_filters': False,
'remote_server_reply': None,
'track_ldjson_price_data': None
})
return
@property
def is_source_type_url(self):
return self.get('url', '').startswith('source:')

@ -4,30 +4,6 @@ import uuid
from changedetectionio import strtobool
from changedetectionio.notification import default_notification_format_for_watch
class Restock(dict):
def __init__(self, *args, **kwargs):
# Define default values
default_values = {
'in_stock': None,
'price': None,
'currency': None,
'original_price': None
}
# Initialize the dictionary with default values
super().__init__(default_values)
# Update with any provided positional arguments (dictionaries)
if args:
if len(args) == 1 and isinstance(args[0], dict):
self.update(args[0])
else:
raise ValueError("Only one positional argument of type 'dict' is allowed")
# Update with keyword arguments
self.update(kwargs)
class watch_base(dict):
def __init__(self, *arg, **kw):

@ -8,4 +8,8 @@ The concept here is to be able to switch between different domain specific probl
Some suggestions for the future
- `graphical`
- `restock_and_price` - extract price AND stock text
## Todo
- Make each processor return a extra list of sub-processed (so you could configure a single processor in different ways)
- move restock_diff to its own pip/github repo

@ -6,6 +6,9 @@ from loguru import logger
import hashlib
import os
import re
import importlib
import pkgutil
import inspect
class difference_detection_processor():
@ -147,8 +150,75 @@ class difference_detection_processor():
return changed_detected, update_obj, ''.encode('utf-8')
def find_sub_packages(package_name):
"""
Find all sub-packages within the given package.
:param package_name: The name of the base package to scan for sub-packages.
:return: A list of sub-package names.
"""
package = importlib.import_module(package_name)
return [name for _, name, is_pkg in pkgutil.iter_modules(package.__path__) if is_pkg]
def find_processors():
"""
Find all subclasses of DifferenceDetectionProcessor in the specified package.
:param package_name: The name of the package to scan for processor modules.
:return: A list of (module, class) tuples.
"""
package_name = "changedetectionio.processors" # Name of the current package/module
processors = []
sub_packages = find_sub_packages(package_name)
for sub_package in sub_packages:
module_name = f"{package_name}.{sub_package}.processor"
try:
module = importlib.import_module(module_name)
# Iterate through all classes in the module
for name, obj in inspect.getmembers(module, inspect.isclass):
if issubclass(obj, difference_detection_processor) and obj is not difference_detection_processor:
processors.append((module, sub_package))
except (ModuleNotFoundError, ImportError) as e:
print(f"Failed to import module {module_name}: {e}")
return processors
def get_parent_module(module):
module_name = module.__name__
if '.' not in module_name:
return None # Top-level module has no parent
parent_module_name = module_name.rsplit('.', 1)[0]
return importlib.import_module(parent_module_name)
def get_custom_watch_obj_for_processor(processor_name):
watch_class = Watch.model
processor_classes = find_processors()
custom_watch_obj = next((tpl for tpl in processor_classes if tpl[1] == processor_name), None)
if custom_watch_obj:
# Parent of .processor.py COULD have its own Watch implementation
parent_module = get_parent_module(custom_watch_obj[0])
if hasattr(parent_module, 'Watch'):
watch_class = parent_module.Watch
return watch_class
def available_processors():
from . import restock_diff, text_json_diff
x=[('text_json_diff', text_json_diff.name), ('restock_diff', restock_diff.name)]
# @todo Make this smarter with introspection of sorts.
return x
"""
Get a list of processors by name and description for the UI elements
:return: A list :)
"""
processor_classes = find_processors()
available = []
for package, processor_class in processor_classes:
available.append((processor_class, package.name))
return available

@ -0,0 +1,44 @@
from changedetectionio.model.Watch import model as BaseWatch
class Restock(dict):
def __init__(self, *args, **kwargs):
# Define default values
default_values = {
'in_stock': None,
'price': None,
'currency': None,
'original_price': None
}
# Initialize the dictionary with default values
super().__init__(default_values)
# Update with any provided positional arguments (dictionaries)
if args:
if len(args) == 1 and isinstance(args[0], dict):
self.update(args[0])
else:
raise ValueError("Only one positional argument of type 'dict' is allowed")
# Update with keyword arguments
self.update(kwargs)
def __setitem__(self, key, value):
# Custom logic to handle setting price and original_price
if key == 'price':
if value and not self.get('original_price'):
self['original_price'] = value
super().__setitem__(key, value)
class Watch(BaseWatch):
def __init__(self, *arg, **kw):
super().__init__(*arg, **kw)
self.update({'restock': Restock()})
def clear_watch(self):
super().clear_watch()
self.update({'restock': Restock()})

@ -0,0 +1,61 @@
from wtforms import (
BooleanField,
validators,
FloatField
)
from changedetectionio.forms import processor_text_json_diff_form
class processor_settings_form(processor_text_json_diff_form):
in_stock_only = BooleanField('Only trigger when product goes BACK to in-stock', default=True)
price_change_min = FloatField('Minimum amount to trigger notification', [validators.Optional()],
render_kw={"placeholder": "No limit", "size": "10"})
price_change_max = FloatField('Maximum amount to trigger notification', [validators.Optional()],
render_kw={"placeholder": "No limit", "size": "10"})
price_change_threshold_percent = FloatField('Threshold in % for price changes', validators=[
validators.Optional(),
validators.NumberRange(min=0, max=100, message="Should be between 0 and 100"),
], render_kw={"placeholder": "0%", "size": "5"})
follow_price_changes = BooleanField('Follow price changes', default=False)
def extra_tab_content(self):
return 'Restock & Price Detection'
def extra_form_content(self):
return """
{% from '_helpers.html' import render_field, render_checkbox_field, render_button %}
<script>
$(document).ready(function () {
toggleOpacity('#follow_price_changes', '.price-change-minmax', true);
});
</script>
<fieldset>
<div class="pure-control-group">
<fieldset class="pure-group">
{{ render_checkbox_field(form.in_stock_only) }}
<span class="pure-form-message-inline">Only trigger notifications when page changes from <strong>out of stock</strong> to <strong>back in stock</strong></span>
</fieldset>
<fieldset class="pure-group">
{{ render_checkbox_field(form.follow_price_changes) }}
<span class="pure-form-message-inline">Changes in price should trigger a notification</span>
<span class="pure-form-message-inline">When OFF - only care about restock detection</span>
</fieldset>
<fieldset class="pure-group price-change-minmax">
{{ render_field(form.price_change_min, placeholder=watch['restock']['price']) }}
<span class="pure-form-message-inline">Minimum amount, only trigger a change when the price is less than this amount.</span>
</fieldset>
<fieldset class="pure-group price-change-minmax">
{{ render_field(form.price_change_max, placeholder=watch['restock']['price']) }}
<span class="pure-form-message-inline">Maximum amount, only trigger a change when the price is more than this amount.</span>
</fieldset>
<fieldset class="pure-group price-change-minmax">
{{ render_field(form.price_change_threshold_percent) }}
<span class="pure-form-message-inline">Price must change more than this % to trigger a change.</span><br>
<span class="pure-form-message-inline">For example, If the product is $1,000 USD, <strong>2%</strong> would mean it has to change more than $20 since the first check.</span><br>
</fieldset>
</div>
</fieldset>"""

@ -1,5 +1,5 @@
from . import difference_detection_processor
from ..model import Restock
from .. import difference_detection_processor
from . import Restock
from loguru import logger
import hashlib
import re
@ -7,11 +7,9 @@ import urllib3
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Re-stock & Price detection for single product pages'
description = 'Detects if the product goes back to in-stock'
class UnableToExtractRestockData(Exception):
def __init__(self, status_code):
# Set this so we can use it in other parts of the app
@ -47,6 +45,7 @@ def get_itemprop_availability(html_content) -> Restock:
logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")
# First phase, dead simple scanning of anything that looks useful
value = Restock()
if data:
logger.debug(f"Using jsonpath to find price/availability/etc")
price_parse = parse('$..(price|Price|highPrice)')
@ -84,7 +83,7 @@ def get_itemprop_availability(html_content) -> Restock:
logger.trace(f"Processed with Extruct in {time.time()-now:.3f}s")
return Restock(value)
return value
def is_between(number, lower=None, upper=None):
@ -154,7 +153,6 @@ class perform_site_check(difference_detection_processor):
# Main detection method
fetched_md5 = None
if self.fetcher.instock_data:
fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest()
# 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold.
update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.")

@ -6,8 +6,8 @@ import os
import re
import urllib3
from . import difference_detection_processor
from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
from changedetectionio.processors import difference_detection_processor
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger
@ -16,6 +16,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Webpage Text/HTML, JSON and PDF changes'
description = 'Detects all text changes where possible'
json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
class FilterNotFoundInResponse(ValueError):

@ -18,6 +18,8 @@ import time
import uuid as uuid_builder
from loguru import logger
from .processors import get_custom_watch_obj_for_processor
# Because the server will run as a daemon and wont know the URL for notification links when firing off a notification
BASE_URL_NOT_SET_TEXT = '("Base URL" not set - see settings - notifications)'
@ -80,9 +82,15 @@ class ChangeDetectionStore:
self.__data['settings']['application'].update(from_disk['settings']['application'])
# Convert each existing watch back to the Watch.model object
for uuid, watch in self.__data['watching'].items():
watch['uuid']=uuid
self.__data['watching'][uuid] = Watch.model(datastore_path=self.datastore_path, default=watch)
watch['uuid'] = uuid
watch_class = get_custom_watch_obj_for_processor(watch.get('processor'))
if watch.get('uuid') != 'text_json_diff':
logger.trace(f"Loading Watch object '{watch_class.__module__}.{watch_class.__name__}' for UUID {uuid}")
self.__data['watching'][uuid] = watch_class(datastore_path=self.datastore_path, default=watch)
logger.info(f"Watching: {uuid} {self.__data['watching'][uuid]['url']}")
# First time ran, Create the datastore.
@ -240,34 +248,7 @@ class ChangeDetectionStore:
# Remove a watchs data but keep the entry (URL etc)
def clear_watch_history(self, uuid):
import pathlib
from .model import Restock
# JSON Data, Screenshots, Textfiles (history index and snapshots), HTML in the future etc
for item in pathlib.Path(os.path.join(self.datastore_path, uuid)).rglob("*.*"):
unlink(item)
# Force the attr to recalculate
bump = self.__data['watching'][uuid].history
# Do this last because it will trigger a recheck due to last_checked being zero
self.__data['watching'][uuid].update({
'browser_steps_last_error_step' : None,
'check_count': 0,
'fetch_time' : 0.0,
'has_ldjson_price_data': None,
'in_stock': None,
'last_checked': 0,
'last_error': False,
'last_notification_error': False,
'last_viewed': 0,
'previous_md5': False,
'previous_md5_before_filters': False,
'remote_server_reply': None,
'track_ldjson_price_data': None,
'restock': Restock()
})
self.__data['watching'][uuid].clear_watch()
self.needs_write_urgent = True
def add_watch(self, url, tag='', extras=None, tag_uuids=None, write_to_disk_now=True):
@ -344,11 +325,13 @@ class ChangeDetectionStore:
if apply_extras.get('tags'):
apply_extras['tags'] = list(set(apply_extras.get('tags')))
new_watch = Watch.model(datastore_path=self.datastore_path, url=url)
# If the processor also has its own Watch implementation
watch_class = get_custom_watch_obj_for_processor(apply_extras.get('processor'))
new_watch = watch_class(datastore_path=self.datastore_path, url=url)
new_uuid = new_watch.get('uuid')
logger.debug(f"Adding URL {url} - {new_uuid}")
logger.debug(f"Adding URL '{url}' - {new_uuid}")
for k in ['uuid', 'history', 'last_checked', 'last_changed', 'newest_history_key', 'previous_md5', 'viewed']:
if k in apply_extras:

@ -74,3 +74,8 @@ def test_consistent_history(client, live_server):
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
with open(json_db_file, 'r') as f:
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"

@ -9,8 +9,6 @@ def test_setup(live_server):
# Unit test of the stripper
# Always we are dealing in utf-8
def test_strip_regex_text_func():
from ..processors import text_json_diff as fetch_site_status
test_content = """
but sometimes we want to remove the lines.

@ -11,9 +11,6 @@ def test_setup(live_server):
# Unit test of the stripper
# Always we are dealing in utf-8
def test_strip_text_func():
from ..processors import text_json_diff as fetch_site_status
test_content = """
Some content
is listed here

@ -286,4 +286,19 @@ def test_data_sanity(client, live_server):
res = client.get(url_for("index"))
assert str(res.data.decode()).count("950.95") == 1, "Price should only show once (for the watch added, no other watches yet)"
## different test, check the edit page works on an empty request result
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
assert b'Deleted' in res.data
client.post(
url_for("form_quick_watch_add"),
data={"url": test_url2, "tags": 'restock tests', 'processor': 'restock_diff'},
follow_redirects=True
)
wait_for_all_checks(client)
res = client.get(
url_for("edit_page", uuid="first"))
assert test_url2.encode('utf-8') in res.data
# @todo look at the url-watches and make sure there is not key called "default" !!!

@ -1,8 +1,9 @@
from . import content_fetchers
from .processors.restock_diff import UnableToExtractRestockData
from .processors.text_json_diff import FilterNotFoundInResponse
from .processors.restock_diff.processor import UnableToExtractRestockData
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
from copy import deepcopy
import importlib
import os
import queue
import threading
@ -13,7 +14,6 @@ import time
# Requests for checking on a single site(watch) from a queue of watches
# (another process inserts watches into the queue that are time-ready for checking)
import sys
from loguru import logger
class update_worker(threading.Thread):
@ -226,8 +226,6 @@ class update_worker(threading.Thread):
os.unlink(full_path)
def run(self):
from .processors import text_json_diff, restock_diff
now = time.time()
while not self.app.config.exit.is_set():
@ -258,22 +256,19 @@ class update_worker(threading.Thread):
try:
# Processor is what we are using for detecting the "Change"
processor = watch.get('processor', 'text_json_diff')
# if system...
# Abort processing when the content was the same as the last fetch
skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same')
# @todo some way to switch by name
# Init a new 'difference_detection_processor'
# Init a new 'difference_detection_processor', first look in processors
processor_module_name = f"changedetectionio.processors.{processor}.processor"
try:
processor_module = importlib.import_module(processor_module_name)
except ModuleNotFoundError as e:
print(f"Processor module '{processor}' not found.")
raise e
if processor == 'restock_diff':
update_handler = restock_diff.perform_site_check(datastore=self.datastore,
watch_uuid=uuid
)
else:
# Used as a default and also by some tests
update_handler = text_json_diff.perform_site_check(datastore=self.datastore,
update_handler = processor_module.perform_site_check(datastore=self.datastore,
watch_uuid=uuid
)

Loading…
Cancel
Save