From 70842193b06de26d120003e133386b56f46579da Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sat, 25 Nov 2023 20:05:18 +0100 Subject: [PATCH] second attempt at plugins --- changedetectionio/__init__.py | 4 + changedetectionio/api/api_v1.py | 2 +- .../blueprint/browser_steps/__init__.py | 2 +- .../blueprint/check_proxies/__init__.py | 2 +- changedetectionio/processors/__init__.py | 84 ++++++++----------- changedetectionio/processors/restock_diff.py | 5 +- .../processors/text_json_diff.py | 4 +- changedetectionio/update_worker.py | 67 +++++++++------ 8 files changed, 90 insertions(+), 80 deletions(-) diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 6a95b156..0e08044f 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -16,6 +16,7 @@ import logging import os import pytz import queue +import sys import threading import time import timeago @@ -80,6 +81,9 @@ csrf = CSRFProtect() csrf.init_app(app) notification_debug_log=[] +from pathlib import Path +sys.path.append(os.path.join(Path.home(), 'changedetectionio-plugins')) + watch_api = Api(app, decorators=[csrf.exempt]) def init_app_secret(datastore_path): diff --git a/changedetectionio/api/api_v1.py b/changedetectionio/api/api_v1.py index 783827ca..e014abb1 100644 --- a/changedetectionio/api/api_v1.py +++ b/changedetectionio/api/api_v1.py @@ -76,7 +76,7 @@ class Watch(Resource): # Properties are not returned as a JSON, so add the required props manually watch['history_n'] = watch.history_n watch['last_changed'] = watch.last_changed - + watch['viewed'] = watch.viewed return watch @auth.check_token diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py index 11fb208d..6ca20021 100644 --- a/changedetectionio/blueprint/browser_steps/__init__.py +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -97,7 +97,7 @@ def construct_blueprint(datastore: ChangeDetectionStore): proxy=proxy) # For test - #browsersteps_start_session['browserstepper'].action_goto_url(value="http://example.com?time="+str(time.time())) + #browsersteps_start_session['browserstepper'].action_goto_url(value="http://exbaseample.com?time="+str(time.time())) return browsersteps_start_session diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py index ea68376a..8f1e49f2 100644 --- a/changedetectionio/blueprint/check_proxies/__init__.py +++ b/changedetectionio/blueprint/check_proxies/__init__.py @@ -41,7 +41,7 @@ def construct_blueprint(datastore: ChangeDetectionStore): now = time.time() try: update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid) - update_handler.call_browser() + update_handler.fetch_content() # title, size is len contents not len xfer except content_fetcher.Non200ErrorCodeReceived as e: if e.status_code == 404: diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 10c9138c..8c74d2b7 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -4,10 +4,8 @@ import hashlib import re from changedetectionio import content_fetcher from copy import deepcopy -from distutils.util import strtobool - -class difference_detection_processor(): +class difference_detection_processor_interface(): browser_steps = None datastore = None fetcher = None @@ -15,52 +13,36 @@ class difference_detection_processor(): watch = None xpath_data = None - def __init__(self, *args, datastore, watch_uuid, **kwargs): - super().__init__(*args, **kwargs) - self.datastore = datastore - self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) - def call_browser(self): - - # Protect against file:// access - if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): - if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): - raise Exception( - "file:// type access is denied for security reasons." - ) - - url = self.watch.link - - # Requests, playwright, other browser via wss:// etc, fetch_extra_something - prefer_fetch_backend = self.watch.get('fetch_backend', 'system') - - # Proxy ID "key" - preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid')) + @abstractmethod + def run_changedetection(self, uuid, skip_when_checksum_same=True): + update_obj = {'last_notification_error': False, 'last_error': False} + some_data = 'xxxxx' + update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() + changed_detected = False + return changed_detected, update_obj, ''.encode('utf-8') - # Pluggable content self.fetcher - if not prefer_fetch_backend or prefer_fetch_backend == 'system': - prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') - # In the case that the preferred fetcher was a browser config with custom connection URL.. - # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..) - browser_connection_url = None - if prefer_fetch_backend.startswith('extra_browser_'): - (t, key) = prefer_fetch_backend.split('extra_browser_') - connection = list( - filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', []))) - if connection: - prefer_fetch_backend = 'base_html_playwright' - browser_connection_url = connection[0].get('browser_connection_url') +class text_content_difference_detection_processor(difference_detection_processor_interface): + def __init__(self, *args, datastore, watch_uuid, prefer_fetch_backend, **kwargs): + self.datastore = datastore + self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) + self.prefer_fetch_backend = prefer_fetch_backend + super().__init__(*args, **kwargs) + ######################################## + # Attach the correct fetcher and proxy # + ######################################## # Grab the right kind of 'fetcher', (playwright, requests, etc) - if hasattr(content_fetcher, prefer_fetch_backend): - fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) + if hasattr(content_fetcher, self.prefer_fetch_backend): + fetcher_obj = getattr(content_fetcher, self.prefer_fetch_backend) else: # If the klass doesnt exist, just use a default fetcher_obj = getattr(content_fetcher, "html_requests") - + # Proxy ID "key" + preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid')) proxy_url = None if preferred_proxy_id: proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url') @@ -69,9 +51,23 @@ class difference_detection_processor(): # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc) self.fetcher = fetcher_obj(proxy_override=proxy_url, - browser_connection_url=browser_connection_url + browser_connection_url=None # Default, let each fetcher work it out ) + def fetch_content(self): + + url = self.watch.link + + # In the case that the preferred fetcher was a browser config with custom connection URL.. + # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..) + if self.prefer_fetch_backend.startswith('extra_browser_'): + (t, key) = self.prefer_fetch_backend.split('extra_browser_') + connection = list( + filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', []))) + if connection: + prefer_fetch_backend = 'base_html_playwright' + browser_connection_url = connection[0].get('browser_connection_url') + if self.watch.has_browser_steps: self.fetcher.browser_steps = self.watch.get('browser_steps', []) self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid')) @@ -115,14 +111,6 @@ class difference_detection_processor(): # After init, call run_changedetection() which will do the actual change-detection - @abstractmethod - def run_changedetection(self, uuid, skip_when_checksum_same=True): - update_obj = {'last_notification_error': False, 'last_error': False} - some_data = 'xxxxx' - update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() - changed_detected = False - return changed_detected, update_obj, ''.encode('utf-8') - def available_processors(): from . import restock_diff, text_json_diff diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index 9751a195..22f4185b 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -1,8 +1,9 @@ import hashlib import urllib3 -from . import difference_detection_processor +#from . import browser_content_difference_detection_processor from copy import deepcopy +from . import text_content_difference_detection_processor urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -15,7 +16,7 @@ class UnableToExtractRestockData(Exception): self.status_code = status_code return -class perform_site_check(difference_detection_processor): +class perform_site_check(text_content_difference_detection_processor): screenshot = None xpath_data = None diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index b503c5be..d9ff3023 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -10,8 +10,8 @@ import urllib3 from changedetectionio import content_fetcher, html_tools from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from copy import deepcopy -from . import difference_detection_processor from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text +from . import text_content_difference_detection_processor urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -31,7 +31,7 @@ class PDFToHTMLToolNotFound(ValueError): # Some common stuff here that can be moved to a base class # (set_proxy_from_list) -class perform_site_check(difference_detection_processor): +class perform_site_check(text_content_difference_detection_processor): def run_changedetection(self, uuid, skip_when_checksum_same=True): changed_detected = False diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index c5ab7de9..5d3ec493 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -1,9 +1,13 @@ +import importlib import os +import re import threading import queue import time +from distutils.util import strtobool from changedetectionio import content_fetcher, html_tools + from .processors.text_json_diff import FilterNotFoundInResponse from .processors.restock_diff import UnableToExtractRestockData @@ -15,6 +19,7 @@ from .processors.restock_diff import UnableToExtractRestockData import logging import sys + class update_worker(threading.Thread): current_uuid = None @@ -24,6 +29,7 @@ class update_worker(threading.Thread): self.app = app self.notification_q = notification_q self.datastore = datastore + super().__init__(*args, **kwargs) def queue_notification_for_watch(self, n_object, watch): @@ -209,7 +215,7 @@ class update_worker(threading.Thread): from .processors import text_json_diff, restock_diff while not self.app.config.exit.is_set(): - update_handler = None + change_processor = None try: queued_item_data = self.q.get(block=False) @@ -230,35 +236,46 @@ class update_worker(threading.Thread): now = time.time() try: - # Processor is what we are using for detecting the "Change" - processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') - # if system... + # Protect against file:// access + if re.search(r'^file://', self.datastore.data['watching'][uuid].get('url', '').strip(), re.IGNORECASE): + if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): + raise Exception( + "file:// type access is denied for security reasons." + ) - # Abort processing when the content was the same as the last fetch - skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') + prefer_fetch_backend = self.datastore.data['watching'][uuid].get('fetch_backend', 'system') + if not prefer_fetch_backend or prefer_fetch_backend == 'system': + prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') + processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') - # @todo some way to switch by name - # Init a new 'difference_detection_processor' + processor = 'cdio_whois_diff' - if processor == 'restock_diff': - update_handler = restock_diff.perform_site_check(datastore=self.datastore, - watch_uuid=uuid - ) + if processor in ['text_json_diff', 'restock_diff']: + base_processor_module = f"changedetectionio.processors.{processor}" else: - # Used as a default and also by some tests - update_handler = text_json_diff.perform_site_check(datastore=self.datastore, - watch_uuid=uuid - ) + # Each plugin is one processor exactly + base_processor_module = f"{processor}.processor" + +# its correct that processor dictates which fethcer it uses i think + + # these should inherit the right fetcher too + module = importlib.import_module(base_processor_module) + change_processor = getattr(module, 'perform_site_check') + change_processor = change_processor(datastore=self.datastore, + watch_uuid=uuid, + prefer_fetch_backend=prefer_fetch_backend + ) # Clear last errors (move to preflight func?) self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None - update_handler.call_browser() - - changed_detected, update_obj, contents = update_handler.run_changedetection(uuid, - skip_when_checksum_same=skip_when_same_checksum, - ) + skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') + # Each processor extends base class of the kind of fetcher it needs to run anyway + change_processor.fetch_content() + changed_detected, update_obj, contents = change_processor.run_changedetection(uuid, + skip_when_checksum_same=skip_when_same_checksum + ) # Re #342 # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. @@ -465,10 +482,10 @@ class update_worker(threading.Thread): }) # Always save the screenshot if it's available - if update_handler.screenshot: - self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot) - if update_handler.xpath_data: - self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data) + if change_processor.screenshot: + self.datastore.save_screenshot(watch_uuid=uuid, screenshot=change_processor.screenshot) + if change_processor.xpath_data: + self.datastore.save_xpath_data(watch_uuid=uuid, data=change_processor.xpath_data) self.current_uuid = None # Done