From 70842193b06de26d120003e133386b56f46579da Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Sat, 25 Nov 2023 20:05:18 +0100
Subject: [PATCH] second attempt at plugins

---
 changedetectionio/__init__.py                 |  4 +
 changedetectionio/api/api_v1.py               |  2 +-
 .../blueprint/browser_steps/__init__.py       |  2 +-
 .../blueprint/check_proxies/__init__.py       |  2 +-
 changedetectionio/processors/__init__.py      | 84 ++++++++-----------
 changedetectionio/processors/restock_diff.py  |  5 +-
 .../processors/text_json_diff.py              |  4 +-
 changedetectionio/update_worker.py            | 67 +++++++++------
 8 files changed, 90 insertions(+), 80 deletions(-)

diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 6a95b156..0e08044f 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -16,6 +16,7 @@ import logging
 import os
 import pytz
 import queue
+import sys
 import threading
 import time
 import timeago
@@ -80,6 +81,9 @@ csrf = CSRFProtect()
 csrf.init_app(app)
 notification_debug_log=[]
 
+from pathlib import Path
+sys.path.append(os.path.join(Path.home(), 'changedetectionio-plugins'))
+
 watch_api = Api(app, decorators=[csrf.exempt])
 
 def init_app_secret(datastore_path):
diff --git a/changedetectionio/api/api_v1.py b/changedetectionio/api/api_v1.py
index 783827ca..e014abb1 100644
--- a/changedetectionio/api/api_v1.py
+++ b/changedetectionio/api/api_v1.py
@@ -76,7 +76,7 @@ class Watch(Resource):
         # Properties are not returned as a JSON, so add the required props manually
         watch['history_n'] = watch.history_n
         watch['last_changed'] = watch.last_changed
-
+        watch['viewed'] = watch.viewed
         return watch
 
     @auth.check_token
diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py
index 11fb208d..6ca20021 100644
--- a/changedetectionio/blueprint/browser_steps/__init__.py
+++ b/changedetectionio/blueprint/browser_steps/__init__.py
@@ -97,7 +97,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
             proxy=proxy)
 
         # For test
-        #browsersteps_start_session['browserstepper'].action_goto_url(value="http://example.com?time="+str(time.time()))
+        #browsersteps_start_session['browserstepper'].action_goto_url(value="http://exbaseample.com?time="+str(time.time()))
 
         return browsersteps_start_session
 
diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py
index ea68376a..8f1e49f2 100644
--- a/changedetectionio/blueprint/check_proxies/__init__.py
+++ b/changedetectionio/blueprint/check_proxies/__init__.py
@@ -41,7 +41,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
         now = time.time()
         try:
             update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid)
-            update_handler.call_browser()
+            update_handler.fetch_content()
         # title, size is len contents not len xfer
         except content_fetcher.Non200ErrorCodeReceived as e:
             if e.status_code == 404:
diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py
index 10c9138c..8c74d2b7 100644
--- a/changedetectionio/processors/__init__.py
+++ b/changedetectionio/processors/__init__.py
@@ -4,10 +4,8 @@ import hashlib
 import re
 from changedetectionio import content_fetcher
 from copy import deepcopy
-from distutils.util import strtobool
-
-class difference_detection_processor():
 
+class difference_detection_processor_interface():
     browser_steps = None
     datastore = None
     fetcher = None
@@ -15,52 +13,36 @@ class difference_detection_processor():
     watch = None
     xpath_data = None
 
-    def __init__(self, *args, datastore, watch_uuid, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.datastore = datastore
-        self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
 
-    def call_browser(self):
-
-        # Protect against file:// access
-        if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE):
-            if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
-                raise Exception(
-                    "file:// type access is denied for security reasons."
-                )
-
-        url = self.watch.link
-
-        # Requests, playwright, other browser via wss:// etc, fetch_extra_something
-        prefer_fetch_backend = self.watch.get('fetch_backend', 'system')
-
-        # Proxy ID "key"
-        preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
+    @abstractmethod
+    def run_changedetection(self, uuid, skip_when_checksum_same=True):
+        update_obj = {'last_notification_error': False, 'last_error': False}
+        some_data = 'xxxxx'
+        update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()
+        changed_detected = False
+        return changed_detected, update_obj, ''.encode('utf-8')
 
-        # Pluggable content self.fetcher
-        if not prefer_fetch_backend or prefer_fetch_backend == 'system':
-            prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend')
 
-        # In the case that the preferred fetcher was a browser config with custom connection URL..
-        # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..)
-        browser_connection_url = None
-        if prefer_fetch_backend.startswith('extra_browser_'):
-            (t, key) = prefer_fetch_backend.split('extra_browser_')
-            connection = list(
-                filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
-            if connection:
-                prefer_fetch_backend = 'base_html_playwright'
-                browser_connection_url = connection[0].get('browser_connection_url')
+class text_content_difference_detection_processor(difference_detection_processor_interface):
 
+    def __init__(self, *args, datastore, watch_uuid, prefer_fetch_backend, **kwargs):
+        self.datastore = datastore
+        self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
+        self.prefer_fetch_backend = prefer_fetch_backend
+        super().__init__(*args, **kwargs)
 
+        ########################################
+        # Attach the correct fetcher and proxy #
+        ########################################
         # Grab the right kind of 'fetcher', (playwright, requests, etc)
-        if hasattr(content_fetcher, prefer_fetch_backend):
-            fetcher_obj = getattr(content_fetcher, prefer_fetch_backend)
+        if hasattr(content_fetcher, self.prefer_fetch_backend):
+            fetcher_obj = getattr(content_fetcher, self.prefer_fetch_backend)
         else:
             # If the klass doesnt exist, just use a default
             fetcher_obj = getattr(content_fetcher, "html_requests")
 
-
+        # Proxy ID "key"
+        preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
         proxy_url = None
         if preferred_proxy_id:
             proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
@@ -69,9 +51,23 @@ class difference_detection_processor():
         # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
         # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
         self.fetcher = fetcher_obj(proxy_override=proxy_url,
-                                   browser_connection_url=browser_connection_url
+                                   browser_connection_url=None # Default, let each fetcher work it out
                                    )
 
+    def fetch_content(self):
+
+        url = self.watch.link
+
+        # In the case that the preferred fetcher was a browser config with custom connection URL..
+        # @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..)
+        if self.prefer_fetch_backend.startswith('extra_browser_'):
+            (t, key) = self.prefer_fetch_backend.split('extra_browser_')
+            connection = list(
+                filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
+            if connection:
+                prefer_fetch_backend = 'base_html_playwright'
+                browser_connection_url = connection[0].get('browser_connection_url')
+
         if self.watch.has_browser_steps:
             self.fetcher.browser_steps = self.watch.get('browser_steps', [])
             self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid'))
@@ -115,14 +111,6 @@ class difference_detection_processor():
 
         # After init, call run_changedetection() which will do the actual change-detection
 
-    @abstractmethod
-    def run_changedetection(self, uuid, skip_when_checksum_same=True):
-        update_obj = {'last_notification_error': False, 'last_error': False}
-        some_data = 'xxxxx'
-        update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()
-        changed_detected = False
-        return changed_detected, update_obj, ''.encode('utf-8')
-
 
 def available_processors():
     from . import restock_diff, text_json_diff
diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py
index 9751a195..22f4185b 100644
--- a/changedetectionio/processors/restock_diff.py
+++ b/changedetectionio/processors/restock_diff.py
@@ -1,8 +1,9 @@
 
 import hashlib
 import urllib3
-from . import difference_detection_processor
+#from . import browser_content_difference_detection_processor
 from copy import deepcopy
+from . import text_content_difference_detection_processor
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -15,7 +16,7 @@ class UnableToExtractRestockData(Exception):
         self.status_code = status_code
         return
 
-class perform_site_check(difference_detection_processor):
+class perform_site_check(text_content_difference_detection_processor):
     screenshot = None
     xpath_data = None
 
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index b503c5be..d9ff3023 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -10,8 +10,8 @@ import urllib3
 from changedetectionio import content_fetcher, html_tools
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from copy import deepcopy
-from . import difference_detection_processor
 from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
+from . import text_content_difference_detection_processor
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
@@ -31,7 +31,7 @@ class PDFToHTMLToolNotFound(ValueError):
 
 # Some common stuff here that can be moved to a base class
 # (set_proxy_from_list)
-class perform_site_check(difference_detection_processor):
+class perform_site_check(text_content_difference_detection_processor):
 
     def run_changedetection(self, uuid, skip_when_checksum_same=True):
         changed_detected = False
diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py
index c5ab7de9..5d3ec493 100644
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@@ -1,9 +1,13 @@
+import importlib
 import os
+import re
 import threading
 import queue
 import time
+from distutils.util import strtobool
 
 from changedetectionio import content_fetcher, html_tools
+
 from .processors.text_json_diff import FilterNotFoundInResponse
 from .processors.restock_diff import UnableToExtractRestockData
 
@@ -15,6 +19,7 @@ from .processors.restock_diff import UnableToExtractRestockData
 import logging
 import sys
 
+
 class update_worker(threading.Thread):
     current_uuid = None
 
@@ -24,6 +29,7 @@ class update_worker(threading.Thread):
         self.app = app
         self.notification_q = notification_q
         self.datastore = datastore
+
         super().__init__(*args, **kwargs)
 
     def queue_notification_for_watch(self, n_object, watch):
@@ -209,7 +215,7 @@ class update_worker(threading.Thread):
         from .processors import text_json_diff, restock_diff
 
         while not self.app.config.exit.is_set():
-            update_handler = None
+            change_processor = None
 
             try:
                 queued_item_data = self.q.get(block=False)
@@ -230,35 +236,46 @@ class update_worker(threading.Thread):
                     now = time.time()
 
                     try:
-                        # Processor is what we are using for detecting the "Change"
-                        processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff')
-                        # if system...
+                        # Protect against file:// access
+                        if re.search(r'^file://', self.datastore.data['watching'][uuid].get('url', '').strip(), re.IGNORECASE):
+                            if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
+                                raise Exception(
+                                    "file:// type access is denied for security reasons."
+                                )
 
-                        # Abort processing when the content was the same as the last fetch
-                        skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same')
+                        prefer_fetch_backend = self.datastore.data['watching'][uuid].get('fetch_backend', 'system')
+                        if not prefer_fetch_backend or prefer_fetch_backend == 'system':
+                            prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend')
 
+                        processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff')
 
-                        # @todo some way to switch by name
-                        # Init a new 'difference_detection_processor'
+                        processor = 'cdio_whois_diff'
 
-                        if processor == 'restock_diff':
-                            update_handler = restock_diff.perform_site_check(datastore=self.datastore,
-                                                                             watch_uuid=uuid
-                                                                             )
+                        if processor in ['text_json_diff', 'restock_diff']:
+                            base_processor_module = f"changedetectionio.processors.{processor}"
                         else:
-                            # Used as a default and also by some tests
-                            update_handler = text_json_diff.perform_site_check(datastore=self.datastore,
-                                                                               watch_uuid=uuid
-                                                                               )
+                            # Each plugin is one processor exactly
+                            base_processor_module = f"{processor}.processor"
+
+# its correct that processor dictates which fethcer it uses i think
+
+                        # these should inherit the right fetcher too
+                        module = importlib.import_module(base_processor_module)
+                        change_processor = getattr(module, 'perform_site_check')
+                        change_processor = change_processor(datastore=self.datastore,
+                                                            watch_uuid=uuid,
+                                                            prefer_fetch_backend=prefer_fetch_backend
+                                                            )
 
                         # Clear last errors (move to preflight func?)
                         self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None
 
-                        update_handler.call_browser()
-
-                        changed_detected, update_obj, contents = update_handler.run_changedetection(uuid,
-                                                                                    skip_when_checksum_same=skip_when_same_checksum,
-                                                                                    )
+                        skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same')
+                        # Each processor extends base class of the kind of fetcher it needs to run anyway
+                        change_processor.fetch_content()
+                        changed_detected, update_obj, contents = change_processor.run_changedetection(uuid,
+                                                                                                      skip_when_checksum_same=skip_when_same_checksum
+                                                                                                      )
 
                         # Re #342
                         # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
@@ -465,10 +482,10 @@ class update_worker(threading.Thread):
                                                                            })
 
                         # Always save the screenshot if it's available
-                        if update_handler.screenshot:
-                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot)
-                        if update_handler.xpath_data:
-                            self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data)
+                        if change_processor.screenshot:
+                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=change_processor.screenshot)
+                        if change_processor.xpath_data:
+                            self.datastore.save_xpath_data(watch_uuid=uuid, data=change_processor.xpath_data)
 
 
                 self.current_uuid = None  # Done