second attempt at plugins

pull/2004/head
dgtlmoon 1 year ago
parent 0285d00f13
commit 70842193b0

@ -16,6 +16,7 @@ import logging
import os import os
import pytz import pytz
import queue import queue
import sys
import threading import threading
import time import time
import timeago import timeago
@ -80,6 +81,9 @@ csrf = CSRFProtect()
csrf.init_app(app) csrf.init_app(app)
notification_debug_log=[] notification_debug_log=[]
from pathlib import Path
sys.path.append(os.path.join(Path.home(), 'changedetectionio-plugins'))
watch_api = Api(app, decorators=[csrf.exempt]) watch_api = Api(app, decorators=[csrf.exempt])
def init_app_secret(datastore_path): def init_app_secret(datastore_path):

@ -76,7 +76,7 @@ class Watch(Resource):
# Properties are not returned as a JSON, so add the required props manually # Properties are not returned as a JSON, so add the required props manually
watch['history_n'] = watch.history_n watch['history_n'] = watch.history_n
watch['last_changed'] = watch.last_changed watch['last_changed'] = watch.last_changed
watch['viewed'] = watch.viewed
return watch return watch
@auth.check_token @auth.check_token

@ -97,7 +97,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
proxy=proxy) proxy=proxy)
# For test # For test
#browsersteps_start_session['browserstepper'].action_goto_url(value="http://example.com?time="+str(time.time())) #browsersteps_start_session['browserstepper'].action_goto_url(value="http://exbaseample.com?time="+str(time.time()))
return browsersteps_start_session return browsersteps_start_session

@ -41,7 +41,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
now = time.time() now = time.time()
try: try:
update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid) update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid)
update_handler.call_browser() update_handler.fetch_content()
# title, size is len contents not len xfer # title, size is len contents not len xfer
except content_fetcher.Non200ErrorCodeReceived as e: except content_fetcher.Non200ErrorCodeReceived as e:
if e.status_code == 404: if e.status_code == 404:

@ -4,10 +4,8 @@ import hashlib
import re import re
from changedetectionio import content_fetcher from changedetectionio import content_fetcher
from copy import deepcopy from copy import deepcopy
from distutils.util import strtobool
class difference_detection_processor():
class difference_detection_processor_interface():
browser_steps = None browser_steps = None
datastore = None datastore = None
fetcher = None fetcher = None
@ -15,52 +13,36 @@ class difference_detection_processor():
watch = None watch = None
xpath_data = None xpath_data = None
def __init__(self, *args, datastore, watch_uuid, **kwargs):
super().__init__(*args, **kwargs)
self.datastore = datastore
self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
def call_browser(self): @abstractmethod
def run_changedetection(self, uuid, skip_when_checksum_same=True):
# Protect against file:// access update_obj = {'last_notification_error': False, 'last_error': False}
if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): some_data = 'xxxxx'
if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()
raise Exception( changed_detected = False
"file:// type access is denied for security reasons." return changed_detected, update_obj, ''.encode('utf-8')
)
url = self.watch.link
# Requests, playwright, other browser via wss:// etc, fetch_extra_something
prefer_fetch_backend = self.watch.get('fetch_backend', 'system')
# Proxy ID "key"
preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
# Pluggable content self.fetcher
if not prefer_fetch_backend or prefer_fetch_backend == 'system':
prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend')
# In the case that the preferred fetcher was a browser config with custom connection URL.. class text_content_difference_detection_processor(difference_detection_processor_interface):
# @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..)
browser_connection_url = None
if prefer_fetch_backend.startswith('extra_browser_'):
(t, key) = prefer_fetch_backend.split('extra_browser_')
connection = list(
filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
if connection:
prefer_fetch_backend = 'base_html_playwright'
browser_connection_url = connection[0].get('browser_connection_url')
def __init__(self, *args, datastore, watch_uuid, prefer_fetch_backend, **kwargs):
self.datastore = datastore
self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid))
self.prefer_fetch_backend = prefer_fetch_backend
super().__init__(*args, **kwargs)
########################################
# Attach the correct fetcher and proxy #
########################################
# Grab the right kind of 'fetcher', (playwright, requests, etc) # Grab the right kind of 'fetcher', (playwright, requests, etc)
if hasattr(content_fetcher, prefer_fetch_backend): if hasattr(content_fetcher, self.prefer_fetch_backend):
fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) fetcher_obj = getattr(content_fetcher, self.prefer_fetch_backend)
else: else:
# If the klass doesnt exist, just use a default # If the klass doesnt exist, just use a default
fetcher_obj = getattr(content_fetcher, "html_requests") fetcher_obj = getattr(content_fetcher, "html_requests")
# Proxy ID "key"
preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid'))
proxy_url = None proxy_url = None
if preferred_proxy_id: if preferred_proxy_id:
proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url') proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url')
@ -69,9 +51,23 @@ class difference_detection_processor():
# Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need.
# When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc) # When browser_connection_url is None, it method should default to working out whats the best defaults (os env vars etc)
self.fetcher = fetcher_obj(proxy_override=proxy_url, self.fetcher = fetcher_obj(proxy_override=proxy_url,
browser_connection_url=browser_connection_url browser_connection_url=None # Default, let each fetcher work it out
) )
def fetch_content(self):
url = self.watch.link
# In the case that the preferred fetcher was a browser config with custom connection URL..
# @todo - on save watch, if its extra_browser_ then it should be obvious it will use playwright (like if its requests now..)
if self.prefer_fetch_backend.startswith('extra_browser_'):
(t, key) = self.prefer_fetch_backend.split('extra_browser_')
connection = list(
filter(lambda s: (s['browser_name'] == key), self.datastore.data['settings']['requests'].get('extra_browsers', [])))
if connection:
prefer_fetch_backend = 'base_html_playwright'
browser_connection_url = connection[0].get('browser_connection_url')
if self.watch.has_browser_steps: if self.watch.has_browser_steps:
self.fetcher.browser_steps = self.watch.get('browser_steps', []) self.fetcher.browser_steps = self.watch.get('browser_steps', [])
self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid')) self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid'))
@ -115,14 +111,6 @@ class difference_detection_processor():
# After init, call run_changedetection() which will do the actual change-detection # After init, call run_changedetection() which will do the actual change-detection
@abstractmethod
def run_changedetection(self, uuid, skip_when_checksum_same=True):
update_obj = {'last_notification_error': False, 'last_error': False}
some_data = 'xxxxx'
update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()
changed_detected = False
return changed_detected, update_obj, ''.encode('utf-8')
def available_processors(): def available_processors():
from . import restock_diff, text_json_diff from . import restock_diff, text_json_diff

@ -1,8 +1,9 @@
import hashlib import hashlib
import urllib3 import urllib3
from . import difference_detection_processor #from . import browser_content_difference_detection_processor
from copy import deepcopy from copy import deepcopy
from . import text_content_difference_detection_processor
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -15,7 +16,7 @@ class UnableToExtractRestockData(Exception):
self.status_code = status_code self.status_code = status_code
return return
class perform_site_check(difference_detection_processor): class perform_site_check(text_content_difference_detection_processor):
screenshot = None screenshot = None
xpath_data = None xpath_data = None

@ -10,8 +10,8 @@ import urllib3
from changedetectionio import content_fetcher, html_tools from changedetectionio import content_fetcher, html_tools
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from copy import deepcopy from copy import deepcopy
from . import difference_detection_processor
from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
from . import text_content_difference_detection_processor
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -31,7 +31,7 @@ class PDFToHTMLToolNotFound(ValueError):
# Some common stuff here that can be moved to a base class # Some common stuff here that can be moved to a base class
# (set_proxy_from_list) # (set_proxy_from_list)
class perform_site_check(difference_detection_processor): class perform_site_check(text_content_difference_detection_processor):
def run_changedetection(self, uuid, skip_when_checksum_same=True): def run_changedetection(self, uuid, skip_when_checksum_same=True):
changed_detected = False changed_detected = False

@ -1,9 +1,13 @@
import importlib
import os import os
import re
import threading import threading
import queue import queue
import time import time
from distutils.util import strtobool
from changedetectionio import content_fetcher, html_tools from changedetectionio import content_fetcher, html_tools
from .processors.text_json_diff import FilterNotFoundInResponse from .processors.text_json_diff import FilterNotFoundInResponse
from .processors.restock_diff import UnableToExtractRestockData from .processors.restock_diff import UnableToExtractRestockData
@ -15,6 +19,7 @@ from .processors.restock_diff import UnableToExtractRestockData
import logging import logging
import sys import sys
class update_worker(threading.Thread): class update_worker(threading.Thread):
current_uuid = None current_uuid = None
@ -24,6 +29,7 @@ class update_worker(threading.Thread):
self.app = app self.app = app
self.notification_q = notification_q self.notification_q = notification_q
self.datastore = datastore self.datastore = datastore
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
def queue_notification_for_watch(self, n_object, watch): def queue_notification_for_watch(self, n_object, watch):
@ -209,7 +215,7 @@ class update_worker(threading.Thread):
from .processors import text_json_diff, restock_diff from .processors import text_json_diff, restock_diff
while not self.app.config.exit.is_set(): while not self.app.config.exit.is_set():
update_handler = None change_processor = None
try: try:
queued_item_data = self.q.get(block=False) queued_item_data = self.q.get(block=False)
@ -230,35 +236,46 @@ class update_worker(threading.Thread):
now = time.time() now = time.time()
try: try:
# Processor is what we are using for detecting the "Change" # Protect against file:// access
processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') if re.search(r'^file://', self.datastore.data['watching'][uuid].get('url', '').strip(), re.IGNORECASE):
# if system... if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')):
raise Exception(
"file:// type access is denied for security reasons."
)
# Abort processing when the content was the same as the last fetch prefer_fetch_backend = self.datastore.data['watching'][uuid].get('fetch_backend', 'system')
skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') if not prefer_fetch_backend or prefer_fetch_backend == 'system':
prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend')
processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff')
# @todo some way to switch by name processor = 'cdio_whois_diff'
# Init a new 'difference_detection_processor'
if processor == 'restock_diff': if processor in ['text_json_diff', 'restock_diff']:
update_handler = restock_diff.perform_site_check(datastore=self.datastore, base_processor_module = f"changedetectionio.processors.{processor}"
watch_uuid=uuid
)
else: else:
# Used as a default and also by some tests # Each plugin is one processor exactly
update_handler = text_json_diff.perform_site_check(datastore=self.datastore, base_processor_module = f"{processor}.processor"
watch_uuid=uuid
) # its correct that processor dictates which fethcer it uses i think
# these should inherit the right fetcher too
module = importlib.import_module(base_processor_module)
change_processor = getattr(module, 'perform_site_check')
change_processor = change_processor(datastore=self.datastore,
watch_uuid=uuid,
prefer_fetch_backend=prefer_fetch_backend
)
# Clear last errors (move to preflight func?) # Clear last errors (move to preflight func?)
self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None
update_handler.call_browser() skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same')
# Each processor extends base class of the kind of fetcher it needs to run anyway
changed_detected, update_obj, contents = update_handler.run_changedetection(uuid, change_processor.fetch_content()
skip_when_checksum_same=skip_when_same_checksum, changed_detected, update_obj, contents = change_processor.run_changedetection(uuid,
) skip_when_checksum_same=skip_when_same_checksum
)
# Re #342 # Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
@ -465,10 +482,10 @@ class update_worker(threading.Thread):
}) })
# Always save the screenshot if it's available # Always save the screenshot if it's available
if update_handler.screenshot: if change_processor.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot) self.datastore.save_screenshot(watch_uuid=uuid, screenshot=change_processor.screenshot)
if update_handler.xpath_data: if change_processor.xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data) self.datastore.save_xpath_data(watch_uuid=uuid, data=change_processor.xpath_data)
self.current_uuid = None # Done self.current_uuid = None # Done

Loading…
Cancel
Save