From 97e591fa24745d3fb8db5e1f97f34529d41942ec Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 13:38:46 +0100 Subject: [PATCH 1/8] Remove duplicate code --- changedetectionio/content_fetcher.py | 6 +- changedetectionio/model/Watch.py | 9 ++ changedetectionio/processors/__init__.py | 84 +++++++++++- changedetectionio/processors/restock_diff.py | 85 ++---------- .../processors/text_json_diff.py | 125 ++++-------------- changedetectionio/update_worker.py | 24 +++- 6 files changed, 149 insertions(+), 184 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index c17419af..31080a97 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -419,11 +419,7 @@ class base_html_playwright(Fetcher): is_binary=False): # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!) - has_browser_steps = self.browser_steps and list(filter( - lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), - self.browser_steps)) - - if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): + if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')): # Temporary backup solution until we rewrite the playwright code return self.run_fetch_browserless_puppeteer( diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 2773a3bb..97131a8a 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -19,6 +19,7 @@ from changedetectionio.notification import ( base_config = { 'body': None, + 'browser_steps': [], 'browser_steps_last_error_step': None, 'check_unique_lines': False, # On change-detected, compare against all history if its something new 'check_count': 0, @@ -234,6 +235,14 @@ class model(dict): fname = os.path.join(self.watch_data_dir, "history.txt") return os.path.isfile(fname) + @property + def has_browser_steps(self): + has_browser_steps = self.get('browser_steps') and list(filter( + lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), + self.get('browser_steps'))) + + return has_browser_steps + # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. @property def newest_history_key(self): diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 225e9c8a..ba6af25c 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,15 +1,95 @@ from abc import abstractmethod +import os import hashlib +from changedetectionio import content_fetcher class difference_detection_processor(): + datastore = None + fetcher = None + screenshot = None + xpath_data = None + browser_steps = None - def __init__(self, *args, **kwargs): + def __init__(self, *args, datastore, watch_uuid, **kwargs): super().__init__(*args, **kwargs) + self.datastore = datastore + + watch = self.datastore.data['watching'].get(watch_uuid) + url = watch.link + + # Requests, playwright, other browser via wss:// etc, fetch_extra_something + prefer_fetch_backend = watch.get('fetch_backend', 'system') + + # Proxy ID "key" + preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=watch_uuid) + + # Pluggable content self.fetcher + if not prefer_fetch_backend or prefer_fetch_backend == 'system': + prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') + + # Grab the right kind of 'fetcher', (playwright, requests, etc) + if hasattr(content_fetcher, prefer_fetch_backend): + fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) + else: + # If the klass doesnt exist, just use a default + fetcher_obj = getattr(content_fetcher, "html_requests") + + + proxy_url = None + if preferred_proxy_id: + proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url') + print(f"Using proxy Key: {preferred_proxy_id} as Proxy URL {proxy_url}") + + # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. + self.fetcher = fetcher_obj(proxy_override=proxy_url, + #browser_url_extra/configurable browser url=... + ) + + if watch.has_browser_steps: + self.fetcher.browser_steps = watch.get('browser_steps', []) + self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, watch_uuid) + + # Tweak the base config with the per-watch ones + request_headers = watch.get('headers', []) + request_headers.update(self.datastore.get_all_base_headers()) + request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=watch_uuid)) + + # https://github.com/psf/requests/issues/4525 + # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot + # do this by accident. + if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: + request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') + + timeout = self.datastore.data['settings']['requests'].get('timeout') + + request_body = watch.get('body') + request_method = watch.get('method') + ignore_status_codes = watch.get('ignore_status_codes', False) + + # Configurable per-watch or global extra delay before extracting text (for webDriver types) + system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) + if watch['webdriver_delay'] is not None: + self.fetcher.render_extract_delay = watch.get('webdriver_delay') + elif system_webdriver_delay is not None: + self.fetcher.render_extract_delay = system_webdriver_delay + + if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): + self.fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') + + # Requests for PDF's, images etc should be passwd the is_binary flag + is_binary = watch.is_pdf + + # And here we go! call the right browser with browser-specific settings + self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), + is_binary=is_binary) + self.fetcher.quit() + + # After init, call run() which will do the actual change-detection @abstractmethod - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): + def run(self, uuid, skip_when_checksum_same=True): update_obj = {'last_notification_error': False, 'last_error': False} some_data = 'xxxxx' update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index 09cae6e7..2edaffde 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -1,10 +1,7 @@ import hashlib -import os -import re import urllib3 from . import difference_detection_processor -from changedetectionio import content_fetcher from copy import deepcopy urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -22,10 +19,6 @@ class perform_site_check(difference_detection_processor): screenshot = None xpath_data = None - def __init__(self, *args, datastore, **kwargs): - super().__init__(*args, **kwargs) - self.datastore = datastore - def run(self, uuid, skip_when_checksum_same=True): # DeepCopy so we can be sure we don't accidently change anything by reference @@ -34,84 +27,24 @@ class perform_site_check(difference_detection_processor): if not watch: raise Exception("Watch no longer exists.") - # Protect against file:// access - if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): - raise Exception( - "file:// type access is denied for security reasons." - ) - # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} - request_headers = watch.get('headers', []) - request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) - - # https://github.com/psf/requests/issues/4525 - # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot - # do this by accident. - if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: - request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - - timeout = self.datastore.data['settings']['requests'].get('timeout') - - url = watch.link - - request_body = self.datastore.data['watching'][uuid].get('body') - request_method = self.datastore.data['watching'][uuid].get('method') - ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) - - # Pluggable content fetcher - prefer_backend = watch.get_fetch_backend - if not prefer_backend or prefer_backend == 'system': - prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] - - if hasattr(content_fetcher, prefer_backend): - klass = getattr(content_fetcher, prefer_backend) - else: - # If the klass doesnt exist, just use a default - klass = getattr(content_fetcher, "html_requests") - - proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) - proxy_url = None - if proxy_id: - proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') - print("UUID {} Using proxy {}".format(uuid, proxy_url)) - - fetcher = klass(proxy_override=proxy_url) - - # Configurable per-watch or global extra delay before extracting text (for webDriver types) - system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) - if watch['webdriver_delay'] is not None: - fetcher.render_extract_delay = watch.get('webdriver_delay') - elif system_webdriver_delay is not None: - fetcher.render_extract_delay = system_webdriver_delay - - # Could be removed if requests/plaintext could also return some info? - if prefer_backend != 'html_webdriver': - raise Exception("Re-stock detection requires Chrome or compatible webdriver/playwright fetcher to work") - - if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): - fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') - - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters')) - fetcher.quit() - - self.screenshot = fetcher.screenshot - self.xpath_data = fetcher.xpath_data + self.screenshot = self.fetcher.screenshot + self.xpath_data = self.fetcher.xpath_data # Track the content type - update_obj['content_type'] = fetcher.headers.get('Content-Type', '') - update_obj["last_check_status"] = fetcher.get_last_status_code() + update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '') + update_obj["last_check_status"] = self.fetcher.get_last_status_code() # Main detection method fetched_md5 = None - if fetcher.instock_data: - fetched_md5 = hashlib.md5(fetcher.instock_data.encode('utf-8')).hexdigest() + if self.fetcher.instock_data: + fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. - update_obj["in_stock"] = True if fetcher.instock_data == 'Possibly in stock' else False + update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False else: - raise UnableToExtractRestockData(status_code=fetcher.status_code) + raise UnableToExtractRestockData(status_code=self.fetcher.status_code) # The main thing that all this at the moment comes down to :) changed_detected = False @@ -128,4 +61,4 @@ class perform_site_check(difference_detection_processor): # Always record the new checksum update_obj["previous_md5"] = fetched_md5 - return changed_detected, update_obj, fetcher.instock_data.encode('utf-8') + return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8') diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index afd8ec36..4a5eca0b 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -1,4 +1,4 @@ -# HTML to TEXT/JSON DIFFERENCE FETCHER +# HTML to TEXT/JSON DIFFERENCE self.fetcher import hashlib import json @@ -32,14 +32,8 @@ class PDFToHTMLToolNotFound(ValueError): # Some common stuff here that can be moved to a base class # (set_proxy_from_list) class perform_site_check(difference_detection_processor): - screenshot = None - xpath_data = None - def __init__(self, *args, datastore, **kwargs): - super().__init__(*args, **kwargs) - self.datastore = datastore - - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): + def run(self, uuid, skip_when_checksum_same=True): changed_detected = False screenshot = False # as bytes stripped_text_from_html = "" @@ -58,91 +52,28 @@ class perform_site_check(difference_detection_processor): # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} - # Tweak the base config with the per-watch ones - request_headers = watch.get('headers', []) - request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) - - # https://github.com/psf/requests/issues/4525 - # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot - # do this by accident. - if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: - request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - - timeout = self.datastore.data['settings']['requests'].get('timeout') - url = watch.link - request_body = self.datastore.data['watching'][uuid].get('body') - request_method = self.datastore.data['watching'][uuid].get('method') - ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) - # source: support is_source = False if url.startswith('source:'): url = url.replace('source:', '') is_source = True - # Pluggable content fetcher - prefer_backend = watch.get_fetch_backend - if not prefer_backend or prefer_backend == 'system': - prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] - - if hasattr(content_fetcher, prefer_backend): - klass = getattr(content_fetcher, prefer_backend) - else: - # If the klass doesnt exist, just use a default - klass = getattr(content_fetcher, "html_requests") - - if preferred_proxy: - proxy_id = preferred_proxy - else: - proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) - - proxy_url = None - if proxy_id: - proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') - print("UUID {} Using proxy {}".format(uuid, proxy_url)) - - fetcher = klass(proxy_override=proxy_url) - - # Configurable per-watch or global extra delay before extracting text (for webDriver types) - system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) - if watch['webdriver_delay'] is not None: - fetcher.render_extract_delay = watch.get('webdriver_delay') - elif system_webdriver_delay is not None: - fetcher.render_extract_delay = system_webdriver_delay - - # Possible conflict - if prefer_backend == 'html_webdriver': - fetcher.browser_steps = watch.get('browser_steps', None) - fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, uuid) - - if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): - fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') - - # requests for PDF's, images etc should be passwd the is_binary flag - is_binary = watch.is_pdf - - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), - is_binary=is_binary) - fetcher.quit() - - self.screenshot = fetcher.screenshot - self.xpath_data = fetcher.xpath_data + self.screenshot = self.fetcher.screenshot + self.xpath_data = self.fetcher.xpath_data # Track the content type - update_obj['content_type'] = fetcher.get_all_headers().get('content-type', '').lower() + update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower() # Watches added automatically in the queue manager will skip if its the same checksum as the previous run # Saves a lot of CPU - update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest() + update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest() if skip_when_checksum_same: if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): raise content_fetcher.checksumFromPreviousCheckWasTheSame() # Fetching complete, now filters - # @todo move to class / maybe inside of fetcher abstract base? # @note: I feel like the following should be in a more obvious chain system # - Check filter text @@ -151,15 +82,15 @@ class perform_site_check(difference_detection_processor): # https://stackoverflow.com/questions/41817578/basic-method-chaining ? # return content().textfilter().jsonextract().checksumcompare() ? - is_json = 'application/json' in fetcher.get_all_headers().get('content-type', '').lower() + is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower() is_html = not is_json is_rss = False - ctype_header = fetcher.get_all_headers().get('content-type', '').lower() + ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower() # Go into RSS preprocess for converting CDATA/comment to usable text if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']): - if '', metadata + '') + self.fetcher.content = self.fetcher.content.replace('', metadata + '') # Better would be if Watch.model could access the global data also # and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__ @@ -217,7 +148,7 @@ class perform_site_check(difference_detection_processor): if is_json: # Sort the JSON so we dont get false alerts when the content is just re-ordered try: - fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True) + self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True) except Exception as e: # Might have just been a snippet, or otherwise bad JSON, continue pass @@ -225,22 +156,22 @@ class perform_site_check(difference_detection_processor): if has_filter_rule: for filter in include_filters_rule: if any(prefix in filter for prefix in json_filter_prefixes): - stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) + stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter) is_html = False if is_html or is_source: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content) - html_content = fetcher.content + self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content) + html_content = self.fetcher.content # If not JSON, and if it's not text/plain.. - if 'text/plain' in fetcher.get_all_headers().get('content-type', '').lower(): + if 'text/plain' in self.fetcher.get_all_headers().get('content-type', '').lower(): # Don't run get_text or xpath/css filters on plaintext stripped_text_from_html = html_content else: # Does it have some ld+json price data? used for easier monitoring - update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content) + update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(self.fetcher.content) # Then we assume HTML if has_filter_rule: @@ -250,13 +181,13 @@ class perform_site_check(difference_detection_processor): # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), - html_content=fetcher.content, + html_content=self.fetcher.content, append_pretty_line_formatting=not is_source, is_rss=is_rss) else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content += html_tools.include_filters(include_filters=filter_rule, - html_content=fetcher.content, + html_content=self.fetcher.content, append_pretty_line_formatting=not is_source) if not html_content.strip(): @@ -311,7 +242,7 @@ class perform_site_check(difference_detection_processor): empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: raise content_fetcher.ReplyWithContentButNoText(url=url, - status_code=fetcher.get_last_status_code(), + status_code=self.fetcher.get_last_status_code(), screenshot=screenshot, has_filters=has_filter_rule, html_content=html_content @@ -320,7 +251,7 @@ class perform_site_check(difference_detection_processor): # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. - update_obj["last_check_status"] = fetcher.get_last_status_code() + update_obj["last_check_status"] = self.fetcher.get_last_status_code() # If there's text to skip # @todo we could abstract out the get_text() to handle this cleaner @@ -408,7 +339,7 @@ class perform_site_check(difference_detection_processor): if is_html: if self.datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']: if not watch['title'] or not len(watch['title']): - update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) + update_obj['title'] = html_tools.extract_element(find='title', html_content=self.fetcher.content) if changed_detected: if watch.get('check_unique_lines', False): diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 792fe94a..668f4116 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -229,17 +229,33 @@ class update_worker(threading.Thread): now = time.time() try: - processor = self.datastore.data['watching'][uuid].get('processor','text_json_diff') + # Processor is what we are using for detecting the "Change" + processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') + # if system... + + # Abort processing when the content was the same as the last fetch + skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') + # @todo some way to switch by name + # Init a new 'difference_detection_processor' + if processor == 'restock_diff': - update_handler = restock_diff.perform_site_check(datastore=self.datastore) + update_handler = restock_diff.perform_site_check(datastore=self.datastore, + watch_uuid=uuid + ) else: # Used as a default and also by some tests - update_handler = text_json_diff.perform_site_check(datastore=self.datastore) + update_handler = text_json_diff.perform_site_check(datastore=self.datastore, + watch_uuid=uuid + ) + # Clear last errors (move to preflight func?) self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None - changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same')) + + changed_detected, update_obj, contents = update_handler.run(uuid, + skip_when_checksum_same=skip_when_same_checksum, + ) # Re #342 # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. From 68d1e2736c51b5e23a05bc8192c928fa62d79c32 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 14:08:25 +0100 Subject: [PATCH 2/8] tidy up methods --- changedetectionio/content_fetcher.py | 1 + changedetectionio/processors/__init__.py | 41 ++++++++++--------- changedetectionio/processors/restock_diff.py | 2 +- .../processors/text_json_diff.py | 3 +- changedetectionio/update_worker.py | 7 +++- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 31080a97..d9c14590 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -667,6 +667,7 @@ class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" def __init__(self, proxy_override=None): + super().__init__() self.proxy_override = proxy_override def run(self, diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index ba6af25c..8a404982 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -15,15 +15,18 @@ class difference_detection_processor(): def __init__(self, *args, datastore, watch_uuid, **kwargs): super().__init__(*args, **kwargs) self.datastore = datastore + self.watch = self.datastore.data['watching'].get(watch_uuid) - watch = self.datastore.data['watching'].get(watch_uuid) - url = watch.link + + def call_browser(self): + + url = self.watch.link # Requests, playwright, other browser via wss:// etc, fetch_extra_something - prefer_fetch_backend = watch.get('fetch_backend', 'system') + prefer_fetch_backend = self.watch.get('fetch_backend', 'system') # Proxy ID "key" - preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=watch_uuid) + preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid')) # Pluggable content self.fetcher if not prefer_fetch_backend or prefer_fetch_backend == 'system': @@ -47,14 +50,14 @@ class difference_detection_processor(): #browser_url_extra/configurable browser url=... ) - if watch.has_browser_steps: - self.fetcher.browser_steps = watch.get('browser_steps', []) - self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, watch_uuid) + if self.watch.has_browser_steps: + self.fetcher.browser_steps = self.watch.get('browser_steps', []) + self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid')) # Tweak the base config with the per-watch ones - request_headers = watch.get('headers', []) + request_headers = self.watch.get('headers', []) request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=watch_uuid)) + request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid'))) # https://github.com/psf/requests/issues/4525 # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot @@ -64,32 +67,32 @@ class difference_detection_processor(): timeout = self.datastore.data['settings']['requests'].get('timeout') - request_body = watch.get('body') - request_method = watch.get('method') - ignore_status_codes = watch.get('ignore_status_codes', False) + request_body = self.watch.get('body') + request_method = self.watch.get('method') + ignore_status_codes = self.watch.get('ignore_status_codes', False) # Configurable per-watch or global extra delay before extracting text (for webDriver types) system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) - if watch['webdriver_delay'] is not None: - self.fetcher.render_extract_delay = watch.get('webdriver_delay') + if self.watch.get('webdriver_delay'): + self.fetcher.render_extract_delay = self.watch.get('webdriver_delay') elif system_webdriver_delay is not None: self.fetcher.render_extract_delay = system_webdriver_delay - if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): - self.fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') + if self.watch.get('webdriver_js_execute_code') is not None and self.watch.get('webdriver_js_execute_code').strip(): + self.fetcher.webdriver_js_execute_code = self.watch.get('webdriver_js_execute_code') # Requests for PDF's, images etc should be passwd the is_binary flag - is_binary = watch.is_pdf + is_binary = self.watch.is_pdf # And here we go! call the right browser with browser-specific settings - self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), + self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'), is_binary=is_binary) self.fetcher.quit() # After init, call run() which will do the actual change-detection @abstractmethod - def run(self, uuid, skip_when_checksum_same=True): + def run_changedetection(self, uuid, skip_when_checksum_same=True): update_obj = {'last_notification_error': False, 'last_error': False} some_data = 'xxxxx' update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index 2edaffde..9751a195 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -19,7 +19,7 @@ class perform_site_check(difference_detection_processor): screenshot = None xpath_data = None - def run(self, uuid, skip_when_checksum_same=True): + def run_changedetection(self, uuid, skip_when_checksum_same=True): # DeepCopy so we can be sure we don't accidently change anything by reference watch = deepcopy(self.datastore.data['watching'].get(uuid)) diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 4a5eca0b..8e070fa7 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -33,8 +33,9 @@ class PDFToHTMLToolNotFound(ValueError): # (set_proxy_from_list) class perform_site_check(difference_detection_processor): - def run(self, uuid, skip_when_checksum_same=True): + def run_changedetection(self, uuid, skip_when_checksum_same=True): changed_detected = False + html_content = "" screenshot = False # as bytes stripped_text_from_html = "" diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 668f4116..f9976dd3 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -209,6 +209,7 @@ class update_worker(threading.Thread): from .processors import text_json_diff, restock_diff while not self.app.config.exit.is_set(): + update_handler = None try: queued_item_data = self.q.get(block=False) @@ -253,7 +254,9 @@ class update_worker(threading.Thread): # Clear last errors (move to preflight func?) self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None - changed_detected, update_obj, contents = update_handler.run(uuid, + update_handler.call_browser() + + changed_detected, update_obj, contents = update_handler.run_changedetection(uuid, skip_when_checksum_same=skip_when_same_checksum, ) @@ -407,6 +410,8 @@ class update_worker(threading.Thread): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) # Other serious error process_changedetection_results = False + + # the thread is still running?? else: # Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc) if not self.datastore.data['watching'].get(uuid): From 572a169a47a56e94469c52ac3e1335d58ec1b515 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 14:15:23 +0100 Subject: [PATCH 3/8] tidups --- changedetectionio/blueprint/check_proxies/__init__.py | 4 ++-- changedetectionio/processors/__init__.py | 4 +++- changedetectionio/tests/test_ignore_regex_text.py | 2 -- changedetectionio/update_worker.py | 2 -- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py index 60c98436..06abdfee 100644 --- a/changedetectionio/blueprint/check_proxies/__init__.py +++ b/changedetectionio/blueprint/check_proxies/__init__.py @@ -40,8 +40,8 @@ def construct_blueprint(datastore: ChangeDetectionStore): contents = '' now = time.time() try: - update_handler = text_json_diff.perform_site_check(datastore=datastore) - changed_detected, update_obj, contents = update_handler.run(uuid, preferred_proxy=preferred_proxy, skip_when_checksum_same=False) + update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid) + changed_detected, update_obj, contents = update_handler.call_browser() # title, size is len contents not len xfer except content_fetcher.Non200ErrorCodeReceived as e: if e.status_code == 404: diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 8a404982..1cee9ae3 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -87,9 +87,11 @@ class difference_detection_processor(): # And here we go! call the right browser with browser-specific settings self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'), is_binary=is_binary) + + #@todo .quit here could go on close object, so we can run JS if change-detected self.fetcher.quit() - # After init, call run() which will do the actual change-detection + # After init, call run_changedetection() which will do the actual change-detection @abstractmethod def run_changedetection(self, uuid, skip_when_checksum_same=True): diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py index 49901f38..45f73392 100644 --- a/changedetectionio/tests/test_ignore_regex_text.py +++ b/changedetectionio/tests/test_ignore_regex_text.py @@ -33,8 +33,6 @@ def test_strip_regex_text_func(): "/not" ] - - fetcher = fetch_site_status.perform_site_check(datastore=False) stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) assert b"but 1 lines" in stripped_content diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index f9976dd3..d1045230 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -410,8 +410,6 @@ class update_worker(threading.Thread): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) # Other serious error process_changedetection_results = False - - # the thread is still running?? else: # Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc) if not self.datastore.data['watching'].get(uuid): From 2bd32b261a376e7713a81ba4dbe3eddb2dc24617 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 14:21:23 +0100 Subject: [PATCH 4/8] remove unsued --- changedetectionio/tests/test_ignore_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index f3918663..5d6d7149 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -24,7 +24,6 @@ def test_strip_text_func(): ignore_lines = ["sometimes"] - fetcher = fetch_site_status.perform_site_check(datastore=False) stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) assert b"sometimes" not in stripped_content From d7bc2bd3f69eab37fadaeb4bb46fb551f7ce579d Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 14:42:34 +0100 Subject: [PATCH 5/8] cleanups --- changedetectionio/processors/__init__.py | 3 ++- changedetectionio/tests/test_extract_csv.py | 2 +- changedetectionio/tests/test_request.py | 6 +++++- changedetectionio/update_worker.py | 3 +++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 1cee9ae3..92ba46a8 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,6 +1,7 @@ from abc import abstractmethod import os import hashlib +from copy import deepcopy from changedetectionio import content_fetcher @@ -15,7 +16,7 @@ class difference_detection_processor(): def __init__(self, *args, datastore, watch_uuid, **kwargs): super().__init__(*args, **kwargs) self.datastore = datastore - self.watch = self.datastore.data['watching'].get(watch_uuid) + self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) def call_browser(self): diff --git a/changedetectionio/tests/test_extract_csv.py b/changedetectionio/tests/test_extract_csv.py index 220b12aa..52596a9e 100644 --- a/changedetectionio/tests/test_extract_csv.py +++ b/changedetectionio/tests/test_extract_csv.py @@ -24,7 +24,7 @@ def test_check_extract_text_from_diff(client, live_server): ) assert b"1 Imported" in res.data - time.sleep(1) + wait_for_all_checks(client) # Load in 5 different numbers/changes last_date="" diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 55c2e342..9586363f 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -80,8 +80,11 @@ def test_headers_in_request(client, live_server): # Should be only one with headers set assert watches_with_headers==1 + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data def test_body_in_request(client, live_server): + # Add our URL to the import page test_url = url_for('test_body', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): @@ -170,7 +173,8 @@ def test_body_in_request(client, live_server): follow_redirects=True ) assert b"Body must be empty when Request Method is set to GET" in res.data - + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data def test_method_in_request(client, live_server): # Add our URL to the import page diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index d1045230..c5ab7de9 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -410,6 +410,9 @@ class update_worker(threading.Thread): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) # Other serious error process_changedetection_results = False +# import traceback +# print(traceback.format_exc()) + else: # Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc) if not self.datastore.data['watching'].get(uuid): From ebc7a7e568e460abd1eb54b8b1aa02dfdc72786b Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 14:51:46 +0100 Subject: [PATCH 6/8] fix security check --- changedetectionio/processors/__init__.py | 7 +++++++ changedetectionio/processors/text_json_diff.py | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 92ba46a8..944a0085 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,6 +1,7 @@ from abc import abstractmethod import os import hashlib +import re from copy import deepcopy from changedetectionio import content_fetcher @@ -18,6 +19,12 @@ class difference_detection_processor(): self.datastore = datastore self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) + # Protect against file:// access + if re.search(r'^file', self.watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): + raise Exception( + "file:// type access is denied for security reasons." + ) + def call_browser(self): diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 8e070fa7..8136374b 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -44,12 +44,6 @@ class perform_site_check(difference_detection_processor): if not watch: raise Exception("Watch no longer exists.") - # Protect against file:// access - if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): - raise Exception( - "file:// type access is denied for security reasons." - ) - # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} From b6bdc2738b2d38ff7f95655b81adf246ed6fa46a Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 15:03:31 +0100 Subject: [PATCH 7/8] tidyup sceurity check --- changedetectionio/processors/__init__.py | 18 +++++++++--------- changedetectionio/tests/test_security.py | 5 +++-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 944a0085..d2e5ee5c 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -2,9 +2,9 @@ from abc import abstractmethod import os import hashlib import re -from copy import deepcopy - from changedetectionio import content_fetcher +from copy import deepcopy +from distutils.util import strtobool class difference_detection_processor(): @@ -19,15 +19,15 @@ class difference_detection_processor(): self.datastore = datastore self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) - # Protect against file:// access - if re.search(r'^file', self.watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): - raise Exception( - "file:// type access is denied for security reasons." - ) - - def call_browser(self): + # Protect against file:// access + if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): + if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): + raise Exception( + "file:// type access is denied for security reasons." + ) + url = self.watch.link # Requests, playwright, other browser via wss:// etc, fetch_extra_something diff --git a/changedetectionio/tests/test_security.py b/changedetectionio/tests/test_security.py index 08a69eeb..406a5401 100644 --- a/changedetectionio/tests/test_security.py +++ b/changedetectionio/tests/test_security.py @@ -1,5 +1,5 @@ from flask import url_for -from . util import set_original_response, set_modified_response, live_server_setup +from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks import time @@ -12,6 +12,7 @@ def test_bad_access(client, live_server): ) assert b"1 Imported" in res.data + wait_for_all_checks(client) # Attempt to add a body with a GET method res = client.post( @@ -59,7 +60,7 @@ def test_bad_access(client, live_server): data={"url": 'file:///tasty/disk/drive', "tags": ''}, follow_redirects=True ) - time.sleep(1) + wait_for_all_checks(client) res = client.get(url_for("index")) assert b'file:// type access is denied for security reasons.' in res.data \ No newline at end of file From 0bcbcb80f16f71b392fd3111ed3f9ffd6c30eeb3 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 7 Nov 2023 15:08:18 +0100 Subject: [PATCH 8/8] tidy up source: handling --- changedetectionio/model/Watch.py | 6 ++++++ changedetectionio/processors/text_json_diff.py | 16 +++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 97131a8a..90858e39 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -146,8 +146,14 @@ class model(dict): flash(message, 'error') return '' + if ready_url.startswith('source:'): + ready_url=ready_url.replace('source:', '') return ready_url + @property + def is_source_type_url(self): + return self.get('url', '').startswith('source:') + @property def get_fetch_backend(self): """ diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index 8136374b..fc35b135 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -49,12 +49,6 @@ class perform_site_check(difference_detection_processor): url = watch.link - # source: support - is_source = False - if url.startswith('source:'): - url = url.replace('source:', '') - is_source = True - self.screenshot = self.fetcher.screenshot self.xpath_data = self.fetcher.xpath_data @@ -89,7 +83,7 @@ class perform_site_check(difference_detection_processor): is_rss = True # source: support, basically treat it as plaintext - if is_source: + if watch.is_source_type_url: is_html = False is_json = False @@ -154,7 +148,7 @@ class perform_site_check(difference_detection_processor): stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter) is_html = False - if is_html or is_source: + if is_html or watch.is_source_type_url: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content) @@ -177,13 +171,13 @@ class perform_site_check(difference_detection_processor): if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), html_content=self.fetcher.content, - append_pretty_line_formatting=not is_source, + append_pretty_line_formatting=not watch.is_source_type_url, is_rss=is_rss) else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content += html_tools.include_filters(include_filters=filter_rule, html_content=self.fetcher.content, - append_pretty_line_formatting=not is_source) + append_pretty_line_formatting=not watch.is_source_type_url) if not html_content.strip(): raise FilterNotFoundInResponse(include_filters_rule) @@ -191,7 +185,7 @@ class perform_site_check(difference_detection_processor): if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) - if is_source: + if watch.is_source_type_url: stripped_text_from_html = html_content else: # extract text