diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index c17419af..31080a97 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -419,11 +419,7 @@ class base_html_playwright(Fetcher): is_binary=False): # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!) - has_browser_steps = self.browser_steps and list(filter( - lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), - self.browser_steps)) - - if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): + if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')): # Temporary backup solution until we rewrite the playwright code return self.run_fetch_browserless_puppeteer( diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 2773a3bb..97131a8a 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -19,6 +19,7 @@ from changedetectionio.notification import ( base_config = { 'body': None, + 'browser_steps': [], 'browser_steps_last_error_step': None, 'check_unique_lines': False, # On change-detected, compare against all history if its something new 'check_count': 0, @@ -234,6 +235,14 @@ class model(dict): fname = os.path.join(self.watch_data_dir, "history.txt") return os.path.isfile(fname) + @property + def has_browser_steps(self): + has_browser_steps = self.get('browser_steps') and list(filter( + lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), + self.get('browser_steps'))) + + return has_browser_steps + # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. @property def newest_history_key(self): diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index 225e9c8a..ba6af25c 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,15 +1,95 @@ from abc import abstractmethod +import os import hashlib +from changedetectionio import content_fetcher class difference_detection_processor(): + datastore = None + fetcher = None + screenshot = None + xpath_data = None + browser_steps = None - def __init__(self, *args, **kwargs): + def __init__(self, *args, datastore, watch_uuid, **kwargs): super().__init__(*args, **kwargs) + self.datastore = datastore + + watch = self.datastore.data['watching'].get(watch_uuid) + url = watch.link + + # Requests, playwright, other browser via wss:// etc, fetch_extra_something + prefer_fetch_backend = watch.get('fetch_backend', 'system') + + # Proxy ID "key" + preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=watch_uuid) + + # Pluggable content self.fetcher + if not prefer_fetch_backend or prefer_fetch_backend == 'system': + prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') + + # Grab the right kind of 'fetcher', (playwright, requests, etc) + if hasattr(content_fetcher, prefer_fetch_backend): + fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) + else: + # If the klass doesnt exist, just use a default + fetcher_obj = getattr(content_fetcher, "html_requests") + + + proxy_url = None + if preferred_proxy_id: + proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url') + print(f"Using proxy Key: {preferred_proxy_id} as Proxy URL {proxy_url}") + + # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. + self.fetcher = fetcher_obj(proxy_override=proxy_url, + #browser_url_extra/configurable browser url=... + ) + + if watch.has_browser_steps: + self.fetcher.browser_steps = watch.get('browser_steps', []) + self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, watch_uuid) + + # Tweak the base config with the per-watch ones + request_headers = watch.get('headers', []) + request_headers.update(self.datastore.get_all_base_headers()) + request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=watch_uuid)) + + # https://github.com/psf/requests/issues/4525 + # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot + # do this by accident. + if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: + request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') + + timeout = self.datastore.data['settings']['requests'].get('timeout') + + request_body = watch.get('body') + request_method = watch.get('method') + ignore_status_codes = watch.get('ignore_status_codes', False) + + # Configurable per-watch or global extra delay before extracting text (for webDriver types) + system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) + if watch['webdriver_delay'] is not None: + self.fetcher.render_extract_delay = watch.get('webdriver_delay') + elif system_webdriver_delay is not None: + self.fetcher.render_extract_delay = system_webdriver_delay + + if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): + self.fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') + + # Requests for PDF's, images etc should be passwd the is_binary flag + is_binary = watch.is_pdf + + # And here we go! call the right browser with browser-specific settings + self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), + is_binary=is_binary) + self.fetcher.quit() + + # After init, call run() which will do the actual change-detection @abstractmethod - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): + def run(self, uuid, skip_when_checksum_same=True): update_obj = {'last_notification_error': False, 'last_error': False} some_data = 'xxxxx' update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index 09cae6e7..2edaffde 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -1,10 +1,7 @@ import hashlib -import os -import re import urllib3 from . import difference_detection_processor -from changedetectionio import content_fetcher from copy import deepcopy urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -22,10 +19,6 @@ class perform_site_check(difference_detection_processor): screenshot = None xpath_data = None - def __init__(self, *args, datastore, **kwargs): - super().__init__(*args, **kwargs) - self.datastore = datastore - def run(self, uuid, skip_when_checksum_same=True): # DeepCopy so we can be sure we don't accidently change anything by reference @@ -34,84 +27,24 @@ class perform_site_check(difference_detection_processor): if not watch: raise Exception("Watch no longer exists.") - # Protect against file:// access - if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): - raise Exception( - "file:// type access is denied for security reasons." - ) - # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} - request_headers = watch.get('headers', []) - request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) - - # https://github.com/psf/requests/issues/4525 - # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot - # do this by accident. - if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: - request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - - timeout = self.datastore.data['settings']['requests'].get('timeout') - - url = watch.link - - request_body = self.datastore.data['watching'][uuid].get('body') - request_method = self.datastore.data['watching'][uuid].get('method') - ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) - - # Pluggable content fetcher - prefer_backend = watch.get_fetch_backend - if not prefer_backend or prefer_backend == 'system': - prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] - - if hasattr(content_fetcher, prefer_backend): - klass = getattr(content_fetcher, prefer_backend) - else: - # If the klass doesnt exist, just use a default - klass = getattr(content_fetcher, "html_requests") - - proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) - proxy_url = None - if proxy_id: - proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') - print("UUID {} Using proxy {}".format(uuid, proxy_url)) - - fetcher = klass(proxy_override=proxy_url) - - # Configurable per-watch or global extra delay before extracting text (for webDriver types) - system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) - if watch['webdriver_delay'] is not None: - fetcher.render_extract_delay = watch.get('webdriver_delay') - elif system_webdriver_delay is not None: - fetcher.render_extract_delay = system_webdriver_delay - - # Could be removed if requests/plaintext could also return some info? - if prefer_backend != 'html_webdriver': - raise Exception("Re-stock detection requires Chrome or compatible webdriver/playwright fetcher to work") - - if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): - fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') - - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters')) - fetcher.quit() - - self.screenshot = fetcher.screenshot - self.xpath_data = fetcher.xpath_data + self.screenshot = self.fetcher.screenshot + self.xpath_data = self.fetcher.xpath_data # Track the content type - update_obj['content_type'] = fetcher.headers.get('Content-Type', '') - update_obj["last_check_status"] = fetcher.get_last_status_code() + update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '') + update_obj["last_check_status"] = self.fetcher.get_last_status_code() # Main detection method fetched_md5 = None - if fetcher.instock_data: - fetched_md5 = hashlib.md5(fetcher.instock_data.encode('utf-8')).hexdigest() + if self.fetcher.instock_data: + fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. - update_obj["in_stock"] = True if fetcher.instock_data == 'Possibly in stock' else False + update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False else: - raise UnableToExtractRestockData(status_code=fetcher.status_code) + raise UnableToExtractRestockData(status_code=self.fetcher.status_code) # The main thing that all this at the moment comes down to :) changed_detected = False @@ -128,4 +61,4 @@ class perform_site_check(difference_detection_processor): # Always record the new checksum update_obj["previous_md5"] = fetched_md5 - return changed_detected, update_obj, fetcher.instock_data.encode('utf-8') + return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8') diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index afd8ec36..4a5eca0b 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -1,4 +1,4 @@ -# HTML to TEXT/JSON DIFFERENCE FETCHER +# HTML to TEXT/JSON DIFFERENCE self.fetcher import hashlib import json @@ -32,14 +32,8 @@ class PDFToHTMLToolNotFound(ValueError): # Some common stuff here that can be moved to a base class # (set_proxy_from_list) class perform_site_check(difference_detection_processor): - screenshot = None - xpath_data = None - def __init__(self, *args, datastore, **kwargs): - super().__init__(*args, **kwargs) - self.datastore = datastore - - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): + def run(self, uuid, skip_when_checksum_same=True): changed_detected = False screenshot = False # as bytes stripped_text_from_html = "" @@ -58,91 +52,28 @@ class perform_site_check(difference_detection_processor): # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} - # Tweak the base config with the per-watch ones - request_headers = watch.get('headers', []) - request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) - - # https://github.com/psf/requests/issues/4525 - # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot - # do this by accident. - if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: - request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - - timeout = self.datastore.data['settings']['requests'].get('timeout') - url = watch.link - request_body = self.datastore.data['watching'][uuid].get('body') - request_method = self.datastore.data['watching'][uuid].get('method') - ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) - # source: support is_source = False if url.startswith('source:'): url = url.replace('source:', '') is_source = True - # Pluggable content fetcher - prefer_backend = watch.get_fetch_backend - if not prefer_backend or prefer_backend == 'system': - prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] - - if hasattr(content_fetcher, prefer_backend): - klass = getattr(content_fetcher, prefer_backend) - else: - # If the klass doesnt exist, just use a default - klass = getattr(content_fetcher, "html_requests") - - if preferred_proxy: - proxy_id = preferred_proxy - else: - proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) - - proxy_url = None - if proxy_id: - proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') - print("UUID {} Using proxy {}".format(uuid, proxy_url)) - - fetcher = klass(proxy_override=proxy_url) - - # Configurable per-watch or global extra delay before extracting text (for webDriver types) - system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) - if watch['webdriver_delay'] is not None: - fetcher.render_extract_delay = watch.get('webdriver_delay') - elif system_webdriver_delay is not None: - fetcher.render_extract_delay = system_webdriver_delay - - # Possible conflict - if prefer_backend == 'html_webdriver': - fetcher.browser_steps = watch.get('browser_steps', None) - fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, uuid) - - if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): - fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') - - # requests for PDF's, images etc should be passwd the is_binary flag - is_binary = watch.is_pdf - - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), - is_binary=is_binary) - fetcher.quit() - - self.screenshot = fetcher.screenshot - self.xpath_data = fetcher.xpath_data + self.screenshot = self.fetcher.screenshot + self.xpath_data = self.fetcher.xpath_data # Track the content type - update_obj['content_type'] = fetcher.get_all_headers().get('content-type', '').lower() + update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower() # Watches added automatically in the queue manager will skip if its the same checksum as the previous run # Saves a lot of CPU - update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest() + update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest() if skip_when_checksum_same: if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): raise content_fetcher.checksumFromPreviousCheckWasTheSame() # Fetching complete, now filters - # @todo move to class / maybe inside of fetcher abstract base? # @note: I feel like the following should be in a more obvious chain system # - Check filter text @@ -151,15 +82,15 @@ class perform_site_check(difference_detection_processor): # https://stackoverflow.com/questions/41817578/basic-method-chaining ? # return content().textfilter().jsonextract().checksumcompare() ? - is_json = 'application/json' in fetcher.get_all_headers().get('content-type', '').lower() + is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower() is_html = not is_json is_rss = False - ctype_header = fetcher.get_all_headers().get('content-type', '').lower() + ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower() # Go into RSS preprocess for converting CDATA/comment to usable text if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']): - if '', metadata + '') + self.fetcher.content = self.fetcher.content.replace('', metadata + '') # Better would be if Watch.model could access the global data also # and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__ @@ -217,7 +148,7 @@ class perform_site_check(difference_detection_processor): if is_json: # Sort the JSON so we dont get false alerts when the content is just re-ordered try: - fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True) + self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True) except Exception as e: # Might have just been a snippet, or otherwise bad JSON, continue pass @@ -225,22 +156,22 @@ class perform_site_check(difference_detection_processor): if has_filter_rule: for filter in include_filters_rule: if any(prefix in filter for prefix in json_filter_prefixes): - stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) + stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter) is_html = False if is_html or is_source: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content) - html_content = fetcher.content + self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content) + html_content = self.fetcher.content # If not JSON, and if it's not text/plain.. - if 'text/plain' in fetcher.get_all_headers().get('content-type', '').lower(): + if 'text/plain' in self.fetcher.get_all_headers().get('content-type', '').lower(): # Don't run get_text or xpath/css filters on plaintext stripped_text_from_html = html_content else: # Does it have some ld+json price data? used for easier monitoring - update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content) + update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(self.fetcher.content) # Then we assume HTML if has_filter_rule: @@ -250,13 +181,13 @@ class perform_site_check(difference_detection_processor): # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), - html_content=fetcher.content, + html_content=self.fetcher.content, append_pretty_line_formatting=not is_source, is_rss=is_rss) else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content += html_tools.include_filters(include_filters=filter_rule, - html_content=fetcher.content, + html_content=self.fetcher.content, append_pretty_line_formatting=not is_source) if not html_content.strip(): @@ -311,7 +242,7 @@ class perform_site_check(difference_detection_processor): empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: raise content_fetcher.ReplyWithContentButNoText(url=url, - status_code=fetcher.get_last_status_code(), + status_code=self.fetcher.get_last_status_code(), screenshot=screenshot, has_filters=has_filter_rule, html_content=html_content @@ -320,7 +251,7 @@ class perform_site_check(difference_detection_processor): # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. - update_obj["last_check_status"] = fetcher.get_last_status_code() + update_obj["last_check_status"] = self.fetcher.get_last_status_code() # If there's text to skip # @todo we could abstract out the get_text() to handle this cleaner @@ -408,7 +339,7 @@ class perform_site_check(difference_detection_processor): if is_html: if self.datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']: if not watch['title'] or not len(watch['title']): - update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) + update_obj['title'] = html_tools.extract_element(find='title', html_content=self.fetcher.content) if changed_detected: if watch.get('check_unique_lines', False): diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 792fe94a..668f4116 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -229,17 +229,33 @@ class update_worker(threading.Thread): now = time.time() try: - processor = self.datastore.data['watching'][uuid].get('processor','text_json_diff') + # Processor is what we are using for detecting the "Change" + processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') + # if system... + + # Abort processing when the content was the same as the last fetch + skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') + # @todo some way to switch by name + # Init a new 'difference_detection_processor' + if processor == 'restock_diff': - update_handler = restock_diff.perform_site_check(datastore=self.datastore) + update_handler = restock_diff.perform_site_check(datastore=self.datastore, + watch_uuid=uuid + ) else: # Used as a default and also by some tests - update_handler = text_json_diff.perform_site_check(datastore=self.datastore) + update_handler = text_json_diff.perform_site_check(datastore=self.datastore, + watch_uuid=uuid + ) + # Clear last errors (move to preflight func?) self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None - changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same')) + + changed_detected, update_obj, contents = update_handler.run(uuid, + skip_when_checksum_same=skip_when_same_checksum, + ) # Re #342 # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.