diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py index 60c98436..06abdfee 100644 --- a/changedetectionio/blueprint/check_proxies/__init__.py +++ b/changedetectionio/blueprint/check_proxies/__init__.py @@ -40,8 +40,8 @@ def construct_blueprint(datastore: ChangeDetectionStore): contents = '' now = time.time() try: - update_handler = text_json_diff.perform_site_check(datastore=datastore) - changed_detected, update_obj, contents = update_handler.run(uuid, preferred_proxy=preferred_proxy, skip_when_checksum_same=False) + update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid) + changed_detected, update_obj, contents = update_handler.call_browser() # title, size is len contents not len xfer except content_fetcher.Non200ErrorCodeReceived as e: if e.status_code == 404: diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index c17419af..d9c14590 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -419,11 +419,7 @@ class base_html_playwright(Fetcher): is_binary=False): # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!) - has_browser_steps = self.browser_steps and list(filter( - lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), - self.browser_steps)) - - if not has_browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): + if not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')): # Temporary backup solution until we rewrite the playwright code return self.run_fetch_browserless_puppeteer( @@ -671,6 +667,7 @@ class html_requests(Fetcher): fetcher_description = "Basic fast Plaintext/HTTP Client" def __init__(self, proxy_override=None): + super().__init__() self.proxy_override = proxy_override def run(self, diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index 2773a3bb..90858e39 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -19,6 +19,7 @@ from changedetectionio.notification import ( base_config = { 'body': None, + 'browser_steps': [], 'browser_steps_last_error_step': None, 'check_unique_lines': False, # On change-detected, compare against all history if its something new 'check_count': 0, @@ -145,8 +146,14 @@ class model(dict): flash(message, 'error') return '' + if ready_url.startswith('source:'): + ready_url=ready_url.replace('source:', '') return ready_url + @property + def is_source_type_url(self): + return self.get('url', '').startswith('source:') + @property def get_fetch_backend(self): """ @@ -234,6 +241,14 @@ class model(dict): fname = os.path.join(self.watch_data_dir, "history.txt") return os.path.isfile(fname) + @property + def has_browser_steps(self): + has_browser_steps = self.get('browser_steps') and list(filter( + lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), + self.get('browser_steps'))) + + return has_browser_steps + # Returns the newest key, but if theres only 1 record, then it's counted as not being new, so return 0. @property def newest_history_key(self): diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index b4cb00c6..caed4a36 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -1,14 +1,108 @@ from abc import abstractmethod +import os import hashlib - +import re +from changedetectionio import content_fetcher +from copy import deepcopy +from distutils.util import strtobool class difference_detection_processor(): + datastore = None + fetcher = None + screenshot = None + xpath_data = None + browser_steps = None + watch = None - def __init__(self, *args, **kwargs): + def __init__(self, *args, datastore, watch_uuid, **kwargs): super().__init__(*args, **kwargs) + self.datastore = datastore + self.watch = deepcopy(self.datastore.data['watching'].get(watch_uuid)) + + def call_browser(self): + + # Protect against file:// access + if re.search(r'^file://', self.watch.get('url', '').strip(), re.IGNORECASE): + if not strtobool(os.getenv('ALLOW_FILE_URI', 'false')): + raise Exception( + "file:// type access is denied for security reasons." + ) + + url = self.watch.link + + # Requests, playwright, other browser via wss:// etc, fetch_extra_something + prefer_fetch_backend = self.watch.get('fetch_backend', 'system') + + # Proxy ID "key" + preferred_proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=self.watch.get('uuid')) + + # Pluggable content self.fetcher + if not prefer_fetch_backend or prefer_fetch_backend == 'system': + prefer_fetch_backend = self.datastore.data['settings']['application'].get('fetch_backend') + + # Grab the right kind of 'fetcher', (playwright, requests, etc) + if hasattr(content_fetcher, prefer_fetch_backend): + fetcher_obj = getattr(content_fetcher, prefer_fetch_backend) + else: + # If the klass doesnt exist, just use a default + fetcher_obj = getattr(content_fetcher, "html_requests") + + + proxy_url = None + if preferred_proxy_id: + proxy_url = self.datastore.proxy_list.get(preferred_proxy_id).get('url') + print(f"Using proxy Key: {preferred_proxy_id} as Proxy URL {proxy_url}") + + # Now call the fetcher (playwright/requests/etc) with arguments that only a fetcher would need. + self.fetcher = fetcher_obj(proxy_override=proxy_url, + #browser_url_extra/configurable browser url=... + ) + + if self.watch.has_browser_steps: + self.fetcher.browser_steps = self.watch.get('browser_steps', []) + self.fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, self.watch.get('uuid')) + + # Tweak the base config with the per-watch ones + request_headers = self.watch.get('headers', []) + request_headers.update(self.datastore.get_all_base_headers()) + request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid'))) + + # https://github.com/psf/requests/issues/4525 + # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot + # do this by accident. + if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: + request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') + + timeout = self.datastore.data['settings']['requests'].get('timeout') + + request_body = self.watch.get('body') + request_method = self.watch.get('method') + ignore_status_codes = self.watch.get('ignore_status_codes', False) + + # Configurable per-watch or global extra delay before extracting text (for webDriver types) + system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) + if self.watch.get('webdriver_delay'): + self.fetcher.render_extract_delay = self.watch.get('webdriver_delay') + elif system_webdriver_delay is not None: + self.fetcher.render_extract_delay = system_webdriver_delay + + if self.watch.get('webdriver_js_execute_code') is not None and self.watch.get('webdriver_js_execute_code').strip(): + self.fetcher.webdriver_js_execute_code = self.watch.get('webdriver_js_execute_code') + + # Requests for PDF's, images etc should be passwd the is_binary flag + is_binary = self.watch.is_pdf + + # And here we go! call the right browser with browser-specific settings + self.fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, self.watch.get('include_filters'), + is_binary=is_binary) + + #@todo .quit here could go on close object, so we can run JS if change-detected + self.fetcher.quit() + + # After init, call run_changedetection() which will do the actual change-detection @abstractmethod - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): + def run_changedetection(self, uuid, skip_when_checksum_same=True): update_obj = {'last_notification_error': False, 'last_error': False} some_data = 'xxxxx' update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() diff --git a/changedetectionio/processors/restock_diff.py b/changedetectionio/processors/restock_diff.py index 09cae6e7..9751a195 100644 --- a/changedetectionio/processors/restock_diff.py +++ b/changedetectionio/processors/restock_diff.py @@ -1,10 +1,7 @@ import hashlib -import os -import re import urllib3 from . import difference_detection_processor -from changedetectionio import content_fetcher from copy import deepcopy urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -22,11 +19,7 @@ class perform_site_check(difference_detection_processor): screenshot = None xpath_data = None - def __init__(self, *args, datastore, **kwargs): - super().__init__(*args, **kwargs) - self.datastore = datastore - - def run(self, uuid, skip_when_checksum_same=True): + def run_changedetection(self, uuid, skip_when_checksum_same=True): # DeepCopy so we can be sure we don't accidently change anything by reference watch = deepcopy(self.datastore.data['watching'].get(uuid)) @@ -34,84 +27,24 @@ class perform_site_check(difference_detection_processor): if not watch: raise Exception("Watch no longer exists.") - # Protect against file:// access - if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): - raise Exception( - "file:// type access is denied for security reasons." - ) - # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} - request_headers = watch.get('headers', []) - request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) - - # https://github.com/psf/requests/issues/4525 - # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot - # do this by accident. - if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: - request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - - timeout = self.datastore.data['settings']['requests'].get('timeout') - - url = watch.link - - request_body = self.datastore.data['watching'][uuid].get('body') - request_method = self.datastore.data['watching'][uuid].get('method') - ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) - - # Pluggable content fetcher - prefer_backend = watch.get_fetch_backend - if not prefer_backend or prefer_backend == 'system': - prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] - - if hasattr(content_fetcher, prefer_backend): - klass = getattr(content_fetcher, prefer_backend) - else: - # If the klass doesnt exist, just use a default - klass = getattr(content_fetcher, "html_requests") - - proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) - proxy_url = None - if proxy_id: - proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') - print("UUID {} Using proxy {}".format(uuid, proxy_url)) - - fetcher = klass(proxy_override=proxy_url) - - # Configurable per-watch or global extra delay before extracting text (for webDriver types) - system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) - if watch['webdriver_delay'] is not None: - fetcher.render_extract_delay = watch.get('webdriver_delay') - elif system_webdriver_delay is not None: - fetcher.render_extract_delay = system_webdriver_delay - - # Could be removed if requests/plaintext could also return some info? - if prefer_backend != 'html_webdriver': - raise Exception("Re-stock detection requires Chrome or compatible webdriver/playwright fetcher to work") - - if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): - fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') - - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters')) - fetcher.quit() - - self.screenshot = fetcher.screenshot - self.xpath_data = fetcher.xpath_data + self.screenshot = self.fetcher.screenshot + self.xpath_data = self.fetcher.xpath_data # Track the content type - update_obj['content_type'] = fetcher.headers.get('Content-Type', '') - update_obj["last_check_status"] = fetcher.get_last_status_code() + update_obj['content_type'] = self.fetcher.headers.get('Content-Type', '') + update_obj["last_check_status"] = self.fetcher.get_last_status_code() # Main detection method fetched_md5 = None - if fetcher.instock_data: - fetched_md5 = hashlib.md5(fetcher.instock_data.encode('utf-8')).hexdigest() + if self.fetcher.instock_data: + fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. - update_obj["in_stock"] = True if fetcher.instock_data == 'Possibly in stock' else False + update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False else: - raise UnableToExtractRestockData(status_code=fetcher.status_code) + raise UnableToExtractRestockData(status_code=self.fetcher.status_code) # The main thing that all this at the moment comes down to :) changed_detected = False @@ -128,4 +61,4 @@ class perform_site_check(difference_detection_processor): # Always record the new checksum update_obj["previous_md5"] = fetched_md5 - return changed_detected, update_obj, fetcher.instock_data.encode('utf-8') + return changed_detected, update_obj, self.fetcher.instock_data.encode('utf-8') diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index afd8ec36..fc35b135 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -1,4 +1,4 @@ -# HTML to TEXT/JSON DIFFERENCE FETCHER +# HTML to TEXT/JSON DIFFERENCE self.fetcher import hashlib import json @@ -32,15 +32,10 @@ class PDFToHTMLToolNotFound(ValueError): # Some common stuff here that can be moved to a base class # (set_proxy_from_list) class perform_site_check(difference_detection_processor): - screenshot = None - xpath_data = None - def __init__(self, *args, datastore, **kwargs): - super().__init__(*args, **kwargs) - self.datastore = datastore - - def run(self, uuid, skip_when_checksum_same=True, preferred_proxy=None): + def run_changedetection(self, uuid, skip_when_checksum_same=True): changed_detected = False + html_content = "" screenshot = False # as bytes stripped_text_from_html = "" @@ -49,100 +44,25 @@ class perform_site_check(difference_detection_processor): if not watch: raise Exception("Watch no longer exists.") - # Protect against file:// access - if re.search(r'^file', watch.get('url', ''), re.IGNORECASE) and not os.getenv('ALLOW_FILE_URI', False): - raise Exception( - "file:// type access is denied for security reasons." - ) - # Unset any existing notification error update_obj = {'last_notification_error': False, 'last_error': False} - # Tweak the base config with the per-watch ones - request_headers = watch.get('headers', []) - request_headers.update(self.datastore.get_all_base_headers()) - request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=uuid)) - - # https://github.com/psf/requests/issues/4525 - # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot - # do this by accident. - if 'Accept-Encoding' in request_headers and "br" in request_headers['Accept-Encoding']: - request_headers['Accept-Encoding'] = request_headers['Accept-Encoding'].replace(', br', '') - - timeout = self.datastore.data['settings']['requests'].get('timeout') - url = watch.link - request_body = self.datastore.data['watching'][uuid].get('body') - request_method = self.datastore.data['watching'][uuid].get('method') - ignore_status_codes = self.datastore.data['watching'][uuid].get('ignore_status_codes', False) - - # source: support - is_source = False - if url.startswith('source:'): - url = url.replace('source:', '') - is_source = True - - # Pluggable content fetcher - prefer_backend = watch.get_fetch_backend - if not prefer_backend or prefer_backend == 'system': - prefer_backend = self.datastore.data['settings']['application']['fetch_backend'] - - if hasattr(content_fetcher, prefer_backend): - klass = getattr(content_fetcher, prefer_backend) - else: - # If the klass doesnt exist, just use a default - klass = getattr(content_fetcher, "html_requests") - - if preferred_proxy: - proxy_id = preferred_proxy - else: - proxy_id = self.datastore.get_preferred_proxy_for_watch(uuid=uuid) - - proxy_url = None - if proxy_id: - proxy_url = self.datastore.proxy_list.get(proxy_id).get('url') - print("UUID {} Using proxy {}".format(uuid, proxy_url)) - - fetcher = klass(proxy_override=proxy_url) - - # Configurable per-watch or global extra delay before extracting text (for webDriver types) - system_webdriver_delay = self.datastore.data['settings']['application'].get('webdriver_delay', None) - if watch['webdriver_delay'] is not None: - fetcher.render_extract_delay = watch.get('webdriver_delay') - elif system_webdriver_delay is not None: - fetcher.render_extract_delay = system_webdriver_delay - - # Possible conflict - if prefer_backend == 'html_webdriver': - fetcher.browser_steps = watch.get('browser_steps', None) - fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, uuid) - - if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): - fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') - - # requests for PDF's, images etc should be passwd the is_binary flag - is_binary = watch.is_pdf - - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch.get('include_filters'), - is_binary=is_binary) - fetcher.quit() - - self.screenshot = fetcher.screenshot - self.xpath_data = fetcher.xpath_data + self.screenshot = self.fetcher.screenshot + self.xpath_data = self.fetcher.xpath_data # Track the content type - update_obj['content_type'] = fetcher.get_all_headers().get('content-type', '').lower() + update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower() # Watches added automatically in the queue manager will skip if its the same checksum as the previous run # Saves a lot of CPU - update_obj['previous_md5_before_filters'] = hashlib.md5(fetcher.content.encode('utf-8')).hexdigest() + update_obj['previous_md5_before_filters'] = hashlib.md5(self.fetcher.content.encode('utf-8')).hexdigest() if skip_when_checksum_same: if update_obj['previous_md5_before_filters'] == watch.get('previous_md5_before_filters'): raise content_fetcher.checksumFromPreviousCheckWasTheSame() # Fetching complete, now filters - # @todo move to class / maybe inside of fetcher abstract base? # @note: I feel like the following should be in a more obvious chain system # - Check filter text @@ -151,24 +71,24 @@ class perform_site_check(difference_detection_processor): # https://stackoverflow.com/questions/41817578/basic-method-chaining ? # return content().textfilter().jsonextract().checksumcompare() ? - is_json = 'application/json' in fetcher.get_all_headers().get('content-type', '').lower() + is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower() is_html = not is_json is_rss = False - ctype_header = fetcher.get_all_headers().get('content-type', '').lower() + ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower() # Go into RSS preprocess for converting CDATA/comment to usable text if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']): - if '', metadata + '') + self.fetcher.content = self.fetcher.content.replace('', metadata + '') # Better would be if Watch.model could access the global data also # and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__ @@ -217,7 +137,7 @@ class perform_site_check(difference_detection_processor): if is_json: # Sort the JSON so we dont get false alerts when the content is just re-ordered try: - fetcher.content = json.dumps(json.loads(fetcher.content), sort_keys=True) + self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True) except Exception as e: # Might have just been a snippet, or otherwise bad JSON, continue pass @@ -225,22 +145,22 @@ class perform_site_check(difference_detection_processor): if has_filter_rule: for filter in include_filters_rule: if any(prefix in filter for prefix in json_filter_prefixes): - stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) + stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter) is_html = False - if is_html or is_source: + if is_html or watch.is_source_type_url: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - fetcher.content = html_tools.workarounds_for_obfuscations(fetcher.content) - html_content = fetcher.content + self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content) + html_content = self.fetcher.content # If not JSON, and if it's not text/plain.. - if 'text/plain' in fetcher.get_all_headers().get('content-type', '').lower(): + if 'text/plain' in self.fetcher.get_all_headers().get('content-type', '').lower(): # Don't run get_text or xpath/css filters on plaintext stripped_text_from_html = html_content else: # Does it have some ld+json price data? used for easier monitoring - update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(fetcher.content) + update_obj['has_ldjson_price_data'] = html_tools.has_ldjson_product_info(self.fetcher.content) # Then we assume HTML if has_filter_rule: @@ -250,14 +170,14 @@ class perform_site_check(difference_detection_processor): # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), - html_content=fetcher.content, - append_pretty_line_formatting=not is_source, + html_content=self.fetcher.content, + append_pretty_line_formatting=not watch.is_source_type_url, is_rss=is_rss) else: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content += html_tools.include_filters(include_filters=filter_rule, - html_content=fetcher.content, - append_pretty_line_formatting=not is_source) + html_content=self.fetcher.content, + append_pretty_line_formatting=not watch.is_source_type_url) if not html_content.strip(): raise FilterNotFoundInResponse(include_filters_rule) @@ -265,7 +185,7 @@ class perform_site_check(difference_detection_processor): if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) - if is_source: + if watch.is_source_type_url: stripped_text_from_html = html_content else: # extract text @@ -311,7 +231,7 @@ class perform_site_check(difference_detection_processor): empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: raise content_fetcher.ReplyWithContentButNoText(url=url, - status_code=fetcher.get_last_status_code(), + status_code=self.fetcher.get_last_status_code(), screenshot=screenshot, has_filters=has_filter_rule, html_content=html_content @@ -320,7 +240,7 @@ class perform_site_check(difference_detection_processor): # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. - update_obj["last_check_status"] = fetcher.get_last_status_code() + update_obj["last_check_status"] = self.fetcher.get_last_status_code() # If there's text to skip # @todo we could abstract out the get_text() to handle this cleaner @@ -408,7 +328,7 @@ class perform_site_check(difference_detection_processor): if is_html: if self.datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']: if not watch['title'] or not len(watch['title']): - update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) + update_obj['title'] = html_tools.extract_element(find='title', html_content=self.fetcher.content) if changed_detected: if watch.get('check_unique_lines', False): diff --git a/changedetectionio/tests/test_extract_csv.py b/changedetectionio/tests/test_extract_csv.py index 220b12aa..52596a9e 100644 --- a/changedetectionio/tests/test_extract_csv.py +++ b/changedetectionio/tests/test_extract_csv.py @@ -24,7 +24,7 @@ def test_check_extract_text_from_diff(client, live_server): ) assert b"1 Imported" in res.data - time.sleep(1) + wait_for_all_checks(client) # Load in 5 different numbers/changes last_date="" diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py index 49901f38..45f73392 100644 --- a/changedetectionio/tests/test_ignore_regex_text.py +++ b/changedetectionio/tests/test_ignore_regex_text.py @@ -33,8 +33,6 @@ def test_strip_regex_text_func(): "/not" ] - - fetcher = fetch_site_status.perform_site_check(datastore=False) stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) assert b"but 1 lines" in stripped_content diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index f3918663..5d6d7149 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -24,7 +24,6 @@ def test_strip_text_func(): ignore_lines = ["sometimes"] - fetcher = fetch_site_status.perform_site_check(datastore=False) stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) assert b"sometimes" not in stripped_content diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 55c2e342..9586363f 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -80,8 +80,11 @@ def test_headers_in_request(client, live_server): # Should be only one with headers set assert watches_with_headers==1 + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data def test_body_in_request(client, live_server): + # Add our URL to the import page test_url = url_for('test_body', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): @@ -170,7 +173,8 @@ def test_body_in_request(client, live_server): follow_redirects=True ) assert b"Body must be empty when Request Method is set to GET" in res.data - + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data def test_method_in_request(client, live_server): # Add our URL to the import page diff --git a/changedetectionio/tests/test_security.py b/changedetectionio/tests/test_security.py index 08a69eeb..406a5401 100644 --- a/changedetectionio/tests/test_security.py +++ b/changedetectionio/tests/test_security.py @@ -1,5 +1,5 @@ from flask import url_for -from . util import set_original_response, set_modified_response, live_server_setup +from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks import time @@ -12,6 +12,7 @@ def test_bad_access(client, live_server): ) assert b"1 Imported" in res.data + wait_for_all_checks(client) # Attempt to add a body with a GET method res = client.post( @@ -59,7 +60,7 @@ def test_bad_access(client, live_server): data={"url": 'file:///tasty/disk/drive', "tags": ''}, follow_redirects=True ) - time.sleep(1) + wait_for_all_checks(client) res = client.get(url_for("index")) assert b'file:// type access is denied for security reasons.' in res.data \ No newline at end of file diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 792fe94a..c5ab7de9 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -209,6 +209,7 @@ class update_worker(threading.Thread): from .processors import text_json_diff, restock_diff while not self.app.config.exit.is_set(): + update_handler = None try: queued_item_data = self.q.get(block=False) @@ -229,17 +230,35 @@ class update_worker(threading.Thread): now = time.time() try: - processor = self.datastore.data['watching'][uuid].get('processor','text_json_diff') + # Processor is what we are using for detecting the "Change" + processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') + # if system... + + # Abort processing when the content was the same as the last fetch + skip_when_same_checksum = queued_item_data.item.get('skip_when_checksum_same') + # @todo some way to switch by name + # Init a new 'difference_detection_processor' + if processor == 'restock_diff': - update_handler = restock_diff.perform_site_check(datastore=self.datastore) + update_handler = restock_diff.perform_site_check(datastore=self.datastore, + watch_uuid=uuid + ) else: # Used as a default and also by some tests - update_handler = text_json_diff.perform_site_check(datastore=self.datastore) + update_handler = text_json_diff.perform_site_check(datastore=self.datastore, + watch_uuid=uuid + ) + # Clear last errors (move to preflight func?) self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None - changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same')) + + update_handler.call_browser() + + changed_detected, update_obj, contents = update_handler.run_changedetection(uuid, + skip_when_checksum_same=skip_when_same_checksum, + ) # Re #342 # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. @@ -391,6 +410,9 @@ class update_worker(threading.Thread): self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) # Other serious error process_changedetection_results = False +# import traceback +# print(traceback.format_exc()) + else: # Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc) if not self.datastore.data['watching'].get(uuid):