From 5a768d7db38a46183947c0a8cd4ad9a382b1f42e Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 10 Oct 2024 14:59:39 +0200 Subject: [PATCH] UTF-8 handling fixes, Improvements to whitespace filtering (#2691) --- .../blueprint/tags/templates/edit-tag.html | 1 - changedetectionio/forms.py | 2 +- changedetectionio/html_tools.py | 35 +++++---- changedetectionio/model/Watch.py | 30 ++++++-- .../processors/restock_diff/processor.py | 2 +- .../processors/text_json_diff/__init__.py | 17 ++++- .../processors/text_json_diff/processor.py | 72 ++++++++++--------- changedetectionio/static/js/watch-settings.js | 10 ++- changedetectionio/store.py | 9 +-- changedetectionio/templates/edit.html | 9 ++- changedetectionio/templates/settings.html | 2 +- changedetectionio/tests/test_extract_regex.py | 2 +- .../tests/test_ignore_regex_text.py | 14 ++-- changedetectionio/tests/test_ignore_text.py | 42 ++++++----- .../tests/unit/test_watch_model.py | 13 ++-- changedetectionio/update_worker.py | 4 +- 16 files changed, 151 insertions(+), 113 deletions(-) diff --git a/changedetectionio/blueprint/tags/templates/edit-tag.html b/changedetectionio/blueprint/tags/templates/edit-tag.html index a713cf6a..e527ea52 100644 --- a/changedetectionio/blueprint/tags/templates/edit-tag.html +++ b/changedetectionio/blueprint/tags/templates/edit-tag.html @@ -17,7 +17,6 @@ -
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 28fdfeb9..19056b5d 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -476,7 +476,7 @@ class processor_text_json_diff_form(commonSettingsForm): title = StringField('Title', default='') - ignore_text = StringListField('Remove lines containing', [ValidateListRegex()]) + ignore_text = StringListField('Ignore lines containing', [ValidateListRegex()]) headers = StringDictKeyValue('Request headers') body = TextAreaField('Request body', [validators.Optional()]) method = SelectField('Request method', choices=valid_method, default=default_method) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 7c2e1eba..6e4ebca0 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -3,11 +3,11 @@ from lxml import etree import json import re - # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis TEXT_FILTER_LIST_LINE_SUFFIX = "
" - +TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ') PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' + # 'price' , 'lowPrice', 'highPrice' are usually under here # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"] @@ -326,6 +326,7 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None # - "line numbers" return a list of line numbers that match (int list) # # wordlist - list of regex's (str) or words (str) +# Preserves all linefeeds and other whitespacing, its not the job of this to remove that def strip_ignore_text(content, wordlist, mode="content"): i = 0 output = [] @@ -341,32 +342,30 @@ def strip_ignore_text(content, wordlist, mode="content"): else: ignore_text.append(k.strip()) - for line in content.splitlines(): + for line in content.splitlines(keepends=True): i += 1 # Always ignore blank lines in this mode. (when this function gets called) got_match = False - if len(line.strip()): - for l in ignore_text: - if l.lower() in line.lower(): - got_match = True + for l in ignore_text: + if l.lower() in line.lower(): + got_match = True - if not got_match: - for r in ignore_regex: - if r.search(line): - got_match = True - - if not got_match: - # Not ignored - output.append(line.encode('utf8')) - else: - ignored_line_numbers.append(i) + if not got_match: + for r in ignore_regex: + if r.search(line): + got_match = True + if not got_match: + # Not ignored, and should preserve "keepends" + output.append(line) + else: + ignored_line_numbers.append(i) # Used for finding out what to highlight if mode == "line numbers": return ignored_line_numbers - return "\n".encode('utf8').join(output) + return ''.join(output) def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: from xml.sax.saxutils import escape as xml_escape diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index b52d37fb..a2e38ce1 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -6,6 +6,8 @@ import re from pathlib import Path from loguru import logger +from ..html_tools import TRANSLATE_WHITESPACE_TABLE + # Allowable protocols, protects against javascript: etc # file:// is further checked by ALLOW_FILE_URI SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):' @@ -312,13 +314,13 @@ class model(watch_base): dest = os.path.join(self.watch_data_dir, snapshot_fname) if not os.path.exists(dest): with open(dest, 'wb') as f: - f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) + f.write(brotli.compress(contents.encode('utf-8'), mode=brotli.MODE_TEXT)) else: snapshot_fname = f"{snapshot_id}.txt" dest = os.path.join(self.watch_data_dir, snapshot_fname) if not os.path.exists(dest): with open(dest, 'wb') as f: - f.write(contents) + f.write(contents.encode('utf-8')) # Append to index # @todo check last char was \n @@ -350,14 +352,32 @@ class model(watch_base): return seconds # Iterate over all history texts and see if something new exists - def lines_contain_something_unique_compared_to_history(self, lines: list): - local_lines = set([l.decode('utf-8').strip().lower() for l in lines]) + # Always applying .strip() to start/end but optionally replace any other whitespace + def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False): + local_lines = [] + if lines: + if ignore_whitespace: + if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk + local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines]) + else: + local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines]) + else: + if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk + local_lines = set([l.strip().lower() for l in lines]) + else: + local_lines = set([l.decode('utf-8').strip().lower() for l in lines]) + # Compare each lines (set) against each history text file (set) looking for something new.. existing_history = set({}) for k, v in self.history.items(): content = self.get_history_snapshot(k) - alist = set([line.strip().lower() for line in content.splitlines()]) + + if ignore_whitespace: + alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()]) + else: + alist = set([line.strip().lower() for line in content.splitlines()]) + existing_history = existing_history.union(alist) # Check that everything in local_lines(new stuff) already exists in existing_history - it should diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 0f490221..911e1838 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -307,4 +307,4 @@ class perform_site_check(difference_detection_processor): # Always record the new checksum update_obj["previous_md5"] = fetched_md5 - return changed_detected, update_obj, snapshot_content.encode('utf-8').strip() + return changed_detected, update_obj, snapshot_content.strip() diff --git a/changedetectionio/processors/text_json_diff/__init__.py b/changedetectionio/processors/text_json_diff/__init__.py index f87aa350..6a5efad9 100644 --- a/changedetectionio/processors/text_json_diff/__init__.py +++ b/changedetectionio/processors/text_json_diff/__init__.py @@ -46,6 +46,9 @@ def prepare_filter_prevew(datastore, watch_uuid): text_after_filter = '' text_before_filter = '' + trigger_line_numbers = [] + ignore_line_numbers = [] + tmp_watch = deepcopy(datastore.data['watching'].get(watch_uuid)) if tmp_watch and tmp_watch.history and os.path.isdir(tmp_watch.watch_data_dir): @@ -72,7 +75,7 @@ def prepare_filter_prevew(datastore, watch_uuid): ) # Use the last loaded HTML as the input update_handler.datastore = datastore - update_handler.fetcher.content = decompressed_data + update_handler.fetcher.content = str(decompressed_data) # str() because playwright/puppeteer/requests return string update_handler.fetcher.headers['content-type'] = tmp_watch.get('content-type') # Process our watch with filters and the HTML from disk, and also a blank watch with no filters but also with the same HTML from disk @@ -84,9 +87,7 @@ def prepare_filter_prevew(datastore, watch_uuid): text_after_filter = future1.result() text_before_filter = future2.result() - trigger_line_numbers = [] try: - trigger_line_numbers = html_tools.strip_ignore_text(content=text_after_filter, wordlist=tmp_watch['trigger_text'], mode='line numbers' @@ -94,6 +95,15 @@ def prepare_filter_prevew(datastore, watch_uuid): except Exception as e: text_before_filter = f"Error: {str(e)}" + try: + text_to_ignore = tmp_watch.get('ignore_text', []) + datastore.data['settings']['application'].get('global_ignore_text', []) + ignore_line_numbers = html_tools.strip_ignore_text(content=text_after_filter, + wordlist=text_to_ignore, + mode='line numbers' + ) + except Exception as e: + text_before_filter = f"Error: {str(e)}" + logger.trace(f"Parsed in {time.time() - now:.3f}s") return jsonify( @@ -102,6 +112,7 @@ def prepare_filter_prevew(datastore, watch_uuid): 'before_filter': text_before_filter.decode('utf-8') if isinstance(text_before_filter, bytes) else text_before_filter, 'duration': time.time() - now, 'trigger_line_numbers': trigger_line_numbers, + 'ignore_line_numbers': ignore_line_numbers, } ) diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 43feb05f..c3752956 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -7,7 +7,7 @@ import re import urllib3 from changedetectionio.processors import difference_detection_processor -from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text +from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE from changedetectionio import html_tools, content_fetchers from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from loguru import logger @@ -36,7 +36,6 @@ class PDFToHTMLToolNotFound(ValueError): class perform_site_check(difference_detection_processor): def run_changedetection(self, watch, skip_when_checksum_same=True): - changed_detected = False html_content = "" screenshot = False # as bytes @@ -205,18 +204,9 @@ class perform_site_check(difference_detection_processor): if watch.get('trim_text_whitespace'): stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines()) - if watch.get('remove_duplicate_lines'): - stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())) - - if watch.get('sort_text_alphabetically'): - # Note: Because a

something

will add an extra line feed to signify the paragraph gap - # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. - stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n") - stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower())) - # Re #340 - return the content before the 'ignore text' was applied # Also used to calculate/show what was removed - text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') + text_content_before_ignored_filter = stripped_text_from_html # @todo whitespace coming from missing rtrim()? # stripped_text_from_html could be based on their preferences, replace the processed text with only that which they want to know about. @@ -236,12 +226,12 @@ class perform_site_check(difference_detection_processor): line_feed_sep="\n", include_change_type_prefix=False) - watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter) + watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter.encode('utf-8')) if not rendered_diff and stripped_text_from_html: # We had some content, but no differences were found # Store our new file as the MD5 so it will trigger in the future - c = hashlib.md5(stripped_text_from_html.encode('utf-8').translate(None, b'\r\n\t ')).hexdigest() + c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest() return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8') else: stripped_text_from_html = rendered_diff @@ -262,14 +252,6 @@ class perform_site_check(difference_detection_processor): update_obj["last_check_status"] = self.fetcher.get_last_status_code() - # If there's text to skip - # @todo we could abstract out the get_text() to handle this cleaner - text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', []) - if len(text_to_ignore): - stripped_text_from_html = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore) - else: - stripped_text_from_html = stripped_text_from_html.encode('utf8') - # 615 Extract text by regex extract_text = watch.get('extract_text', []) if len(extract_text) > 0: @@ -278,39 +260,53 @@ class perform_site_check(difference_detection_processor): # incase they specified something in '/.../x' if re.search(PERL_STYLE_REGEX, s_re, re.IGNORECASE): regex = html_tools.perl_style_slash_enclosed_regex_to_options(s_re) - result = re.findall(regex.encode('utf-8'), stripped_text_from_html) + result = re.findall(regex, stripped_text_from_html) for l in result: if type(l) is tuple: # @todo - some formatter option default (between groups) - regex_matched_output += list(l) + [b'\n'] + regex_matched_output += list(l) + ['\n'] else: # @todo - some formatter option default (between each ungrouped result) - regex_matched_output += [l] + [b'\n'] + regex_matched_output += [l] + ['\n'] else: # Doesnt look like regex, just hunt for plaintext and return that which matches # `stripped_text_from_html` will be bytes, so we must encode s_re also to bytes - r = re.compile(re.escape(s_re.encode('utf-8')), re.IGNORECASE) + r = re.compile(re.escape(s_re), re.IGNORECASE) res = r.findall(stripped_text_from_html) if res: for match in res: - regex_matched_output += [match] + [b'\n'] + regex_matched_output += [match] + ['\n'] ########################################################## - stripped_text_from_html = b'' - text_content_before_ignored_filter = b'' + stripped_text_from_html = '' + if regex_matched_output: # @todo some formatter for presentation? - stripped_text_from_html = b''.join(regex_matched_output) - text_content_before_ignored_filter = stripped_text_from_html + stripped_text_from_html = ''.join(regex_matched_output) + if watch.get('remove_duplicate_lines'): + stripped_text_from_html = '\n'.join(dict.fromkeys(line for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())) + if watch.get('sort_text_alphabetically'): + # Note: Because a

something

will add an extra line feed to signify the paragraph gap + # we end up with 'Some text\n\n', sorting will add all those extra \n at the start, so we remove them here. + stripped_text_from_html = stripped_text_from_html.replace("\n\n", "\n") + stripped_text_from_html = '\n'.join(sorted(stripped_text_from_html.splitlines(), key=lambda x: x.lower())) + +### CALCULATE MD5 + # If there's text to ignore + text_to_ignore = watch.get('ignore_text', []) + self.datastore.data['settings']['application'].get('global_ignore_text', []) + text_for_checksuming = stripped_text_from_html + if text_to_ignore: + text_for_checksuming = html_tools.strip_ignore_text(stripped_text_from_html, text_to_ignore) + # Re #133 - if we should strip whitespaces from triggering the change detected comparison - if self.datastore.data['settings']['application'].get('ignore_whitespace', False): - fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest() + if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False): + fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest() else: - fetched_md5 = hashlib.md5(stripped_text_from_html).hexdigest() + fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest() ############ Blocking rules, after checksum ################# blocked = False @@ -350,7 +346,13 @@ class perform_site_check(difference_detection_processor): if changed_detected: if watch.get('check_unique_lines', False): - has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines()) + ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace') + + has_unique_lines = watch.lines_contain_something_unique_compared_to_history( + lines=stripped_text_from_html.splitlines(), + ignore_whitespace=ignore_whitespace + ) + # One or more lines? unsure? if not has_unique_lines: logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False") diff --git a/changedetectionio/static/js/watch-settings.js b/changedetectionio/static/js/watch-settings.js index f3360dbe..cb9f9c60 100644 --- a/changedetectionio/static/js/watch-settings.js +++ b/changedetectionio/static/js/watch-settings.js @@ -42,8 +42,12 @@ function request_textpreview_update() { { 'color': '#ee0000', 'lines': data['trigger_line_numbers'] + }, + { + 'color': '#757575', + 'lines': data['ignore_line_numbers'] } - ]); + ]) }).fail(function (error) { if (error.statusText === 'abort') { console.log('Request was aborted due to a new request being fired.'); @@ -76,8 +80,8 @@ $(document).ready(function () { $('body').toggleClass('preview-text-enabled') request_textpreview_update(); const method = $('body').hasClass('preview-text-enabled') ? 'on' : 'off'; - $('textarea:visible')[method]('keyup blur', request_textpreview_update.throttle(1000)); - $('input:visible')[method]('keyup blur change', request_textpreview_update.throttle(1000)); + $('#filters-and-triggers textarea')[method]('blur', request_textpreview_update.throttle(1000)); + $('#filters-and-triggers input')[method]('change', request_textpreview_update.throttle(1000)); $("#filters-and-triggers-tab")[method]('click', request_textpreview_update.throttle(1000)); }); $('.minitabs-wrapper').miniTabs({ diff --git a/changedetectionio/store.py b/changedetectionio/store.py index cc1b335f..697da5bc 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -4,6 +4,7 @@ from flask import ( flash ) +from .html_tools import TRANSLATE_WHITESPACE_TABLE from . model import App, Watch from copy import deepcopy, copy from os import path, unlink @@ -750,17 +751,17 @@ class ChangeDetectionStore: def update_5(self): # If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings # In other words - the watch notification_title and notification_body are not needed if they are the same as the default one - current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) - current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) + current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE) + current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE) for uuid, watch in self.data['watching'].items(): try: watch_body = watch.get('notification_body', '') - if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body: + if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body: # Looks the same as the default one, so unset it watch['notification_body'] = None watch_title = watch.get('notification_title', '') - if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title: + if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title: # Looks the same as the default one, so unset it watch['notification_title'] = None except Exception as e: diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 9d9f48ff..5847962f 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -26,7 +26,6 @@ - {% if playwright_enabled %} @@ -330,9 +329,9 @@ nav {{ render_checkbox_field(form.filter_text_added) }} {{ render_checkbox_field(form.filter_text_replaced) }} {{ render_checkbox_field(form.filter_text_removed) }} - Note: Depending on the length and similarity of the text on each line, the algorithm may consider an addition instead of replacement for example. - So it's always better to select Added+Replaced when you're interested in new content.
- When content is merely moved in a list, it will also trigger an addition, consider enabling Only trigger when unique lines appear + Note: Depending on the length and similarity of the text on each line, the algorithm may consider an addition instead of replacement for example.
+  So it's always better to select Added+Replaced when you're interested in new content.
+  When content is merely moved in a list, it will also trigger an addition, consider enabling Only trigger when unique lines appear
{{ render_checkbox_field(form.check_unique_lines) }} @@ -371,7 +370,7 @@ nav ") }}
    -
  • Matching text will be removed from the text snapshot
  • +
  • Matching text will be ignored in the text snapshot (you can still see it but it wont trigger a change)
  • Each line processed separately, any line matching will be ignored (removed before creating the checksum)
  • Regular Expression support, wrap the entire line in forward slash /regex/
  • Changing this will affect the comparison checksum which may trigger an alert
  • diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index ad41e7b6..e39c4081 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -172,7 +172,7 @@ nav Note: This is applied globally in addition to the per-watch rules.
      -
    • Matching text will be removed from the text snapshot
    • +
    • Matching text will be ignored in the text snapshot (you can still see it but it wont trigger a change)
    • Note: This is applied globally in addition to the per-watch rules.
    • Each line processed separately, any line matching will be ignored (removed before creating the checksum)
    • Regular Expression support, wrap the entire line in forward slash /regex/
    • diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py index 522cff0c..058b3411 100644 --- a/changedetectionio/tests/test_extract_regex.py +++ b/changedetectionio/tests/test_extract_regex.py @@ -71,7 +71,7 @@ def test_setup(client, live_server, measure_memory_usage): live_server_setup(live_server) def test_check_filter_multiline(client, live_server, measure_memory_usage): - #live_server_setup(live_server) + # live_server_setup(live_server) set_multiline_response() # Add our URL to the import page diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py index 06c60ea4..34883182 100644 --- a/changedetectionio/tests/test_ignore_regex_text.py +++ b/changedetectionio/tests/test_ignore_regex_text.py @@ -33,13 +33,17 @@ def test_strip_regex_text_func(): stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) - assert b"but 1 lines" in stripped_content - assert b"igNORe-cAse text" not in stripped_content - assert b"but 1234 lines" not in stripped_content - assert b"really" not in stripped_content - assert b"not this" not in stripped_content + assert "but 1 lines" in stripped_content + assert "igNORe-cAse text" not in stripped_content + assert "but 1234 lines" not in stripped_content + assert "really" not in stripped_content + assert "not this" not in stripped_content # Check line number reporting stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers") assert stripped_content == [2, 5, 6, 7, 8, 10] + # Check that linefeeds are preserved when there are is no matching ignores + content = "some text\n\nand other text\n" + stripped_content = html_tools.strip_ignore_text(content, ignore_lines) + assert content == stripped_content diff --git a/changedetectionio/tests/test_ignore_text.py b/changedetectionio/tests/test_ignore_text.py index 37d21d1b..4a5c86a1 100644 --- a/changedetectionio/tests/test_ignore_text.py +++ b/changedetectionio/tests/test_ignore_text.py @@ -22,10 +22,15 @@ def test_strip_text_func(): ignore_lines = ["sometimes"] stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) + assert "sometimes" not in stripped_content + assert "Some content" in stripped_content - assert b"sometimes" not in stripped_content - assert b"Some content" in stripped_content + # Check that line feeds dont get chewed up when something is found + test_content = "Some initial text\n\nWhich is across multiple lines\n\nZZZZz\n\n\nSo let's see what happens." + ignore = ['something irrelevent but just to check', 'XXXXX', 'YYYYY', 'ZZZZZ'] + stripped_content = html_tools.strip_ignore_text(test_content, ignore) + assert stripped_content == "Some initial text\n\nWhich is across multiple lines\n\n\n\nSo let's see what happens." def set_original_ignore_response(): test_return_data = """ @@ -141,8 +146,6 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa - - # Just to be sure.. set a regular modified change.. set_modified_original_ignore_response() client.get(url_for("form_watch_checknow"), follow_redirects=True) @@ -153,17 +156,17 @@ def test_check_ignore_text_functionality(client, live_server, measure_memory_usa res = client.get(url_for("preview_page", uuid="first")) - # Should no longer be in the preview - assert b'new ignore stuff' not in res.data + # SHOULD BE be in the preview, it was added in set_modified_original_ignore_response() + # and we have "new ignore stuff" in ignore_text + # it is only ignored, it is not removed (it will be highlighted too) + assert b'new ignore stuff' in res.data res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data +# When adding some ignore text, it should not trigger a change, even if something else on that line changes def test_check_global_ignore_text_functionality(client, live_server, measure_memory_usage): - - # Give the endpoint time to spin up - time.sleep(1) - + #live_server_setup(live_server) ignore_text = "XXXXX\r\nYYYYY\r\nZZZZZ" set_original_ignore_response() @@ -172,6 +175,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem url_for("settings_page"), data={ "requests-time_between_check-minutes": 180, + "application-ignore_whitespace": "y", "application-global_ignore_text": ignore_text, 'application-fetch_backend': "html_requests" }, @@ -192,9 +196,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem # Give the thread time to pick it up wait_for_all_checks(client) - - # Goto the edit page of the item, add our ignore text - # Add our URL to the import page + #Adding some ignore text should not trigger a change res = client.post( url_for("edit_page", uuid="first"), data={"ignore_text": "something irrelevent but just to check", "url": test_url, 'fetch_backend': "html_requests"}, @@ -210,20 +212,15 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) - - # Give the thread time to pick it up wait_for_all_checks(client) - - # so that we are sure everything is viewed and in a known 'nothing changed' state - res = client.get(url_for("diff_history_page", uuid="first")) - - # It should report nothing found (no new 'unviewed' class) + # It should report nothing found (no new 'unviewed' class), adding random ignore text should not cause a change res = client.get(url_for("index")) assert b'unviewed' not in res.data assert b'/test-endpoint' in res.data +##### - - # Make a change which includes the ignore text + # Make a change which includes the ignore text, it should be ignored and no 'change' triggered + # It adds text with "ZZZZzzzz" and "ZZZZ" is in the ignore list set_modified_ignore_response() # Trigger a check @@ -233,6 +230,7 @@ def test_check_global_ignore_text_functionality(client, live_server, measure_mem # It should report nothing found (no new 'unviewed' class) res = client.get(url_for("index")) + assert b'unviewed' not in res.data assert b'/test-endpoint' in res.data diff --git a/changedetectionio/tests/unit/test_watch_model.py b/changedetectionio/tests/unit/test_watch_model.py index a9e3df3c..a550cd0a 100644 --- a/changedetectionio/tests/unit/test_watch_model.py +++ b/changedetectionio/tests/unit/test_watch_model.py @@ -18,12 +18,13 @@ class TestDiffBuilder(unittest.TestCase): watch['last_viewed'] = 110 - watch.save_history_text(contents=b"hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4())) - watch.save_history_text(contents=b"hello world", timestamp=105, snapshot_id=str(uuid_builder.uuid4())) - watch.save_history_text(contents=b"hello world", timestamp=109, snapshot_id=str(uuid_builder.uuid4())) - watch.save_history_text(contents=b"hello world", timestamp=112, snapshot_id=str(uuid_builder.uuid4())) - watch.save_history_text(contents=b"hello world", timestamp=115, snapshot_id=str(uuid_builder.uuid4())) - watch.save_history_text(contents=b"hello world", timestamp=117, snapshot_id=str(uuid_builder.uuid4())) + # Contents from the browser are always returned from the browser/requests/etc as str, str is basically UTF-16 in python + watch.save_history_text(contents="hello world", timestamp=100, snapshot_id=str(uuid_builder.uuid4())) + watch.save_history_text(contents="hello world", timestamp=105, snapshot_id=str(uuid_builder.uuid4())) + watch.save_history_text(contents="hello world", timestamp=109, snapshot_id=str(uuid_builder.uuid4())) + watch.save_history_text(contents="hello world", timestamp=112, snapshot_id=str(uuid_builder.uuid4())) + watch.save_history_text(contents="hello world", timestamp=115, snapshot_id=str(uuid_builder.uuid4())) + watch.save_history_text(contents="hello world", timestamp=117, snapshot_id=str(uuid_builder.uuid4())) p = watch.get_next_snapshot_key_to_last_viewed assert p == "112", "Correct last-viewed timestamp was detected" diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 97e1ec27..ebb3ada7 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -286,8 +286,8 @@ class update_worker(threading.Thread): # Re #342 # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes. # We then convert/.decode('utf-8') for the notification etc - if not isinstance(contents, (bytes, bytearray)): - raise Exception("Error - returned data from the fetch handler SHOULD be bytes") +# if not isinstance(contents, (bytes, bytearray)): +# raise Exception("Error - returned data from the fetch handler SHOULD be bytes") except PermissionError as e: logger.critical(f"File permission error updating file, watch: {uuid}") logger.critical(str(e))