diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 19d1bad7..6e4ebca0 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -3,11 +3,11 @@ from lxml import etree import json import re - # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis TEXT_FILTER_LIST_LINE_SUFFIX = "
" - +TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ') PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' + # 'price' , 'lowPrice', 'highPrice' are usually under here # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"] diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index c6d71854..a2e38ce1 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -6,6 +6,8 @@ import re from pathlib import Path from loguru import logger +from ..html_tools import TRANSLATE_WHITESPACE_TABLE + # Allowable protocols, protects against javascript: etc # file:// is further checked by ALLOW_FILE_URI SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):' @@ -350,14 +352,32 @@ class model(watch_base): return seconds # Iterate over all history texts and see if something new exists - def lines_contain_something_unique_compared_to_history(self, lines: list): - local_lines = set([l.decode('utf-8').strip().lower() for l in lines]) + # Always applying .strip() to start/end but optionally replace any other whitespace + def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False): + local_lines = [] + if lines: + if ignore_whitespace: + if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk + local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines]) + else: + local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines]) + else: + if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk + local_lines = set([l.strip().lower() for l in lines]) + else: + local_lines = set([l.decode('utf-8').strip().lower() for l in lines]) + # Compare each lines (set) against each history text file (set) looking for something new.. existing_history = set({}) for k, v in self.history.items(): content = self.get_history_snapshot(k) - alist = set([line.strip().lower() for line in content.splitlines()]) + + if ignore_whitespace: + alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()]) + else: + alist = set([line.strip().lower() for line in content.splitlines()]) + existing_history = existing_history.union(alist) # Check that everything in local_lines(new stuff) already exists in existing_history - it should diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index 4c8f75ed..d6501390 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -7,7 +7,7 @@ import re import urllib3 from changedetectionio.processors import difference_detection_processor -from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text +from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE from changedetectionio import html_tools, content_fetchers from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from loguru import logger @@ -230,7 +230,7 @@ class perform_site_check(difference_detection_processor): if not rendered_diff and stripped_text_from_html: # We had some content, but no differences were found # Store our new file as the MD5 so it will trigger in the future - c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest() + c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest() return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8') else: stripped_text_from_html = rendered_diff @@ -304,7 +304,7 @@ class perform_site_check(difference_detection_processor): # Re #133 - if we should strip whitespaces from triggering the change detected comparison if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False): - fetched_md5 = hashlib.md5(text_for_checksuming.translate(str.maketrans("", "", "\n\r\t ")).encode('utf-8')).hexdigest() + fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest() else: fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest() @@ -346,7 +346,13 @@ class perform_site_check(difference_detection_processor): if changed_detected: if watch.get('check_unique_lines', False): - has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines()) + ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace') + + has_unique_lines = watch.lines_contain_something_unique_compared_to_history( + lines=stripped_text_from_html.splitlines(), + ignore_whitespace=ignore_whitespace + ) + # One or more lines? unsure? if not has_unique_lines: logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False") diff --git a/changedetectionio/store.py b/changedetectionio/store.py index cc1b335f..697da5bc 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -4,6 +4,7 @@ from flask import ( flash ) +from .html_tools import TRANSLATE_WHITESPACE_TABLE from . model import App, Watch from copy import deepcopy, copy from os import path, unlink @@ -750,17 +751,17 @@ class ChangeDetectionStore: def update_5(self): # If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings # In other words - the watch notification_title and notification_body are not needed if they are the same as the default one - current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) - current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) + current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE) + current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE) for uuid, watch in self.data['watching'].items(): try: watch_body = watch.get('notification_body', '') - if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body: + if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body: # Looks the same as the default one, so unset it watch['notification_body'] = None watch_title = watch.get('notification_title', '') - if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title: + if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title: # Looks the same as the default one, so unset it watch['notification_title'] = None except Exception as e: