Unique line test wasnt considering whitespace changes!

3 months ago · 5d753f59c4
parent 090f5d7725
commit 5d753f59c4
4 changed files with 40 additions and 13 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@ -3,11 +3,11 @@ from lxml import etree
 import json
 import re
 # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
 TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
-
+TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
 PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
 # 'price' , 'lowPrice', 'highPrice' are usually under here
 # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
 LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@ -6,6 +6,8 @@ import re
 from pathlib import Path
 from loguru import logger
 from ..html_tools import TRANSLATE_WHITESPACE_TABLE
 # Allowable protocols, protects against javascript: etc
 # file:// is further checked by ALLOW_FILE_URI
 SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
@ -350,14 +352,32 @@ class model(watch_base):
        return seconds
    # Iterate over all history texts and see if something new exists
-    def lines_contain_something_unique_compared_to_history(self, lines: list):
+    # Always applying .strip() to start/end but optionally replace any other whitespace
    def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False):
        local_lines = []
        if lines:
            if ignore_whitespace:
                if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
                    local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
                else:
                    local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
            else:
                if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
                    local_lines = set([l.strip().lower() for l in lines])
                else:
                    local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
        # Compare each lines (set) against each history text file (set) looking for something new..
        existing_history = set({})
        for k, v in self.history.items():
            content = self.get_history_snapshot(k)
            if ignore_whitespace:
                alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()])
            else:
                alist = set([line.strip().lower() for line in content.splitlines()])
            existing_history = existing_history.union(alist)
        # Check that everything in local_lines(new stuff) already exists in existing_history - it should
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@ -7,7 +7,7 @@ import re
 import urllib3
 from changedetectionio.processors import difference_detection_processor
-from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
+from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
 from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from loguru import logger
@ -230,7 +230,7 @@ class perform_site_check(difference_detection_processor):
            if not rendered_diff and stripped_text_from_html:
                # We had some content, but no differences were found
                # Store our new file as the MD5 so it will trigger in the future
-                c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
+                c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
                return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
            else:
                stripped_text_from_html = rendered_diff
@ -304,7 +304,7 @@ class perform_site_check(difference_detection_processor):
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
        if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
-            fetched_md5 = hashlib.md5(text_for_checksuming.translate(str.maketrans("", "", "\n\r\t ")).encode('utf-8')).hexdigest()
+            fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
        else:
            fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()
@ -346,7 +346,13 @@ class perform_site_check(difference_detection_processor):
        if changed_detected:
            if watch.get('check_unique_lines', False):
-                has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines())
+                ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace')
                has_unique_lines = watch.lines_contain_something_unique_compared_to_history(
                    lines=stripped_text_from_html.splitlines(),
                    ignore_whitespace=ignore_whitespace
                )
                # One or more lines? unsure?
                if not has_unique_lines:
                    logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -4,6 +4,7 @@ from flask import (
    flash
 )
 from .html_tools import TRANSLATE_WHITESPACE_TABLE
 from . model import App, Watch
 from copy import deepcopy, copy
 from os import path, unlink
@ -750,17 +751,17 @@ class ChangeDetectionStore:
    def update_5(self):
        # If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings
        # In other words - the watch notification_title and notification_body are not needed if they are the same as the default one
-        current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
+        current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
-        current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
+        current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
        for uuid, watch in self.data['watching'].items():
            try:
                watch_body = watch.get('notification_body', '')
-                if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body:
+                if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body:
                    # Looks the same as the default one, so unset it
                    watch['notification_body'] = None
                watch_title = watch.get('notification_title', '')
-                if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title:
+                if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title:
                    # Looks the same as the default one, so unset it
                    watch['notification_title'] = None
            except Exception as e: