diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 19d1bad7..6e4ebca0 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -3,11 +3,11 @@ from lxml import etree
import json
import re
-
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "
"
-
+TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
+
# 'price' , 'lowPrice', 'highPrice' are usually under here
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index c6d71854..a2e38ce1 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -6,6 +6,8 @@ import re
from pathlib import Path
from loguru import logger
+from ..html_tools import TRANSLATE_WHITESPACE_TABLE
+
# Allowable protocols, protects against javascript: etc
# file:// is further checked by ALLOW_FILE_URI
SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
@@ -350,14 +352,32 @@ class model(watch_base):
return seconds
# Iterate over all history texts and see if something new exists
- def lines_contain_something_unique_compared_to_history(self, lines: list):
- local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
+ # Always applying .strip() to start/end but optionally replace any other whitespace
+ def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False):
+ local_lines = []
+ if lines:
+ if ignore_whitespace:
+ if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
+ local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
+ else:
+ local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
+ else:
+ if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
+ local_lines = set([l.strip().lower() for l in lines])
+ else:
+ local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
+
# Compare each lines (set) against each history text file (set) looking for something new..
existing_history = set({})
for k, v in self.history.items():
content = self.get_history_snapshot(k)
- alist = set([line.strip().lower() for line in content.splitlines()])
+
+ if ignore_whitespace:
+ alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()])
+ else:
+ alist = set([line.strip().lower() for line in content.splitlines()])
+
existing_history = existing_history.union(alist)
# Check that everything in local_lines(new stuff) already exists in existing_history - it should
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index 4c8f75ed..d6501390 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -7,7 +7,7 @@ import re
import urllib3
from changedetectionio.processors import difference_detection_processor
-from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
+from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger
@@ -230,7 +230,7 @@ class perform_site_check(difference_detection_processor):
if not rendered_diff and stripped_text_from_html:
# We had some content, but no differences were found
# Store our new file as the MD5 so it will trigger in the future
- c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest()
+ c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
else:
stripped_text_from_html = rendered_diff
@@ -304,7 +304,7 @@ class perform_site_check(difference_detection_processor):
# Re #133 - if we should strip whitespaces from triggering the change detected comparison
if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
- fetched_md5 = hashlib.md5(text_for_checksuming.translate(str.maketrans("", "", "\n\r\t ")).encode('utf-8')).hexdigest()
+ fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
else:
fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()
@@ -346,7 +346,13 @@ class perform_site_check(difference_detection_processor):
if changed_detected:
if watch.get('check_unique_lines', False):
- has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines())
+ ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace')
+
+ has_unique_lines = watch.lines_contain_something_unique_compared_to_history(
+ lines=stripped_text_from_html.splitlines(),
+ ignore_whitespace=ignore_whitespace
+ )
+
# One or more lines? unsure?
if not has_unique_lines:
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")
diff --git a/changedetectionio/store.py b/changedetectionio/store.py
index cc1b335f..697da5bc 100644
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@@ -4,6 +4,7 @@ from flask import (
flash
)
+from .html_tools import TRANSLATE_WHITESPACE_TABLE
from . model import App, Watch
from copy import deepcopy, copy
from os import path, unlink
@@ -750,17 +751,17 @@ class ChangeDetectionStore:
def update_5(self):
# If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings
# In other words - the watch notification_title and notification_body are not needed if they are the same as the default one
- current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
- current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n "))
+ current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
+ current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
for uuid, watch in self.data['watching'].items():
try:
watch_body = watch.get('notification_body', '')
- if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body:
+ if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body:
# Looks the same as the default one, so unset it
watch['notification_body'] = None
watch_title = watch.get('notification_title', '')
- if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title:
+ if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title:
# Looks the same as the default one, so unset it
watch['notification_title'] = None
except Exception as e: