Unique line test wasnt considering whitespace changes!

refactor-filters
dgtlmoon 3 months ago
parent 090f5d7725
commit 5d753f59c4

@ -3,11 +3,11 @@ from lxml import etree
import json import json
import re import re
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis # HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>" TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$' PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'
# 'price' , 'lowPrice', 'highPrice' are usually under here # 'price' , 'lowPrice', 'highPrice' are usually under here
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here # All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"] LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]

@ -6,6 +6,8 @@ import re
from pathlib import Path from pathlib import Path
from loguru import logger from loguru import logger
from ..html_tools import TRANSLATE_WHITESPACE_TABLE
# Allowable protocols, protects against javascript: etc # Allowable protocols, protects against javascript: etc
# file:// is further checked by ALLOW_FILE_URI # file:// is further checked by ALLOW_FILE_URI
SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):' SAFE_PROTOCOL_REGEX='^(http|https|ftp|file):'
@ -350,14 +352,32 @@ class model(watch_base):
return seconds return seconds
# Iterate over all history texts and see if something new exists # Iterate over all history texts and see if something new exists
def lines_contain_something_unique_compared_to_history(self, lines: list): # Always applying .strip() to start/end but optionally replace any other whitespace
def lines_contain_something_unique_compared_to_history(self, lines: list, ignore_whitespace=False):
local_lines = []
if lines:
if ignore_whitespace:
if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
local_lines = set([l.translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
else:
local_lines = set([l.decode('utf-8').translate(TRANSLATE_WHITESPACE_TABLE).lower() for l in lines])
else:
if isinstance(lines[0], str): # Can be either str or bytes depending on what was on the disk
local_lines = set([l.strip().lower() for l in lines])
else:
local_lines = set([l.decode('utf-8').strip().lower() for l in lines]) local_lines = set([l.decode('utf-8').strip().lower() for l in lines])
# Compare each lines (set) against each history text file (set) looking for something new.. # Compare each lines (set) against each history text file (set) looking for something new..
existing_history = set({}) existing_history = set({})
for k, v in self.history.items(): for k, v in self.history.items():
content = self.get_history_snapshot(k) content = self.get_history_snapshot(k)
if ignore_whitespace:
alist = set([line.translate(TRANSLATE_WHITESPACE_TABLE).lower() for line in content.splitlines()])
else:
alist = set([line.strip().lower() for line in content.splitlines()]) alist = set([line.strip().lower() for line in content.splitlines()])
existing_history = existing_history.union(alist) existing_history = existing_history.union(alist)
# Check that everything in local_lines(new stuff) already exists in existing_history - it should # Check that everything in local_lines(new stuff) already exists in existing_history - it should

@ -7,7 +7,7 @@ import re
import urllib3 import urllib3
from changedetectionio.processors import difference_detection_processor from changedetectionio.processors import difference_detection_processor
from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text from changedetectionio.html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text, TRANSLATE_WHITESPACE_TABLE
from changedetectionio import html_tools, content_fetchers from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
from loguru import logger from loguru import logger
@ -230,7 +230,7 @@ class perform_site_check(difference_detection_processor):
if not rendered_diff and stripped_text_from_html: if not rendered_diff and stripped_text_from_html:
# We had some content, but no differences were found # We had some content, but no differences were found
# Store our new file as the MD5 so it will trigger in the future # Store our new file as the MD5 so it will trigger in the future
c = hashlib.md5(stripped_text_from_html.translate(b'\r\n\t ').encode('utf-8')).hexdigest() c = hashlib.md5(stripped_text_from_html.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8') return False, {'previous_md5': c}, stripped_text_from_html.encode('utf-8')
else: else:
stripped_text_from_html = rendered_diff stripped_text_from_html = rendered_diff
@ -304,7 +304,7 @@ class perform_site_check(difference_detection_processor):
# Re #133 - if we should strip whitespaces from triggering the change detected comparison # Re #133 - if we should strip whitespaces from triggering the change detected comparison
if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False): if text_for_checksuming and self.datastore.data['settings']['application'].get('ignore_whitespace', False):
fetched_md5 = hashlib.md5(text_for_checksuming.translate(str.maketrans("", "", "\n\r\t ")).encode('utf-8')).hexdigest() fetched_md5 = hashlib.md5(text_for_checksuming.translate(TRANSLATE_WHITESPACE_TABLE).encode('utf-8')).hexdigest()
else: else:
fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest() fetched_md5 = hashlib.md5(text_for_checksuming.encode('utf-8')).hexdigest()
@ -346,7 +346,13 @@ class perform_site_check(difference_detection_processor):
if changed_detected: if changed_detected:
if watch.get('check_unique_lines', False): if watch.get('check_unique_lines', False):
has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines()) ignore_whitespace = self.datastore.data['settings']['application'].get('ignore_whitespace')
has_unique_lines = watch.lines_contain_something_unique_compared_to_history(
lines=stripped_text_from_html.splitlines(),
ignore_whitespace=ignore_whitespace
)
# One or more lines? unsure? # One or more lines? unsure?
if not has_unique_lines: if not has_unique_lines:
logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False") logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")

@ -4,6 +4,7 @@ from flask import (
flash flash
) )
from .html_tools import TRANSLATE_WHITESPACE_TABLE
from . model import App, Watch from . model import App, Watch
from copy import deepcopy, copy from copy import deepcopy, copy
from os import path, unlink from os import path, unlink
@ -750,17 +751,17 @@ class ChangeDetectionStore:
def update_5(self): def update_5(self):
# If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings # If the watch notification body, title look the same as the global one, unset it, so the watch defaults back to using the main settings
# In other words - the watch notification_title and notification_body are not needed if they are the same as the default one # In other words - the watch notification_title and notification_body are not needed if they are the same as the default one
current_system_body = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) current_system_body = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
current_system_title = self.data['settings']['application']['notification_body'].translate(str.maketrans('', '', "\r\n ")) current_system_title = self.data['settings']['application']['notification_body'].translate(TRANSLATE_WHITESPACE_TABLE)
for uuid, watch in self.data['watching'].items(): for uuid, watch in self.data['watching'].items():
try: try:
watch_body = watch.get('notification_body', '') watch_body = watch.get('notification_body', '')
if watch_body and watch_body.translate(str.maketrans('', '', "\r\n ")) == current_system_body: if watch_body and watch_body.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_body:
# Looks the same as the default one, so unset it # Looks the same as the default one, so unset it
watch['notification_body'] = None watch['notification_body'] = None
watch_title = watch.get('notification_title', '') watch_title = watch.get('notification_title', '')
if watch_title and watch_title.translate(str.maketrans('', '', "\r\n ")) == current_system_title: if watch_title and watch_title.translate(TRANSLATE_WHITESPACE_TABLE) == current_system_title:
# Looks the same as the default one, so unset it # Looks the same as the default one, so unset it
watch['notification_title'] = None watch['notification_title'] = None
except Exception as e: except Exception as e:

Loading…
Cancel
Save