Now saving last two HTML snapshots for future reference, refactor, dont write screenshots and xpath to disk when no change detected (saves disk IO) (#2431)

pull/2436/head
dgtlmoon 6 months ago committed by GitHub
parent ffd160ce0e
commit 343e359b39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -170,23 +170,33 @@ class WatchSingleHistory(Resource):
curl http://localhost:5000/api/v1/watch/cc0cfffa-f449-477b-83ea-0caafd1dc091/history/1677092977 -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json" curl http://localhost:5000/api/v1/watch/cc0cfffa-f449-477b-83ea-0caafd1dc091/history/1677092977 -H"x-api-key:813031b16330fe25e3780cf0325daa45" -H "Content-Type: application/json"
@apiName Get single snapshot content @apiName Get single snapshot content
@apiGroup Watch History @apiGroup Watch History
@apiParam {String} [html] Optional Set to =1 to return the last HTML (only stores last 2 snapshots, use `latest` as timestamp)
@apiSuccess (200) {String} OK @apiSuccess (200) {String} OK
@apiSuccess (404) {String} ERR Not found @apiSuccess (404) {String} ERR Not found
""" """
watch = self.datastore.data['watching'].get(uuid) watch = self.datastore.data['watching'].get(uuid)
if not watch: if not watch:
abort(404, message='No watch exists with the UUID of {}'.format(uuid)) abort(404, message=f"No watch exists with the UUID of {uuid}")
if not len(watch.history): if not len(watch.history):
abort(404, message='Watch found but no history exists for the UUID {}'.format(uuid)) abort(404, message=f"Watch found but no history exists for the UUID {uuid}")
if timestamp == 'latest': if timestamp == 'latest':
timestamp = list(watch.history.keys())[-1] timestamp = list(watch.history.keys())[-1]
if request.args.get('html'):
content = watch.get_fetched_html(timestamp)
if content:
response = make_response(content, 200)
response.mimetype = "text/html"
else:
response = make_response("No content found", 404)
response.mimetype = "text/plain"
else:
content = watch.get_history_snapshot(timestamp) content = watch.get_history_snapshot(timestamp)
response = make_response(content, 200) response = make_response(content, 200)
response.mimetype = "text/plain" response.mimetype = "text/plain"
return response return response

@ -187,8 +187,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url u = browsersteps_sessions[browsersteps_session_id]['browserstepper'].page.url
if is_last_step and u: if is_last_step and u:
(screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data() (screenshot, xpath_data) = browsersteps_sessions[browsersteps_session_id]['browserstepper'].request_visualselector_data()
datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot) watch = datastore.data['watching'].get(uuid)
datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data) if watch:
watch.save_screenshot(screenshot=screenshot)
watch.save_xpath_data(data=xpath_data)
# if not this_session.page: # if not this_session.page:
# cleanup_playwright_session() # cleanup_playwright_session()

@ -328,14 +328,9 @@ class model(dict):
def save_history_text(self, contents, timestamp, snapshot_id): def save_history_text(self, contents, timestamp, snapshot_id):
import brotli import brotli
self.ensure_data_dir_exists() logger.trace(f"{self.get('uuid')} - Updating history.txt with timestamp {timestamp}")
# Small hack so that we sleep just enough to allow 1 second between history snapshots self.ensure_data_dir_exists()
# this is because history.txt indexes/keys snapshots by epoch seconds and we dont want dupe keys
if self.__newest_history_key and int(timestamp) == int(self.__newest_history_key):
logger.warning(f"Timestamp {timestamp} already exists, waiting 1 seconds so we have a unique key in history.txt")
timestamp = str(int(timestamp) + 1)
time.sleep(1)
threshold = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024)) threshold = int(os.getenv('SNAPSHOT_BROTLI_COMPRESSION_THRESHOLD', 1024))
skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False')) skip_brotli = strtobool(os.getenv('DISABLE_BROTLI_TEXT_SNAPSHOT', 'False'))
@ -528,8 +523,42 @@ class model(dict):
# None is set # None is set
return False return False
def save_error_text(self, contents):
self.ensure_data_dir_exists()
target_path = os.path.join(self.watch_data_dir, "last-error.txt")
with open(target_path, 'w') as f:
f.write(contents)
def save_xpath_data(self, data, as_error=False):
import json
if as_error:
target_path = os.path.join(self.watch_data_dir, "elements-error.json")
else:
target_path = os.path.join(self.watch_data_dir, "elements.json")
self.ensure_data_dir_exists()
with open(target_path, 'w') as f:
f.write(json.dumps(data))
f.close()
# Save as PNG, PNG is larger but better for doing visual diff in the future
def save_screenshot(self, screenshot: bytes, as_error=False):
if as_error:
target_path = os.path.join(self.watch_data_dir, "last-error-screenshot.png")
else:
target_path = os.path.join(self.watch_data_dir, "last-screenshot.png")
self.ensure_data_dir_exists()
def get_last_fetched_before_filters(self): with open(target_path, 'wb') as f:
f.write(screenshot)
f.close()
def get_last_fetched_text_before_filters(self):
import brotli import brotli
filepath = os.path.join(self.watch_data_dir, 'last-fetched.br') filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
@ -544,12 +573,56 @@ class model(dict):
with open(filepath, 'rb') as f: with open(filepath, 'rb') as f:
return(brotli.decompress(f.read()).decode('utf-8')) return(brotli.decompress(f.read()).decode('utf-8'))
def save_last_fetched_before_filters(self, contents): def save_last_text_fetched_before_filters(self, contents):
import brotli import brotli
filepath = os.path.join(self.watch_data_dir, 'last-fetched.br') filepath = os.path.join(self.watch_data_dir, 'last-fetched.br')
with open(filepath, 'wb') as f: with open(filepath, 'wb') as f:
f.write(brotli.compress(contents, mode=brotli.MODE_TEXT)) f.write(brotli.compress(contents, mode=brotli.MODE_TEXT))
def save_last_fetched_html(self, timestamp, contents):
import brotli
self.ensure_data_dir_exists()
snapshot_fname = f"{timestamp}.html.br"
filepath = os.path.join(self.watch_data_dir, snapshot_fname)
with open(filepath, 'wb') as f:
contents = contents.encode('utf-8') if isinstance(contents, str) else contents
try:
f.write(brotli.compress(contents))
except Exception as e:
logger.warning(f"{self.get('uuid')} - Unable to compress snapshot, saving as raw data to {filepath}")
logger.warning(e)
f.write(contents)
self._prune_last_fetched_html_snapshots()
def get_fetched_html(self, timestamp):
import brotli
snapshot_fname = f"{timestamp}.html.br"
filepath = os.path.join(self.watch_data_dir, snapshot_fname)
if os.path.isfile(filepath):
with open(filepath, 'rb') as f:
return (brotli.decompress(f.read()).decode('utf-8'))
return False
def _prune_last_fetched_html_snapshots(self):
dates = list(self.history.keys())
dates.reverse()
for index, timestamp in enumerate(dates):
snapshot_fname = f"{timestamp}.html.br"
filepath = os.path.join(self.watch_data_dir, snapshot_fname)
# Keep only the first 2
if index > 1 and os.path.isfile(filepath):
os.remove(filepath)
@property @property
def get_browsersteps_available_screenshots(self): def get_browsersteps_available_screenshots(self):
"For knowing which screenshots are available to show the user in BrowserSteps UI" "For knowing which screenshots are available to show the user in BrowserSteps UI"

@ -1,5 +1,6 @@
from abc import abstractmethod from abc import abstractmethod
from changedetectionio.strtobool import strtobool from changedetectionio.strtobool import strtobool
from changedetectionio.model import Watch
from copy import deepcopy from copy import deepcopy
from loguru import logger from loguru import logger
import hashlib import hashlib
@ -138,7 +139,7 @@ class difference_detection_processor():
# After init, call run_changedetection() which will do the actual change-detection # After init, call run_changedetection() which will do the actual change-detection
@abstractmethod @abstractmethod
def run_changedetection(self, uuid, skip_when_checksum_same=True): def run_changedetection(self, watch: Watch, skip_when_checksum_same=True):
update_obj = {'last_notification_error': False, 'last_error': False} update_obj = {'last_notification_error': False, 'last_error': False}
some_data = 'xxxxx' some_data = 'xxxxx'
update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest() update_obj["previous_md5"] = hashlib.md5(some_data.encode('utf-8')).hexdigest()

@ -1,6 +1,5 @@
from . import difference_detection_processor from . import difference_detection_processor
from copy import deepcopy
from loguru import logger from loguru import logger
import hashlib import hashlib
import urllib3 import urllib3
@ -20,10 +19,7 @@ class perform_site_check(difference_detection_processor):
screenshot = None screenshot = None
xpath_data = None xpath_data = None
def run_changedetection(self, uuid, skip_when_checksum_same=True): def run_changedetection(self, watch, skip_when_checksum_same=True):
# DeepCopy so we can be sure we don't accidently change anything by reference
watch = deepcopy(self.datastore.data['watching'].get(uuid))
if not watch: if not watch:
raise Exception("Watch no longer exists.") raise Exception("Watch no longer exists.")
@ -44,13 +40,13 @@ class perform_site_check(difference_detection_processor):
fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest() fetched_md5 = hashlib.md5(self.fetcher.instock_data.encode('utf-8')).hexdigest()
# 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold. # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold.
update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False update_obj["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
logger.debug(f"Watch UUID {uuid} restock check returned '{self.fetcher.instock_data}' from JS scraper.") logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.")
else: else:
raise UnableToExtractRestockData(status_code=self.fetcher.status_code) raise UnableToExtractRestockData(status_code=self.fetcher.status_code)
# The main thing that all this at the moment comes down to :) # The main thing that all this at the moment comes down to :)
changed_detected = False changed_detected = False
logger.debug(f"Watch UUID {uuid} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") logger.debug(f"Watch UUID {watch.get('uuid')} restock check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}")
if watch.get('previous_md5') and watch.get('previous_md5') != fetched_md5: if watch.get('previous_md5') and watch.get('previous_md5') != fetched_md5:
# Yes if we only care about it going to instock, AND we are in stock # Yes if we only care about it going to instock, AND we are in stock

@ -10,8 +10,6 @@ from . import difference_detection_processor
from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text from ..html_tools import PERL_STYLE_REGEX, cdata_in_document_to_text
from changedetectionio import html_tools, content_fetchers from changedetectionio import html_tools, content_fetchers
from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
import changedetectionio.content_fetchers
from copy import deepcopy
from loguru import logger from loguru import logger
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@ -21,7 +19,8 @@ description = 'Detects all text changes where possible'
json_filter_prefixes = ['json:', 'jq:', 'jqraw:'] json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
class FilterNotFoundInResponse(ValueError): class FilterNotFoundInResponse(ValueError):
def __init__(self, msg): def __init__(self, msg, screenshot=None):
self.screenshot = screenshot
ValueError.__init__(self, msg) ValueError.__init__(self, msg)
@ -34,14 +33,12 @@ class PDFToHTMLToolNotFound(ValueError):
# (set_proxy_from_list) # (set_proxy_from_list)
class perform_site_check(difference_detection_processor): class perform_site_check(difference_detection_processor):
def run_changedetection(self, uuid, skip_when_checksum_same=True): def run_changedetection(self, watch, skip_when_checksum_same=True):
changed_detected = False changed_detected = False
html_content = "" html_content = ""
screenshot = False # as bytes screenshot = False # as bytes
stripped_text_from_html = "" stripped_text_from_html = ""
# DeepCopy so we can be sure we don't accidently change anything by reference
watch = deepcopy(self.datastore.data['watching'].get(uuid))
if not watch: if not watch:
raise Exception("Watch no longer exists.") raise Exception("Watch no longer exists.")
@ -116,12 +113,12 @@ class perform_site_check(difference_detection_processor):
# Better would be if Watch.model could access the global data also # Better would be if Watch.model could access the global data also
# and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__ # and then use getattr https://docs.python.org/3/reference/datamodel.html#object.__getitem__
# https://realpython.com/inherit-python-dict/ instead of doing it procedurely # https://realpython.com/inherit-python-dict/ instead of doing it procedurely
include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='include_filters') include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='include_filters')
# 1845 - remove duplicated filters in both group and watch include filter # 1845 - remove duplicated filters in both group and watch include filter
include_filters_rule = list(dict.fromkeys(watch.get('include_filters', []) + include_filters_from_tags)) include_filters_rule = list(dict.fromkeys(watch.get('include_filters', []) + include_filters_from_tags))
subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='subtractive_selectors'), subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=watch.get('uuid'), attr='subtractive_selectors'),
*watch.get("subtractive_selectors", []), *watch.get("subtractive_selectors", []),
*self.datastore.data["settings"]["application"].get("global_subtractive_selectors", []) *self.datastore.data["settings"]["application"].get("global_subtractive_selectors", [])
] ]
@ -188,7 +185,7 @@ class perform_site_check(difference_detection_processor):
append_pretty_line_formatting=not watch.is_source_type_url) append_pretty_line_formatting=not watch.is_source_type_url)
if not html_content.strip(): if not html_content.strip():
raise FilterNotFoundInResponse(include_filters_rule) raise FilterNotFoundInResponse(msg=include_filters_rule, screenshot=self.fetcher.screenshot)
if has_subtractive_selectors: if has_subtractive_selectors:
html_content = html_tools.element_removal(subtractive_selectors, html_content) html_content = html_tools.element_removal(subtractive_selectors, html_content)
@ -222,7 +219,7 @@ class perform_site_check(difference_detection_processor):
from .. import diff from .. import diff
# needs to not include (added) etc or it may get used twice # needs to not include (added) etc or it may get used twice
# Replace the processed text with the preferred result # Replace the processed text with the preferred result
rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_before_filters(), rendered_diff = diff.render_diff(previous_version_file_contents=watch.get_last_fetched_text_before_filters(),
newest_version_file_contents=stripped_text_from_html, newest_version_file_contents=stripped_text_from_html,
include_equal=False, # not the same lines include_equal=False, # not the same lines
include_added=watch.get('filter_text_added', True), include_added=watch.get('filter_text_added', True),
@ -231,7 +228,7 @@ class perform_site_check(difference_detection_processor):
line_feed_sep="\n", line_feed_sep="\n",
include_change_type_prefix=False) include_change_type_prefix=False)
watch.save_last_fetched_before_filters(text_content_before_ignored_filter) watch.save_last_text_fetched_before_filters(text_content_before_ignored_filter)
if not rendered_diff and stripped_text_from_html: if not rendered_diff and stripped_text_from_html:
# We had some content, but no differences were found # We had some content, but no differences were found
@ -344,17 +341,17 @@ class perform_site_check(difference_detection_processor):
if not watch['title'] or not len(watch['title']): if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=self.fetcher.content) update_obj['title'] = html_tools.extract_element(find='title', html_content=self.fetcher.content)
logger.debug(f"Watch UUID {uuid} content check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}") logger.debug(f"Watch UUID {watch.get('uuid')} content check - Previous MD5: {watch.get('previous_md5')}, Fetched MD5 {fetched_md5}")
if changed_detected: if changed_detected:
if watch.get('check_unique_lines', False): if watch.get('check_unique_lines', False):
has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines()) has_unique_lines = watch.lines_contain_something_unique_compared_to_history(lines=stripped_text_from_html.splitlines())
# One or more lines? unsure? # One or more lines? unsure?
if not has_unique_lines: if not has_unique_lines:
logger.debug(f"check_unique_lines: UUID {uuid} didnt have anything new setting change_detected=False") logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} didnt have anything new setting change_detected=False")
changed_detected = False changed_detected = False
else: else:
logger.debug(f"check_unique_lines: UUID {uuid} had unique content") logger.debug(f"check_unique_lines: UUID {watch.get('uuid')} had unique content")
# Always record the new checksum # Always record the new checksum
update_obj["previous_md5"] = fetched_md5 update_obj["previous_md5"] = fetched_md5

@ -163,7 +163,6 @@ class ChangeDetectionStore:
del (update_obj[dict_key]) del (update_obj[dict_key])
self.__data['watching'][uuid].update(update_obj) self.__data['watching'][uuid].update(update_obj)
self.needs_write = True self.needs_write = True
@property @property
@ -376,46 +375,6 @@ class ChangeDetectionStore:
return False return False
# Save as PNG, PNG is larger but better for doing visual diff in the future
def save_screenshot(self, watch_uuid, screenshot: bytes, as_error=False):
if not self.data['watching'].get(watch_uuid):
return
if as_error:
target_path = os.path.join(self.datastore_path, watch_uuid, "last-error-screenshot.png")
else:
target_path = os.path.join(self.datastore_path, watch_uuid, "last-screenshot.png")
self.data['watching'][watch_uuid].ensure_data_dir_exists()
with open(target_path, 'wb') as f:
f.write(screenshot)
f.close()
def save_error_text(self, watch_uuid, contents):
if not self.data['watching'].get(watch_uuid):
return
self.data['watching'][watch_uuid].ensure_data_dir_exists()
target_path = os.path.join(self.datastore_path, watch_uuid, "last-error.txt")
with open(target_path, 'w') as f:
f.write(contents)
def save_xpath_data(self, watch_uuid, data, as_error=False):
if not self.data['watching'].get(watch_uuid):
return
if as_error:
target_path = os.path.join(self.datastore_path, watch_uuid, "elements-error.json")
else:
target_path = os.path.join(self.datastore_path, watch_uuid, "elements.json")
self.data['watching'][watch_uuid].ensure_data_dir_exists()
with open(target_path, 'w') as f:
f.write(json.dumps(data))
f.close()
def sync_to_json(self): def sync_to_json(self):
logger.info("Saving JSON..") logger.info("Saving JSON..")
try: try:

@ -149,6 +149,15 @@ def test_api_simple(client, live_server):
headers={'x-api-key': api_key}, headers={'x-api-key': api_key},
) )
assert b'which has this one new line' in res.data assert b'which has this one new line' in res.data
assert b'<div id' not in res.data
# Fetch the HTML of the latest one
res = client.get(
url_for("watchsinglehistory", uuid=watch_uuid, timestamp='latest')+"?html=1",
headers={'x-api-key': api_key},
)
assert b'which has this one new line' in res.data
assert b'<div id' in res.data
# Fetch the whole watch # Fetch the whole watch
res = client.get( res = client.get(

@ -5,15 +5,13 @@ import os
import json import json
import logging import logging
from flask import url_for from flask import url_for
from .util import live_server_setup from .util import live_server_setup, wait_for_all_checks
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server): def test_consistent_history(client, live_server):
live_server_setup(live_server) live_server_setup(live_server)
# Give the endpoint time to spin up r = range(1, 30)
time.sleep(1)
r = range(1, 50)
for one in r: for one in r:
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True) test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
@ -25,15 +23,8 @@ def test_consistent_history(client, live_server):
assert b"1 Imported" in res.data assert b"1 Imported" in res.data
time.sleep(3) wait_for_all_checks(client)
while True:
res = client.get(url_for("index"))
logging.debug("Waiting for 'Checking now' to go away..")
if b'Checking now' not in res.data:
break
time.sleep(0.5)
time.sleep(3)
# Essentially just triggers the DB write/update # Essentially just triggers the DB write/update
res = client.post( res = client.post(
url_for("settings_page"), url_for("settings_page"),
@ -44,8 +35,9 @@ def test_consistent_history(client, live_server):
) )
assert b"Settings updated." in res.data assert b"Settings updated." in res.data
# Give it time to write it out
time.sleep(3) time.sleep(2)
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json') json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
json_obj = None json_obj = None
@ -58,7 +50,7 @@ def test_consistent_history(client, live_server):
# each one should have a history.txt containing just one line # each one should have a history.txt containing just one line
for w in json_obj['watching'].keys(): for w in json_obj['watching'].keys():
history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt') history_txt_index_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, 'history.txt')
assert os.path.isfile(history_txt_index_file), "History.txt should exist where I expect it - {}".format(history_txt_index_file) assert os.path.isfile(history_txt_index_file), f"History.txt should exist where I expect it at {history_txt_index_file}"
# Same like in model.Watch # Same like in model.Watch
with open(history_txt_index_file, "r") as f: with open(history_txt_index_file, "r") as f:
@ -70,15 +62,15 @@ def test_consistent_history(client, live_server):
w)) w))
# Find the snapshot one # Find the snapshot one
for fname in files_in_watch_dir: for fname in files_in_watch_dir:
if fname != 'history.txt': if fname != 'history.txt' and 'html' not in fname:
# contents should match what we requested as content returned from the test url # contents should match what we requested as content returned from the test url
with open(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, fname), 'r') as snapshot_f: with open(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w, fname), 'r') as snapshot_f:
contents = snapshot_f.read() contents = snapshot_f.read()
watch_url = json_obj['watching'][w]['url'] watch_url = json_obj['watching'][w]['url']
u = urlparse(watch_url) u = urlparse(watch_url)
q = parse_qs(u[4]) q = parse_qs(u[4])
assert q['content'][0] == contents.strip(), "Snapshot file {} should contain {}".format(fname, q['content'][0]) assert q['content'][0] == contents.strip(), f"Snapshot file {fname} should contain {q['content'][0]}"
assert len(files_in_watch_dir) == 2, "Should be just two files in the dir, history.txt and the snapshot" assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"

@ -1,11 +1,12 @@
from . import content_fetchers
from .processors.restock_diff import UnableToExtractRestockData
from .processors.text_json_diff import FilterNotFoundInResponse
from changedetectionio import html_tools
from copy import deepcopy
import os import os
import threading
import queue import queue
import threading
import time import time
from . import content_fetchers
from changedetectionio import html_tools
from .processors.text_json_diff import FilterNotFoundInResponse
from .processors.restock_diff import UnableToExtractRestockData
# A single update worker # A single update worker
# #
@ -245,14 +246,19 @@ class update_worker(threading.Thread):
contents = b'' contents = b''
process_changedetection_results = True process_changedetection_results = True
update_obj = {} update_obj = {}
logger.info(f"Processing watch UUID {uuid} "
f"Priority {queued_item_data.priority} " # Clear last errors (move to preflight func?)
f"URL {self.datastore.data['watching'][uuid]['url']}") self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None
# DeepCopy so we can be sure we don't accidently change anything by reference
watch = deepcopy(self.datastore.data['watching'].get(uuid))
logger.info(f"Processing watch UUID {uuid} Priority {queued_item_data.priority} URL {watch['url']}")
now = time.time() now = time.time()
try: try:
# Processor is what we are using for detecting the "Change" # Processor is what we are using for detecting the "Change"
processor = self.datastore.data['watching'][uuid].get('processor', 'text_json_diff') processor = watch.get('processor', 'text_json_diff')
# if system... # if system...
# Abort processing when the content was the same as the last fetch # Abort processing when the content was the same as the last fetch
@ -272,12 +278,10 @@ class update_worker(threading.Thread):
watch_uuid=uuid watch_uuid=uuid
) )
# Clear last errors (move to preflight func?)
self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None
update_handler.call_browser() update_handler.call_browser()
changed_detected, update_obj, contents = update_handler.run_changedetection(uuid, changed_detected, update_obj, contents = update_handler.run_changedetection(
watch=watch,
skip_when_checksum_same=skip_when_same_checksum, skip_when_checksum_same=skip_when_same_checksum,
) )
@ -309,7 +313,8 @@ class update_worker(threading.Thread):
}) })
if e.screenshot: if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot) watch.save_screenshot(screenshot=e.screenshot, as_error=True)
process_changedetection_results = False process_changedetection_results = False
except content_fetchers.exceptions.Non200ErrorCodeReceived as e: except content_fetchers.exceptions.Non200ErrorCodeReceived as e:
@ -325,11 +330,11 @@ class update_worker(threading.Thread):
err_text = "Error - Request returned a HTTP error code {}".format(str(e.status_code)) err_text = "Error - Request returned a HTTP error code {}".format(str(e.status_code))
if e.screenshot: if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) watch.save_screenshot(screenshot=e.screenshot, as_error=True)
if e.xpath_data: if e.xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=e.xpath_data, as_error=True) watch.save_xpath_data(data=e.xpath_data, as_error=True)
if e.page_text: if e.page_text:
self.datastore.save_error_text(watch_uuid=uuid, contents=e.page_text) watch.save_error_text(contents=e.page_text)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False process_changedetection_results = False
@ -340,17 +345,19 @@ class update_worker(threading.Thread):
err_text = "Warning, no filters were found, no change detection ran - Did the page change layout? update your Visual Filter if necessary." err_text = "Warning, no filters were found, no change detection ran - Did the page change layout? update your Visual Filter if necessary."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
# Only when enabled, send the notification # Only when enabled, send the notification
if self.datastore.data['watching'][uuid].get('filter_failure_notification_send', False): if watch.get('filter_failure_notification_send', False):
c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5) c = watch.get('consecutive_filter_failures', 5)
c += 1 c += 1
# Send notification if we reached the threshold? # Send notification if we reached the threshold?
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts',
0) 0)
logger.warning(f"Filter for {uuid} not found, consecutive_filter_failures: {c}") logger.warning(f"Filter for {uuid} not found, consecutive_filter_failures: {c}")
if threshold > 0 and c >= threshold: if threshold > 0 and c >= threshold:
if not self.datastore.data['watching'][uuid].get('notification_muted'): if not watch.get('notification_muted'):
self.send_filter_failure_notification(uuid) self.send_filter_failure_notification(uuid)
c = 0 c = 0
@ -400,15 +407,15 @@ class update_worker(threading.Thread):
} }
) )
if self.datastore.data['watching'][uuid].get('filter_failure_notification_send', False): if watch.get('filter_failure_notification_send', False):
c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5) c = watch.get('consecutive_filter_failures', 5)
c += 1 c += 1
# Send notification if we reached the threshold? # Send notification if we reached the threshold?
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts',
0) 0)
logger.error(f"Step for {uuid} not found, consecutive_filter_failures: {c}") logger.error(f"Step for {uuid} not found, consecutive_filter_failures: {c}")
if threshold > 0 and c >= threshold: if threshold > 0 and c >= threshold:
if not self.datastore.data['watching'][uuid].get('notification_muted'): if not watch.get('notification_muted'):
self.send_step_failure_notification(watch_uuid=uuid, step_n=e.step_n) self.send_step_failure_notification(watch_uuid=uuid, step_n=e.step_n)
c = 0 c = 0
@ -430,7 +437,7 @@ class update_worker(threading.Thread):
except content_fetchers.exceptions.JSActionExceptions as e: except content_fetchers.exceptions.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot: if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code})
process_changedetection_results = False process_changedetection_results = False
@ -440,7 +447,7 @@ class update_worker(threading.Thread):
err_text = "{} - {}".format(err_text, e.message) err_text = "{} - {}".format(err_text, e.message)
if e.screenshot: if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code, 'last_check_status': e.status_code,
@ -464,8 +471,6 @@ class update_worker(threading.Thread):
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)})
# Other serious error # Other serious error
process_changedetection_results = False process_changedetection_results = False
# import traceback
# print(traceback.format_exc())
else: else:
# Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc) # Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc)
@ -473,7 +478,7 @@ class update_worker(threading.Thread):
continue continue
# Mark that we never had any failures # Mark that we never had any failures
if not self.datastore.data['watching'][uuid].get('ignore_status_codes'): if not watch.get('ignore_status_codes'):
update_obj['consecutive_filter_failures'] = 0 update_obj['consecutive_filter_failures'] = 0
# Everything ran OK, clean off any previous error # Everything ran OK, clean off any previous error
@ -481,25 +486,48 @@ class update_worker(threading.Thread):
self.cleanup_error_artifacts(uuid) self.cleanup_error_artifacts(uuid)
if not self.datastore.data['watching'].get(uuid):
continue
# #
# Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc # Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc
if process_changedetection_results: if process_changedetection_results:
# Always save the screenshot if it's available
if update_handler.screenshot:
watch.save_screenshot(screenshot=update_handler.screenshot)
if update_handler.xpath_data:
watch.save_xpath_data(data=update_handler.xpath_data)
try: try:
watch = self.datastore.data['watching'].get(uuid)
self.datastore.update_watch(uuid=uuid, update_obj=update_obj) self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
# Also save the snapshot on the first time checked # Also save the snapshot on the first time checked
if changed_detected or not watch['last_checked']: if changed_detected or not watch.get('last_checked'):
timestamp = round(time.time())
# Small hack so that we sleep just enough to allow 1 second between history snapshots
# this is because history.txt indexes/keys snapshots by epoch seconds and we dont want dupe keys
if watch.newest_history_key and int(timestamp) == int(watch.newest_history_key):
logger.warning(
f"Timestamp {timestamp} already exists, waiting 1 seconds so we have a unique key in history.txt")
timestamp = str(int(timestamp) + 1)
time.sleep(1)
watch.save_history_text(contents=contents, watch.save_history_text(contents=contents,
timestamp=str(round(time.time())), timestamp=timestamp,
snapshot_id=update_obj.get('previous_md5', 'none')) snapshot_id=update_obj.get('previous_md5', 'none'))
if update_handler.fetcher.content:
watch.save_last_fetched_html(contents=update_handler.fetcher.content, timestamp=timestamp)
# A change was detected # A change was detected
if changed_detected: if changed_detected:
# Notifications should only trigger on the second time (first time, we gather the initial snapshot) # Notifications should only trigger on the second time (first time, we gather the initial snapshot)
if watch.history_n >= 2: if watch.history_n >= 2:
logger.info(f"Change detected in UUID {uuid} - {watch['url']}") logger.info(f"Change detected in UUID {uuid} - {watch['url']}")
if not self.datastore.data['watching'][uuid].get('notification_muted'): if not watch.get('notification_muted'):
self.send_content_changed_notification(watch_uuid=uuid) self.send_content_changed_notification(watch_uuid=uuid)
else: else:
logger.info(f"Change triggered in UUID {uuid} due to first history saving (no notifications sent) - {watch['url']}") logger.info(f"Change triggered in UUID {uuid} due to first history saving (no notifications sent) - {watch['url']}")
@ -510,9 +538,9 @@ class update_worker(threading.Thread):
logger.critical(str(e)) logger.critical(str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)}) self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
if self.datastore.data['watching'].get(uuid):
# Always record that we atleast tried # Always record that we atleast tried
count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1 count = watch.get('check_count', 0) + 1
# Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds # Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds
try: try:
@ -528,12 +556,6 @@ class update_worker(threading.Thread):
'check_count': count 'check_count': count
}) })
# Always save the screenshot if it's available
if update_handler.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot)
if update_handler.xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data)
self.current_uuid = None # Done self.current_uuid = None # Done
self.q.task_done() self.q.task_done()

Loading…
Cancel
Save