Merge branch 'master' into 550-visual-selector

dgtlmoon 3 years ago
commit 573e92c5e5

@ -434,48 +434,21 @@ def changedetection_app(config=None, datastore_o=None):
def scrub_page():
import re
if request.method == 'POST':
confirmtext = request.form.get('confirmtext')
limit_date = request.form.get('limit_date')
limit_timestamp = 0
# Re #149 - allow empty/0 timestamp limit
if len(limit_date):
limit_date = limit_date.replace('T', ' ')
# I noticed chrome will show '/' but actually submit '-'
limit_date = limit_date.replace('-', '/')
# In the case that :ss seconds are supplied
limit_date = re.sub(r'(\d\d:\d\d)(:\d\d)', '\\1', limit_date)
str_to_dt = datetime.datetime.strptime(limit_date, '%Y/%m/%d %H:%M')
limit_timestamp = int(str_to_dt.timestamp())
if limit_timestamp > time.time():
flash("Timestamp is in the future, cannot continue.", 'error')
return redirect(url_for('scrub_page'))
except ValueError:
flash('Incorrect date format, cannot continue.', 'error')
return redirect(url_for('scrub_page'))
if confirmtext == 'scrub':
changes_removed = 0
for uuid, watch in['watching'].items():
if limit_timestamp:
changes_removed += datastore.scrub_watch(uuid, limit_timestamp=limit_timestamp)
changes_removed += datastore.scrub_watch(uuid)
for uuid in['watching'].keys():
flash("Cleared snapshot history ({} snapshots removed)".format(changes_removed))
flash("Cleared all snapshot history")
flash('Incorrect confirmation text.', 'error')
return redirect(url_for('index'))
output = render_template("scrub.html")
output = render_template("scrub.html")
return output

@ -24,24 +24,24 @@ class Fetcher():
content = None
headers = None
fetcher_description ="No description"
fetcher_description = "No description"
xpath_element_js = """
// Include the getXpath script directly, easier than fetching
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&'//*[@id="''"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
const findUpTag = (el) => {
let r = el
chained_css = [];
while (r.parentNode) {
if(r.classList.length >0) {
// limit to just using 2 class names of each, stops from getting really huge selector strings
current_css='.'+Array.from(r.classList).slice(0, 2).join('.');
var f=chained_css.join(' ');
var q=document.querySelectorAll(f);
if(q.length==1) return current_css;
@ -52,7 +52,7 @@ class Fetcher():
return null;
var elements = document.getElementsByTagName("*");
var size_pos=[];
// after page fetch, inject this JS
@ -60,16 +60,16 @@ class Fetcher():
var bbox;
for (var i = 0; i < elements.length; i++) {
bbox = elements[i].getBoundingClientRect();
// forget reallysmall ones
if (bbox['width'] <10 && bbox['height'] <10 ) {
// @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
// it should not traverse when we know we can anchor off just an ID one level up etc..
// maybe, get current class or id, keep traversing up looking for only class or id until there is just one match
// 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
try {
@ -80,12 +80,12 @@ class Fetcher():
} catch (e) {
var x=1;
// default back to the less intelligent one
if (!xpath_result) {
xpath_result = getXPath(elements[i]);
xpath: xpath_result,
width: bbox['width'],
@ -95,8 +95,8 @@ class Fetcher():
childCount: elements[i].childElementCount
// inject the current one set in the css_filter, which may be a CSS rule
// used for displaying the current one in VisualSelector, where its not one we generated.
if (css_filter.length) {
@ -118,10 +118,10 @@ class Fetcher():
return size_pos;
xpath_data = None
# Will be needed in the future by the VisualSelector, always get this where possible.
screenshot = False
@ -155,6 +155,7 @@ class Fetcher():
def is_ready(self):
return True
# Maybe for the future, each fetcher provides its own diff output, could be used for text, image
# the current one would return javascript output (as we use JS to generate the diff)
@ -180,10 +181,10 @@ class base_html_playwright(Fetcher):
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
# try:
# from playwright.sync_api import sync_playwright
# except ModuleNotFoundError:
# fetcher_enabled = False
# try:
# from playwright.sync_api import sync_playwright
# except ModuleNotFoundError:
# fetcher_enabled = False
browser_type = ''
command_executor = ''
@ -255,7 +256,7 @@ class base_html_playwright(Fetcher):
page.evaluate("var css_filter=''")
self.xpath_data = page.evaluate("async () => {"+ self.xpath_element_js+ "}")
self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}")
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024})
@ -324,7 +325,7 @@ class base_html_webdriver(Fetcher):
self.driver.set_window_size(1280, 1024)
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.xpath_data = self.driver.execute_script("var css_filter='{}';".format(current_css_filter)+self.xpath_element_js)
self.xpath_data = self.driver.execute_script("var css_filter='{}';".format(current_css_filter) + self.xpath_element_js)
self.screenshot = self.driver.get_screenshot_as_png()
# @todo - how to check this? is it possible?
@ -350,8 +351,6 @@ class base_html_webdriver(Fetcher):
return True
def quit(self):
if self.driver:

@ -194,6 +194,4 @@ class perform_site_check():
if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot, fetcher.xpath_data

@ -145,6 +145,10 @@ class ChangeDetectionStore:
def update_watch(self, uuid, update_obj):
# It's possible that the watch could be deleted before update
if not self.__data['watching'].get(uuid):
with self.lock:
# In python 3.9 we have the |= dict operator, but that still will lose data on nested structures...
@ -256,46 +260,14 @@ class ChangeDetectionStore:
# Remove a watchs data but keep the entry (URL etc)
def scrub_watch(self, uuid, limit_timestamp = False):
import hashlib
del_timestamps = []
changes_removed = 0
for timestamp, path in['watching'][uuid]['history'].items():
if not limit_timestamp or (limit_timestamp is not False and int(timestamp) > limit_timestamp):
changes_removed += 1
if not limit_timestamp:['watching'][uuid]['last_checked'] = 0['watching'][uuid]['last_changed'] = 0['watching'][uuid]['previous_md5'] = ""
for timestamp in del_timestamps:
# If there was a limitstamp, we need to reset some meta data about the entry
# This has to happen after we remove the others from the list
if limit_timestamp:
newest_key = self.get_newest_history_key(uuid)
if newest_key:['watching'][uuid]['last_checked'] = int(newest_key)
# @todo should be the original value if it was less than newest key['watching'][uuid]['last_changed'] = int(newest_key)
with open(['watching'][uuid]['history'][str(newest_key)], "rb") as fp:
content =['watching'][uuid]['previous_md5'] = hashlib.md5(content).hexdigest()
except (FileNotFoundError, IOError):['watching'][uuid]['previous_md5'] = ""
def scrub_watch(self, uuid):
import pathlib
self.needs_write = True
return changes_removed
self.__data['watching'][uuid].update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'newest_history_key': 0, 'previous_md5': False})
self.needs_write_urgent = True
for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"):
def add_watch(self, url, tag="", extras=None, write_to_disk_now=True):
if extras is None:
@ -457,10 +429,11 @@ class ChangeDetectionStore:
import pathlib
# Only in the sub-directories
for item in pathlib.Path(self.datastore_path).rglob("*/*txt"):
if not str(item) in index:
print ("Removing",item)
for uuid in['watching']:
for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"):
if not str(item) in index:
print ("Removing",item)
# Run all updates
# IMPORTANT - Each update could be run even when they have a new install and the schema is correct

@ -7,7 +7,7 @@
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
<div class="pure-control-group">
This will remove all version snapshots/data, but keep your list of URLs. <br/>
This will remove ALL version snapshots/data, but keep your list of URLs. <br/>
You may like to use the <strong>BACKUP</strong> link first.<br/>
@ -17,12 +17,6 @@
<span class="pure-form-message-inline">Type in the word <strong>scrub</strong> to confirm that you understand!</span>
<div class="pure-control-group">
<label for="confirmtext">Optional: Limit deletion of snapshots to snapshots <i>newer</i> than date/time</label>
<input type="datetime-local" id="limit_date" name="limit_date" />
<span class="pure-form-message-inline">dd/mm/yyyy hh:mm (24 hour format)</span>
<div class="pure-control-group">
<button type="submit" class="pure-button pure-button-primary">Scrub!</button>

@ -39,9 +39,5 @@ selenium ~= 4.1.0
# ImportError: cannot import name 'safe_str_cmp' from ''
# need to revisit flask login versions
werkzeug ~= 2.0.0
<<<<<<< HEAD
playwright ~= 1.21.0
# playwright is installed at Dockerfile build time because it's not available on all platforms
>>>>>>> playwright
