diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 525e8688..789c72a6 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -434,48 +434,21 @@ def changedetection_app(config=None, datastore_o=None): @login_required def scrub_page(): - import re - if request.method == 'POST': confirmtext = request.form.get('confirmtext') - limit_date = request.form.get('limit_date') - limit_timestamp = 0 - - # Re #149 - allow empty/0 timestamp limit - if len(limit_date): - try: - limit_date = limit_date.replace('T', ' ') - # I noticed chrome will show '/' but actually submit '-' - limit_date = limit_date.replace('-', '/') - # In the case that :ss seconds are supplied - limit_date = re.sub(r'(\d\d:\d\d)(:\d\d)', '\\1', limit_date) - - str_to_dt = datetime.datetime.strptime(limit_date, '%Y/%m/%d %H:%M') - limit_timestamp = int(str_to_dt.timestamp()) - - if limit_timestamp > time.time(): - flash("Timestamp is in the future, cannot continue.", 'error') - return redirect(url_for('scrub_page')) - - except ValueError: - flash('Incorrect date format, cannot continue.', 'error') - return redirect(url_for('scrub_page')) if confirmtext == 'scrub': changes_removed = 0 - for uuid, watch in datastore.data['watching'].items(): - if limit_timestamp: - changes_removed += datastore.scrub_watch(uuid, limit_timestamp=limit_timestamp) - else: - changes_removed += datastore.scrub_watch(uuid) + for uuid in datastore.data['watching'].keys(): + datastore.scrub_watch(uuid) - flash("Cleared snapshot history ({} snapshots removed)".format(changes_removed)) + flash("Cleared all snapshot history") else: flash('Incorrect confirmation text.', 'error') return redirect(url_for('index')) - output = render_template("scrub.html") + output = render_template("scrub.html") return output diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 320d4543..f547ce24 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -24,24 +24,24 @@ class Fetcher(): content = None headers = None - fetcher_description ="No description" - xpath_element_js=""" + fetcher_description = "No description" + xpath_element_js = """ // Include the getXpath script directly, easier than fetching !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}}); //# sourceMappingURL=index.umd.js.map - + const findUpTag = (el) => { let r = el chained_css = []; - + while (r.parentNode) { - + if(r.classList.length >0) { // limit to just using 2 class names of each, stops from getting really huge selector strings current_css='.'+Array.from(r.classList).slice(0, 2).join('.'); chained_css.unshift(current_css); - + var f=chained_css.join(' '); var q=document.querySelectorAll(f); if(q.length==1) return current_css; @@ -52,7 +52,7 @@ class Fetcher(): return null; } - + var elements = document.getElementsByTagName("*"); var size_pos=[]; // after page fetch, inject this JS @@ -60,16 +60,16 @@ class Fetcher(): var bbox; for (var i = 0; i < elements.length; i++) { bbox = elements[i].getBoundingClientRect(); - + // forget reallysmall ones if (bbox['width'] <10 && bbox['height'] <10 ) { continue; } - + // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes // it should not traverse when we know we can anchor off just an ID one level up etc.. // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match - + // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us. xpath_result=false; try { @@ -80,12 +80,12 @@ class Fetcher(): } catch (e) { var x=1; } - + // default back to the less intelligent one if (!xpath_result) { xpath_result = getXPath(elements[i]); } - + size_pos.push({ xpath: xpath_result, width: bbox['width'], @@ -95,8 +95,8 @@ class Fetcher(): childCount: elements[i].childElementCount }); } - - + + // inject the current one set in the css_filter, which may be a CSS rule // used for displaying the current one in VisualSelector, where its not one we generated. if (css_filter.length) { @@ -118,10 +118,10 @@ class Fetcher(): }); } } - + return size_pos; """ - xpath_data=None + xpath_data = None # Will be needed in the future by the VisualSelector, always get this where possible. screenshot = False @@ -155,6 +155,7 @@ class Fetcher(): def is_ready(self): return True + # Maybe for the future, each fetcher provides its own diff output, could be used for text, image # the current one would return javascript output (as we use JS to generate the diff) # @@ -180,10 +181,10 @@ class base_html_playwright(Fetcher): if os.getenv("PLAYWRIGHT_DRIVER_URL"): fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) -# try: -# from playwright.sync_api import sync_playwright -# except ModuleNotFoundError: -# fetcher_enabled = False + # try: + # from playwright.sync_api import sync_playwright + # except ModuleNotFoundError: + # fetcher_enabled = False browser_type = '' command_executor = '' @@ -255,7 +256,7 @@ class base_html_playwright(Fetcher): else: page.evaluate("var css_filter=''") - self.xpath_data = page.evaluate("async () => {"+ self.xpath_element_js+ "}") + self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}") # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it # JPEG is better here because the screenshots can be very very large page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}) @@ -324,7 +325,7 @@ class base_html_webdriver(Fetcher): self.driver.set_window_size(1280, 1024) self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) - self.xpath_data = self.driver.execute_script("var css_filter='{}';".format(current_css_filter)+self.xpath_element_js) + self.xpath_data = self.driver.execute_script("var css_filter='{}';".format(current_css_filter) + self.xpath_element_js) self.screenshot = self.driver.get_screenshot_as_png() # @todo - how to check this? is it possible? @@ -350,8 +351,6 @@ class base_html_webdriver(Fetcher): self.quit() return True - - def quit(self): if self.driver: try: diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index d97ee4bc..f7253440 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -194,6 +194,4 @@ class perform_site_check(): if not watch['title'] or not len(watch['title']): update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) - return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot, fetcher.xpath_data - diff --git a/changedetectionio/store.py b/changedetectionio/store.py index c65f17fa..5e139294 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -145,6 +145,10 @@ class ChangeDetectionStore: def update_watch(self, uuid, update_obj): + # It's possible that the watch could be deleted before update + if not self.__data['watching'].get(uuid): + return + with self.lock: # In python 3.9 we have the |= dict operator, but that still will lose data on nested structures... @@ -256,46 +260,14 @@ class ChangeDetectionStore: return self.data['watching'][uuid].get(val) # Remove a watchs data but keep the entry (URL etc) - def scrub_watch(self, uuid, limit_timestamp = False): - - import hashlib - del_timestamps = [] - - changes_removed = 0 - - for timestamp, path in self.data['watching'][uuid]['history'].items(): - if not limit_timestamp or (limit_timestamp is not False and int(timestamp) > limit_timestamp): - self.unlink_history_file(path) - del_timestamps.append(timestamp) - changes_removed += 1 - - if not limit_timestamp: - self.data['watching'][uuid]['last_checked'] = 0 - self.data['watching'][uuid]['last_changed'] = 0 - self.data['watching'][uuid]['previous_md5'] = "" - - - for timestamp in del_timestamps: - del self.data['watching'][uuid]['history'][str(timestamp)] - - # If there was a limitstamp, we need to reset some meta data about the entry - # This has to happen after we remove the others from the list - if limit_timestamp: - newest_key = self.get_newest_history_key(uuid) - if newest_key: - self.data['watching'][uuid]['last_checked'] = int(newest_key) - # @todo should be the original value if it was less than newest key - self.data['watching'][uuid]['last_changed'] = int(newest_key) - try: - with open(self.data['watching'][uuid]['history'][str(newest_key)], "rb") as fp: - content = fp.read() - self.data['watching'][uuid]['previous_md5'] = hashlib.md5(content).hexdigest() - except (FileNotFoundError, IOError): - self.data['watching'][uuid]['previous_md5'] = "" - pass + def scrub_watch(self, uuid): + import pathlib - self.needs_write = True - return changes_removed + self.__data['watching'][uuid].update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'newest_history_key': 0, 'previous_md5': False}) + self.needs_write_urgent = True + + for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"): + unlink(item) def add_watch(self, url, tag="", extras=None, write_to_disk_now=True): if extras is None: @@ -457,10 +429,11 @@ class ChangeDetectionStore: import pathlib # Only in the sub-directories - for item in pathlib.Path(self.datastore_path).rglob("*/*txt"): - if not str(item) in index: - print ("Removing",item) - unlink(item) + for uuid in self.data['watching']: + for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"): + if not str(item) in index: + print ("Removing",item) + unlink(item) # Run all updates # IMPORTANT - Each update could be run even when they have a new install and the schema is correct diff --git a/changedetectionio/templates/scrub.html b/changedetectionio/templates/scrub.html index bd006b31..5b9a15da 100644 --- a/changedetectionio/templates/scrub.html +++ b/changedetectionio/templates/scrub.html @@ -7,7 +7,7 @@
- This will remove all version snapshots/data, but keep your list of URLs.
+ This will remove ALL version snapshots/data, but keep your list of URLs.
You may like to use the BACKUP link first.

@@ -17,12 +17,6 @@ Type in the word scrub to confirm that you understand!
-
- - - dd/mm/yyyy hh:mm (24 hour format) -
-
diff --git a/requirements.txt b/requirements.txt index 3e376796..468dca88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,9 +39,5 @@ selenium ~= 4.1.0 # ImportError: cannot import name 'safe_str_cmp' from 'werkzeug.security' # need to revisit flask login versions werkzeug ~= 2.0.0 -<<<<<<< HEAD -playwright ~= 1.21.0 -======= # playwright is installed at Dockerfile build time because it's not available on all platforms ->>>>>>> playwright