Merge branch 'master' into 550-visual-selector

3 years ago · 573e92c5e5
parent 57ba77e287 68db20168e
commit 573e92c5e5
6 changed files with 44 additions and 111 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -434,48 +434,21 @@ def changedetection_app(config=None, datastore_o=None):
    @login_required
    def scrub_page():

-        import re
-
        if request.method == 'POST':
            confirmtext = request.form.get('confirmtext')
-            limit_date = request.form.get('limit_date')
-            limit_timestamp = 0
-
-            # Re #149 - allow empty/0 timestamp limit
-            if len(limit_date):
-                try:
-                    limit_date = limit_date.replace('T', ' ')
-                    # I noticed chrome will show '/' but actually submit '-'
-                    limit_date = limit_date.replace('-', '/')
-                    # In the case that :ss seconds are supplied
-                    limit_date = re.sub(r'(\d\d:\d\d)(:\d\d)', '\\1', limit_date)
-
-                    str_to_dt = datetime.datetime.strptime(limit_date, '%Y/%m/%d %H:%M')
-                    limit_timestamp = int(str_to_dt.timestamp())
-
-                    if limit_timestamp > time.time():
-                        flash("Timestamp is in the future, cannot continue.", 'error')
-                        return redirect(url_for('scrub_page'))
-
-                except ValueError:
-                    flash('Incorrect date format, cannot continue.', 'error')
-                    return redirect(url_for('scrub_page'))

            if confirmtext == 'scrub':
                changes_removed = 0
-                for uuid, watch in datastore.data['watching'].items():
-                    if limit_timestamp:
-                        changes_removed += datastore.scrub_watch(uuid, limit_timestamp=limit_timestamp)
-                    else:
-                        changes_removed += datastore.scrub_watch(uuid)
+                for uuid in datastore.data['watching'].keys():
+                    datastore.scrub_watch(uuid)

-                flash("Cleared snapshot history ({} snapshots removed)".format(changes_removed))
+                flash("Cleared all snapshot history")
            else:
                flash('Incorrect confirmation text.', 'error')

            return redirect(url_for('index'))

-        output =  render_template("scrub.html")
+        output = render_template("scrub.html")
        return output


--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -24,24 +24,24 @@ class Fetcher():
    content = None
    headers = None

-    fetcher_description ="No description"
-    xpath_element_js="""               
+    fetcher_description = "No description"
+    xpath_element_js = """               
                // Include the getXpath script directly, easier than fetching
                !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
                //# sourceMappingURL=index.umd.js.map             

-            
+
                const findUpTag = (el) => {
                  let r = el
                  chained_css = [];
-            
+
                  while (r.parentNode) {
-            
+
                    if(r.classList.length >0) {
                     // limit to just using 2 class names of each, stops from getting really huge selector strings
                      current_css='.'+Array.from(r.classList).slice(0, 2).join('.');
                      chained_css.unshift(current_css);
-            
+
                      var f=chained_css.join(' ');
                      var q=document.querySelectorAll(f);
                      if(q.length==1) return current_css;
@ -52,7 +52,7 @@ class Fetcher():
                  return null;
                }

-                
+
                var elements = document.getElementsByTagName("*");
                var size_pos=[];
                // after page fetch, inject this JS
@ -60,16 +60,16 @@ class Fetcher():
                var bbox;
                for (var i = 0; i < elements.length; i++) {   
                 bbox = elements[i].getBoundingClientRect();
-                
+
                 // forget reallysmall ones
                 if (bbox['width'] <10 && bbox['height'] <10 ) {
                   continue;
                 }
-                
+
                 // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
                 // it should not traverse when we know we can anchor off just an ID one level up etc..
                 // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match 
-                
+
                 // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
                 xpath_result=false;
                 try {
@ -80,12 +80,12 @@ class Fetcher():
                 } catch (e) {
                   var x=1;
                 }
-                
+
                 // default back to the less intelligent one
                 if (!xpath_result) {
                   xpath_result = getXPath(elements[i]);                   
                 } 
-                
+
                 size_pos.push({
                   xpath: xpath_result,
                   width: bbox['width'], 
@ -95,8 +95,8 @@ class Fetcher():
                   childCount: elements[i].childElementCount
                 });                 
                }
-                
-                
+
+
                // inject the current one set in the css_filter, which may be a CSS rule
                // used for displaying the current one in VisualSelector, where its not one we generated.
                if (css_filter.length) {
@ -118,10 +118,10 @@ class Fetcher():
                         });
                     }
                }
-                
+
                return size_pos;
    """
-    xpath_data=None
+    xpath_data = None

    # Will be needed in the future by the VisualSelector, always get this where possible.
    screenshot = False
@ -155,6 +155,7 @@ class Fetcher():
    def is_ready(self):
        return True

+
 #   Maybe for the future, each fetcher provides its own diff output, could be used for text, image
 #   the current one would return javascript output (as we use JS to generate the diff)
 #
@ -180,10 +181,10 @@ class base_html_playwright(Fetcher):
    if os.getenv("PLAYWRIGHT_DRIVER_URL"):
        fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))

-#    try:
-#        from playwright.sync_api import sync_playwright
-#    except ModuleNotFoundError:
-#        fetcher_enabled = False
+    #    try:
+    #        from playwright.sync_api import sync_playwright
+    #    except ModuleNotFoundError:
+    #        fetcher_enabled = False

    browser_type = ''
    command_executor = ''
@ -255,7 +256,7 @@ class base_html_playwright(Fetcher):
            else:
                page.evaluate("var css_filter=''")

-            self.xpath_data = page.evaluate("async () => {"+ self.xpath_element_js+ "}")
+            self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}")
            # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
            # JPEG is better here because the screenshots can be very very large
            page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024})
@ -324,7 +325,7 @@ class base_html_webdriver(Fetcher):

        self.driver.set_window_size(1280, 1024)
        self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
-        self.xpath_data = self.driver.execute_script("var css_filter='{}';".format(current_css_filter)+self.xpath_element_js)
+        self.xpath_data = self.driver.execute_script("var css_filter='{}';".format(current_css_filter) + self.xpath_element_js)
        self.screenshot = self.driver.get_screenshot_as_png()

        # @todo - how to check this? is it possible?
@ -350,8 +351,6 @@ class base_html_webdriver(Fetcher):
        self.quit()
        return True

-
-
    def quit(self):
        if self.driver:
            try:
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -194,6 +194,4 @@ class perform_site_check():
                if not watch['title'] or not len(watch['title']):
                    update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)

-
        return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot, fetcher.xpath_data
-
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -145,6 +145,10 @@ class ChangeDetectionStore:

    def update_watch(self, uuid, update_obj):

+        # It's possible that the watch could be deleted before update
+        if not self.__data['watching'].get(uuid):
+            return
+
        with self.lock:

            # In python 3.9 we have the |= dict operator, but that still will lose data on nested structures...
@ -256,46 +260,14 @@ class ChangeDetectionStore:
        return self.data['watching'][uuid].get(val)

    # Remove a watchs data but keep the entry (URL etc)
-    def scrub_watch(self, uuid, limit_timestamp = False):
-
-        import hashlib
-        del_timestamps = []
-
-        changes_removed = 0
-
-        for timestamp, path in self.data['watching'][uuid]['history'].items():
-            if not limit_timestamp or (limit_timestamp is not False and int(timestamp) > limit_timestamp):
-                self.unlink_history_file(path)
-                del_timestamps.append(timestamp)
-                changes_removed += 1
-
-        if not limit_timestamp:
-            self.data['watching'][uuid]['last_checked'] = 0
-            self.data['watching'][uuid]['last_changed'] = 0
-            self.data['watching'][uuid]['previous_md5'] = ""
-
-
-        for timestamp in del_timestamps:
-            del self.data['watching'][uuid]['history'][str(timestamp)]
-
-            # If there was a limitstamp, we need to reset some meta data about the entry
-            # This has to happen after we remove the others from the list
-            if limit_timestamp:
-                newest_key = self.get_newest_history_key(uuid)
-                if newest_key:
-                    self.data['watching'][uuid]['last_checked'] = int(newest_key)
-                    # @todo should be the original value if it was less than newest key
-                    self.data['watching'][uuid]['last_changed'] = int(newest_key)
-                    try:
-                        with open(self.data['watching'][uuid]['history'][str(newest_key)], "rb") as fp:
-                            content = fp.read()
-                        self.data['watching'][uuid]['previous_md5'] = hashlib.md5(content).hexdigest()
-                    except (FileNotFoundError, IOError):
-                        self.data['watching'][uuid]['previous_md5'] = ""
-                        pass
+    def scrub_watch(self, uuid):
+        import pathlib

-        self.needs_write = True
-        return changes_removed
+        self.__data['watching'][uuid].update({'history': {}, 'last_checked': 0, 'last_changed': 0, 'newest_history_key': 0, 'previous_md5': False})
+        self.needs_write_urgent = True
+
+        for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"):
+            unlink(item)

    def add_watch(self, url, tag="", extras=None, write_to_disk_now=True):
        if extras is None:
@ -457,10 +429,11 @@ class ChangeDetectionStore:
        import pathlib

        # Only in the sub-directories
-        for item in pathlib.Path(self.datastore_path).rglob("*/*txt"):
-            if not str(item) in index:
-                print ("Removing",item)
-                unlink(item)
+        for uuid in self.data['watching']:
+            for item in pathlib.Path(self.datastore_path).rglob(uuid+"/*.txt"):
+                if not str(item) in index:
+                    print ("Removing",item)
+                    unlink(item)

    # Run all updates
    # IMPORTANT - Each update could be run even when they have a new install and the schema is correct
--- a/changedetectionio/templates/scrub.html
+++ b/changedetectionio/templates/scrub.html
@ -7,7 +7,7 @@
        <input type="hidden" name="csrf_token" value="{{ csrf_token() }}"/>
        <fieldset>
            <div class="pure-control-group">
-                This will remove all version snapshots/data, but keep your list of URLs. <br/>
+                This will remove ALL version snapshots/data, but keep your list of URLs. <br/>
                You may like to use the <strong>BACKUP</strong> link first.<br/>
            </div>
            <br/>
@ -17,12 +17,6 @@
                <span class="pure-form-message-inline">Type in the word <strong>scrub</strong> to confirm that you understand!</span>
            </div>
            <br/>
-            <div class="pure-control-group">
-                <label for="confirmtext">Optional: Limit deletion of snapshots to snapshots <i>newer</i> than date/time</label>
-                <input type="datetime-local" id="limit_date" name="limit_date"  />
-                <span class="pure-form-message-inline">dd/mm/yyyy hh:mm (24 hour format)</span>
-            </div>
-            <br/>
            <div class="pure-control-group">
                <button type="submit" class="pure-button pure-button-primary">Scrub!</button>
            </div>
--- a/requirements.txt
+++ b/requirements.txt
@ -39,9 +39,5 @@ selenium ~= 4.1.0
 # ImportError: cannot import name 'safe_str_cmp' from 'werkzeug.security'
 # need to revisit flask login versions
 werkzeug ~= 2.0.0
-<<<<<<< HEAD
-playwright ~= 1.21.0
-=======

 # playwright is installed at Dockerfile build time because it's not available on all platforms
->>>>>>> playwright