just re-use the existing page fetch

3 years ago · 12aa77ee35
parent a086991b54
commit 12aa77ee35
7 changed files with 1659 additions and 156 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@ -1101,158 +1101,6 @@ def changedetection_app(config=None, datastore_o=None):
        flash("{} watches are queued for rechecking.".format(i))
        return redirect(url_for('index', tag=tag))

-    @app.route("/api/request-visual-selector-data/<string:uuid>", methods=['GET'])
-    @login_required
-    def visualselector_request_current_screenshot_and_metadata(uuid):
-        import json
-
-        watch = deepcopy(datastore.data['watching'][uuid])
-
-        path_to_datafile = os.path.join(datastore_o.datastore_path, uuid, "elements.json")
-        try:
-            os.unlink(path_to_datafile)
-        except FileNotFoundError:
-            pass
-
-        # docker run -p 3000:3000 browserless/chrome
-        # @todo this needs abstracting out?
-        from playwright.sync_api import sync_playwright
-        with sync_playwright() as p:
-            browser = p.chromium.connect_over_cdp("ws://127.0.0.1:3000")
-            page = browser.new_page()
-
-
-            # @todo handle timeouts for long pages >30sec
-            try:
-                page.goto(watch['url'])
-            except Exception as e:
-                pass
-
-            #time.sleep(3)
-            # https://github.com/microsoft/playwright/issues/620
-            # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
-            screenshot = page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024})
-            screenshot = page.screenshot(type='jpeg', full_page=True, quality=90)
-
-            # Could be made a lot faster
-            # https://toruskit.com/blog/how-to-get-element-bounds-without-reflow/
-
-            # lazy quoting, probably going to be bad later.
-            css_filter = watch['css_filter'].replace('"', '\\"')
-            css_filter = css_filter.replace('\'', '\\\'')
-
-            page.evaluate("var css_filter='{}';".format(css_filter))
-
-            info = page.evaluate("""async () => {                        
-            // Include the getXpath script directly, easier than fetching
-            !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
-            //# sourceMappingURL=index.umd.js.map             
-
-
-const findUpTag = (el) => {
-  let r = el
-  chained_css = [];
-  
-  while (r.parentNode) {
-  
-    if(r.classList.length >0) {
-     // limit to just using 2 class names of each, stops from getting really huge selector strings
-      current_css='.'+Array.from(r.classList).slice(0, 2).join('.');
-      chained_css.unshift(current_css);
-   
-      var f=chained_css.join(' ');
-      var q=document.querySelectorAll(f);
-      if(q.length==1) return current_css;
-      if(f.length >120) return null;
-    }  
-    r = r.parentNode;
-  }
-  return null;
-}
-
-                                    
-              var elements = document.getElementsByTagName("*");
-              var size_pos=[];
-              // after page fetch, inject this JS
-              // build a map of all elements and their positions (maybe that only include text?)
-              var bbox;
-              for (var i = 0; i < elements.length; i++) {   
-                 bbox = elements[i].getBoundingClientRect();
-
-                 // forget reallysmall ones
-                 if (bbox['width'] <10 && bbox['height'] <10 ) {
-                   continue;
-                 }
-                 
-                 // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
-                 // it should not traverse when we know we can anchor off just an ID one level up etc..
-                 // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match 
-                 
-                 // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
-                 xpath_result=false;
-                 try {
-                   var d= findUpTag(elements[i]);
-                   if (d) {
-                     xpath_result =d;
-                   }                
-                 } catch (e) {
-                   var x=1;
-                 }
-                 
-                 // default back to the less intelligent one
-                 if (!xpath_result) {
-                   xpath_result = getXPath(elements[i]);                   
-                 } 
-                 
-                 size_pos.push({
-                   xpath: xpath_result,
-                   width: bbox['width'], 
-                   height: bbox['height'],
-                   left: bbox['left'],
-                   top: bbox['top'],
-                   childCount: elements[i].childElementCount
-                 });                 
-              }
-              
-              
-               // inject the current one set in the css_filter, which may be a CSS rule
-               // used for displaying the current one in VisualSelector, where its not one we generated.
-               if (css_filter.length) {
-                   // is it xpath?
-                   if (css_filter.startsWith('/') ) {
-                     q=document.evaluate(css_filter, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
-                   } else {
-                     q=document.querySelector(css_filter);
-                   }
-                   if (q) {
-                       bbox = q.getBoundingClientRect();
-                       size_pos.push({
-                           xpath: css_filter,
-                           width: bbox['width'], 
-                           height: bbox['height'],
-                           left: bbox['left'],
-                           top: bbox['top'],
-                           childCount: q.childElementCount
-                         });
-                     }
-               }
-                 
-              return size_pos;
-}""")
-
-            browser.close()
-
-            with open(path_to_datafile ,'w') as f:
-                f.write(json.dumps(info, indent=4))
-
-
-        response = make_response(screenshot)
-        response.headers['Content-type'] = 'image/jpeg'
-        response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
-        response.headers['Pragma'] = 'no-cache'
-        response.headers['Expires'] = 0
-        return response
-

    # @todo handle ctrl break
    ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start()
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -7,6 +7,7 @@ from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
 from selenium.common.exceptions import WebDriverException
 import requests
 import time
+import json
 import urllib3.exceptions


@ -26,6 +27,102 @@ class Fetcher():
    headers = None

    fetcher_description ="No description"
+    xpath_element_js="""               
+                // Include the getXpath script directly, easier than fetching
+                !function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):(e=e||self).getXPath=n()}(this,function(){return function(e){var n=e;if(n&&n.id)return'//*[@id="'+n.id+'"]';for(var o=[];n&&Node.ELEMENT_NODE===n.nodeType;){for(var i=0,r=!1,d=n.previousSibling;d;)d.nodeType!==Node.DOCUMENT_TYPE_NODE&&d.nodeName===n.nodeName&&i++,d=d.previousSibling;for(d=n.nextSibling;d;){if(d.nodeName===n.nodeName){r=!0;break}d=d.nextSibling}o.push((n.prefix?n.prefix+":":"")+n.localName+(i||r?"["+(i+1)+"]":"")),n=n.parentNode}return o.length?"/"+o.reverse().join("/"):""}});
+                //# sourceMappingURL=index.umd.js.map             
+
+            
+                const findUpTag = (el) => {
+                  let r = el
+                  chained_css = [];
+            
+                  while (r.parentNode) {
+            
+                    if(r.classList.length >0) {
+                     // limit to just using 2 class names of each, stops from getting really huge selector strings
+                      current_css='.'+Array.from(r.classList).slice(0, 2).join('.');
+                      chained_css.unshift(current_css);
+            
+                      var f=chained_css.join(' ');
+                      var q=document.querySelectorAll(f);
+                      if(q.length==1) return current_css;
+                      if(f.length >120) return null;
+                    }  
+                    r = r.parentNode;
+                  }
+                  return null;
+                }
+
+                
+                var elements = document.getElementsByTagName("*");
+                var size_pos=[];
+                // after page fetch, inject this JS
+                // build a map of all elements and their positions (maybe that only include text?)
+                var bbox;
+                for (var i = 0; i < elements.length; i++) {   
+                 bbox = elements[i].getBoundingClientRect();
+                
+                 // forget reallysmall ones
+                 if (bbox['width'] <10 && bbox['height'] <10 ) {
+                   continue;
+                 }
+                
+                 // @todo the getXpath kind of sucks, it doesnt know when there is for example just one ID sometimes
+                 // it should not traverse when we know we can anchor off just an ID one level up etc..
+                 // maybe, get current class or id, keep traversing up looking for only class or id until there is just one match 
+                
+                 // 1st primitive - if it has class, try joining it all and select, if theres only one.. well thats us.
+                 xpath_result=false;
+                 try {
+                   var d= findUpTag(elements[i]);
+                   if (d) {
+                     xpath_result =d;
+                   }                
+                 } catch (e) {
+                   var x=1;
+                 }
+                
+                 // default back to the less intelligent one
+                 if (!xpath_result) {
+                   xpath_result = getXPath(elements[i]);                   
+                 } 
+                
+                 size_pos.push({
+                   xpath: xpath_result,
+                   width: bbox['width'], 
+                   height: bbox['height'],
+                   left: bbox['left'],
+                   top: bbox['top'],
+                   childCount: elements[i].childElementCount
+                 });                 
+                }
+                
+                
+                // inject the current one set in the css_filter, which may be a CSS rule
+                // used for displaying the current one in VisualSelector, where its not one we generated.
+                if (css_filter.length) {
+                   // is it xpath?
+                   if (css_filter.startsWith('/') ) {
+                     q=document.evaluate(css_filter, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+                   } else {
+                     q=document.querySelector(css_filter);
+                   }
+                   if (q) {
+                       bbox = q.getBoundingClientRect();
+                       size_pos.push({
+                           xpath: css_filter,
+                           width: bbox['width'], 
+                           height: bbox['height'],
+                           left: bbox['left'],
+                           top: bbox['top'],
+                           childCount: q.childElementCount
+                         });
+                     }
+                }
+                
+                return size_pos;
+    """

    @abstractmethod
    def get_error(self):
@ -59,6 +156,11 @@ class Fetcher():
    def is_ready(self):
        return True

+    @abstractmethod
+    def get_xpath_data(self, current_css_xpath_filter):
+        return None
+
+
 #   Maybe for the future, each fetcher provides its own diff output, could be used for text, image
 #   the current one would return javascript output (as we use JS to generate the diff)
 #
@ -163,6 +265,15 @@ class html_webdriver(Fetcher):
        self.quit()
        return True

+    def get_xpath_data(self, current_css_xpath_filter):
+
+        # lazy quoting, probably going to be bad later.
+        css_filter = current_css_xpath_filter.replace('"', '\\"')
+        css_filter = css_filter.replace('\'', '\\\'')
+        info = self.driver.execute_script("var css_filter='{}';".format(css_filter)+self.xpath_element_js)
+        return info
+
+
    def quit(self):
        if self.driver:
            try:
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -195,6 +195,7 @@ class perform_site_check():
        if self.datastore.data['settings']['application'].get('real_browser_save_screenshot', True):
            screenshot = fetcher.screenshot()

+        xpath_elements = fetcher.get_xpath_data(watch['css_filter'])
        fetcher.quit()

-        return changed_detected, update_obj, text_content_before_ignored_filter, screenshot
+        return changed_detected, update_obj, text_content_before_ignored_filter, screenshot, xpath_elements
--- a/changedetectionio/store.py
+++ b/changedetectionio/store.py
@ -357,6 +357,14 @@ class ChangeDetectionStore:
            f.write(screenshot)
            f.close()

+    def save_xpath_data(self, watch_uuid, data):
+        output_path = "{}/{}".format(self.datastore_path, watch_uuid)
+        fname = "{}/elements.json".format(output_path)
+        with open(fname, 'w') as f:
+            f.write(json.dumps(data))
+            f.close()
+
+
    def sync_to_json(self):
        logging.info("Saving JSON..")
        print("Saving JSON..")
--- a/changedetectionio/templates/edit.html
+++ b/changedetectionio/templates/edit.html
@ -189,7 +189,7 @@ nav
                            <!-- use img src ready load to know everything is ready to map out -->
                            <img id="selector-background" />
                            <script>
-                                $("img#selector-background").attr("src", "{{ url_for('visualselector_request_current_screenshot_and_metadata', uuid=uuid) }}");
+                                $("img#selector-background").attr("src", "{{url_for('static_content', group='screenshot', filename=uuid)}}");
                                $("img#selector-background").bind('load', function () {
                                   fetch_data();
                                });
@ -199,7 +199,6 @@ nav

                        <div id="selector-current-xpath"><strong>Currently:</strong>&nbsp;<span class="text">Loading...</span></div>

-
                    </div>
                </fieldset>
            </div>
--- a/changedetectionio/update_worker.py
+++ b/changedetectionio/update_worker.py
@ -40,10 +40,11 @@ class update_worker(threading.Thread):
                    contents = ""
                    screenshot = False
                    update_obj= {}
+                    xpath_data = False
                    now = time.time()

                    try:
-                        changed_detected, update_obj, contents, screenshot = update_handler.run(uuid)
+                        changed_detected, update_obj, contents, screenshot, xpath_data = update_handler.run(uuid)

                        # Re #342
                        # In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
@ -144,6 +145,9 @@ class update_worker(threading.Thread):
                        # Always save the screenshot if it's available
                        if screenshot:
                            self.datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot)
+                        if xpath_data:
+                            self.datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data)
+

                self.current_uuid = None  # Done
                self.q.task_done()
--- a/1532
+++ b/1532