Puppeteer fetcher, adding disk cache and other fixes (#1563)

2 years ago · 5325918f29
parent 8eee913438
commit 5325918f29
3 changed files with 111 additions and 10 deletions
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@ -30,7 +30,7 @@ jobs:

          # Selenium+browserless
          docker run --network changedet-network -d --hostname selenium  -p 4444:4444 --rm --shm-size="2g"  selenium/standalone-chrome-debug:3.141.59
-          docker run --network changedet-network -d --hostname browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.53-chrome-stable
+          docker run --network changedet-network -d --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm  -p 3000:3000  --shm-size="2g"  browserless/chrome:1.53-chrome-stable

      - name: Build changedetection.io container for testing
        run: |         
@ -58,6 +58,11 @@ jobs:
          # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
          docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'

+      - name: Test with puppeteer fetcher
+        run: |
+          docker run --rm -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio  bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
+          # Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above
+
      - name: Test proxy interaction
        run: |
          cd changedetectionio
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@ -291,7 +291,8 @@ class base_html_playwright(Fetcher):
        xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)

        code = f"""module.exports = async ({{ page, context }}) => {{
-          var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password}} = context;
+        
+          var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password, disk_cache_dir}} = context;
          
          await page.setBypassCSP(true)
          await page.setExtraHTTPHeaders(req_headers);          
@ -299,7 +300,7 @@ class base_html_playwright(Fetcher):
          // https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
          
          await page.setDefaultNavigationTimeout(0);
-          
+
          if(proxy_username) {{
            await page.authenticate({{
                username: proxy_username,
@ -312,6 +313,82 @@ class base_html_playwright(Fetcher):
          height: 768,
          deviceScaleFactor: 1,
        }});
+
+        // Very primitive disk cache - USE WITH EXTREME CAUTION
+        // Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
+        if ( disk_cache_dir ) {{
+            
+            await page.setRequestInterception(true);
+                         
+            console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");                 
+            const fs = require('fs');
+            const crypto = require('crypto');
+            function file_is_expired(file_path) {{
+                if (!fs.existsSync(dir_path+key)) {{
+                  return true;
+                }}
+                var stats = fs.statSync(file_path);
+                const now_date = new Date();
+                const expire_seconds = 300;
+                if ( (now_date/1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {{                  
+                  console.log("CACHE EXPIRED: "+file_path);
+                  return true;
+                }}
+                return false;
+                
+            }}
+        
+            page.on('request', async (request) => {{
+                    
+                // if (blockedExtensions.some((str) => req.url().endsWith(str))) return req.abort();
+		        const url = request.url();
+                const key = crypto.createHash('md5').update(url).digest("hex");                
+                const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';             
+                                       
+                // https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
+                
+                if (fs.existsSync(dir_path+key)) {{
+                    file_is_expired(dir_path+key);
+                    console.log("Cache exists "+dir_path+key+ " - "+url);
+                    const cached_data = fs.readFileSync(dir_path+key);                          
+                    request.respond({{
+                        status: 200,
+                        //contentType: 'text/html', //@todo
+                        body: cached_data
+                    }});
+                    return;
+                }}                
+                request.continue();
+            }});
+            
+            page.on('response', async (response) => {{
+                const url = response.url();
+                // @todo - check response size()
+                console.log("Cache - Got "+response.request().method()+" - "+url+" - "+response.request().resourceType());
+                
+                if(response.request().method()  != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200 ) {{
+                    console.log("Skipping- "+url);
+                    return;
+                }}
+                
+                const key = crypto.createHash('md5').update(url).digest("hex");
+                const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';               
+                const data = await response.text();
+                if (!fs.existsSync(dir_path)) {{
+                    fs.mkdirSync(dir_path, {{ recursive: true }})
+                }}
+                
+                var expired = false;
+                if (fs.existsSync(dir_path+key)) {{
+                  if (file_is_expired(dir_path+key)) {{
+                    fs.writeFileSync(dir_path+key, data);
+                  }}
+                }} else {{                
+                    fs.writeFileSync(dir_path+key, data);
+                }}
+		    }});		    
+          }}
+
        
          const r = await page.goto(url, {{
                waitUntil: 'load'                
@ -325,7 +402,6 @@ class base_html_playwright(Fetcher):
            await page.waitForTimeout(200);
          }}
          
-        var html = await page.content();
        var xpath_data;
        var instock_data;
        try {{
@ -337,13 +413,27 @@ class base_html_playwright(Fetcher):
          
      // Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
      // Wrap it here (for now)
-      var b64s;
-      try {{
+      
+      var b64s = false;
+      try {{      
             b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }});
        }} catch (e) {{
            console.log(e);
        }}
-         
+        
+        // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
+        if (!b64s) {{
+            // @todo after text extract, we can place some overlay text with red background to say 'croppped'        
+            console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
+            try {{
+                 b64s = await page.screenshot({{ encoding: "base64", quality: screenshot_quality, type: 'jpeg' }});
+            }} catch (e) {{
+                console.log(e);
+            }}
+         }}
+    
+            
+         var html = await page.content();
          return {{
            data: {{
                'content': html, 
@ -375,15 +465,17 @@ class base_html_playwright(Fetcher):
            # Actual authentication handled by Puppeteer/node
            o = urlparse(self.proxy.get('server'))
            proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
-            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
+            browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}&dumpio=true"


        try:
+            amp = '&' if '?' in browserless_function_url else '?'
            response = requests.request(
                method="POST",
                json={
                    "code": code,
                    "context": {
+                        'disk_cache_dir': False, # or path to disk cache
                        'execute_js': self.webdriver_js_execute_code,
                        'extra_wait_ms': extra_wait_ms,
                        'include_filters': current_include_filters,
@ -396,9 +488,10 @@ class base_html_playwright(Fetcher):
                    }
                },
                # @todo /function needs adding ws:// to http:// rebuild this
-                url=browserless_function_url+"&--disable-features=AudioServiceOutOfProcess&dumpio=true",
+                url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
                timeout=wait_browserless_seconds)

+# 'ziparchive::addglob() will throw an instance of error instead of resulting in a fatal error if glob support is not available.'
        except ReadTimeout:
            raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
        except ConnectTimeout:
@ -410,6 +503,10 @@ class base_html_playwright(Fetcher):

                x = response.json()
                if not x.get('screenshot'):
+                    # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
+                    # https://github.com/puppeteer/puppeteer/issues/1834
+                    # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
+                    # Check your memory is shared and big enough
                    raise ScreenshotUnavailable(url=url, status_code=None)

                if not x.get('content', '').strip():
--- a/changedetectionio/res/xpath_element_scraper.js
+++ b/changedetectionio/res/xpath_element_scraper.js
@ -17,7 +17,6 @@ try {



-
 // Include the getXpath script directly, easier than fetching
 function getxpath(e) {
        var n = e;