diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index e06e619f..3011dd5f 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -30,7 +30,7 @@ jobs: # Selenium+browserless docker run --network changedet-network -d --hostname selenium -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome-debug:3.141.59 - docker run --network changedet-network -d --hostname browserless -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.53-chrome-stable + docker run --network changedet-network -d --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.53-chrome-stable - name: Build changedetection.io container for testing run: | @@ -58,6 +58,11 @@ jobs: # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' + - name: Test with puppeteer fetcher + run: | + docker run --rm -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' + # Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above + - name: Test proxy interaction run: | cd changedetectionio diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 262ac2b4..3d474dc3 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -291,7 +291,8 @@ class base_html_playwright(Fetcher): xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) code = f"""module.exports = async ({{ page, context }}) => {{ - var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password}} = context; + + var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password, disk_cache_dir}} = context; await page.setBypassCSP(true) await page.setExtraHTTPHeaders(req_headers); @@ -299,7 +300,7 @@ class base_html_playwright(Fetcher): // https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded await page.setDefaultNavigationTimeout(0); - + if(proxy_username) {{ await page.authenticate({{ username: proxy_username, @@ -312,6 +313,82 @@ class base_html_playwright(Fetcher): height: 768, deviceScaleFactor: 1, }}); + + // Very primitive disk cache - USE WITH EXTREME CAUTION + // Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" + if ( disk_cache_dir ) {{ + + await page.setRequestInterception(true); + + console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<"); + const fs = require('fs'); + const crypto = require('crypto'); + function file_is_expired(file_path) {{ + if (!fs.existsSync(dir_path+key)) {{ + return true; + }} + var stats = fs.statSync(file_path); + const now_date = new Date(); + const expire_seconds = 300; + if ( (now_date/1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {{ + console.log("CACHE EXPIRED: "+file_path); + return true; + }} + return false; + + }} + + page.on('request', async (request) => {{ + + // if (blockedExtensions.some((str) => req.url().endsWith(str))) return req.abort(); + const url = request.url(); + const key = crypto.createHash('md5').update(url).digest("hex"); + const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; + + // https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js + + if (fs.existsSync(dir_path+key)) {{ + file_is_expired(dir_path+key); + console.log("Cache exists "+dir_path+key+ " - "+url); + const cached_data = fs.readFileSync(dir_path+key); + request.respond({{ + status: 200, + //contentType: 'text/html', //@todo + body: cached_data + }}); + return; + }} + request.continue(); + }}); + + page.on('response', async (response) => {{ + const url = response.url(); + // @todo - check response size() + console.log("Cache - Got "+response.request().method()+" - "+url+" - "+response.request().resourceType()); + + if(response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200 ) {{ + console.log("Skipping- "+url); + return; + }} + + const key = crypto.createHash('md5').update(url).digest("hex"); + const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; + const data = await response.text(); + if (!fs.existsSync(dir_path)) {{ + fs.mkdirSync(dir_path, {{ recursive: true }}) + }} + + var expired = false; + if (fs.existsSync(dir_path+key)) {{ + if (file_is_expired(dir_path+key)) {{ + fs.writeFileSync(dir_path+key, data); + }} + }} else {{ + fs.writeFileSync(dir_path+key, data); + }} + }}); + }} + const r = await page.goto(url, {{ waitUntil: 'load' @@ -325,7 +402,6 @@ class base_html_playwright(Fetcher): await page.waitForTimeout(200); }} - var html = await page.content(); var xpath_data; var instock_data; try {{ @@ -337,13 +413,27 @@ class base_html_playwright(Fetcher): // Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure // Wrap it here (for now) - var b64s; - try {{ + + var b64s = false; + try {{ b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }}); }} catch (e) {{ console.log(e); }} - + + // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw' + if (!b64s) {{ + // @todo after text extract, we can place some overlay text with red background to say 'croppped' + console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot'); + try {{ + b64s = await page.screenshot({{ encoding: "base64", quality: screenshot_quality, type: 'jpeg' }}); + }} catch (e) {{ + console.log(e); + }} + }} + + + var html = await page.content(); return {{ data: {{ 'content': html, @@ -375,15 +465,17 @@ class base_html_playwright(Fetcher): # Actual authentication handled by Puppeteer/node o = urlparse(self.proxy.get('server')) proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl()) - browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}" + browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}&dumpio=true" try: + amp = '&' if '?' in browserless_function_url else '?' response = requests.request( method="POST", json={ "code": code, "context": { + 'disk_cache_dir': False, # or path to disk cache 'execute_js': self.webdriver_js_execute_code, 'extra_wait_ms': extra_wait_ms, 'include_filters': current_include_filters, @@ -396,9 +488,10 @@ class base_html_playwright(Fetcher): } }, # @todo /function needs adding ws:// to http:// rebuild this - url=browserless_function_url+"&--disable-features=AudioServiceOutOfProcess&dumpio=true", + url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts", timeout=wait_browserless_seconds) +# 'ziparchive::addglob() will throw an instance of error instead of resulting in a fatal error if glob support is not available.' except ReadTimeout: raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s") except ConnectTimeout: @@ -410,6 +503,10 @@ class base_html_playwright(Fetcher): x = response.json() if not x.get('screenshot'): + # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips + # https://github.com/puppeteer/puppeteer/issues/1834 + # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051 + # Check your memory is shared and big enough raise ScreenshotUnavailable(url=url, status_code=None) if not x.get('content', '').strip(): diff --git a/changedetectionio/res/xpath_element_scraper.js b/changedetectionio/res/xpath_element_scraper.js index 27a54c2c..db927ed6 100644 --- a/changedetectionio/res/xpath_element_scraper.js +++ b/changedetectionio/res/xpath_element_scraper.js @@ -17,7 +17,6 @@ try { - // Include the getXpath script directly, easier than fetching function getxpath(e) { var n = e;