From d939882dde750c7f7565bc198d30c205bf62ad89 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 11 May 2023 16:36:35 +0200 Subject: [PATCH] Fetcher - Experimental fetcher improvements (Code TidyUp, Improve tests, revert to old playwright when using BrowserSteps for now) (#1564) --- .github/workflows/test-only.yml | 4 +- changedetectionio/content_fetcher.py | 216 +++++------------------ changedetectionio/res/puppeteer_fetch.js | 179 +++++++++++++++++++ 3 files changed, 224 insertions(+), 175 deletions(-) create mode 100644 changedetectionio/res/puppeteer_fetch.js diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index 3011dd5f..e87e925a 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -58,9 +58,9 @@ jobs: # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' - - name: Test with puppeteer fetcher + - name: Test with puppeteer fetcher and disk cache run: | - docker run --rm -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' + docker run --rm -e "PUPPETEER_DISK_CACHE=/tmp/data/" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' # Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above - name: Test proxy interaction diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 3d474dc3..553fd536 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -287,168 +287,18 @@ class base_html_playwright(Fetcher): current_include_filters=None, is_binary=False): + from pkg_resources import resource_string + extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000 - xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) - - code = f"""module.exports = async ({{ page, context }}) => {{ - - var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password, disk_cache_dir}} = context; - - await page.setBypassCSP(true) - await page.setExtraHTTPHeaders(req_headers); - await page.setUserAgent(user_agent); - // https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded - - await page.setDefaultNavigationTimeout(0); - - if(proxy_username) {{ - await page.authenticate({{ - username: proxy_username, - password: proxy_password - }}); - }} - - await page.setViewport({{ - width: 1024, - height: 768, - deviceScaleFactor: 1, - }}); - - // Very primitive disk cache - USE WITH EXTREME CAUTION - // Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" - if ( disk_cache_dir ) {{ - - await page.setRequestInterception(true); - - console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<"); - const fs = require('fs'); - const crypto = require('crypto'); - function file_is_expired(file_path) {{ - if (!fs.existsSync(dir_path+key)) {{ - return true; - }} - var stats = fs.statSync(file_path); - const now_date = new Date(); - const expire_seconds = 300; - if ( (now_date/1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {{ - console.log("CACHE EXPIRED: "+file_path); - return true; - }} - return false; - - }} - - page.on('request', async (request) => {{ - - // if (blockedExtensions.some((str) => req.url().endsWith(str))) return req.abort(); - const url = request.url(); - const key = crypto.createHash('md5').update(url).digest("hex"); - const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; - - // https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js - - if (fs.existsSync(dir_path+key)) {{ - file_is_expired(dir_path+key); - console.log("Cache exists "+dir_path+key+ " - "+url); - const cached_data = fs.readFileSync(dir_path+key); - request.respond({{ - status: 200, - //contentType: 'text/html', //@todo - body: cached_data - }}); - return; - }} - request.continue(); - }}); - - page.on('response', async (response) => {{ - const url = response.url(); - // @todo - check response size() - console.log("Cache - Got "+response.request().method()+" - "+url+" - "+response.request().resourceType()); - - if(response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200 ) {{ - console.log("Skipping- "+url); - return; - }} - - const key = crypto.createHash('md5').update(url).digest("hex"); - const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; - const data = await response.text(); - if (!fs.existsSync(dir_path)) {{ - fs.mkdirSync(dir_path, {{ recursive: true }}) - }} - - var expired = false; - if (fs.existsSync(dir_path+key)) {{ - if (file_is_expired(dir_path+key)) {{ - fs.writeFileSync(dir_path+key, data); - }} - }} else {{ - fs.writeFileSync(dir_path+key, data); - }} - }}); - }} - - - const r = await page.goto(url, {{ - waitUntil: 'load' - }}); - - await page.waitForTimeout(1000); - await page.waitForTimeout(extra_wait_ms); - - if(execute_js) {{ - await page.evaluate(execute_js); - await page.waitForTimeout(200); - }} - - var xpath_data; - var instock_data; - try {{ - xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters); - instock_data = await page.evaluate(() => {{ {self.instock_data_js} }}); - }} catch (e) {{ - console.log(e); - }} - - // Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure - // Wrap it here (for now) - - var b64s = false; - try {{ - b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }}); - }} catch (e) {{ - console.log(e); - }} - - // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw' - if (!b64s) {{ - // @todo after text extract, we can place some overlay text with red background to say 'croppped' - console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot'); - try {{ - b64s = await page.screenshot({{ encoding: "base64", quality: screenshot_quality, type: 'jpeg' }}); - }} catch (e) {{ - console.log(e); - }} - }} - - - var html = await page.content(); - return {{ - data: {{ - 'content': html, - 'headers': r.headers(), - 'instock_data': instock_data, - 'screenshot': b64s, - 'status_code': r.status(), - 'xpath_data': xpath_data - }}, - type: 'application/json', - }}; - }};""" + + self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8') + # In the future inject this is a proper JS package + code = code.replace('%xpath_scrape_code%', self.xpath_element_js) + code = code.replace('%instock_scrape_code%', self.instock_data_js) from requests.exceptions import ConnectTimeout, ReadTimeout - wait_browserless_seconds = 120 + wait_browserless_seconds = 240 browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL') from urllib.parse import urlparse @@ -475,7 +325,9 @@ class base_html_playwright(Fetcher): json={ "code": code, "context": { - 'disk_cache_dir': False, # or path to disk cache + # Very primitive disk cache - USE WITH EXTREME CAUTION + # Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" + 'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/ 'execute_js': self.webdriver_js_execute_code, 'extra_wait_ms': extra_wait_ms, 'include_filters': current_include_filters, @@ -484,14 +336,26 @@ class base_html_playwright(Fetcher): 'url': url, 'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'), 'proxy_username': self.proxy.get('username','') if self.proxy else False, - 'proxy_password': self.proxy.get('password','') if self.proxy else False, + 'proxy_password': self.proxy.get('password', '') if self.proxy else False, + 'no_cache_list': [ + 'twitter', + '.pdf' + ], + # Could use https://github.com/easylist/easylist here, or install a plugin + 'block_url_list': [ + 'adnxs.com', + 'analytics.twitter.com', + 'doubleclick.net', + 'google-analytics.com', + 'googletagmanager', + 'trustpilot.com' + ] } }, # @todo /function needs adding ws:// to http:// rebuild this url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts", timeout=wait_browserless_seconds) -# 'ziparchive::addglob() will throw an instance of error instead of resulting in a fatal error if glob support is not available.' except ReadTimeout: raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s") except ConnectTimeout: @@ -535,17 +399,23 @@ class base_html_playwright(Fetcher): current_include_filters=None, is_binary=False): - if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): - # Temporary backup solution until we rewrite the playwright code - return self.run_fetch_browserless_puppeteer( - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes, - current_include_filters, - is_binary) + # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!) + has_browser_steps = self.browser_steps and list(filter( + lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), + self.browser_steps)) + + if not has_browser_steps: + if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): + # Temporary backup solution until we rewrite the playwright code + return self.run_fetch_browserless_puppeteer( + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes, + current_include_filters, + is_binary) from playwright.sync_api import sync_playwright import playwright._impl._api_types diff --git a/changedetectionio/res/puppeteer_fetch.js b/changedetectionio/res/puppeteer_fetch.js new file mode 100644 index 00000000..0c6a99fb --- /dev/null +++ b/changedetectionio/res/puppeteer_fetch.js @@ -0,0 +1,179 @@ +module.exports = async ({page, context}) => { + + var { + url, + execute_js, + user_agent, + extra_wait_ms, + req_headers, + include_filters, + xpath_element_js, + screenshot_quality, + proxy_username, + proxy_password, + disk_cache_dir, + no_cache_list, + block_url_list, + } = context; + + await page.setBypassCSP(true) + await page.setExtraHTTPHeaders(req_headers); + await page.setUserAgent(user_agent); + // https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded + + await page.setDefaultNavigationTimeout(0); + + if (proxy_username) { + await page.authenticate({ + username: proxy_username, + password: proxy_password + }); + } + + await page.setViewport({ + width: 1024, + height: 768, + deviceScaleFactor: 1, + }); + + await page.setRequestInterception(true); + if (disk_cache_dir) { + console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<"); + } + const fs = require('fs'); + const crypto = require('crypto'); + + function file_is_expired(file_path) { + if (!fs.existsSync(file_path)) { + return true; + } + var stats = fs.statSync(file_path); + const now_date = new Date(); + const expire_seconds = 300; + if ((now_date / 1000) - (stats.mtime.getTime() / 1000) > expire_seconds) { + console.log("CACHE EXPIRED: " + file_path); + return true; + } + return false; + + } + + page.on('request', async (request) => { + // General blocking of requests that waste traffic + if (block_url_list.some(substring => request.url().toLowerCase().includes(substring))) return request.abort(); + + if (disk_cache_dir) { + const url = request.url(); + const key = crypto.createHash('md5').update(url).digest("hex"); + const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; + + // https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js + + if (fs.existsSync(dir_path + key)) { + console.log("* CACHE HIT , using - " + dir_path + key + " - " + url); + const cached_data = fs.readFileSync(dir_path + key); + // @todo headers can come from dir_path+key+".meta" json file + request.respond({ + status: 200, + //contentType: 'text/html', //@todo + body: cached_data + }); + return; + } + } + request.continue(); + }); + + + if (disk_cache_dir) { + page.on('response', async (response) => { + const url = response.url(); + // Basic filtering for sane responses + if (response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200) { + console.log("Skipping (not useful) - Status:" + response.status() + " Method:" + response.request().method() + " ResourceType:" + response.request().resourceType() + " " + url); + return; + } + if (no_cache_list.some(substring => url.toLowerCase().includes(substring))) { + console.log("Skipping (no_cache_list) - " + url); + return; + } + response.buffer().then(buffer => { + if (buffer.length > 100) { + console.log("Cache - Saving " + response.request().method() + " - " + url + " - " + response.request().resourceType()); + + const key = crypto.createHash('md5').update(url).digest("hex"); + const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; + + if (!fs.existsSync(dir_path)) { + fs.mkdirSync(dir_path, {recursive: true}) + } + + if (fs.existsSync(dir_path + key)) { + if (file_is_expired(dir_path + key)) { + fs.writeFileSync(dir_path + key, buffer); + } + } else { + fs.writeFileSync(dir_path + key, buffer); + } + } + }); + }); + } + + const r = await page.goto(url, { + waitUntil: 'load' + }); + + await page.waitForTimeout(1000); + await page.waitForTimeout(extra_wait_ms); + + if (execute_js) { + await page.evaluate(execute_js); + await page.waitForTimeout(200); + } + + var xpath_data; + var instock_data; + try { + // Not sure the best way here, in the future this should be a new package added to npm then run in browserless + // (Once the old playwright is removed) + xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters); + instock_data = await page.evaluate(() => {%instock_scrape_code%}); + } catch (e) { + console.log(e); + } + + // Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure + // Wrap it here (for now) + + var b64s = false; + try { + b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'}); + } catch (e) { + console.log(e); + } + + // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw' + if (!b64s) { + // @todo after text extract, we can place some overlay text with red background to say 'croppped' + console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot'); + try { + b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'}); + } catch (e) { + console.log(e); + } + } + + var html = await page.content(); + return { + data: { + 'content': html, + 'headers': r.headers(), + 'instock_data': instock_data, + 'screenshot': b64s, + 'status_code': r.status(), + 'xpath_data': xpath_data + }, + type: 'application/json', + }; +}; \ No newline at end of file