More disk cache improvements

puppeteer-fixes
dgtlmoon 2 years ago
parent eb61dda30a
commit edd2f5b087

@ -58,9 +58,9 @@ jobs:
# restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it
docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py'
- name: Test with puppeteer fetcher - name: Test with puppeteer fetcher and disk cache
run: | run: |
docker run --rm -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' docker run --rm -e "PUPPETEER_DISK_CACHE=/tmp/data/" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
# Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above # Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above
- name: Test proxy interaction - name: Test proxy interaction

@ -292,7 +292,20 @@ class base_html_playwright(Fetcher):
code = f"""module.exports = async ({{ page, context }}) => {{ code = f"""module.exports = async ({{ page, context }}) => {{
var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password, disk_cache_dir}} = context; var {{ url,
execute_js,
user_agent,
extra_wait_ms,
req_headers,
include_filters,
xpath_element_js,
screenshot_quality,
proxy_username,
proxy_password,
disk_cache_dir,
no_cache_list,
block_url_list,
}} = context;
await page.setBypassCSP(true) await page.setBypassCSP(true)
await page.setExtraHTTPHeaders(req_headers); await page.setExtraHTTPHeaders(req_headers);
@ -314,10 +327,6 @@ class base_html_playwright(Fetcher):
deviceScaleFactor: 1, deviceScaleFactor: 1,
}}); }});
// Very primitive disk cache - USE WITH EXTREME CAUTION
// Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
if ( disk_cache_dir ) {{
await page.setRequestInterception(true); await page.setRequestInterception(true);
console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<"); console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");
@ -339,8 +348,10 @@ class base_html_playwright(Fetcher):
}} }}
page.on('request', async (request) => {{ page.on('request', async (request) => {{
// General blocking of requests that waste traffic
if (block_url_list.some(substring=>request.url().toLowerCase().includes(substring))) return request.abort();
// if (blockedExtensions.some((str) => req.url().endsWith(str))) return req.abort(); if ( disk_cache_dir ) {{
const url = request.url(); const url = request.url();
const key = crypto.createHash('md5').update(url).digest("hex"); const key = crypto.createHash('md5').update(url).digest("hex");
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
@ -348,9 +359,9 @@ class base_html_playwright(Fetcher):
// https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js // https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
if (fs.existsSync(dir_path+key)) {{ if (fs.existsSync(dir_path+key)) {{
file_is_expired(dir_path+key); console.log("* CACHE HIT , using - "+dir_path+key+ " - "+url);
console.log("Cache exists "+dir_path+key+ " - "+url);
const cached_data = fs.readFileSync(dir_path+key); const cached_data = fs.readFileSync(dir_path+key);
// @todo headers can come from dir_path+key+".meta" json file
request.respond({{ request.respond({{
status: 200, status: 200,
//contentType: 'text/html', //@todo //contentType: 'text/html', //@todo
@ -358,22 +369,31 @@ class base_html_playwright(Fetcher):
}}); }});
return; return;
}} }}
}}
request.continue(); request.continue();
}}); }});
if ( disk_cache_dir ) {{
page.on('response', async (response) => {{ page.on('response', async (response) => {{
const url = response.url(); const url = response.url();
// @todo - check response size() // @todo - check response size()
console.log("Cache - Got "+response.request().method()+" - "+url+" - "+response.request().resourceType());
if(response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200 ) {{ if(response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200 ) {{
console.log("Skipping- "+url); console.log("Skipping (not useful) - Status:"+response.status()+" Method:"+response.request().method()+" ResourceType:"+response.request().resourceType()+" "+url);
return; return;
}} }}
if (no_cache_list.some(substring=>url.toLowerCase().includes(substring))) {{
console.log("Skipping (no_cache_list) - "+url);
return;
}}
response.buffer().then(buffer => {{
if(buffer.length > 100) {{
console.log("Cache - Saving "+response.request().method()+" - "+url+" - "+response.request().resourceType());
const key = crypto.createHash('md5').update(url).digest("hex"); const key = crypto.createHash('md5').update(url).digest("hex");
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/'; const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
const data = await response.text();
if (!fs.existsSync(dir_path)) {{ if (!fs.existsSync(dir_path)) {{
fs.mkdirSync(dir_path, {{ recursive: true }}) fs.mkdirSync(dir_path, {{ recursive: true }})
}} }}
@ -381,15 +401,16 @@ class base_html_playwright(Fetcher):
var expired = false; var expired = false;
if (fs.existsSync(dir_path+key)) {{ if (fs.existsSync(dir_path+key)) {{
if (file_is_expired(dir_path+key)) {{ if (file_is_expired(dir_path+key)) {{
fs.writeFileSync(dir_path+key, data); fs.writeFileSync(dir_path+key, buffer);
}} }}
}} else {{ }} else {{
fs.writeFileSync(dir_path+key, data); fs.writeFileSync(dir_path+key, buffer);
}}
}} }}
}}); }});
}});
}} }}
const r = await page.goto(url, {{ const r = await page.goto(url, {{
waitUntil: 'load' waitUntil: 'load'
}}); }});
@ -432,7 +453,6 @@ class base_html_playwright(Fetcher):
}} }}
}} }}
var html = await page.content(); var html = await page.content();
return {{ return {{
data: {{ data: {{
@ -475,7 +495,9 @@ class base_html_playwright(Fetcher):
json={ json={
"code": code, "code": code,
"context": { "context": {
'disk_cache_dir': False, # or path to disk cache ending in /, ie /tmp/cache/ # Very primitive disk cache - USE WITH EXTREME CAUTION
# Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
'execute_js': self.webdriver_js_execute_code, 'execute_js': self.webdriver_js_execute_code,
'extra_wait_ms': extra_wait_ms, 'extra_wait_ms': extra_wait_ms,
'include_filters': current_include_filters, 'include_filters': current_include_filters,
@ -484,7 +506,20 @@ class base_html_playwright(Fetcher):
'url': url, 'url': url,
'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'), 'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'),
'proxy_username': self.proxy.get('username','') if self.proxy else False, 'proxy_username': self.proxy.get('username','') if self.proxy else False,
'proxy_password': self.proxy.get('password','') if self.proxy else False, 'proxy_password': self.proxy.get('password', '') if self.proxy else False,
'no_cache_list': [
'twitter',
'.pdf'
],
# Could use https://github.com/easylist/easylist here, or install a plugin
'block_url_list': [
'adnxs.com',
'analytics.twitter.com',
'doubleclick.net',
'google-analytics.com',
'googletagmanager',
'trustpilot.com'
]
} }
}, },
# @todo /function needs adding ws:// to http:// rebuild this # @todo /function needs adding ws:// to http:// rebuild this

Loading…
Cancel
Save