|
|
|
@ -291,7 +291,8 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
|
|
|
|
|
|
|
|
|
|
code = f"""module.exports = async ({{ page, context }}) => {{
|
|
|
|
|
var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password}} = context;
|
|
|
|
|
|
|
|
|
|
var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password, disk_cache_dir}} = context;
|
|
|
|
|
|
|
|
|
|
await page.setBypassCSP(true)
|
|
|
|
|
await page.setExtraHTTPHeaders(req_headers);
|
|
|
|
@ -313,6 +314,82 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
deviceScaleFactor: 1,
|
|
|
|
|
}});
|
|
|
|
|
|
|
|
|
|
// Very primitive disk cache - USE WITH EXTREME CAUTION
|
|
|
|
|
// Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
|
|
|
|
|
if ( disk_cache_dir ) {{
|
|
|
|
|
|
|
|
|
|
await page.setRequestInterception(true);
|
|
|
|
|
|
|
|
|
|
console.log(">>>>>>>>>>>>>>> LOCAL DISK CACHE ENABLED <<<<<<<<<<<<<<<<<<<<<");
|
|
|
|
|
const fs = require('fs');
|
|
|
|
|
const crypto = require('crypto');
|
|
|
|
|
function file_is_expired(file_path) {{
|
|
|
|
|
if (!fs.existsSync(dir_path+key)) {{
|
|
|
|
|
return true;
|
|
|
|
|
}}
|
|
|
|
|
var stats = fs.statSync(file_path);
|
|
|
|
|
const now_date = new Date();
|
|
|
|
|
const expire_seconds = 300;
|
|
|
|
|
if ( (now_date/1000) - (stats.mtime.getTime() / 1000) > expire_seconds) {{
|
|
|
|
|
console.log("CACHE EXPIRED: "+file_path);
|
|
|
|
|
return true;
|
|
|
|
|
}}
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
page.on('request', async (request) => {{
|
|
|
|
|
|
|
|
|
|
// if (blockedExtensions.some((str) => req.url().endsWith(str))) return req.abort();
|
|
|
|
|
const url = request.url();
|
|
|
|
|
const key = crypto.createHash('md5').update(url).digest("hex");
|
|
|
|
|
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
|
|
|
|
|
|
|
|
|
// https://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
|
|
|
|
|
|
|
|
|
|
if (fs.existsSync(dir_path+key)) {{
|
|
|
|
|
file_is_expired(dir_path+key);
|
|
|
|
|
console.log("Cache exists "+dir_path+key+ " - "+url);
|
|
|
|
|
const cached_data = fs.readFileSync(dir_path+key);
|
|
|
|
|
request.respond({{
|
|
|
|
|
status: 200,
|
|
|
|
|
//contentType: 'text/html', //@todo
|
|
|
|
|
body: cached_data
|
|
|
|
|
}});
|
|
|
|
|
return;
|
|
|
|
|
}}
|
|
|
|
|
request.continue();
|
|
|
|
|
}});
|
|
|
|
|
|
|
|
|
|
page.on('response', async (response) => {{
|
|
|
|
|
const url = response.url();
|
|
|
|
|
// @todo - check response size()
|
|
|
|
|
console.log("Cache - Got "+response.request().method()+" - "+url+" - "+response.request().resourceType());
|
|
|
|
|
|
|
|
|
|
if(response.request().method() != 'GET' || response.request().resourceType() == 'xhr' || response.request().resourceType() == 'document' || response.status() != 200 ) {{
|
|
|
|
|
console.log("Skipping- "+url);
|
|
|
|
|
return;
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
const key = crypto.createHash('md5').update(url).digest("hex");
|
|
|
|
|
const dir_path = disk_cache_dir + key.slice(0, 1) + '/' + key.slice(1, 2) + '/' + key.slice(2, 3) + '/';
|
|
|
|
|
const data = await response.text();
|
|
|
|
|
if (!fs.existsSync(dir_path)) {{
|
|
|
|
|
fs.mkdirSync(dir_path, {{ recursive: true }})
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
var expired = false;
|
|
|
|
|
if (fs.existsSync(dir_path+key)) {{
|
|
|
|
|
if (file_is_expired(dir_path+key)) {{
|
|
|
|
|
fs.writeFileSync(dir_path+key, data);
|
|
|
|
|
}}
|
|
|
|
|
}} else {{
|
|
|
|
|
fs.writeFileSync(dir_path+key, data);
|
|
|
|
|
}}
|
|
|
|
|
}});
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const r = await page.goto(url, {{
|
|
|
|
|
waitUntil: 'load'
|
|
|
|
|
}});
|
|
|
|
@ -325,7 +402,6 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
await page.waitForTimeout(200);
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
var html = await page.content();
|
|
|
|
|
var xpath_data;
|
|
|
|
|
var instock_data;
|
|
|
|
|
try {{
|
|
|
|
@ -337,13 +413,27 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
|
|
|
|
|
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
|
|
|
|
|
// Wrap it here (for now)
|
|
|
|
|
var b64s;
|
|
|
|
|
|
|
|
|
|
var b64s = false;
|
|
|
|
|
try {{
|
|
|
|
|
b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }});
|
|
|
|
|
}} catch (e) {{
|
|
|
|
|
console.log(e);
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
|
|
|
|
if (!b64s) {{
|
|
|
|
|
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
|
|
|
|
|
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
|
|
|
|
|
try {{
|
|
|
|
|
b64s = await page.screenshot({{ encoding: "base64", quality: screenshot_quality, type: 'jpeg' }});
|
|
|
|
|
}} catch (e) {{
|
|
|
|
|
console.log(e);
|
|
|
|
|
}}
|
|
|
|
|
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
var html = await page.content();
|
|
|
|
|
return {{
|
|
|
|
|
data: {{
|
|
|
|
|
'content': html,
|
|
|
|
@ -375,15 +465,17 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
# Actual authentication handled by Puppeteer/node
|
|
|
|
|
o = urlparse(self.proxy.get('server'))
|
|
|
|
|
proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
|
|
|
|
|
browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
|
|
|
|
|
browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}&dumpio=true"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
amp = '&' if '?' in browserless_function_url else '?'
|
|
|
|
|
response = requests.request(
|
|
|
|
|
method="POST",
|
|
|
|
|
json={
|
|
|
|
|
"code": code,
|
|
|
|
|
"context": {
|
|
|
|
|
'disk_cache_dir': False, # or path to disk cache
|
|
|
|
|
'execute_js': self.webdriver_js_execute_code,
|
|
|
|
|
'extra_wait_ms': extra_wait_ms,
|
|
|
|
|
'include_filters': current_include_filters,
|
|
|
|
@ -396,9 +488,10 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
# @todo /function needs adding ws:// to http:// rebuild this
|
|
|
|
|
url=browserless_function_url+"&--disable-features=AudioServiceOutOfProcess&dumpio=true",
|
|
|
|
|
url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
|
|
|
|
|
timeout=wait_browserless_seconds)
|
|
|
|
|
|
|
|
|
|
# 'ziparchive::addglob() will throw an instance of error instead of resulting in a fatal error if glob support is not available.'
|
|
|
|
|
except ReadTimeout:
|
|
|
|
|
raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
|
|
|
|
|
except ConnectTimeout:
|
|
|
|
@ -410,6 +503,10 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
|
|
|
|
|
x = response.json()
|
|
|
|
|
if not x.get('screenshot'):
|
|
|
|
|
# https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
|
|
|
|
|
# https://github.com/puppeteer/puppeteer/issues/1834
|
|
|
|
|
# https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
|
|
|
|
|
# Check your memory is shared and big enough
|
|
|
|
|
raise ScreenshotUnavailable(url=url, status_code=None)
|
|
|
|
|
|
|
|
|
|
if not x.get('content', '').strip():
|
|
|
|
|