You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
182 lines
5.4 KiB
182 lines
5.4 KiB
module.exports = async ({page, context}) => {
|
|
|
|
var {
|
|
url,
|
|
execute_js,
|
|
user_agent,
|
|
extra_wait_ms,
|
|
req_headers,
|
|
include_filters,
|
|
xpath_element_js,
|
|
screenshot_quality,
|
|
proxy_username,
|
|
proxy_password,
|
|
disk_cache_dir,
|
|
no_cache_list,
|
|
block_url_list,
|
|
} = context;
|
|
|
|
await page.setBypassCSP(true)
|
|
await page.setExtraHTTPHeaders(req_headers);
|
|
var total_size = 0;
|
|
|
|
if (user_agent) {
|
|
await page.setUserAgent(user_agent);
|
|
}
|
|
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
|
|
|
|
await page.setDefaultNavigationTimeout(0);
|
|
|
|
if (proxy_username) {
|
|
// Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
|
|
// https://github.com/puppeteer/puppeteer/issues/676 ?
|
|
// https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
|
|
// https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/
|
|
await page.authenticate({
|
|
username: proxy_username,
|
|
password: proxy_password
|
|
});
|
|
}
|
|
|
|
await page.setViewport({
|
|
width: 1024,
|
|
height: 768,
|
|
deviceScaleFactor: 1,
|
|
});
|
|
await page.setRequestInterception(true);
|
|
await page.setCacheEnabled(false);
|
|
|
|
|
|
await page.evaluateOnNewDocument('navigator.serviceWorker.register = () => { console.warn("Service Worker registration blocked by Playwright")}');
|
|
|
|
await page.evaluateOnNewDocument(`
|
|
|
|
const toBlob = HTMLCanvasElement.prototype.toBlob;
|
|
const toDataURL = HTMLCanvasElement.prototype.toDataURL;
|
|
|
|
HTMLCanvasElement.prototype.manipulate = function() {
|
|
console.warn("ma");
|
|
const {width, height} = this;
|
|
const context = this.getContext('2d');
|
|
var dt = new Date();
|
|
|
|
const shift = {
|
|
'r': dt.getDay()-3,
|
|
'g': dt.getDay()-3,
|
|
'b': dt.getDay()-3
|
|
};
|
|
console.log(shift);
|
|
const matt = context.getImageData(0, 0, width, height);
|
|
for (let i = 0; i < height; i += Math.max(1, parseInt(height / 10))) {
|
|
for (let j = 0; j < width; j += Math.max(1, parseInt(width / 10))) {
|
|
const n = ((i * (width * 4)) + (j * 4));
|
|
matt.data[n + 0] = matt.data[n + 0] + shift.r;
|
|
matt.data[n + 1] = matt.data[n + 1] + shift.g;
|
|
matt.data[n + 2] = matt.data[n + 2] + shift.b;
|
|
}
|
|
}
|
|
context.putImageData(matt, 0, 0);
|
|
};
|
|
|
|
Object.defineProperty(HTMLCanvasElement.prototype, 'toBlob', {
|
|
value: function() {
|
|
console.warn("toblob");
|
|
if (true) {
|
|
try {
|
|
this.manipulate();
|
|
}
|
|
catch(e) {
|
|
console.warn('manipulation failed', e);
|
|
}
|
|
}
|
|
return toBlob.apply(this, arguments);
|
|
}
|
|
});
|
|
Object.defineProperty(HTMLCanvasElement.prototype, 'toDataURL', {
|
|
value: function() {
|
|
console.warn("todata");
|
|
if (true) {
|
|
try {
|
|
this.manipulate();
|
|
}
|
|
catch(e) {
|
|
console.warn('manipulation failed', e);
|
|
}
|
|
}
|
|
return toDataURL.apply(this, arguments);
|
|
}
|
|
});
|
|
|
|
|
|
Object.defineProperty(navigator, 'webdriver', {get: () => false});
|
|
`)
|
|
|
|
await page.emulateTimezone('America/Chicago');
|
|
|
|
var r = await page.goto(url, {
|
|
waitUntil: 'load', timeout: 0
|
|
});
|
|
|
|
// https://github.com/puppeteer/puppeteer/issues/2479#issuecomment-408263504
|
|
if (r === null) {
|
|
r = await page.waitForResponse(() => true);
|
|
}
|
|
|
|
await page.waitForTimeout(4000);
|
|
await page.waitForTimeout(extra_wait_ms);
|
|
|
|
|
|
if (execute_js) {
|
|
await page.evaluate(execute_js);
|
|
await page.waitForTimeout(200);
|
|
}
|
|
|
|
var xpath_data;
|
|
var instock_data;
|
|
try {
|
|
// Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode
|
|
// (Once the old playwright is removed)
|
|
xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
|
|
instock_data = await page.evaluate(() => {%instock_scrape_code%});
|
|
} catch (e) {
|
|
console.log(e);
|
|
}
|
|
|
|
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
|
|
// Wrap it here (for now)
|
|
|
|
var b64s = false;
|
|
try {
|
|
b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'});
|
|
} catch (e) {
|
|
console.log(e);
|
|
}
|
|
|
|
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
|
if (!b64s) {
|
|
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
|
|
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
|
|
try {
|
|
b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'});
|
|
} catch (e) {
|
|
console.log(e);
|
|
}
|
|
}
|
|
|
|
var html = await page.content();
|
|
page.close();
|
|
|
|
return {
|
|
data: {
|
|
'content': html,
|
|
'headers': r.headers(),
|
|
'instock_data': instock_data,
|
|
'screenshot': b64s,
|
|
'status_code': r.status(),
|
|
'xpath_data': xpath_data,
|
|
'total_size': total_size
|
|
},
|
|
type: 'application/json',
|
|
};
|
|
};
|