You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
changedetection.io/changedetectionio/content_fetchers/res/puppeteer_fetch.js

182 lines
5.4 KiB

module.exports = async ({page, context}) => {
var {
url,
execute_js,
user_agent,
extra_wait_ms,
req_headers,
include_filters,
xpath_element_js,
screenshot_quality,
proxy_username,
proxy_password,
disk_cache_dir,
no_cache_list,
block_url_list,
} = context;
await page.setBypassCSP(true)
await page.setExtraHTTPHeaders(req_headers);
9 months ago
var total_size = 0;
if (user_agent) {
await page.setUserAgent(user_agent);
}
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
await page.setDefaultNavigationTimeout(0);
if (proxy_username) {
// Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
// https://github.com/puppeteer/puppeteer/issues/676 ?
// https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
// https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/
await page.authenticate({
username: proxy_username,
password: proxy_password
});
}
await page.setViewport({
width: 1024,
height: 768,
deviceScaleFactor: 1,
});
await page.setRequestInterception(true);
9 months ago
await page.setCacheEnabled(false);
await page.evaluateOnNewDocument('navigator.serviceWorker.register = () => { console.warn("Service Worker registration blocked by Playwright")}');
await page.evaluateOnNewDocument(`
const toBlob = HTMLCanvasElement.prototype.toBlob;
const toDataURL = HTMLCanvasElement.prototype.toDataURL;
HTMLCanvasElement.prototype.manipulate = function() {
console.warn("ma");
const {width, height} = this;
const context = this.getContext('2d');
var dt = new Date();
const shift = {
'r': dt.getDay()-3,
'g': dt.getDay()-3,
'b': dt.getDay()-3
};
console.log(shift);
const matt = context.getImageData(0, 0, width, height);
for (let i = 0; i < height; i += Math.max(1, parseInt(height / 10))) {
for (let j = 0; j < width; j += Math.max(1, parseInt(width / 10))) {
const n = ((i * (width * 4)) + (j * 4));
matt.data[n + 0] = matt.data[n + 0] + shift.r;
matt.data[n + 1] = matt.data[n + 1] + shift.g;
matt.data[n + 2] = matt.data[n + 2] + shift.b;
}
}
9 months ago
context.putImageData(matt, 0, 0);
};
9 months ago
Object.defineProperty(HTMLCanvasElement.prototype, 'toBlob', {
value: function() {
console.warn("toblob");
if (true) {
try {
this.manipulate();
}
9 months ago
catch(e) {
console.warn('manipulation failed', e);
}
9 months ago
}
return toBlob.apply(this, arguments);
}
9 months ago
});
Object.defineProperty(HTMLCanvasElement.prototype, 'toDataURL', {
value: function() {
console.warn("todata");
if (true) {
try {
this.manipulate();
}
9 months ago
catch(e) {
console.warn('manipulation failed', e);
}
}
return toDataURL.apply(this, arguments);
}
});
9 months ago
Object.defineProperty(navigator, 'webdriver', {get: () => false});
`)
await page.emulateTimezone('America/Chicago');
9 months ago
var r = await page.goto(url, {
waitUntil: 'load', timeout: 0
});
9 months ago
// https://github.com/puppeteer/puppeteer/issues/2479#issuecomment-408263504
if (r === null) {
r = await page.waitForResponse(() => true);
}
await page.waitForTimeout(4000);
await page.waitForTimeout(extra_wait_ms);
9 months ago
if (execute_js) {
await page.evaluate(execute_js);
await page.waitForTimeout(200);
}
var xpath_data;
var instock_data;
try {
// Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode
// (Once the old playwright is removed)
xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters);
instock_data = await page.evaluate(() => {%instock_scrape_code%});
} catch (e) {
console.log(e);
}
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
// Wrap it here (for now)
var b64s = false;
try {
b64s = await page.screenshot({encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg'});
} catch (e) {
console.log(e);
}
// May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
if (!b64s) {
// @todo after text extract, we can place some overlay text with red background to say 'croppped'
console.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot');
try {
b64s = await page.screenshot({encoding: "base64", quality: screenshot_quality, type: 'jpeg'});
} catch (e) {
console.log(e);
}
}
var html = await page.content();
9 months ago
page.close();
return {
data: {
'content': html,
'headers': r.headers(),
'instock_data': instock_data,
'screenshot': b64s,
'status_code': r.status(),
9 months ago
'xpath_data': xpath_data,
'total_size': total_size
},
type: 'application/json',
};
9 months ago
};