Fetcher - Experimental fetcher fixes, now only enabled with 'USE_EXPERIMENTAL_PUPPETEER_FETCH' env var (default off) (#1561)

pull/1563/head
dgtlmoon 2 years ago committed by GitHub
parent 3801d339f5
commit 316f28a0f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -277,7 +277,7 @@ class base_html_playwright(Fetcher):
with open(destination, 'w') as f: with open(destination, 'w') as f:
f.write(content) f.write(content)
def run(self, def run_fetch_browserless_puppeteer(self,
url, url,
timeout, timeout,
request_headers, request_headers,
@ -287,63 +287,63 @@ class base_html_playwright(Fetcher):
current_include_filters=None, current_include_filters=None,
is_binary=False): is_binary=False):
# Fallback for now to the old way if browsersteps
# @todo - need to figure out how to get browsersteps with images on each step working
if self.browser_steps:
for step in self.browser_steps:
if step.get('operation'):
return self.run_playwright(
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes,
current_include_filters,
is_binary)
elif os.getenv('FORCE_PLAYWRIGHT_FETCH'):
# Temporary backup solution until we rewrite the playwright code
return self.run_playwright(
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes,
current_include_filters,
is_binary)
extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000 extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
code = f"""module.exports = async ({{ page, context }}) => {{ code = f"""module.exports = async ({{ page, context }}) => {{
var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy}} = context; var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy_username, proxy_password}} = context;
await page.setBypassCSP(true) await page.setBypassCSP(true)
await page.setExtraHTTPHeaders(req_headers); await page.setExtraHTTPHeaders(req_headers);
await page.setUserAgent(user_agent); await page.setUserAgent(user_agent);
// https://ourcodeworld.com/articles/read/1106/how-to-solve-puppeteer-timeouterror-navigation-timeout-of-30000-ms-exceeded
if(proxy) {{ await page.setDefaultNavigationTimeout(0);
if(proxy_username) {{
await page.authenticate({{ await page.authenticate({{
username: proxy['username'], username: proxy_username,
password: proxy['password'], password: proxy_password
}}); }});
}} }}
const r = await page.goto(url, wait_until='commit'); await page.setViewport({{
await page.waitForTimeout(extra_wait_ms) width: 1024,
height: 768,
deviceScaleFactor: 1,
}});
const r = await page.goto(url, {{
waitUntil: 'load'
}});
await page.waitForTimeout(1000);
await page.waitForTimeout(extra_wait_ms);
if(execute_js) {{ if(execute_js) {{
await page.evaluate(execute_js); await page.evaluate(execute_js);
await page.waitForTimeout(200); await page.waitForTimeout(200);
}} }}
const xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters); var html = await page.content();
const instock_data = await page.evaluate(() => {{ {self.instock_data_js} }}); var xpath_data;
var instock_data;
const html = await page.content(); try {{
const b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }}); xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters);
instock_data = await page.evaluate(() => {{ {self.instock_data_js} }});
}} catch (e) {{
console.log(e);
}}
// Protocol error (Page.captureScreenshot): Cannot take screenshot with 0 width can come from a proxy auth failure
// Wrap it here (for now)
var b64s;
try {{
b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }});
}} catch (e) {{
console.log(e);
}}
return {{ return {{
data: {{ data: {{
'content': html, 'content': html,
@ -387,15 +387,16 @@ class base_html_playwright(Fetcher):
'execute_js': self.webdriver_js_execute_code, 'execute_js': self.webdriver_js_execute_code,
'extra_wait_ms': extra_wait_ms, 'extra_wait_ms': extra_wait_ms,
'include_filters': current_include_filters, 'include_filters': current_include_filters,
'proxy': self.proxy,
'req_headers': request_headers, 'req_headers': request_headers,
'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)), 'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
'url': url, 'url': url,
'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'), 'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'),
'proxy_username': self.proxy.get('username','') if self.proxy else False,
'proxy_password': self.proxy.get('password','') if self.proxy else False,
} }
}, },
# @todo /function needs adding ws:// to http:// rebuild this # @todo /function needs adding ws:// to http:// rebuild this
url=browserless_function_url, url=browserless_function_url+"&--disable-features=AudioServiceOutOfProcess&dumpio=true",
timeout=wait_browserless_seconds) timeout=wait_browserless_seconds)
except ReadTimeout: except ReadTimeout:
@ -427,15 +428,27 @@ class base_html_playwright(Fetcher):
# Some other error from browserless # Some other error from browserless
raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8')) raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
def run_playwright(self, def run(self,
url, url,
timeout, timeout,
request_headers, request_headers,
request_body, request_body,
request_method, request_method,
ignore_status_codes=False, ignore_status_codes=False,
current_include_filters=None, current_include_filters=None,
is_binary=False): is_binary=False):
if os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
# Temporary backup solution until we rewrite the playwright code
return self.run_fetch_browserless_puppeteer(
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes,
current_include_filters,
is_binary)
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
import playwright._impl._api_types import playwright._impl._api_types

@ -315,7 +315,8 @@ class update_worker(threading.Thread):
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True) self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code}) 'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False process_changedetection_results = False
except Exception as e: except Exception as e:
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e)) self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))

Loading…
Cancel
Save