Fetcher - playwright/browserless - Use builtin node puppeteer handler in browserless, scales way better, and is faster (#1559)

pull/1563/head
dgtlmoon 2 years ago committed by GitHub
parent 1710885fc4
commit 94f38f052e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -10,6 +10,7 @@ import time
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary' visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary'
class Non200ErrorCodeReceived(Exception): class Non200ErrorCodeReceived(Exception):
def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None): def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None):
# Set this so we can use it in other parts of the app # Set this so we can use it in other parts of the app
@ -24,10 +25,12 @@ class Non200ErrorCodeReceived(Exception):
self.page_text = html_tools.html_to_text(page_html) self.page_text = html_tools.html_to_text(page_html)
return return
class checksumFromPreviousCheckWasTheSame(Exception): class checksumFromPreviousCheckWasTheSame(Exception):
def __init__(self): def __init__(self):
return return
class JSActionExceptions(Exception): class JSActionExceptions(Exception):
def __init__(self, status_code, url, screenshot, message=''): def __init__(self, status_code, url, screenshot, message=''):
self.status_code = status_code self.status_code = status_code
@ -36,6 +39,7 @@ class JSActionExceptions(Exception):
self.message = message self.message = message
return return
class BrowserStepsStepTimout(Exception): class BrowserStepsStepTimout(Exception):
def __init__(self, step_n): def __init__(self, step_n):
self.step_n = step_n self.step_n = step_n
@ -51,6 +55,7 @@ class PageUnloadable(Exception):
self.message = message self.message = message
return return
class EmptyReply(Exception): class EmptyReply(Exception):
def __init__(self, status_code, url, screenshot=None): def __init__(self, status_code, url, screenshot=None):
# Set this so we can use it in other parts of the app # Set this so we can use it in other parts of the app
@ -59,6 +64,7 @@ class EmptyReply(Exception):
self.screenshot = screenshot self.screenshot = screenshot
return return
class ScreenshotUnavailable(Exception): class ScreenshotUnavailable(Exception):
def __init__(self, status_code, url, page_html=None): def __init__(self, status_code, url, page_html=None):
# Set this so we can use it in other parts of the app # Set this so we can use it in other parts of the app
@ -69,6 +75,7 @@ class ScreenshotUnavailable(Exception):
self.page_text = html_to_text(page_html) self.page_text = html_to_text(page_html)
return return
class ReplyWithContentButNoText(Exception): class ReplyWithContentButNoText(Exception):
def __init__(self, status_code, url, screenshot=None): def __init__(self, status_code, url, screenshot=None):
# Set this so we can use it in other parts of the app # Set this so we can use it in other parts of the app
@ -77,13 +84,14 @@ class ReplyWithContentButNoText(Exception):
self.screenshot = screenshot self.screenshot = screenshot
return return
class Fetcher(): class Fetcher():
browser_steps = None browser_steps = None
browser_steps_screenshot_path = None browser_steps_screenshot_path = None
content = None content = None
error = None error = None
fetcher_description = "No description" fetcher_description = "No description"
headers = None headers = {}
status_code = None status_code = None
webdriver_js_execute_code = None webdriver_js_execute_code = None
xpath_data = None xpath_data = None
@ -105,7 +113,6 @@ class Fetcher():
self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8') self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8')
self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8') self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8')
@abstractmethod @abstractmethod
def get_error(self): def get_error(self):
return self.error return self.error
@ -152,13 +159,15 @@ class Fetcher():
interface = steppable_browser_interface() interface = steppable_browser_interface()
interface.page = self.page interface.page = self.page
valid_steps = filter(lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), self.browser_steps) valid_steps = filter(
lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'),
self.browser_steps)
for step in valid_steps: for step in valid_steps:
step_n += 1 step_n += 1
print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation'])) print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation']))
self.screenshot_step("before-"+str(step_n)) self.screenshot_step("before-" + str(step_n))
self.save_step_html("before-"+str(step_n)) self.save_step_html("before-" + str(step_n))
try: try:
optional_value = step['optional_value'] optional_value = step['optional_value']
selector = step['selector'] selector = step['selector']
@ -177,8 +186,6 @@ class Fetcher():
# Stop processing here # Stop processing here
raise BrowserStepsStepTimout(step_n=step_n) raise BrowserStepsStepTimout(step_n=step_n)
# It's always good to reset these # It's always good to reset these
def delete_browser_steps_screenshots(self): def delete_browser_steps_screenshots(self):
import glob import glob
@ -188,6 +195,7 @@ class Fetcher():
for f in files: for f in files:
os.unlink(f) os.unlink(f)
# Maybe for the future, each fetcher provides its own diff output, could be used for text, image # Maybe for the future, each fetcher provides its own diff output, could be used for text, image
# the current one would return javascript output (as we use JS to generate the diff) # the current one would return javascript output (as we use JS to generate the diff)
# #
@ -205,6 +213,7 @@ def available_fetchers():
return p return p
class base_html_playwright(Fetcher): class base_html_playwright(Fetcher):
fetcher_description = "Playwright {}/Javascript".format( fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
@ -278,6 +287,156 @@ class base_html_playwright(Fetcher):
current_include_filters=None, current_include_filters=None,
is_binary=False): is_binary=False):
# Fallback for now to the old way if browsersteps
# @todo - need to figure out how to get browsersteps with images on each step working
if self.browser_steps:
for step in self.browser_steps:
if step.get('operation'):
return self.run_playwright(
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes,
current_include_filters,
is_binary)
elif os.getenv('FORCE_PLAYWRIGHT_FETCH'):
# Temporary backup solution until we rewrite the playwright code
return self.run_playwright(
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes,
current_include_filters,
is_binary)
extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
code = f"""module.exports = async ({{ page, context }}) => {{
var {{ url, execute_js, user_agent, extra_wait_ms, req_headers, include_filters, xpath_element_js, screenshot_quality, proxy}} = context;
await page.setBypassCSP(true)
await page.setExtraHTTPHeaders(req_headers);
await page.setUserAgent(user_agent);
if(proxy) {{
await page.authenticate({{
username: proxy['username'],
password: proxy['password'],
}});
}}
const r = await page.goto(url, wait_until='commit');
await page.waitForTimeout(extra_wait_ms)
if(execute_js) {{
await page.evaluate(execute_js);
await page.waitForTimeout(200);
}}
const xpath_data = await page.evaluate((include_filters) => {{ {xpath_element_js} }}, include_filters);
const instock_data = await page.evaluate(() => {{ {self.instock_data_js} }});
const html = await page.content();
const b64s = await page.screenshot({{ encoding: "base64", fullPage: true, quality: screenshot_quality, type: 'jpeg' }});
return {{
data: {{
'content': html,
'headers': r.headers(),
'instock_data': instock_data,
'screenshot': b64s,
'status_code': r.status(),
'xpath_data': xpath_data
}},
type: 'application/json',
}};
}};"""
from requests.exceptions import ConnectTimeout, ReadTimeout
wait_browserless_seconds = 120
browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
from urllib.parse import urlparse
if not browserless_function_url:
# Convert/try to guess from PLAYWRIGHT_DRIVER_URL
o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
# Append proxy connect string
if self.proxy:
import urllib.parse
# Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
# Actual authentication handled by Puppeteer/node
o = urlparse(self.proxy.get('server'))
proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
try:
response = requests.request(
method="POST",
json={
"code": code,
"context": {
'execute_js': self.webdriver_js_execute_code,
'extra_wait_ms': extra_wait_ms,
'include_filters': current_include_filters,
'proxy': self.proxy,
'req_headers': request_headers,
'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
'url': url,
'user_agent': request_headers.get('User-Agent', 'Mozilla/5.0'),
}
},
# @todo /function needs adding ws:// to http:// rebuild this
url=browserless_function_url,
timeout=wait_browserless_seconds)
except ReadTimeout:
raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s")
except ConnectTimeout:
raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..")
else:
# 200 Here means that the communication to browserless worked only, not the page state
if response.status_code == 200:
import base64
x = response.json()
if not x.get('screenshot'):
raise ScreenshotUnavailable(url=url, status_code=None)
if not x.get('content', '').strip():
raise EmptyReply(url=url, status_code=None)
if x.get('status_code', 200) != 200 and not ignore_status_codes:
raise Non200ErrorCodeReceived(url=url, status_code=x.get('status_code', 200), page_html=x['content'])
self.content = x.get('content')
self.headers = x.get('headers')
self.instock_data = x.get('instock_data')
self.screenshot = base64.b64decode(x.get('screenshot'))
self.xpath_data = x.get('xpath_data')
else:
# Some other error from browserless
raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
def run_playwright(self,
url,
timeout,
request_headers,
request_body,
request_method,
ignore_status_codes=False,
current_include_filters=None,
is_binary=False):
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
import playwright._impl._api_types import playwright._impl._api_types
@ -294,7 +453,7 @@ class base_html_playwright(Fetcher):
# Set user agent to prevent Cloudflare from blocking the browser # Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py # Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context( context = browser.new_context(
user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0', user_agent=request_headers.get('User-Agent', 'Mozilla/5.0'),
proxy=self.proxy, proxy=self.proxy,
# This is needed to enable JavaScript execution on GitHub and others # This is needed to enable JavaScript execution on GitHub and others
bypass_csp=True, bypass_csp=True,
@ -324,12 +483,12 @@ class base_html_playwright(Fetcher):
except playwright._impl._api_types.Error as e: except playwright._impl._api_types.Error as e:
# Retry once - https://github.com/browserless/chrome/issues/2485 # Retry once - https://github.com/browserless/chrome/issues/2485
# Sometimes errors related to invalid cert's and other can be random # Sometimes errors related to invalid cert's and other can be random
print ("Content Fetcher > retrying request got error - ", str(e)) print("Content Fetcher > retrying request got error - ", str(e))
time.sleep(1) time.sleep(1)
response = self.page.goto(url, wait_until='commit') response = self.page.goto(url, wait_until='commit')
except Exception as e: except Exception as e:
print ("Content Fetcher > Other exception when page.goto", str(e)) print("Content Fetcher > Other exception when page.goto", str(e))
context.close() context.close()
browser.close() browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e)) raise PageUnloadable(url=url, status_code=None, message=str(e))
@ -348,7 +507,7 @@ class base_html_playwright(Fetcher):
# This can be ok, we will try to grab what we could retrieve # This can be ok, we will try to grab what we could retrieve
pass pass
except Exception as e: except Exception as e:
print ("Content Fetcher > Other exception when executing custom JS code", str(e)) print("Content Fetcher > Other exception when executing custom JS code", str(e))
context.close() context.close()
browser.close() browser.close()
raise PageUnloadable(url=url, status_code=None, message=str(e)) raise PageUnloadable(url=url, status_code=None, message=str(e))
@ -356,7 +515,7 @@ class base_html_playwright(Fetcher):
if response is None: if response is None:
context.close() context.close()
browser.close() browser.close()
print ("Content Fetcher > Response object was none") print("Content Fetcher > Response object was none")
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
# Run Browser Steps here # Run Browser Steps here
@ -370,7 +529,7 @@ class base_html_playwright(Fetcher):
if len(self.page.content().strip()) == 0: if len(self.page.content().strip()) == 0:
context.close() context.close()
browser.close() browser.close()
print ("Content Fetcher > Content was empty") print("Content Fetcher > Content was empty")
raise EmptyReply(url=url, status_code=response.status) raise EmptyReply(url=url, status_code=response.status)
self.status_code = response.status self.status_code = response.status
@ -382,7 +541,8 @@ class base_html_playwright(Fetcher):
else: else:
self.page.evaluate("var include_filters=''") self.page.evaluate("var include_filters=''")
self.xpath_data = self.page.evaluate("async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") self.xpath_data = self.page.evaluate(
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
# Bug 3 in Playwright screenshot handling # Bug 3 in Playwright screenshot handling
@ -394,7 +554,8 @@ class base_html_playwright(Fetcher):
# acceptable screenshot quality here # acceptable screenshot quality here
try: try:
# The actual screenshot # The actual screenshot
self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) self.screenshot = self.page.screenshot(type='jpeg', full_page=True,
quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)))
except Exception as e: except Exception as e:
context.close() context.close()
browser.close() browser.close()
@ -403,6 +564,7 @@ class base_html_playwright(Fetcher):
context.close() context.close()
browser.close() browser.close()
class base_html_webdriver(Fetcher): class base_html_webdriver(Fetcher):
if os.getenv("WEBDRIVER_URL"): if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))

@ -38,15 +38,15 @@ const findUpTag = (el) => {
if (el.name !== undefined && el.name.length) { if (el.name !== undefined && el.name.length) {
var proposed = el.tagName + "[name=" + el.name + "]"; var proposed = el.tagName + "[name=" + el.name + "]";
var proposed_element = window.document.querySelectorAll(proposed); var proposed_element = window.document.querySelectorAll(proposed);
if(proposed_element.length) { if (proposed_element.length) {
if (proposed_element.length === 1) { if (proposed_element.length === 1) {
return proposed; return proposed;
} else { } else {
// Some sites change ID but name= stays the same, we can hit it if we know the index // Some sites change ID but name= stays the same, we can hit it if we know the index
// Find all the elements that match and work out the input[n] // Find all the elements that match and work out the input[n]
var n=Array.from(proposed_element).indexOf(el); var n = Array.from(proposed_element).indexOf(el);
// Return a Playwright selector for nthinput[name=zipcode] // Return a Playwright selector for nthinput[name=zipcode]
return proposed+" >> nth="+n; return proposed + " >> nth=" + n;
} }
} }
} }

Loading…
Cancel
Save