|
|
|
@ -311,125 +311,6 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
with open(destination, 'w') as f:
|
|
|
|
|
f.write(content)
|
|
|
|
|
|
|
|
|
|
def run_fetch_browserless_puppeteer(self,
|
|
|
|
|
url,
|
|
|
|
|
timeout,
|
|
|
|
|
request_headers,
|
|
|
|
|
request_body,
|
|
|
|
|
request_method,
|
|
|
|
|
ignore_status_codes=False,
|
|
|
|
|
current_include_filters=None,
|
|
|
|
|
is_binary=False):
|
|
|
|
|
|
|
|
|
|
from pkg_resources import resource_string
|
|
|
|
|
|
|
|
|
|
extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000
|
|
|
|
|
|
|
|
|
|
self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors)
|
|
|
|
|
code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8')
|
|
|
|
|
# In the future inject this is a proper JS package
|
|
|
|
|
code = code.replace('%xpath_scrape_code%', self.xpath_element_js)
|
|
|
|
|
code = code.replace('%instock_scrape_code%', self.instock_data_js)
|
|
|
|
|
|
|
|
|
|
from requests.exceptions import ConnectTimeout, ReadTimeout
|
|
|
|
|
wait_browser_seconds = 240
|
|
|
|
|
|
|
|
|
|
browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL')
|
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
if not browserless_function_url:
|
|
|
|
|
# Convert/try to guess from PLAYWRIGHT_DRIVER_URL
|
|
|
|
|
o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL'))
|
|
|
|
|
browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Append proxy connect string
|
|
|
|
|
if self.proxy:
|
|
|
|
|
# Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
|
|
|
|
|
# Actual authentication handled by Puppeteer/node
|
|
|
|
|
o = urlparse(self.proxy.get('server'))
|
|
|
|
|
proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl())
|
|
|
|
|
browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}"
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
amp = '&' if '?' in browserless_function_url else '?'
|
|
|
|
|
response = requests.request(
|
|
|
|
|
method="POST",
|
|
|
|
|
json={
|
|
|
|
|
"code": code,
|
|
|
|
|
"context": {
|
|
|
|
|
# Very primitive disk cache - USE WITH EXTREME CAUTION
|
|
|
|
|
# Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
|
|
|
|
|
'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/
|
|
|
|
|
'execute_js': self.webdriver_js_execute_code,
|
|
|
|
|
'extra_wait_ms': extra_wait_ms,
|
|
|
|
|
'include_filters': current_include_filters,
|
|
|
|
|
'req_headers': request_headers,
|
|
|
|
|
'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)),
|
|
|
|
|
'url': url,
|
|
|
|
|
'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
|
|
|
|
|
'proxy_username': self.proxy.get('username', '') if self.proxy else False,
|
|
|
|
|
'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False,
|
|
|
|
|
'no_cache_list': [
|
|
|
|
|
'twitter',
|
|
|
|
|
'.pdf'
|
|
|
|
|
],
|
|
|
|
|
# Could use https://github.com/easylist/easylist here, or install a plugin
|
|
|
|
|
'block_url_list': [
|
|
|
|
|
'adnxs.com',
|
|
|
|
|
'analytics.twitter.com',
|
|
|
|
|
'doubleclick.net',
|
|
|
|
|
'google-analytics.com',
|
|
|
|
|
'googletagmanager',
|
|
|
|
|
'trustpilot.com'
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
# @todo /function needs adding ws:// to http:// rebuild this
|
|
|
|
|
url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts",
|
|
|
|
|
timeout=wait_browser_seconds)
|
|
|
|
|
|
|
|
|
|
except ReadTimeout:
|
|
|
|
|
raise PageUnloadable(url=url, status_code=None, message=f"No response from browser in {wait_browser_seconds}s")
|
|
|
|
|
except ConnectTimeout:
|
|
|
|
|
raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browser, retrying..")
|
|
|
|
|
else:
|
|
|
|
|
# 200 Here means that the communication to the browser worked only, not the page state
|
|
|
|
|
try:
|
|
|
|
|
x = response.json()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise PageUnloadable(url=url, message="Error reading JSON response from browserless")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
self.status_code = response.status_code
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise PageUnloadable(url=url, message="Error reading status_code code response from browserless")
|
|
|
|
|
|
|
|
|
|
self.headers = x.get('headers')
|
|
|
|
|
|
|
|
|
|
if self.status_code != 200 and not ignore_status_codes:
|
|
|
|
|
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content',''))
|
|
|
|
|
|
|
|
|
|
if self.status_code == 200:
|
|
|
|
|
import base64
|
|
|
|
|
|
|
|
|
|
if not x.get('screenshot'):
|
|
|
|
|
# https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
|
|
|
|
|
# https://github.com/puppeteer/puppeteer/issues/1834
|
|
|
|
|
# https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
|
|
|
|
|
# Check your memory is shared and big enough
|
|
|
|
|
raise ScreenshotUnavailable(url=url, status_code=None)
|
|
|
|
|
|
|
|
|
|
if not x.get('content', '').strip():
|
|
|
|
|
raise EmptyReply(url=url, status_code=None)
|
|
|
|
|
|
|
|
|
|
self.content = x.get('content')
|
|
|
|
|
self.instock_data = x.get('instock_data')
|
|
|
|
|
self.screenshot = base64.b64decode(x.get('screenshot'))
|
|
|
|
|
self.xpath_data = x.get('xpath_data')
|
|
|
|
|
else:
|
|
|
|
|
# Some other error from browserless
|
|
|
|
|
raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8'))
|
|
|
|
|
|
|
|
|
|
def run(self,
|
|
|
|
|
url,
|
|
|
|
|
timeout,
|
|
|
|
@ -441,21 +322,6 @@ class base_html_playwright(Fetcher):
|
|
|
|
|
is_binary=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
|
|
|
|
|
# browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
|
|
|
|
|
if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'):
|
|
|
|
|
if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')):
|
|
|
|
|
# Temporary backup solution until we rewrite the playwright code
|
|
|
|
|
return self.run_fetch_browserless_puppeteer(
|
|
|
|
|
url,
|
|
|
|
|
timeout,
|
|
|
|
|
request_headers,
|
|
|
|
|
request_body,
|
|
|
|
|
request_method,
|
|
|
|
|
ignore_status_codes,
|
|
|
|
|
current_include_filters,
|
|
|
|
|
is_binary)
|
|
|
|
|
|
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
|
import playwright._impl._errors
|
|
|
|
|
|
|
|
|
|