@ -311,125 +311,6 @@ class base_html_playwright(Fetcher):
with open ( destination , ' w ' ) as f :
f . write ( content )
def run_fetch_browserless_puppeteer ( self ,
url ,
timeout ,
request_headers ,
request_body ,
request_method ,
ignore_status_codes = False ,
current_include_filters = None ,
is_binary = False ) :
from pkg_resources import resource_string
extra_wait_ms = ( int ( os . getenv ( " WEBDRIVER_DELAY_BEFORE_CONTENT_READY " , 5 ) ) + self . render_extract_delay ) * 1000
self . xpath_element_js = self . xpath_element_js . replace ( ' %E LEMENTS % ' , visualselector_xpath_selectors )
code = resource_string ( __name__ , " res/puppeteer_fetch.js " ) . decode ( ' utf-8 ' )
# In the future inject this is a proper JS package
code = code . replace ( ' %x path_scrape_code % ' , self . xpath_element_js )
code = code . replace ( ' %i nstock_scrape_code % ' , self . instock_data_js )
from requests . exceptions import ConnectTimeout , ReadTimeout
wait_browserless_seconds = 240
browserless_function_url = os . getenv ( ' BROWSERLESS_FUNCTION_URL ' )
from urllib . parse import urlparse
if not browserless_function_url :
# Convert/try to guess from PLAYWRIGHT_DRIVER_URL
o = urlparse ( os . getenv ( ' PLAYWRIGHT_DRIVER_URL ' ) )
browserless_function_url = o . _replace ( scheme = " http " ) . _replace ( path = " function " ) . geturl ( )
# Append proxy connect string
if self . proxy :
# Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
# Actual authentication handled by Puppeteer/node
o = urlparse ( self . proxy . get ( ' server ' ) )
proxy_url = urllib . parse . quote ( o . _replace ( netloc = " {} : {} " . format ( o . hostname , o . port ) ) . geturl ( ) )
browserless_function_url = f " { browserless_function_url } &--proxy-server= { proxy_url } "
try :
amp = ' & ' if ' ? ' in browserless_function_url else ' ? '
response = requests . request (
method = " POST " ,
json = {
" code " : code ,
" context " : {
# Very primitive disk cache - USE WITH EXTREME CAUTION
# Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]"
' disk_cache_dir ' : os . getenv ( " PUPPETEER_DISK_CACHE " , False ) , # or path to disk cache ending in /, ie /tmp/cache/
' execute_js ' : self . webdriver_js_execute_code ,
' extra_wait_ms ' : extra_wait_ms ,
' include_filters ' : current_include_filters ,
' req_headers ' : request_headers ,
' screenshot_quality ' : int ( os . getenv ( " PLAYWRIGHT_SCREENSHOT_QUALITY " , 72 ) ) ,
' url ' : url ,
' user_agent ' : { k . lower ( ) : v for k , v in request_headers . items ( ) } . get ( ' user-agent ' , None ) ,
' proxy_username ' : self . proxy . get ( ' username ' , ' ' ) if self . proxy else False ,
' proxy_password ' : self . proxy . get ( ' password ' , ' ' ) if self . proxy and self . proxy . get ( ' username ' ) else False ,
' no_cache_list ' : [
' twitter ' ,
' .pdf '
] ,
# Could use https://github.com/easylist/easylist here, or install a plugin
' block_url_list ' : [
' adnxs.com ' ,
' analytics.twitter.com ' ,
' doubleclick.net ' ,
' google-analytics.com ' ,
' googletagmanager ' ,
' trustpilot.com '
]
}
} ,
# @todo /function needs adding ws:// to http:// rebuild this
url = browserless_function_url + f " { amp } --disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts " ,
timeout = wait_browserless_seconds )
except ReadTimeout :
raise PageUnloadable ( url = url , status_code = None , message = f " No response from browserless in { wait_browserless_seconds } s " )
except ConnectTimeout :
raise PageUnloadable ( url = url , status_code = None , message = f " Timed out connecting to browserless, retrying.. " )
else :
# 200 Here means that the communication to browserless worked only, not the page state
try :
x = response . json ( )
except Exception as e :
raise PageUnloadable ( url = url , message = " Error reading JSON response from browserless " )
try :
self . status_code = response . status_code
except Exception as e :
raise PageUnloadable ( url = url , message = " Error reading status_code code response from browserless " )
self . headers = x . get ( ' headers ' )
if self . status_code != 200 and not ignore_status_codes :
raise Non200ErrorCodeReceived ( url = url , status_code = self . status_code , page_html = x . get ( ' content ' , ' ' ) )
if self . status_code == 200 :
import base64
if not x . get ( ' screenshot ' ) :
# https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips
# https://github.com/puppeteer/puppeteer/issues/1834
# https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051
# Check your memory is shared and big enough
raise ScreenshotUnavailable ( url = url , status_code = None )
if not x . get ( ' content ' , ' ' ) . strip ( ) :
raise EmptyReply ( url = url , status_code = None )
self . content = x . get ( ' content ' )
self . instock_data = x . get ( ' instock_data ' )
self . screenshot = base64 . b64decode ( x . get ( ' screenshot ' ) )
self . xpath_data = x . get ( ' xpath_data ' )
else :
# Some other error from browserless
raise PageUnloadable ( url = url , status_code = None , message = response . content . decode ( ' utf-8 ' ) )
def run ( self ,
url ,
timeout ,
@ -441,21 +322,6 @@ class base_html_playwright(Fetcher):
is_binary = False ) :
# For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!)
# browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case)
if not self . browser_connection_is_custom and not self . browser_steps and os . getenv ( ' USE_EXPERIMENTAL_PUPPETEER_FETCH ' ) :
if strtobool ( os . getenv ( ' USE_EXPERIMENTAL_PUPPETEER_FETCH ' ) ) :
# Temporary backup solution until we rewrite the playwright code
return self . run_fetch_browserless_puppeteer (
url ,
timeout ,
request_headers ,
request_body ,
request_method ,
ignore_status_codes ,
current_include_filters ,
is_binary )
from playwright . sync_api import sync_playwright
import playwright . _impl . _errors
@ -528,7 +394,7 @@ class base_html_playwright(Fetcher):
self . status_code = response . status
except Exception as e :
# https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
logger . critical ( f " Response from browserless/p laywright did not have a status_code! Response follows." )
logger . critical ( f " Response from the browser/P laywright did not have a status_code! Response follows." )
logger . critical ( response )
raise PageUnloadable ( url = url , status_code = None , message = str ( e ) )