@ -1,4 +1,5 @@
from abc import abstractmethod
from abc import abstractmethod
from distutils . util import strtobool
from urllib . parse import urlparse
from urllib . parse import urlparse
import chardet
import chardet
import hashlib
import hashlib
@ -8,6 +9,7 @@ import os
import requests
import requests
import sys
import sys
import time
import time
import urllib . parse
visualselector_xpath_selectors = ' div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary '
visualselector_xpath_selectors = ' div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary '
@ -267,7 +269,6 @@ class base_html_playwright(Fetcher):
if self . proxy :
if self . proxy :
# Playwright needs separate username and password values
# Playwright needs separate username and password values
from urllib . parse import urlparse
parsed = urlparse ( self . proxy . get ( ' server ' ) )
parsed = urlparse ( self . proxy . get ( ' server ' ) )
if parsed . username :
if parsed . username :
self . proxy [ ' username ' ] = parsed . username
self . proxy [ ' username ' ] = parsed . username
@ -322,14 +323,13 @@ class base_html_playwright(Fetcher):
# Append proxy connect string
# Append proxy connect string
if self . proxy :
if self . proxy :
import urllib . parse
# Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
# Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error
# Actual authentication handled by Puppeteer/node
# Actual authentication handled by Puppeteer/node
o = urlparse ( self . proxy . get ( ' server ' ) )
o = urlparse ( self . proxy . get ( ' server ' ) )
proxy_url = urllib . parse . quote ( o . _replace ( netloc = " {} : {} " . format ( o . hostname , o . port ) ) . geturl ( ) )
# Remove scheme, socks5:// doesnt always work and it will autodetect anyway
proxy_url = urllib . parse . quote ( o . _replace ( netloc = " {} : {} " . format ( o . hostname , o . port ) ) . geturl ( ) . replace ( f " { o . scheme } :// " , ' ' , 1 ) )
browserless_function_url = f " { browserless_function_url } &--proxy-server= { proxy_url } &dumpio=true "
browserless_function_url = f " { browserless_function_url } &--proxy-server= { proxy_url } &dumpio=true "
try :
try :
amp = ' & ' if ' ? ' in browserless_function_url else ' ? '
amp = ' & ' if ' ? ' in browserless_function_url else ' ? '
response = requests . request (
response = requests . request (
@ -348,7 +348,7 @@ class base_html_playwright(Fetcher):
' url ' : url ,
' url ' : url ,
' user_agent ' : { k . lower ( ) : v for k , v in request_headers . items ( ) } . get ( ' user-agent ' , None ) ,
' user_agent ' : { k . lower ( ) : v for k , v in request_headers . items ( ) } . get ( ' user-agent ' , None ) ,
' proxy_username ' : self . proxy . get ( ' username ' , ' ' ) if self . proxy else False ,
' proxy_username ' : self . proxy . get ( ' username ' , ' ' ) if self . proxy else False ,
' proxy_password ' : self . proxy . get ( ' password ' , ' ' ) if self . proxy else False ,
' proxy_password ' : self . proxy . get ( ' password ' , ' ' ) if self . proxy and self . proxy . get ( ' username ' ) else False ,
' no_cache_list ' : [
' no_cache_list ' : [
' twitter ' ,
' twitter ' ,
' .pdf '
' .pdf '
@ -417,8 +417,8 @@ class base_html_playwright(Fetcher):
lambda s : ( s [ ' operation ' ] and len ( s [ ' operation ' ] ) and s [ ' operation ' ] != ' Choose one ' and s [ ' operation ' ] != ' Goto site ' ) ,
lambda s : ( s [ ' operation ' ] and len ( s [ ' operation ' ] ) and s [ ' operation ' ] != ' Choose one ' and s [ ' operation ' ] != ' Goto site ' ) ,
self . browser_steps ) )
self . browser_steps ) )
if not has_browser_steps :
if not has_browser_steps and os . getenv ( ' USE_EXPERIMENTAL_PUPPETEER_FETCH ' ) :
if os. getenv ( ' USE_EXPERIMENTAL_PUPPETEER_FETCH ' ) :
if strtobool( os. getenv ( ' USE_EXPERIMENTAL_PUPPETEER_FETCH ' ) ) :
# Temporary backup solution until we rewrite the playwright code
# Temporary backup solution until we rewrite the playwright code
return self . run_fetch_browserless_puppeteer (
return self . run_fetch_browserless_puppeteer (
url ,
url ,
@ -435,6 +435,7 @@ class base_html_playwright(Fetcher):
self . delete_browser_steps_screenshots ( )
self . delete_browser_steps_screenshots ( )
response = None
response = None
with sync_playwright ( ) as p :
with sync_playwright ( ) as p :
browser_type = getattr ( p , self . browser_type )
browser_type = getattr ( p , self . browser_type )
@ -443,6 +444,9 @@ class base_html_playwright(Fetcher):
# 60,000 connection timeout only
# 60,000 connection timeout only
browser = browser_type . connect_over_cdp ( self . command_executor , timeout = 60000 )
browser = browser_type . connect_over_cdp ( self . command_executor , timeout = 60000 )
# SOCKS5 with authentication is not supported (yet)
# https://github.com/microsoft/playwright/issues/10567
# Set user agent to prevent Cloudflare from blocking the browser
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser . new_context (
context = browser . new_context (
@ -479,7 +483,6 @@ class base_html_playwright(Fetcher):
print ( " Content Fetcher > retrying request got error - " , str ( e ) )
print ( " Content Fetcher > retrying request got error - " , str ( e ) )
time . sleep ( 1 )
time . sleep ( 1 )
response = self . page . goto ( url , wait_until = ' commit ' )
response = self . page . goto ( url , wait_until = ' commit ' )
except Exception as e :
except Exception as e :
print ( " Content Fetcher > Other exception when page.goto " , str ( e ) )
print ( " Content Fetcher > Other exception when page.goto " , str ( e ) )
context . close ( )
context . close ( )
@ -633,7 +636,6 @@ class base_html_webdriver(Fetcher):
from selenium . common . exceptions import WebDriverException
from selenium . common . exceptions import WebDriverException
# request_body, request_method unused for now, until some magic in the future happens.
# request_body, request_method unused for now, until some magic in the future happens.
# check env for WEBDRIVER_URL
self . driver = webdriver . Remote (
self . driver = webdriver . Remote (
command_executor = self . command_executor ,
command_executor = self . command_executor ,
desired_capabilities = DesiredCapabilities . CHROME ,
desired_capabilities = DesiredCapabilities . CHROME ,
@ -712,6 +714,10 @@ class html_requests(Fetcher):
proxies = { }
proxies = { }
# Allows override the proxy on a per-request basis
# Allows override the proxy on a per-request basis
# https://requests.readthedocs.io/en/latest/user/advanced/#socks
# Should also work with `socks5://user:pass@host:port` type syntax.
if self . proxy_override :
if self . proxy_override :
proxies = { ' http ' : self . proxy_override , ' https ' : self . proxy_override , ' ftp ' : self . proxy_override }
proxies = { ' http ' : self . proxy_override , ' https ' : self . proxy_override , ' ftp ' : self . proxy_override }
else :
else :