|
|
@ -44,10 +44,12 @@ class fetcher(Fetcher):
|
|
|
|
|
|
|
|
|
|
|
|
# @todo filter some injection attack?
|
|
|
|
# @todo filter some injection attack?
|
|
|
|
# check scheme when no scheme
|
|
|
|
# check scheme when no scheme
|
|
|
|
proxy_url = parsed.scheme + "://" if parsed.scheme else ''
|
|
|
|
proxy_url = parsed.scheme + "://" if parsed.scheme else 'http://'
|
|
|
|
proxy_url += f"{parsed.hostname}:{parsed.port}{parsed.path}?{parsed.query}"
|
|
|
|
|
|
|
|
r = "?" if not '?' in self.browser_connection_url else '&'
|
|
|
|
r = "?" if not '?' in self.browser_connection_url else '&'
|
|
|
|
self.browser_connection_url += f"{r}--proxy-server={proxy_override}"
|
|
|
|
port = ":"+str(parsed.port) if parsed.port else ''
|
|
|
|
|
|
|
|
q = "?"+parsed.query if parsed.query else ''
|
|
|
|
|
|
|
|
proxy_url += f"{parsed.hostname}{port}{parsed.path}{q}"
|
|
|
|
|
|
|
|
self.browser_connection_url += f"{r}--proxy-server={proxy_url}"
|
|
|
|
|
|
|
|
|
|
|
|
# def screenshot_step(self, step_n=''):
|
|
|
|
# def screenshot_step(self, step_n=''):
|
|
|
|
# screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
|
|
|
|
# screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
|
|
|
@ -105,8 +107,8 @@ class fetcher(Fetcher):
|
|
|
|
# SOCKS5 with authentication is not supported (yet)
|
|
|
|
# SOCKS5 with authentication is not supported (yet)
|
|
|
|
# https://github.com/microsoft/playwright/issues/10567
|
|
|
|
# https://github.com/microsoft/playwright/issues/10567
|
|
|
|
self.page.setDefaultNavigationTimeout(0)
|
|
|
|
self.page.setDefaultNavigationTimeout(0)
|
|
|
|
self.page.setCacheEnabled(True)
|
|
|
|
await self.page.setCacheEnabled(True)
|
|
|
|
if self.proxy:
|
|
|
|
if self.proxy and self.proxy.get('username'):
|
|
|
|
# Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
|
|
|
|
# Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
|
|
|
|
# https://github.com/puppeteer/puppeteer/issues/676 ?
|
|
|
|
# https://github.com/puppeteer/puppeteer/issues/676 ?
|
|
|
|
# https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
|
|
|
|
# https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
|
|
|
@ -121,7 +123,7 @@ class fetcher(Fetcher):
|
|
|
|
# browsersteps_interface.page = self.page
|
|
|
|
# browsersteps_interface.page = self.page
|
|
|
|
|
|
|
|
|
|
|
|
response = await self.page.goto(url, waitUntil="load")
|
|
|
|
response = await self.page.goto(url, waitUntil="load")
|
|
|
|
self.headers = response.headers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if response is None:
|
|
|
|
if response is None:
|
|
|
|
await self.page.close()
|
|
|
|
await self.page.close()
|
|
|
@ -129,6 +131,8 @@ class fetcher(Fetcher):
|
|
|
|
logger.warning("Content Fetcher > Response object was none")
|
|
|
|
logger.warning("Content Fetcher > Response object was none")
|
|
|
|
raise EmptyReply(url=url, status_code=None)
|
|
|
|
raise EmptyReply(url=url, status_code=None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.headers = response.headers
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
|
|
|
|
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
|
|
|
|
await self.page.evaluate(self.webdriver_js_execute_code)
|
|
|
|
await self.page.evaluate(self.webdriver_js_execute_code)
|
|
|
@ -140,9 +144,6 @@ class fetcher(Fetcher):
|
|
|
|
# This can be ok, we will try to grab what we could retrieve
|
|
|
|
# This can be ok, we will try to grab what we could retrieve
|
|
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
|
|
|
|
|
|
|
await asyncio.sleep(1 + extra_wait)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
self.status_code = response.status
|
|
|
|
self.status_code = response.status
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|