270 lines
14 KiB
270 lines
14 KiB
import asyncio
|
|
import json
|
|
import os
|
|
import websockets.exceptions
|
|
from urllib.parse import urlparse
|
|
|
|
from loguru import logger
|
|
|
|
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
|
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
|
|
|
|
class fetcher(Fetcher):
|
|
fetcher_description = "Puppeteer/direct {}/Javascript".format(
|
|
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
|
|
)
|
|
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
|
|
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
|
|
|
|
browser_type = ''
|
|
command_executor = ''
|
|
|
|
proxy = None
|
|
|
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
|
super().__init__()
|
|
|
|
if custom_browser_connection_url:
|
|
self.browser_connection_is_custom = True
|
|
self.browser_connection_url = custom_browser_connection_url
|
|
else:
|
|
# Fallback to fetching from system
|
|
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
|
self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
|
|
|
|
# allow per-watch proxy selection override
|
|
# @todo check global too?
|
|
if proxy_override:
|
|
# Playwright needs separate username and password values
|
|
parsed = urlparse(proxy_override)
|
|
if parsed:
|
|
self.proxy = {'username': parsed.username, 'password': parsed.password}
|
|
# Add the proxy server chrome start option, the username and password never gets added here
|
|
# (It always goes in via await self.page.authenticate(self.proxy))
|
|
|
|
# @todo filter some injection attack?
|
|
# check scheme when no scheme
|
|
proxy_url = parsed.scheme + "://" if parsed.scheme else 'http://'
|
|
r = "?" if not '?' in self.browser_connection_url else '&'
|
|
port = ":"+str(parsed.port) if parsed.port else ''
|
|
q = "?"+parsed.query if parsed.query else ''
|
|
proxy_url += f"{parsed.hostname}{port}{parsed.path}{q}"
|
|
self.browser_connection_url += f"{r}--proxy-server={proxy_url}"
|
|
|
|
# def screenshot_step(self, step_n=''):
|
|
# screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85)
|
|
#
|
|
# if self.browser_steps_screenshot_path is not None:
|
|
# destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
|
|
# logger.debug(f"Saving step screenshot to {destination}")
|
|
# with open(destination, 'wb') as f:
|
|
# f.write(screenshot)
|
|
#
|
|
# def save_step_html(self, step_n):
|
|
# content = self.page.content()
|
|
# destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
|
|
# logger.debug(f"Saving step HTML to {destination}")
|
|
# with open(destination, 'w') as f:
|
|
# f.write(content)
|
|
|
|
async def fetch_page(self,
|
|
url,
|
|
timeout,
|
|
request_headers,
|
|
request_body,
|
|
request_method,
|
|
ignore_status_codes,
|
|
current_include_filters,
|
|
is_binary
|
|
):
|
|
|
|
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
|
self.delete_browser_steps_screenshots()
|
|
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
|
|
|
from pyppeteer import Pyppeteer
|
|
pyppeteer_instance = Pyppeteer()
|
|
|
|
# Connect directly using the specified browser_ws_endpoint
|
|
# @todo timeout
|
|
try:
|
|
browser = await pyppeteer_instance.connect(browserWSEndpoint=self.browser_connection_url,
|
|
ignoreHTTPSErrors=True
|
|
)
|
|
except websockets.exceptions.InvalidStatusCode as e:
|
|
raise BrowserConnectError(msg=f"Error while trying to connect the browser, Code {e.status_code} (check your access, whitelist IP, password etc)")
|
|
except websockets.exceptions.InvalidURI:
|
|
raise BrowserConnectError(msg=f"Error connecting to the browser, check your browser connection address (should be ws:// or wss://")
|
|
except Exception as e:
|
|
raise BrowserConnectError(msg=f"Error connecting to the browser {str(e)}")
|
|
|
|
# Better is to launch chrome with the URL as arg
|
|
# non-headless - newPage() will launch an extra tab/window, .browser should already contain 1 page/tab
|
|
# headless - ask a new page
|
|
self.page = (pages := await browser.pages) and len(pages) or await browser.newPage()
|
|
|
|
try:
|
|
from pyppeteerstealth import inject_evasions_into_page
|
|
except ImportError:
|
|
logger.debug("pyppeteerstealth module not available, skipping")
|
|
pass
|
|
else:
|
|
# I tried hooking events via self.page.on(Events.Page.DOMContentLoaded, inject_evasions_requiring_obj_to_page)
|
|
# But I could never get it to fire reliably, so we just inject it straight after
|
|
await inject_evasions_into_page(self.page)
|
|
|
|
# This user agent is similar to what was used when tweaking the evasions in inject_evasions_into_page(..)
|
|
user_agent = None
|
|
if request_headers and request_headers.get('User-Agent'):
|
|
# Request_headers should now be CaaseInsensitiveDict
|
|
# Remove it so it's not sent again with headers after
|
|
user_agent = request_headers.pop('User-Agent').strip()
|
|
await self.page.setUserAgent(user_agent)
|
|
|
|
if not user_agent:
|
|
# Attempt to strip 'HeadlessChrome' etc
|
|
await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent')))
|
|
|
|
await self.page.setBypassCSP(True)
|
|
if request_headers:
|
|
await self.page.setExtraHTTPHeaders(request_headers)
|
|
|
|
# SOCKS5 with authentication is not supported (yet)
|
|
# https://github.com/microsoft/playwright/issues/10567
|
|
self.page.setDefaultNavigationTimeout(0)
|
|
await self.page.setCacheEnabled(True)
|
|
if self.proxy and self.proxy.get('username'):
|
|
# Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer
|
|
# https://github.com/puppeteer/puppeteer/issues/676 ?
|
|
# https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2
|
|
# https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/
|
|
await self.page.authenticate(self.proxy)
|
|
|
|
# Re-use as much code from browser steps as possible so its the same
|
|
# from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
|
|
|
|
# not yet used here, we fallback to playwright when browsersteps is required
|
|
# browsersteps_interface = steppable_browser_interface()
|
|
# browsersteps_interface.page = self.page
|
|
|
|
response = await self.page.goto(url, waitUntil="load")
|
|
|
|
|
|
if response is None:
|
|
await self.page.close()
|
|
await browser.close()
|
|
logger.warning("Content Fetcher > Response object was none")
|
|
raise EmptyReply(url=url, status_code=None)
|
|
|
|
self.headers = response.headers
|
|
|
|
try:
|
|
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
|
|
await self.page.evaluate(self.webdriver_js_execute_code)
|
|
except Exception as e:
|
|
logger.warning("Got exception when running evaluate on custom JS code")
|
|
logger.error(str(e))
|
|
await self.page.close()
|
|
await browser.close()
|
|
# This can be ok, we will try to grab what we could retrieve
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
try:
|
|
self.status_code = response.status
|
|
except Exception as e:
|
|
# https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
|
|
logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
|
|
logger.critical(response)
|
|
await self.page.close()
|
|
await browser.close()
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
if self.status_code != 200 and not ignore_status_codes:
|
|
screenshot = await self.page.screenshot(type_='jpeg',
|
|
fullPage=True,
|
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
|
|
|
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
|
content = await self.page.content
|
|
if len(content.strip()) == 0:
|
|
await self.page.close()
|
|
await browser.close()
|
|
logger.error("Content Fetcher > Content was empty")
|
|
raise EmptyReply(url=url, status_code=response.status)
|
|
|
|
# Run Browser Steps here
|
|
# @todo not yet supported, we switch to playwright in this case
|
|
# if self.browser_steps_get_valid_steps():
|
|
# self.iterate_browser_steps()
|
|
|
|
await asyncio.sleep(1 + extra_wait)
|
|
|
|
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
|
# Setup the xPath/VisualSelector scraper
|
|
if current_include_filters is not None:
|
|
js = json.dumps(current_include_filters)
|
|
await self.page.evaluate(f"var include_filters={js}")
|
|
else:
|
|
await self.page.evaluate(f"var include_filters=''")
|
|
|
|
self.xpath_data = await self.page.evaluate(
|
|
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
|
|
self.instock_data = await self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
|
|
|
self.content = await self.page.content
|
|
# Bug 3 in Playwright screenshot handling
|
|
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
|
# JPEG is better here because the screenshots can be very very large
|
|
|
|
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
|
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
|
# acceptable screenshot quality here
|
|
try:
|
|
self.screenshot = await self.page.screenshot(type_='jpeg',
|
|
fullPage=True,
|
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
|
except Exception as e:
|
|
logger.error("Error fetching screenshot")
|
|
# // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw'
|
|
# // @ todo after text extract, we can place some overlay text with red background to say 'croppped'
|
|
logger.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot')
|
|
try:
|
|
self.screenshot = await self.page.screenshot(type_='jpeg',
|
|
fullPage=False,
|
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
|
except Exception as e:
|
|
logger.error('ERROR: Failed to get viewport-only reduced screenshot :(')
|
|
pass
|
|
finally:
|
|
# It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
|
|
logger.success(f"Fetching '{url}' complete, closing page")
|
|
await self.page.close()
|
|
logger.success(f"Fetching '{url}' complete, closing browser")
|
|
await browser.close()
|
|
logger.success(f"Fetching '{url}' complete, exiting puppeteer fetch.")
|
|
|
|
async def main(self, **kwargs):
|
|
await self.fetch_page(**kwargs)
|
|
|
|
def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False,
|
|
current_include_filters=None, is_binary=False):
|
|
|
|
#@todo make update_worker async which could run any of these content_fetchers within memory and time constraints
|
|
max_time = os.getenv('PUPPETEER_MAX_PROCESSING_TIMEOUT_SECONDS', 180)
|
|
|
|
# This will work in 3.10 but not >= 3.11 because 3.11 wants tasks only
|
|
try:
|
|
asyncio.run(asyncio.wait_for(self.main(
|
|
url=url,
|
|
timeout=timeout,
|
|
request_headers=request_headers,
|
|
request_body=request_body,
|
|
request_method=request_method,
|
|
ignore_status_codes=ignore_status_codes,
|
|
current_include_filters=current_include_filters,
|
|
is_binary=is_binary
|
|
), timeout=max_time))
|
|
except asyncio.TimeoutError:
|
|
raise(BrowserFetchTimedOut(msg=f"Browser connected but was unable to process the page in {max_time} seconds."))
|
|
|