You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
272 lines
13 KiB
272 lines
13 KiB
import json
|
|
import os
|
|
from urllib.parse import urlparse
|
|
|
|
from loguru import logger
|
|
|
|
from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
|
|
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, PaginatedContentMisconfigured, ScreenshotUnavailable
|
|
|
|
class fetcher(Fetcher):
|
|
fetcher_description = "Playwright {}/Javascript".format(
|
|
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
|
|
)
|
|
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
|
|
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
|
|
|
|
browser_type = ''
|
|
command_executor = ''
|
|
|
|
# Configs for Proxy setup
|
|
# In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server"
|
|
playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password']
|
|
|
|
proxy = None
|
|
|
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
|
super().__init__()
|
|
|
|
self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
|
|
|
|
if custom_browser_connection_url:
|
|
self.browser_connection_is_custom = True
|
|
self.browser_connection_url = custom_browser_connection_url
|
|
else:
|
|
# Fallback to fetching from system
|
|
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
|
self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"')
|
|
|
|
# If any proxy settings are enabled, then we should setup the proxy object
|
|
proxy_args = {}
|
|
for k in self.playwright_proxy_settings_mappings:
|
|
v = os.getenv('playwright_proxy_' + k, False)
|
|
if v:
|
|
proxy_args[k] = v.strip('"')
|
|
|
|
if proxy_args:
|
|
self.proxy = proxy_args
|
|
|
|
# allow per-watch proxy selection override
|
|
if proxy_override:
|
|
self.proxy = {'server': proxy_override}
|
|
|
|
if self.proxy:
|
|
# Playwright needs separate username and password values
|
|
parsed = urlparse(self.proxy.get('server'))
|
|
if parsed.username:
|
|
self.proxy['username'] = parsed.username
|
|
self.proxy['password'] = parsed.password
|
|
|
|
def screenshot_step(self, step_n=''):
|
|
super().screenshot_step(step_n=step_n)
|
|
screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
|
|
|
if self.browser_steps_screenshot_path is not None:
|
|
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n))
|
|
logger.debug(f"Saving step screenshot to {destination}")
|
|
with open(destination, 'wb') as f:
|
|
f.write(screenshot)
|
|
|
|
def save_step_html(self, step_n):
|
|
super().save_step_html(step_n=step_n)
|
|
content = self.page.content()
|
|
destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n))
|
|
logger.debug(f"Saving step HTML to {destination}")
|
|
with open(destination, 'w') as f:
|
|
f.write(content)
|
|
|
|
def run(self,
|
|
url,
|
|
timeout,
|
|
request_headers,
|
|
request_body,
|
|
request_method,
|
|
ignore_status_codes=False,
|
|
current_include_filters=None,
|
|
is_binary=False,
|
|
empty_pages_are_a_change=False):
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
import playwright._impl._errors
|
|
from changedetectionio.content_fetchers import visualselector_xpath_selectors
|
|
self.delete_browser_steps_screenshots()
|
|
response = None
|
|
|
|
with sync_playwright() as p:
|
|
browser_type = getattr(p, self.browser_type)
|
|
|
|
# Seemed to cause a connection Exception even tho I can see it connect
|
|
# self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
|
|
# 60,000 connection timeout only
|
|
browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000)
|
|
|
|
# SOCKS5 with authentication is not supported (yet)
|
|
# https://github.com/microsoft/playwright/issues/10567
|
|
|
|
# Set user agent to prevent Cloudflare from blocking the browser
|
|
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
|
|
context = browser.new_context(
|
|
accept_downloads=False, # Should never be needed
|
|
bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
|
|
extra_http_headers=request_headers,
|
|
ignore_https_errors=True,
|
|
proxy=self.proxy,
|
|
service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
|
|
user_agent=manage_user_agent(headers=request_headers),
|
|
)
|
|
|
|
self.page = context.new_page()
|
|
|
|
# Listen for all console events and handle errors
|
|
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
|
|
|
|
# Re-use as much code from browser steps as possible so its the same
|
|
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
|
|
browsersteps_interface = steppable_browser_interface(start_url=url)
|
|
browsersteps_interface.page = self.page
|
|
|
|
response = browsersteps_interface.action_goto_url(value=url)
|
|
self.headers = response.all_headers()
|
|
|
|
if response is None:
|
|
context.close()
|
|
browser.close()
|
|
logger.debug("Content Fetcher > Response object from the browser communication was none")
|
|
raise EmptyReply(url=url, status_code=None)
|
|
|
|
try:
|
|
if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code):
|
|
if self.webdriver_enable_pagination == True:
|
|
self.run_paginated(url=url)
|
|
else:
|
|
self.run_normal(browsersteps_interface=browsersteps_interface)
|
|
except playwright._impl._errors.TimeoutError as e:
|
|
context.close()
|
|
browser.close()
|
|
# This can be ok, we will try to grab what we could retrieve
|
|
pass
|
|
except Exception as e:
|
|
logger.debug(f"Content Fetcher > Other exception when executing custom JS code {str(e)}")
|
|
context.close()
|
|
browser.close()
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay
|
|
self.page.wait_for_timeout(extra_wait * 1000)
|
|
|
|
try:
|
|
self.status_code = response.status
|
|
except Exception as e:
|
|
# https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962
|
|
logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.")
|
|
logger.critical(response)
|
|
context.close()
|
|
browser.close()
|
|
raise PageUnloadable(url=url, status_code=None, message=str(e))
|
|
|
|
if self.status_code != 200 and not ignore_status_codes:
|
|
screenshot = self.page.screenshot(type='jpeg', full_page=True,
|
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)))
|
|
|
|
raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot)
|
|
|
|
if not empty_pages_are_a_change and len(self.page.content().strip()) == 0:
|
|
logger.debug("Content Fetcher > Content was empty, empty_pages_are_a_change = False")
|
|
context.close()
|
|
browser.close()
|
|
raise EmptyReply(url=url, status_code=response.status)
|
|
|
|
# Run Browser Steps here
|
|
if self.browser_steps_get_valid_steps():
|
|
self.iterate_browser_steps(start_url=url)
|
|
|
|
self.page.wait_for_timeout(extra_wait * 1000)
|
|
|
|
# So we can find an element on the page where its selector was entered manually (maybe not xPath etc)
|
|
if current_include_filters is not None:
|
|
self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
|
|
else:
|
|
self.page.evaluate("var include_filters=''")
|
|
|
|
self.xpath_data = self.page.evaluate(
|
|
"async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}")
|
|
self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}")
|
|
|
|
self.content = self.page.content()
|
|
# Bug 3 in Playwright screenshot handling
|
|
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
|
|
# JPEG is better here because the screenshots can be very very large
|
|
|
|
# Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded
|
|
# which will significantly increase the IO size between the server and client, it's recommended to use the lowest
|
|
# acceptable screenshot quality here
|
|
try:
|
|
# The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage
|
|
self.screenshot = self.page.screenshot(type='jpeg',
|
|
full_page=True,
|
|
quality=int(os.getenv("SCREENSHOT_QUALITY", 72)),
|
|
)
|
|
except Exception as e:
|
|
# It's likely the screenshot was too long/big and something crashed
|
|
raise ScreenshotUnavailable(url=url, status_code=self.status_code)
|
|
finally:
|
|
context.close()
|
|
browser.close()
|
|
|
|
def run_normal(self, browsersteps_interface):
|
|
"""
|
|
Run normal content extraction.
|
|
"""
|
|
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
|
|
|
|
def run_paginated(self, url):
|
|
"""
|
|
Run paginated content extraction in the following order:
|
|
1. Execute initial JS code after the page is loaded
|
|
2.
|
|
a. Execute JS code to extract content from the page\n
|
|
b. Look for a "next page" button and click it if it exists\n
|
|
c. Repeat step 2 until the "next page" button is not found
|
|
3. Write the extracted content to a hidden input element with ID "cd_data"
|
|
"""
|
|
if self.webdriver_paginated_js_execute_each_page is None or not len(self.webdriver_paginated_js_execute_each_page) \
|
|
or self.webdriver_paginated_next_selector is None or not len(self.webdriver_paginated_next_selector):
|
|
raise PaginatedContentMisconfigured()
|
|
|
|
from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface
|
|
from playwright._impl._errors import TimeoutError
|
|
|
|
browsersteps_interface = steppable_browser_interface(start_url=url)
|
|
browsersteps_interface.page = self.page
|
|
|
|
browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None)
|
|
browsersteps_interface.action_wait_for_load_state(selector=None)
|
|
|
|
data = ""
|
|
step_n = 1
|
|
while True:
|
|
if data != "":
|
|
data += ","
|
|
|
|
logger.debug(f"Paginated content > Page {step_n}")
|
|
data += browsersteps_interface.action_execute_js(value=self.webdriver_paginated_js_execute_each_page, selector=None)
|
|
|
|
try:
|
|
next_button = browsersteps_interface.get_locator(self.webdriver_paginated_next_selector)
|
|
next_button.wait_for()
|
|
next_button.click()
|
|
browsersteps_interface.action_wait_for_load_state(selector=None)
|
|
step_n += 1
|
|
except TimeoutError:
|
|
# This just means the button could not be found.
|
|
logger.debug(f"Paginated content > Next button could not be found")
|
|
break
|
|
|
|
self.page.evaluate('''(data) => {
|
|
const el = document.createElement('input');
|
|
el.id = 'cd_data';
|
|
el.type = 'hidden';
|
|
el.value = data;
|
|
document.body.appendChild(el);
|
|
}''', data)
|