|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
|
|
|
from loguru import logger
|
|
|
|
from changedetectionio.content_fetchers.base import Fetcher
|
|
|
|
|
|
|
|
class fetcher(Fetcher):
|
|
|
|
if os.getenv("WEBDRIVER_URL"):
|
|
|
|
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
|
|
|
|
else:
|
|
|
|
fetcher_description = "WebDriver Chrome/Javascript"
|
|
|
|
|
|
|
|
# Configs for Proxy setup
|
|
|
|
# In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy"
|
|
|
|
selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy',
|
|
|
|
'proxyAutoconfigUrl', 'sslProxy', 'autodetect',
|
|
|
|
'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword']
|
|
|
|
proxy = None
|
|
|
|
|
|
|
|
def __init__(self, proxy_override=None, custom_browser_connection_url=None):
|
|
|
|
super().__init__()
|
|
|
|
from selenium.webdriver.common.proxy import Proxy as SeleniumProxy
|
|
|
|
|
|
|
|
# .strip('"') is going to save someone a lot of time when they accidently wrap the env value
|
|
|
|
if not custom_browser_connection_url:
|
|
|
|
self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"')
|
|
|
|
else:
|
|
|
|
self.browser_connection_is_custom = True
|
|
|
|
self.browser_connection_url = custom_browser_connection_url
|
|
|
|
|
|
|
|
# If any proxy settings are enabled, then we should setup the proxy object
|
|
|
|
proxy_args = {}
|
|
|
|
for k in self.selenium_proxy_settings_mappings:
|
|
|
|
v = os.getenv('webdriver_' + k, False)
|
|
|
|
if v:
|
|
|
|
proxy_args[k] = v.strip('"')
|
|
|
|
|
|
|
|
# Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy
|
|
|
|
if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy:
|
|
|
|
proxy_args['httpProxy'] = self.system_http_proxy
|
|
|
|
if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy:
|
|
|
|
proxy_args['httpsProxy'] = self.system_https_proxy
|
|
|
|
|
|
|
|
# Allows override the proxy on a per-request basis
|
|
|
|
if proxy_override is not None:
|
|
|
|
proxy_args['httpProxy'] = proxy_override
|
|
|
|
|
|
|
|
if proxy_args:
|
|
|
|
self.proxy = SeleniumProxy(raw=proxy_args)
|
|
|
|
|
|
|
|
def run(self,
|
|
|
|
url,
|
|
|
|
timeout,
|
|
|
|
request_headers,
|
|
|
|
request_body,
|
|
|
|
request_method,
|
|
|
|
ignore_status_codes=False,
|
|
|
|
current_include_filters=None,
|
|
|
|
is_binary=False,
|
|
|
|
empty_pages_are_a_change=False):
|
|
|
|
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
|
|
|
from selenium.common.exceptions import WebDriverException
|
|
|
|
# request_body, request_method unused for now, until some magic in the future happens.
|
|
|
|
|
|
|
|
options = ChromeOptions()
|
|
|
|
if self.proxy:
|
|
|
|
options.proxy = self.proxy
|
|
|
|
|
|
|
|
self.driver = webdriver.Remote(
|
|
|
|
command_executor=self.browser_connection_url,
|
|
|
|
options=options)
|
|
|
|
|
|
|
|
try:
|
|
|
|
self.driver.get(url)
|
|
|
|
except WebDriverException as e:
|
|
|
|
# Be sure we close the session window
|
|
|
|
self.quit()
|
|
|
|
raise
|
|
|
|
|
|
|
|
self.driver.set_window_size(1280, 1024)
|
|
|
|
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
|
|
|
|
|
|
|
if self.webdriver_js_execute_code is not None:
|
|
|
|
self.driver.execute_script(self.webdriver_js_execute_code)
|
|
|
|
# Selenium doesn't automatically wait for actions as good as Playwright, so wait again
|
|
|
|
self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
|
|
|
|
|
|
|
|
# @todo - how to check this? is it possible?
|
|
|
|
self.status_code = 200
|
|
|
|
# @todo somehow we should try to get this working for WebDriver
|
|
|
|
# raise EmptyReply(url=url, status_code=r.status_code)
|
|
|
|
|
|
|
|
# @todo - dom wait loaded?
|
|
|
|
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay)
|
|
|
|
self.content = self.driver.page_source
|
|
|
|
self.headers = {}
|
|
|
|
|
|
|
|
self.screenshot = self.driver.get_screenshot_as_png()
|
|
|
|
|
|
|
|
# Does the connection to the webdriver work? run a test connection.
|
|
|
|
def is_ready(self):
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
|
|
|
|
|
|
|
self.driver = webdriver.Remote(
|
|
|
|
command_executor=self.command_executor,
|
|
|
|
options=ChromeOptions())
|
|
|
|
|
|
|
|
# driver.quit() seems to cause better exceptions
|
|
|
|
self.quit()
|
|
|
|
return True
|
|
|
|
|
|
|
|
def quit(self):
|
|
|
|
if self.driver:
|
|
|
|
try:
|
|
|
|
self.driver.quit()
|
|
|
|
except Exception as e:
|
|
|
|
logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}")
|