Make system prefer to use Playwright if PLAYWRIGHT_DRIVER_URL is set

playwright
dgtlmoon 3 years ago
parent dc8f20d104
commit 8134242b38

@ -8,7 +8,7 @@ import sys
import eventlet import eventlet
import eventlet.wsgi import eventlet.wsgi
from . import store, changedetection_app from . import store, changedetection_app, content_fetcher
from . import __version__ from . import __version__
def main(): def main():

@ -4,7 +4,7 @@ import os
import requests import requests
import time import time
import urllib3.exceptions import urllib3.exceptions
import sys
class EmptyReply(Exception): class EmptyReply(Exception):
def __init__(self, status_code, url): def __init__(self, status_code, url):
@ -20,9 +20,9 @@ class Fetcher():
status_code = None status_code = None
content = None content = None
headers = None headers = None
# Will be needed in the future by the VisualSelector, always get this where possible.
fetcher_description ="No description" screenshot = None
fetcher_list_order = 0 fetcher_description = "No description"
@abstractmethod @abstractmethod
def get_error(self): def get_error(self):
@ -59,36 +59,28 @@ class Fetcher():
# Maybe for the future, each fetcher provides its own diff output, could be used for text, image # Maybe for the future, each fetcher provides its own diff output, could be used for text, image
# the current one would return javascript output (as we use JS to generate the diff) # the current one would return javascript output (as we use JS to generate the diff)
# #
# Returns tuple(mime_type, stream)
# @abstractmethod
# def return_diff(self, stream_a, stream_b):
# return
def available_fetchers(): def available_fetchers():
import inspect
from changedetectionio import content_fetcher # See the if statement at the bottom of this file for how we switch between playwright and webdriver
p=[] import inspect
for name, obj in inspect.getmembers(content_fetcher): p=[]
if inspect.isclass(obj): for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
# @todo html_ is maybe better as fetcher_ or something if inspect.isclass(obj):
# In this case, make sure to edit the default one in store.py and fetch_site_status.py # @todo html_ is maybe better as fetcher_ or something
if "html_" in name: # In this case, make sure to edit the default one in store.py and fetch_site_status.py
t=tuple([name,obj.fetcher_description,obj.fetcher_list_order]) if name.startswith('html_'):
p.append(t) t=tuple([name,obj.fetcher_description])
# sort by obj.fetcher_list_order p.append(t)
p.sort(key=lambda x: x[2])
# strip obj.fetcher_list_order from each member in the tuple
p = list(map(lambda x: x[:2], p)) return p
return p class base_html_playwright(Fetcher):
class html_playwright(Fetcher):
fetcher_description = "Playwright {}/Javascript".format( fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
) )
if os.getenv("PLAYWRIGHT_DRIVER_URL"): if os.getenv("PLAYWRIGHT_DRIVER_URL"):
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
fetcher_list_order = 3
browser_type = '' browser_type = ''
command_executor = '' command_executor = ''
@ -129,15 +121,22 @@ class html_playwright(Fetcher):
with sync_playwright() as p: with sync_playwright() as p:
browser_type = getattr(p, self.browser_type) browser_type = getattr(p, self.browser_type)
browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
# Seemed to cause a connection Exception even tho I can see it connect
#self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000)
# Set user agent to prevent Cloudflare from blocking the browser # Set user agent to prevent Cloudflare from blocking the browser
context = browser.new_context( context = browser.new_context(
user_agent="Mozilla/5.0", user_agent="Mozilla/5.0",
proxy=self.proxy proxy=self.proxy
) )
page = context.new_page() page = context.new_page()
page.set_viewport_size({"width": 1280, "height": 1024})
response = page.goto(url, timeout=timeout*1000) response = page.goto(url, timeout=timeout*1000)
page.wait_for_timeout(5000)
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
page.wait_for_timeout(extra_wait * 1000)
if response is None: if response is None:
raise EmptyReply(url=url, status_code=None) raise EmptyReply(url=url, status_code=None)
@ -146,15 +145,19 @@ class html_playwright(Fetcher):
self.content = page.content() self.content = page.content()
self.headers = response.all_headers() self.headers = response.all_headers()
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
screenshot = page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024})
self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=90)
context.close() context.close()
browser.close() browser.close()
class html_webdriver(Fetcher):
class base_html_webdriver(Fetcher):
if os.getenv("WEBDRIVER_URL"): if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
else: else:
fetcher_description = "WebDriver Chrome/Javascript" fetcher_description = "WebDriver Chrome/Javascript"
fetcher_list_order = 2
command_executor = '' command_executor = ''
@ -220,9 +223,8 @@ class html_webdriver(Fetcher):
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.content = self.driver.page_source self.content = self.driver.page_source
self.headers = {} self.headers = {}
self.screenshot = self.driver.get_screenshot_as_png()
def screenshot(self): self.quit()
return self.driver.get_screenshot_as_png()
# Does the connection to the webdriver work? run a test connection. # Does the connection to the webdriver work? run a test connection.
def is_ready(self): def is_ready(self):
@ -248,7 +250,6 @@ class html_webdriver(Fetcher):
# "html_requests" is listed as the default fetcher in store.py! # "html_requests" is listed as the default fetcher in store.py!
class html_requests(Fetcher): class html_requests(Fetcher):
fetcher_description = "Basic fast Plaintext/HTTP Client" fetcher_description = "Basic fast Plaintext/HTTP Client"
fetcher_list_order = 1
def run(self, def run(self,
url, url,
@ -283,3 +284,12 @@ class html_requests(Fetcher):
self.content = r.text self.content = r.text
self.headers = r.headers self.headers = r.headers
# Decide which is the 'real' HTML webdriver, this is more a system wide config
# rather than site-specific.
use_playwright_as_chrome_fetcher= os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
html_webdriver = base_html_playwright
else:
html_webdriver = base_html_webdriver

@ -68,6 +68,7 @@ class perform_site_check():
fetcher = klass() fetcher = klass()
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code) fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code)
# Fetching complete, now filters # Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base? # @todo move to class / maybe inside of fetcher abstract base?
@ -192,9 +193,4 @@ class perform_site_check():
if not watch['title'] or not len(watch['title']): if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content) update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
if self.datastore.data['settings']['application'].get('real_browser_save_screenshot', True): return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
screenshot = fetcher.screenshot()
fetcher.quit()
return changed_detected, update_obj, text_content_before_ignored_filter, screenshot
Loading…
Cancel
Save