Make system prefer to use Playwright if PLAYWRIGHT_DRIVER_URL is set

playwright
dgtlmoon 3 years ago
parent dc8f20d104
commit 8134242b38

@ -8,7 +8,7 @@ import sys
import eventlet
import eventlet.wsgi
from . import store, changedetection_app
from . import store, changedetection_app, content_fetcher
from . import __version__
def main():

@ -4,7 +4,7 @@ import os
import requests
import time
import urllib3.exceptions
import sys
class EmptyReply(Exception):
def __init__(self, status_code, url):
@ -20,9 +20,9 @@ class Fetcher():
status_code = None
content = None
headers = None
# Will be needed in the future by the VisualSelector, always get this where possible.
screenshot = None
fetcher_description = "No description"
fetcher_list_order = 0
@abstractmethod
def get_error(self):
@ -59,36 +59,28 @@ class Fetcher():
# Maybe for the future, each fetcher provides its own diff output, could be used for text, image
# the current one would return javascript output (as we use JS to generate the diff)
#
# Returns tuple(mime_type, stream)
# @abstractmethod
# def return_diff(self, stream_a, stream_b):
# return
def available_fetchers():
# See the if statement at the bottom of this file for how we switch between playwright and webdriver
import inspect
from changedetectionio import content_fetcher
p=[]
for name, obj in inspect.getmembers(content_fetcher):
for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass):
if inspect.isclass(obj):
# @todo html_ is maybe better as fetcher_ or something
# In this case, make sure to edit the default one in store.py and fetch_site_status.py
if "html_" in name:
t=tuple([name,obj.fetcher_description,obj.fetcher_list_order])
if name.startswith('html_'):
t=tuple([name,obj.fetcher_description])
p.append(t)
# sort by obj.fetcher_list_order
p.sort(key=lambda x: x[2])
# strip obj.fetcher_list_order from each member in the tuple
p = list(map(lambda x: x[:2], p))
return p
class html_playwright(Fetcher):
class base_html_playwright(Fetcher):
fetcher_description = "Playwright {}/Javascript".format(
os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize()
)
if os.getenv("PLAYWRIGHT_DRIVER_URL"):
fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL"))
fetcher_list_order = 3
browser_type = ''
command_executor = ''
@ -129,15 +121,22 @@ class html_playwright(Fetcher):
with sync_playwright() as p:
browser_type = getattr(p, self.browser_type)
browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
# Seemed to cause a connection Exception even tho I can see it connect
#self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000)
browser = browser_type.connect_over_cdp(self.command_executor, timeout=timeout * 1000)
# Set user agent to prevent Cloudflare from blocking the browser
context = browser.new_context(
user_agent="Mozilla/5.0",
proxy=self.proxy
)
page = context.new_page()
page.set_viewport_size({"width": 1280, "height": 1024})
response = page.goto(url, timeout=timeout*1000)
page.wait_for_timeout(5000)
extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))
page.wait_for_timeout(extra_wait * 1000)
if response is None:
raise EmptyReply(url=url, status_code=None)
@ -146,15 +145,19 @@ class html_playwright(Fetcher):
self.content = page.content()
self.headers = response.all_headers()
# Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it
# JPEG is better here because the screenshots can be very very large
screenshot = page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024})
self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=90)
context.close()
browser.close()
class html_webdriver(Fetcher):
class base_html_webdriver(Fetcher):
if os.getenv("WEBDRIVER_URL"):
fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL"))
else:
fetcher_description = "WebDriver Chrome/Javascript"
fetcher_list_order = 2
command_executor = ''
@ -220,9 +223,8 @@ class html_webdriver(Fetcher):
time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)))
self.content = self.driver.page_source
self.headers = {}
def screenshot(self):
return self.driver.get_screenshot_as_png()
self.screenshot = self.driver.get_screenshot_as_png()
self.quit()
# Does the connection to the webdriver work? run a test connection.
def is_ready(self):
@ -248,7 +250,6 @@ class html_webdriver(Fetcher):
# "html_requests" is listed as the default fetcher in store.py!
class html_requests(Fetcher):
fetcher_description = "Basic fast Plaintext/HTTP Client"
fetcher_list_order = 1
def run(self,
url,
@ -283,3 +284,12 @@ class html_requests(Fetcher):
self.content = r.text
self.headers = r.headers
# Decide which is the 'real' HTML webdriver, this is more a system wide config
# rather than site-specific.
use_playwright_as_chrome_fetcher= os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
html_webdriver = base_html_playwright
else:
html_webdriver = base_html_webdriver

@ -68,6 +68,7 @@ class perform_site_check():
fetcher = klass()
fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_code)
# Fetching complete, now filters
# @todo move to class / maybe inside of fetcher abstract base?
@ -192,9 +193,4 @@ class perform_site_check():
if not watch['title'] or not len(watch['title']):
update_obj['title'] = html_tools.extract_element(find='title', html_content=fetcher.content)
if self.datastore.data['settings']['application'].get('real_browser_save_screenshot', True):
screenshot = fetcher.screenshot()
fetcher.quit()
return changed_detected, update_obj, text_content_before_ignored_filter, screenshot
return changed_detected, update_obj, text_content_before_ignored_filter, fetcher.screenshot
Loading…
Cancel
Save