From 5b530ff61c3b292e904ea800b10afcebccd23717 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 24 Nov 2022 20:53:01 +0100 Subject: [PATCH] Configurable "Browser Steps" when Playwright/Chrome is configured (enter text, scroll, wait for text, click button etc) (#478) --- .gitignore | 1 + Dockerfile | 2 +- MANIFEST.in | 5 +- README.md | 11 + changedetectionio/__init__.py | 26 +- changedetectionio/blueprint/__init__.py | 0 .../blueprint/browser_steps/__init__.py | 226 ++++++++++ .../blueprint/browser_steps/browser_steps.py | 268 +++++++++++ changedetectionio/content_fetcher.py | 202 ++++++--- changedetectionio/fetch_site_status.py | 5 + changedetectionio/forms.py | 28 +- .../res/xpath_element_scraper.js | 36 +- changedetectionio/static/js/browser-steps.js | 425 ++++++++++++++++++ changedetectionio/static/js/stepper.js | 34 ++ changedetectionio/static/js/tabs.js | 5 +- .../static/js/visual-selector.js | 241 +++++----- .../static/styles/parts/browser-steps.scss | 81 ++++ .../static/styles/parts/spinners.scss | 44 ++ changedetectionio/static/styles/styles.css | 185 +++++--- changedetectionio/static/styles/styles.scss | 370 +++++++-------- changedetectionio/store.py | 2 +- changedetectionio/templates/edit.html | 86 +++- .../tests/fetchers/test_content.py | 2 +- .../tests/visualselector/test_fetch_data.py | 4 +- changedetectionio/update_worker.py | 54 +++ docs/browsersteps-anim.gif | Bin 0 -> 307994 bytes requirements.txt | 1 - 27 files changed, 1860 insertions(+), 484 deletions(-) create mode 100644 changedetectionio/blueprint/__init__.py create mode 100644 changedetectionio/blueprint/browser_steps/__init__.py create mode 100644 changedetectionio/blueprint/browser_steps/browser_steps.py create mode 100644 changedetectionio/static/js/browser-steps.js create mode 100644 changedetectionio/static/js/stepper.js create mode 100644 changedetectionio/static/styles/parts/browser-steps.scss create mode 100644 changedetectionio/static/styles/parts/spinners.scss create mode 100644 docs/browsersteps-anim.gif diff --git a/.gitignore b/.gitignore index 0655eb90..39fc0dd0 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ __pycache__ build dist venv +test-datastore/* test-datastore *.egg-info* .vscode/settings.json diff --git a/Dockerfile b/Dockerfile index 2272ea01..6b067afe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ RUN pip install --target=/dependencies -r /requirements.txt # Playwright is an alternative to Selenium # Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing # https://github.com/dgtlmoon/changedetection.io/pull/1067 also musl/alpine (not supported) -RUN pip install --target=/dependencies playwright~=1.26 \ +RUN pip install --target=/dependencies playwright~=1.27.1 \ || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled." # Final image stage diff --git a/MANIFEST.in b/MANIFEST.in index 707fcc6d..f6e723b5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,7 +4,10 @@ recursive-include changedetectionio/static * recursive-include changedetectionio/model * recursive-include changedetectionio/tests * recursive-include changedetectionio/res * +prune changedetectionio/static/package-lock.json +prune changedetectionio/static/styles/node_modules +prune changedetectionio/static/styles/package-lock.json include changedetection.py global-exclude *.pyc global-exclude node_modules -global-exclude venv \ No newline at end of file +global-exclude venv diff --git a/README.md b/README.md index a6facc3d..18fcb351 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,17 @@ Available when connected to a +### Perform interactive browser steps + +Fill in text boxes, click buttons and more, setup your changedetection scenario. + +Using the **Browser Steps** configuration, add basic steps before performing change detection, such as logging into websites, adding a product to a cart, refining searches. + +Self-hosted web page change monitoring context difference + +After **Browser Steps** have been run, then visit the **Visual Selector** tab to refine the content you're interested in. +Requires Playwright to be enabled. + ## Installation ### Docker diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 750c7a48..dceefcc6 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -1,18 +1,20 @@ #!/usr/bin/python3 import datetime +import flask_login +import logging import os +import pytz import queue import threading import time +import timeago + from copy import deepcopy +from distutils.util import strtobool +from feedgen.feed import FeedGenerator from threading import Event -import flask_login -import logging -import pytz -import timeago -from feedgen.feed import FeedGenerator from flask import ( Flask, abort, @@ -27,7 +29,6 @@ from flask import ( ) from flask_login import login_required from flask_restful import abort, Api - from flask_wtf import CSRFProtect from changedetectionio import html_tools @@ -44,7 +45,6 @@ ticker_thread = None extra_stylesheets = [] update_q = queue.PriorityQueue() - notification_q = queue.Queue() app = Flask(__name__, @@ -97,7 +97,7 @@ def _jinja2_filter_datetime(watch_obj, format="%Y-%m-%d %H:%M:%S"): # Worker thread tells us which UUID it is currently processing. for t in running_update_threads: if t.current_uuid == watch_obj['uuid']: - return ' Checking now' + return ' Checking now' if watch_obj['last_checked'] == 0: return 'Not yet' @@ -525,6 +525,7 @@ def changedetection_app(config=None, datastore_o=None): def edit_page(uuid): from changedetectionio import forms + from changedetectionio.blueprint.browser_steps.browser_steps import browser_step_ui_config using_default_check_time = True # More for testing, possible to return the first/only @@ -558,6 +559,8 @@ def changedetection_app(config=None, datastore_o=None): data=default, ) + # form.browser_steps[0] can be assumed that we 'goto url' first + if datastore.proxy_list is None: # @todo - Couldn't get setattr() etc dynamic addition working, so remove it instead del form.proxy @@ -650,6 +653,7 @@ def changedetection_app(config=None, datastore_o=None): watch.get('fetch_backend', None) is None and system_uses_webdriver) else False output = render_template("edit.html", + browser_steps_config=browser_step_ui_config, current_base_url=datastore.data['settings']['application']['base_url'], emailprefix=os.getenv('NOTIFICATION_MAIL_BUTTON_PREFIX', False), form=form, @@ -661,7 +665,6 @@ def changedetection_app(config=None, datastore_o=None): settings_application=datastore.data['settings']['application'], using_global_webdriver_wait=default['webdriver_delay'] is None, uuid=uuid, - visualselector_data_is_ready=visualselector_data_is_ready, visualselector_enabled=visualselector_enabled, watch=watch ) @@ -1190,7 +1193,6 @@ def changedetection_app(config=None, datastore_o=None): else: # No tag, no uuid, add everything. for watch_uuid, watch in datastore.data['watching'].items(): - if watch_uuid not in running_uuids and not datastore.data['watching'][watch_uuid]['paused']: update_q.put((1, watch_uuid)) i += 1 @@ -1308,9 +1310,11 @@ def changedetection_app(config=None, datastore_o=None): # paste in etc return redirect(url_for('index')) + import changedetectionio.blueprint.browser_steps as browser_steps + app.register_blueprint(browser_steps.construct_blueprint(datastore), url_prefix='/browser-steps') + # @todo handle ctrl break ticker_thread = threading.Thread(target=ticker_thread_check_time_launch_checks).start() - threading.Thread(target=notification_runner).start() # Check for new release version, but not when running in test/build or pytest diff --git a/changedetectionio/blueprint/__init__.py b/changedetectionio/blueprint/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py new file mode 100644 index 00000000..b877a20f --- /dev/null +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -0,0 +1,226 @@ + +# HORRIBLE HACK BUT WORKS :-) PR anyone? +# +# Why? +# `browsersteps_playwright_browser_interface.chromium.connect_over_cdp()` will only run once without async() +# - this flask app is not async() +# - browserless has a single timeout/keepalive which applies to the session made at .connect_over_cdp() +# +# So it means that we must unfortunately for now just keep a single timer since .connect_over_cdp() was run +# and know when that reaches timeout/keepalive :( when that time is up, restart the connection and tell the user +# that their time is up, insert another coin. (reload) +# +# Bigger picture +# - It's horrible that we have this click+wait deal, some nice socket.io solution using something similar +# to what the browserless debug UI already gives us would be smarter.. +# +# OR +# - Some API call that should be hacked into browserless or playwright that we can "/api/bump-keepalive/{session_id}/60" +# So we can tell it that we need more time (run this on each action) +# +# OR +# - use multiprocessing to bump this over to its own process and add some transport layer (queue/pipes) + + + + +from distutils.util import strtobool +from flask import Blueprint, request, make_response +from flask_login import login_required +import os +import logging +from changedetectionio.store import ChangeDetectionStore + +browsersteps_live_ui_o = {} +browsersteps_playwright_browser_interface = None +browsersteps_playwright_browser_interface_start_time = None +browsersteps_playwright_browser_interface_browser = None +browsersteps_playwright_browser_interface_end_time = None + + +def cleanup_playwright_session(): + print("Cleaning up old playwright session because time was up") + global browsersteps_playwright_browser_interface + global browsersteps_live_ui_o + global browsersteps_playwright_browser_interface_browser + global browsersteps_playwright_browser_interface + global browsersteps_playwright_browser_interface_start_time + global browsersteps_playwright_browser_interface_end_time + + import psutil + + current_process = psutil.Process() + children = current_process.children(recursive=True) + for child in children: + print (child) + print('Child pid is {}'.format(child.pid)) + + # .stop() hangs sometimes if its called when there are no children to process + # but how do we know this is our child? dunno + if children: + browsersteps_playwright_browser_interface.stop() + + browsersteps_live_ui_o = {} + browsersteps_playwright_browser_interface = None + browsersteps_playwright_browser_interface_start_time = None + browsersteps_playwright_browser_interface_browser = None + browsersteps_playwright_browser_interface_end_time = None + print ("Cleaning up old playwright session because time was up - done") + +def construct_blueprint(datastore: ChangeDetectionStore): + + browser_steps_blueprint = Blueprint('browser_steps', __name__, template_folder="templates") + + @login_required + @browser_steps_blueprint.route("/browsersteps_update", methods=['GET', 'POST']) + def browsersteps_ui_update(): + import base64 + import playwright._impl._api_types + import time + + from changedetectionio.blueprint.browser_steps import browser_steps + + global browsersteps_live_ui_o, browsersteps_playwright_browser_interface_end_time + global browsersteps_playwright_browser_interface_browser + global browsersteps_playwright_browser_interface + global browsersteps_playwright_browser_interface_start_time + + step_n = None + remaining =0 + uuid = request.args.get('uuid') + + browsersteps_session_id = request.args.get('browsersteps_session_id') + + if not browsersteps_session_id: + return make_response('No browsersteps_session_id specified', 500) + + # Because we don't "really" run in a context manager ( we make the playwright interface global/long-living ) + # We need to manage the shutdown when the time is up + if browsersteps_playwright_browser_interface_end_time: + remaining = browsersteps_playwright_browser_interface_end_time-time.time() + if browsersteps_playwright_browser_interface_end_time and remaining <= 0: + + + cleanup_playwright_session() + + return make_response('Browser session expired, please reload the Browser Steps interface', 500) + + + # Actions - step/apply/etc, do the thing and return state + if request.method == 'POST': + # @todo - should always be an existing session + step_operation = request.form.get('operation') + step_selector = request.form.get('selector') + step_optional_value = request.form.get('optional_value') + step_n = int(request.form.get('step_n')) + is_last_step = strtobool(request.form.get('is_last_step')) + + if step_operation == 'Goto site': + step_operation = 'goto_url' + step_optional_value = None + step_selector = datastore.data['watching'][uuid].get('url') + + # @todo try.. accept.. nice errors not popups.. + try: + + this_session = browsersteps_live_ui_o.get(browsersteps_session_id) + if not this_session: + print("Browser exited") + return make_response('Browser session ran out of time :( Please reload this page.', 401) + + this_session.call_action(action_name=step_operation, + selector=step_selector, + optional_value=step_optional_value) + except playwright._impl._api_types.TimeoutError as e: + print("Element wasnt found :-(", step_operation) + return make_response("Element was not found on page", 401) + + except playwright._impl._api_types.Error as e: + # Browser/playwright level error + print("Browser error - got playwright._impl._api_types.Error, try reloading the session/browser") + print (str(e)) + + # Try to find something of value to give back to the user + for l in str(e).splitlines(): + if 'DOMException' in l: + return make_response(l, 401) + + return make_response('Browser session ran out of time :( Please reload this page.', 401) + + # Get visual selector ready/update its data (also use the current filter info from the page?) + # When the last 'apply' button was pressed + # @todo this adds overhead because the xpath selection is happening twice + u = this_session.page.url + if is_last_step and u: + (screenshot, xpath_data) = this_session.request_visualselector_data() + datastore.save_screenshot(watch_uuid=uuid, screenshot=screenshot) + datastore.save_xpath_data(watch_uuid=uuid, data=xpath_data) + + # Setup interface + if request.method == 'GET': + + if not browsersteps_playwright_browser_interface: + print("Starting connection with playwright") + logging.debug("browser_steps.py connecting") + from playwright.sync_api import sync_playwright + + browsersteps_playwright_browser_interface = sync_playwright().start() + + + time.sleep(1) + # At 20 minutes, some other variable is closing it + # @todo find out what it is and set it + seconds_keepalive = int(os.getenv('BROWSERSTEPS_MINUTES_KEEPALIVE', 10)) * 60 + + # keep it alive for 10 seconds more than we advertise, sometimes it helps to keep it shutting down cleanly + keepalive = "&timeout={}".format(((seconds_keepalive+3) * 1000)) + try: + browsersteps_playwright_browser_interface_browser = browsersteps_playwright_browser_interface.chromium.connect_over_cdp( + os.getenv('PLAYWRIGHT_DRIVER_URL', '') + keepalive) + except Exception as e: + if 'ECONNREFUSED' in str(e): + return make_response('Unable to start the Playwright session properly, is it running?', 401) + + browsersteps_playwright_browser_interface_end_time = time.time() + (seconds_keepalive-3) + print("Starting connection with playwright - done") + + if not browsersteps_live_ui_o.get(browsersteps_session_id): + # Boot up a new session + proxy_id = datastore.get_preferred_proxy_for_watch(uuid=uuid) + proxy = None + if proxy_id: + proxy_url = datastore.proxy_list.get(proxy_id).get('url') + if proxy_url: + proxy = {'server': proxy_url} + print("Browser Steps: UUID {} Using proxy {}".format(uuid, proxy_url)) + + # Begin the new "Playwright Context" that re-uses the playwright interface + # Each session is a "Playwright Context" as a list, that uses the playwright interface + browsersteps_live_ui_o[browsersteps_session_id] = browser_steps.browsersteps_live_ui( + playwright_browser=browsersteps_playwright_browser_interface_browser, + proxy=proxy) + this_session = browsersteps_live_ui_o[browsersteps_session_id] + + if not this_session.page: + cleanup_playwright_session() + return make_response('Browser session ran out of time :( Please reload this page.', 401) + + try: + state = this_session.get_current_state() + except playwright._impl._api_types.Error as e: + return make_response("Browser session ran out of time :( Please reload this page."+str(e), 401) + + p = {'screenshot': "data:image/png;base64,{}".format( + base64.b64encode(state[0]).decode('ascii')), + 'xpath_data': state[1], + 'session_age_start': this_session.age_start, + 'browser_time_remaining': round(remaining) + } + + + # @todo BSON/binary JSON, faster xfer, OR pick it off the disk + return p + + return browser_steps_blueprint + + diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py new file mode 100644 index 00000000..1207d192 --- /dev/null +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -0,0 +1,268 @@ +#!/usr/bin/python3 + +import os +import time +import re +from random import randint + +# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end +# 0- off, 1- on +browser_step_ui_config = {'Choose one': '0 0', + # 'Check checkbox': '1 0', + # 'Click button containing text': '0 1', + # 'Scroll to bottom': '0 0', + # 'Scroll to element': '1 0', + # 'Scroll to top': '0 0', + # 'Switch to iFrame by index number': '0 1' + # 'Uncheck checkbox': '1 0', + # @todo + 'Check checkbox': '1 0', + 'Click X,Y': '0 1', + 'Click element if exists': '1 0', + 'Click element': '1 0', + 'Click element containing text': '0 1', + 'Enter text in field': '1 1', +# 'Extract text and use as filter': '1 0', + 'Goto site': '0 0', + 'Press Enter': '0 0', + 'Select by label': '1 1', + 'Scroll down': '0 0', + 'Uncheck checkbox': '1 0', + 'Wait for seconds': '0 1', + 'Wait for text': '0 1', + # 'Press Page Down': '0 0', + # 'Press Page Up': '0 0', + # weird bug, come back to it later + } + + +# Good reference - https://playwright.dev/python/docs/input +# https://pythonmana.com/2021/12/202112162236307035.html +# +# ONLY Works in Playwright because we need the fullscreen screenshot +class steppable_browser_interface(): + page = None + + # Convert and perform "Click Button" for example + def call_action(self, action_name, selector=None, optional_value=None): + now = time.time() + call_action_name = re.sub('[^0-9a-zA-Z]+', '_', action_name.lower()) + if call_action_name == 'choose_one': + return + + print("> action calling", call_action_name) + # https://playwright.dev/python/docs/selectors#xpath-selectors + if selector.startswith('/') and not selector.startswith('//'): + selector = "xpath=" + selector + + action_handler = getattr(self, "action_" + call_action_name) + + # Support for Jinja2 variables in the value and selector + from jinja2 import Environment + jinja2_env = Environment(extensions=['jinja2_time.TimeExtension']) + + if selector and ('{%' in selector or '{{' in selector): + selector = str(jinja2_env.from_string(selector).render()) + + if optional_value and ('{%' in optional_value or '{{' in optional_value): + optional_value = str(jinja2_env.from_string(optional_value).render()) + + action_handler(selector, optional_value) + self.page.wait_for_timeout(3 * 1000) + print("Call action done in", time.time() - now) + + def action_goto_url(self, url, optional_value): + # self.page.set_viewport_size({"width": 1280, "height": 5000}) + now = time.time() + response = self.page.goto(url, timeout=0, wait_until='domcontentloaded') + print("Time to goto URL", time.time() - now) + + # Wait_until = commit + # - `'commit'` - consider operation to be finished when network response is received and the document started loading. + # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds + # This seemed to solve nearly all 'TimeoutErrors' + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.page.wait_for_timeout(extra_wait * 1000) + + def action_click_element_containing_text(self, selector=None, value=''): + if not len(value.strip()): + return + elem = self.page.get_by_text(value) + if elem.count(): + elem.first.click(delay=randint(200, 500)) + + def action_enter_text_in_field(self, selector, value): + if not len(selector.strip()): + return + + self.page.fill(selector, value, timeout=10 * 1000) + + def action_click_element(self, selector, value): + print("Clicking element") + if not len(selector.strip()): + return + self.page.click(selector, timeout=10 * 1000, delay=randint(200, 500)) + + def action_click_element_if_exists(self, selector, value): + import playwright._impl._api_types as _api_types + print("Clicking element if exists") + if not len(selector.strip()): + return + try: + self.page.click(selector, timeout=10 * 1000, delay=randint(200, 500)) + except _api_types.TimeoutError as e: + return + except _api_types.Error as e: + # Element was there, but page redrew and now its long long gone + return + + def action_click_x_y(self, selector, value): + x, y = value.strip().split(',') + x = int(float(x.strip())) + y = int(float(y.strip())) + self.page.mouse.click(x=x, y=y, delay=randint(200, 500)) + + def action_scroll_down(self, selector, value): + # Some sites this doesnt work on for some reason + self.page.mouse.wheel(0, 600) + self.page.wait_for_timeout(1000) + + def action_wait_for_seconds(self, selector, value): + self.page.wait_for_timeout(int(value) * 1000) + + # @todo - in the future make some popout interface to capture what needs to be set + # https://playwright.dev/python/docs/api/class-keyboard + def action_press_enter(self, selector, value): + self.page.keyboard.press("Enter", delay=randint(200, 500)) + + def action_press_page_up(self, selector, value): + self.page.keyboard.press("PageUp", delay=randint(200, 500)) + + def action_press_page_down(self, selector, value): + self.page.keyboard.press("PageDown", delay=randint(200, 500)) + + def action_check_checkbox(self, selector, value): + self.page.locator(selector).check() + + def action_uncheck_checkbox(self, selector, value): + self.page.locator(selector).uncheck() + + +# Responsible for maintaining a live 'context' with browserless +# @todo - how long do contexts live for anyway? +class browsersteps_live_ui(steppable_browser_interface): + context = None + page = None + render_extra_delay = 1 + stale = False + # bump and kill this if idle after X sec + age_start = 0 + + # use a special driver, maybe locally etc + command_executor = os.getenv( + "PLAYWRIGHT_BROWSERSTEPS_DRIVER_URL" + ) + # if not.. + if not command_executor: + command_executor = os.getenv( + "PLAYWRIGHT_DRIVER_URL", + 'ws://playwright-chrome:3000' + ).strip('"') + + browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') + + def __init__(self, playwright_browser, proxy=None): + self.age_start = time.time() + self.playwright_browser = playwright_browser + if self.context is None: + self.connect(proxy=proxy) + + # Connect and setup a new context + def connect(self, proxy=None): + # Should only get called once - test that + keep_open = 1000 * 60 * 5 + now = time.time() + + # @todo handle multiple contexts, bind a unique id from the browser on each req? + self.context = self.playwright_browser.new_context( + # @todo + # user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0', + # proxy=self.proxy, + # This is needed to enable JavaScript execution on GitHub and others + bypass_csp=True, + # Should never be needed + accept_downloads=False, + proxy=proxy + ) + + self.page = self.context.new_page() + + # self.page.set_default_navigation_timeout(keep_open) + self.page.set_default_timeout(keep_open) + # @todo probably this doesnt work + self.page.on( + "close", + self.mark_as_closed, + ) + # Listen for all console events and handle errors + self.page.on("console", lambda msg: print(f"Browser steps console - {msg.type}: {msg.text} {msg.args}")) + + print("time to browser setup", time.time() - now) + self.page.wait_for_timeout(1 * 1000) + + def mark_as_closed(self): + print("Page closed, cleaning up..") + + @property + def has_expired(self): + if not self.page: + return True + + + def get_current_state(self): + """Return the screenshot and interactive elements mapping, generally always called after action_()""" + from pkg_resources import resource_string + xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8') + now = time.time() + self.page.wait_for_timeout(1 * 1000) + + # The actual screenshot + screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=40) + + self.page.evaluate("var include_filters=''") + # Go find the interactive elements + # @todo in the future, something smarter that can scan for elements with .click/focus etc event handlers? + elements = 'a,button,input,select,textarea,i,th,td,p,li,h1,h2,h3,h4' + xpath_element_js = xpath_element_js.replace('%ELEMENTS%', elements) + xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") + # So the JS will find the smallest one first + xpath_data['size_pos'] = sorted(xpath_data['size_pos'], key=lambda k: k['width'] * k['height'], reverse=True) + print("Time to complete get_current_state of browser", time.time() - now) + # except + # playwright._impl._api_types.Error: Browser closed. + # @todo show some countdown timer? + return (screenshot, xpath_data) + + def request_visualselector_data(self): + """ + Does the same that the playwright operation in content_fetcher does + This is used to just bump the VisualSelector data so it' ready to go if they click on the tab + @todo refactor and remove duplicate code, add include_filters + :param xpath_data: + :param screenshot: + :param current_include_filters: + :return: + """ + + self.page.evaluate("var include_filters=''") + from pkg_resources import resource_string + # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector + # @todo dont duplicate these selectors, or just let them both use the same data? + xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8') + xpath_element_js = xpath_element_js.replace('%ELEMENTS%', + 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section') + xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") + + screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) + + return (screenshot, xpath_data) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index fdcd9988..18d40ad2 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -1,7 +1,7 @@ from abc import abstractmethod -from pkg_resources import resource_string import chardet import json +import logging import os import requests import sys @@ -30,6 +30,12 @@ class JSActionExceptions(Exception): self.message = message return +class BrowserStepsStepTimout(Exception): + def __init__(self, step_n): + self.step_n = step_n + return + + class PageUnloadable(Exception): def __init__(self, status_code, url, screenshot=False, message=False): # Set this so we can use it in other parts of the app @@ -70,6 +76,8 @@ class Fetcher(): status_code = None content = None headers = None + browser_steps = None + browser_steps_screenshot_path = None fetcher_description = "No description" webdriver_js_execute_code = None @@ -86,8 +94,10 @@ class Fetcher(): render_extract_delay = 0 def __init__(self): + from pkg_resources import resource_string # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8') + self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section') @abstractmethod def get_error(self): @@ -113,11 +123,62 @@ class Fetcher(): def get_last_status_code(self): return self.status_code + @abstractmethod + def screenshot_step(self, step_n): + return None + @abstractmethod # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc def is_ready(self): return True + def iterate_browser_steps(self): + from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface + from playwright._impl._api_types import TimeoutError + from jinja2 import Environment + jinja2_env = Environment(extensions=['jinja2_time.TimeExtension']) + + step_n = 0 + + if self.browser_steps is not None and len(self.browser_steps): + interface = steppable_browser_interface() + interface.page = self.page + + valid_steps = filter(lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), self.browser_steps) + + for step in valid_steps: + step_n += 1 + print(">> Iterating check - browser Step n {} - {}...".format(step_n, step['operation'])) + self.screenshot_step("before-"+str(step_n)) + self.save_step_html("before-"+str(step_n)) + try: + optional_value = step['optional_value'] + selector = step['selector'] + # Support for jinja2 template in step values, with date module added + if '{%' in step['optional_value'] or '{{' in step['optional_value']: + optional_value = str(jinja2_env.from_string(step['optional_value']).render()) + if '{%' in step['selector'] or '{{' in step['selector']: + selector = str(jinja2_env.from_string(step['selector']).render()) + + getattr(interface, "call_action")(action_name=step['operation'], + selector=selector, + optional_value=optional_value) + self.screenshot_step(step_n) + self.save_step_html(step_n) + except TimeoutError: + # Stop processing here + raise BrowserStepsStepTimout(step_n=step_n) + + + + # It's always good to reset these + def delete_browser_steps_screenshots(self): + import glob + if self.browser_steps_screenshot_path is not None: + dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg') + files = glob.glob(dest) + for f in files: + os.unlink(f) # Maybe for the future, each fetcher provides its own diff output, could be used for text, image # the current one would return javascript output (as we use JS to generate the diff) @@ -136,7 +197,6 @@ def available_fetchers(): return p - class base_html_playwright(Fetcher): fetcher_description = "Playwright {}/Javascript".format( os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() @@ -174,15 +234,26 @@ class base_html_playwright(Fetcher): # allow per-watch proxy selection override if proxy_override: - # https://playwright.dev/docs/network#http-proxy - from urllib.parse import urlparse - parsed = urlparse(proxy_override) - proxy_url = "{}://{}:{}".format(parsed.scheme, parsed.hostname, parsed.port) - self.proxy = {'server': proxy_url} - if parsed.username: - self.proxy['username'] = parsed.username - if parsed.password: - self.proxy['password'] = parsed.password + self.proxy = {'server': proxy_override} + + def screenshot_step(self, step_n=''): + + # There's a bug where we need to do it twice or it doesnt take the whole page, dont know why. + self.page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}) + screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85) + + if self.browser_steps_screenshot_path is not None: + destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) + logging.debug("Saving step screenshot to {}".format(destination)) + with open(destination, 'wb') as f: + f.write(screenshot) + + def save_step_html(self, step_n): + content = self.page.content() + destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) + logging.debug("Saving step HTML to {}".format(destination)) + with open(destination, 'w') as f: + f.write(content) def run(self, url, @@ -195,9 +266,9 @@ class base_html_playwright(Fetcher): from playwright.sync_api import sync_playwright import playwright._impl._api_types - from playwright._impl._api_types import Error, TimeoutError - response = None + self.delete_browser_steps_screenshots() + response = None with sync_playwright() as p: browser_type = getattr(p, self.browser_type) @@ -217,89 +288,86 @@ class base_html_playwright(Fetcher): accept_downloads=False ) + self.page = context.new_page() if len(request_headers): context.set_extra_http_headers(request_headers) - page = context.new_page() try: - page.set_default_navigation_timeout(90000) - page.set_default_timeout(90000) + self.page.set_default_navigation_timeout(90000) + self.page.set_default_timeout(90000) # Listen for all console events and handle errors - page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) + self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) # Bug - never set viewport size BEFORE page.goto + # Waits for the next navigation. Using Python context manager # prevents a race condition between clicking and waiting for a navigation. - with page.expect_navigation(): - response = page.goto(url, wait_until='load') - + with self.page.expect_navigation(): + response = self.page.goto(url, wait_until='load') + # Wait_until = commit + # - `'commit'` - consider operation to be finished when network response is received and the document started loading. + # Better to not use any smarts from Playwright and just wait an arbitrary number of seconds + # This seemed to solve nearly all 'TimeoutErrors' + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay + self.page.wait_for_timeout(extra_wait * 1000) + + if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): + self.page.evaluate(self.webdriver_js_execute_code) except playwright._impl._api_types.TimeoutError as e: context.close() browser.close() # This can be ok, we will try to grab what we could retrieve pass - except Exception as e: - print("other exception when page.goto") - print(str(e)) + print ("other exception when page.goto") + print (str(e)) context.close() browser.close() - raise PageUnloadable(url=url, status_code=None, message=e.message) + raise PageUnloadable(url=url, status_code=None) + if response is None: context.close() browser.close() - print("response object was none") + print ("response object was none") raise EmptyReply(url=url, status_code=None) + # Bug 2(?) Set the viewport size AFTER loading the page + self.page.set_viewport_size({"width": 1280, "height": 1024}) + + # Run Browser Steps here + self.iterate_browser_steps() - # Removed browser-set-size, seemed to be needed to make screenshots work reliably in older playwright versions - # Was causing exceptions like 'waiting for page but content is changing' etc - # https://www.browserstack.com/docs/automate/playwright/change-browser-window-size 1280x720 should be the default - extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay time.sleep(extra_wait) - if self.webdriver_js_execute_code is not None: - try: - page.evaluate(self.webdriver_js_execute_code) - except Exception as e: - # Is it possible to get a screenshot? - error_screenshot = False - try: - page.screenshot(type='jpeg', - clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}, - quality=1) - - # The actual screenshot - error_screenshot = page.screenshot(type='jpeg', - full_page=True, - quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) - except Exception as s: - pass - - raise JSActionExceptions(status_code=response.status, screenshot=error_screenshot, message=str(e), url=url) - - else: - # JS eval was run, now we also wait some time if possible to let the page settle - if self.render_extract_delay: - page.wait_for_timeout(self.render_extract_delay * 1000) - - page.wait_for_timeout(500) - - self.content = page.content() + + self.content = self.page.content() + self.status_code = response.status + + if len(self.page.content().strip()) == 0: + context.close() + browser.close() + print ("Content was empty") + raise EmptyReply(url=url, status_code=None) + + # Bug 2(?) Set the viewport size AFTER loading the page + self.page.set_viewport_size({"width": 1280, "height": 1024}) + self.status_code = response.status + self.content = self.page.content() self.headers = response.all_headers() + # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) if current_include_filters is not None: - page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) + self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) else: - page.evaluate("var include_filters=''") + self.page.evaluate("var include_filters=''") - self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}") + self.xpath_data = self.page.evaluate("async () => {" + self.xpath_element_js.replace('%ELEMENTS%', 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary') + "}") # Bug 3 in Playwright screenshot handling # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it @@ -310,26 +378,17 @@ class base_html_playwright(Fetcher): # acceptable screenshot quality here try: # Quality set to 1 because it's not used, just used as a work-around for a bug, no need to change this. - page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}, quality=1) + self.page.screenshot(type='jpeg', clip={'x': 1.0, 'y': 1.0, 'width': 1280, 'height': 1024}, quality=1) # The actual screenshot - self.screenshot = page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) + self.screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) except Exception as e: context.close() browser.close() raise ScreenshotUnavailable(url=url, status_code=None) - if len(self.content.strip()) == 0: - context.close() - browser.close() - print("Content was empty") - raise EmptyReply(url=url, status_code=None, screenshot=self.screenshot) - context.close() browser.close() - if not ignore_status_codes and self.status_code!=200: - raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=self.content, screenshot=self.screenshot) - class base_html_webdriver(Fetcher): if os.getenv("WEBDRIVER_URL"): fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) @@ -423,7 +482,6 @@ class base_html_webdriver(Fetcher): def is_ready(self): from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities - from selenium.common.exceptions import WebDriverException self.driver = webdriver.Remote( command_executor=self.command_executor, diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 31c0bb7f..68762f45 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -108,6 +108,11 @@ class perform_site_check(): elif system_webdriver_delay is not None: fetcher.render_extract_delay = system_webdriver_delay + # Possible conflict + if prefer_backend == 'html_webdriver': + fetcher.browser_steps = watch.get('browser_steps', None) + fetcher.browser_steps_screenshot_path = os.path.join(self.datastore.datastore_path, uuid) + if watch.get('webdriver_js_execute_code') is not None and watch.get('webdriver_js_execute_code').strip(): fetcher.webdriver_js_execute_code = watch.get('webdriver_js_execute_code') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 2904f461..57dd7c77 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -1,11 +1,10 @@ +import os import re from wtforms import ( BooleanField, - Field, Form, IntegerField, - PasswordField, RadioField, SelectField, StringField, @@ -13,15 +12,17 @@ from wtforms import ( TextAreaField, fields, validators, - widgets, + widgets ) +from wtforms.fields import FieldList from wtforms.validators import ValidationError +# default +# each select