From 823a0c99f4eca80bc116e4a0ac61685c7e8eb57c Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 11 Feb 2024 00:09:12 +0100 Subject: [PATCH] Code - Split content fetcher code up (playwright, puppeteer and requests), fix puppeteer direct chrome support (#2169) --- .github/workflows/pypi-release.yml | 7 +- .github/workflows/test-only.yml | 79 ++- Dockerfile | 9 +- changedetectionio/__init__.py | 4 +- .../blueprint/browser_steps/TODO.txt | 7 + .../blueprint/browser_steps/browser_steps.py | 8 +- .../blueprint/check_proxies/__init__.py | 14 +- changedetectionio/content_fetcher.py | 658 ------------------ .../content_fetchers/__init__.py | 38 + changedetectionio/content_fetchers/base.py | 137 ++++ .../content_fetchers/exceptions/__init__.py | 90 +++ .../content_fetchers/playwright.py | 210 ++++++ .../content_fetchers/puppeteer.py | 234 +++++++ .../content_fetchers/requests.py | 91 +++ .../res/puppeteer_fetch.js | 0 .../res/stock-not-in-stock.js | 0 .../res/xpath_element_scraper.js | 0 .../content_fetchers/webdriver_selenium.py | 119 ++++ changedetectionio/flask_app.py | 29 +- changedetectionio/forms.py | 50 +- changedetectionio/processors/__init__.py | 20 +- .../processors/text_json_diff.py | 7 +- .../run_custom_browser_url_tests.sh | 14 +- changedetectionio/run_proxy_tests.sh | 50 +- changedetectionio/run_socks_proxy_tests.sh | 43 ++ .../tests/proxy_list/test_multiple_proxy.py | 18 +- .../proxy_list/test_select_custom_proxy.py | 3 +- changedetectionio/tests/test_backend.py | 2 +- changedetectionio/tests/test_errorhandling.py | 12 +- .../tests/visualselector/test_fetch_data.py | 1 - changedetectionio/update_worker.py | 33 +- requirements.txt | 6 + 32 files changed, 1166 insertions(+), 827 deletions(-) create mode 100644 changedetectionio/blueprint/browser_steps/TODO.txt delete mode 100644 changedetectionio/content_fetcher.py create mode 100644 changedetectionio/content_fetchers/__init__.py create mode 100644 changedetectionio/content_fetchers/base.py create mode 100644 changedetectionio/content_fetchers/exceptions/__init__.py create mode 100644 changedetectionio/content_fetchers/playwright.py create mode 100644 changedetectionio/content_fetchers/puppeteer.py create mode 100644 changedetectionio/content_fetchers/requests.py rename changedetectionio/{ => content_fetchers}/res/puppeteer_fetch.js (100%) rename changedetectionio/{ => content_fetchers}/res/stock-not-in-stock.js (100%) rename changedetectionio/{ => content_fetchers}/res/xpath_element_scraper.js (100%) create mode 100644 changedetectionio/content_fetchers/webdriver_selenium.py create mode 100755 changedetectionio/run_socks_proxy_tests.sh diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 1b2ffb76..b708c9a8 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -11,7 +11,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: "3.11" - name: Install pypa/build run: >- python3 -m @@ -38,9 +38,14 @@ jobs: with: name: python-package-distributions path: dist/ + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' - name: Test that the basic pip built package runs without error run: | set -ex + sudo pip3 install --upgrade pip pip3 install dist/changedetection.io*.whl changedetection.io -d /tmp -p 10000 & sleep 3 diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index 231105a8..1b2af49f 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -28,12 +28,12 @@ jobs: docker network create changedet-network - # Selenium and sockpuppetbrowser + # Selenium docker run --network changedet-network -d --hostname selenium -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4 - docker run --network changedet-network -d --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest - # For accessing custom browser tests - docker run --network changedet-network -d --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url --rm dgtlmoon/sockpuppetbrowser:latest + # SocketPuppetBrowser + Extra for custom browser test + docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest + docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest - name: Build changedetection.io container for testing run: | @@ -65,42 +65,69 @@ jobs: # The default pytest logger_level is TRACE # To change logger_level for pytest(test/conftest.py), # append the docker option. e.g. '-e LOGGER_LEVEL=DEBUG' - docker run --network changedet-network test-changedetectionio bash -c 'cd changedetectionio && ./run_basic_tests.sh' + docker run --name test-cdio-basic-tests --network changedet-network test-changedetectionio bash -c 'cd changedetectionio && ./run_basic_tests.sh' - - name: Specific tests in built container for Selenium - run: | - # Selenium fetch - docker run --rm -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py' - - - name: Specific tests in built container for Playwright and SocketPuppetBrowser +# PLAYWRIGHT/NODE-> CDP + - name: Playwright and SocketPuppetBrowser - Specific tests in built container run: | # Playwright via Sockpuppetbrowser fetch + # tests/visualselector/test_fetch_data.py will do browser steps docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' - - name: Specific tests in built container for headers and requests checks with Playwright + - name: Playwright and SocketPuppetBrowser - Headers and requests run: | # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' - - name: Specific tests in built container for headers and requests checks with Selenium + - name: Playwright and SocketPuppetBrowser - Restock detection + run: | + # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it + docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' + +# STRAIGHT TO CDP + - name: Pyppeteer and SocketPuppetBrowser - Specific tests in built container run: | - docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' + # Playwright via Sockpuppetbrowser fetch + docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "FAST_PUPPETEER_CHROME_FETCHER=True" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' - - name: Test built container restock detection via Playwright + - name: Pyppeteer and SocketPuppetBrowser - Headers and requests checks + run: | + # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers + docker run --name "changedet" --hostname changedet --rm -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' + + - name: Pyppeteer and SocketPuppetBrowser - Restock detection run: | # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it - docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' + docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' +# SELENIUM + - name: Specific tests in built container for Selenium + run: | + # Selenium fetch + docker run --rm -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py' + + - name: Specific tests in built container for headers and requests checks with Selenium + run: | + docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' + +# OTHER STUFF - name: Test SMTP notification mime types run: | # SMTP content types - needs the 'Debug SMTP server/echo message back server' container from above docker run --rm --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/smtp/test_notification_smtp.py' - - name: Test proxy interaction + # @todo Add a test via playwright/puppeteer + # squid with auth is tested in run_proxy_tests.sh -> tests/proxy_list/test_select_custom_proxy.py + - name: Test proxy squid style interaction run: | cd changedetectionio ./run_proxy_tests.sh - # And again with PLAYWRIGHT_DRIVER_URL=.. + cd .. + + - name: Test proxy SOCKS5 style interaction + run: | + cd changedetectionio + ./run_socks_proxy_tests.sh cd .. - name: Test custom browser URL @@ -174,6 +201,16 @@ jobs: # @todo - scan the container log to see the right "graceful shutdown" text exists docker rm sig-test -#export WEBDRIVER_URL=http://localhost:4444/wd/hub -#pytest tests/fetchers/test_content.py -#pytest tests/test_errorhandling.py + - name: Dump container log + if: always() + run: | + mkdir output-logs + docker logs test-cdio-basic-tests > output-logs/test-cdio-basic-tests-stdout.txt + docker logs test-cdio-basic-tests 2> output-logs/test-cdio-basic-tests-stderr.txt + + - name: Store container log + if: always() + uses: actions/upload-artifact@v1 + with: + name: test-cdio-basic-tests-output + path: output-logs diff --git a/Dockerfile b/Dockerfile index 68994e89..e592c9bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,8 @@ # pip dependencies install stage -FROM python:3.11-slim-bookworm as builder + +# @NOTE! I would love to move to 3.11 but it breaks the async handler in changedetectionio/content_fetchers/puppeteer.py +# If you know how to fix it, please do! and test it for both 3.10 and 3.11 +FROM python:3.10-slim-bookworm as builder # See `cryptography` pin comment in requirements.txt ARG CRYPTOGRAPHY_DONT_BUILD_RUST=1 @@ -25,11 +28,11 @@ RUN pip install --target=/dependencies -r /requirements.txt # Playwright is an alternative to Selenium # Excluded this package from requirements.txt to prevent arm/v6 and arm/v7 builds from failing # https://github.com/dgtlmoon/changedetection.io/pull/1067 also musl/alpine (not supported) -RUN pip install --target=/dependencies playwright~=1.40 \ +RUN pip install --target=/dependencies playwright~=1.41.2 \ || echo "WARN: Failed to install Playwright. The application can still run, but the Playwright option will be disabled." # Final image stage -FROM python:3.11-slim-bookworm +FROM python:3.10-slim-bookworm RUN apt-get update && apt-get install -y --no-install-recommends \ libxslt1.1 \ diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 677fd232..e2bf5ddc 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -6,11 +6,11 @@ __version__ = '0.45.14' from distutils.util import strtobool from json.decoder import JSONDecodeError - +import os +#os.environ['EVENTLET_NO_GREENDNS'] = 'yes' import eventlet import eventlet.wsgi import getopt -import os import signal import socket import sys diff --git a/changedetectionio/blueprint/browser_steps/TODO.txt b/changedetectionio/blueprint/browser_steps/TODO.txt new file mode 100644 index 00000000..7e586cfe --- /dev/null +++ b/changedetectionio/blueprint/browser_steps/TODO.txt @@ -0,0 +1,7 @@ +- This needs an abstraction to directly handle the puppeteer connection methods +- Then remove the playwright stuff +- Remove hack redirect at line 65 changedetectionio/processors/__init__.py + +The screenshots are base64 encoded/decoded which is very CPU intensive for large screenshots (in playwright) but not +in the direct puppeteer connection (they are binary end to end) + diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index cfc96a20..22710e99 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -243,7 +243,7 @@ class browsersteps_live_ui(steppable_browser_interface): def get_current_state(self): """Return the screenshot and interactive elements mapping, generally always called after action_()""" from pkg_resources import resource_string - xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8') + xpath_element_js = resource_string(__name__, "../../content_fetchers/res/xpath_element_scraper.js").decode('utf-8') now = time.time() self.page.wait_for_timeout(1 * 1000) @@ -278,10 +278,10 @@ class browsersteps_live_ui(steppable_browser_interface): self.page.evaluate("var include_filters=''") from pkg_resources import resource_string # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector - xpath_element_js = resource_string(__name__, "../../res/xpath_element_scraper.js").decode('utf-8') - from changedetectionio.content_fetcher import visualselector_xpath_selectors + xpath_element_js = resource_string(__name__, "../../content_fetchers/res/xpath_element_scraper.js").decode('utf-8') + from changedetectionio.content_fetchers import visualselector_xpath_selectors xpath_element_js = xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) xpath_data = self.page.evaluate("async () => {" + xpath_element_js + "}") - screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) + screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) return (screenshot, xpath_data) diff --git a/changedetectionio/blueprint/check_proxies/__init__.py b/changedetectionio/blueprint/check_proxies/__init__.py index ea68376a..db4bbe62 100644 --- a/changedetectionio/blueprint/check_proxies/__init__.py +++ b/changedetectionio/blueprint/check_proxies/__init__.py @@ -1,14 +1,11 @@ from concurrent.futures import ThreadPoolExecutor +from changedetectionio.store import ChangeDetectionStore from functools import wraps from flask import Blueprint from flask_login import login_required -from changedetectionio.processors import text_json_diff -from changedetectionio.store import ChangeDetectionStore - - STATUS_CHECKING = 0 STATUS_FAILED = 1 STATUS_OK = 2 @@ -32,7 +29,8 @@ def construct_blueprint(datastore: ChangeDetectionStore): @threadpool def long_task(uuid, preferred_proxy): import time - from changedetectionio import content_fetcher + from changedetectionio.content_fetchers import exceptions as content_fetcher_exceptions + from changedetectionio.processors import text_json_diff status = {'status': '', 'length': 0, 'text': ''} from jinja2 import Environment, BaseLoader @@ -43,7 +41,7 @@ def construct_blueprint(datastore: ChangeDetectionStore): update_handler = text_json_diff.perform_site_check(datastore=datastore, watch_uuid=uuid) update_handler.call_browser() # title, size is len contents not len xfer - except content_fetcher.Non200ErrorCodeReceived as e: + except content_fetcher_exceptions.Non200ErrorCodeReceived as e: if e.status_code == 404: status.update({'status': 'OK', 'length': len(contents), 'text': f"OK but 404 (page not found)"}) elif e.status_code == 403 or e.status_code == 401: @@ -52,12 +50,12 @@ def construct_blueprint(datastore: ChangeDetectionStore): status.update({'status': 'ERROR', 'length': len(contents), 'text': f"Status code: {e.status_code}"}) except text_json_diff.FilterNotFoundInResponse: status.update({'status': 'OK', 'length': len(contents), 'text': f"OK but CSS/xPath filter not found (page changed layout?)"}) - except content_fetcher.EmptyReply as e: + except content_fetcher_exceptions.EmptyReply as e: if e.status_code == 403 or e.status_code == 401: status.update({'status': 'ERROR OTHER', 'length': len(contents), 'text': f"Got empty reply with code {e.status_code} - Access denied"}) else: status.update({'status': 'ERROR OTHER', 'length': len(contents) if contents else 0, 'text': f"Empty reply with code {e.status_code}, needs chrome?"}) - except content_fetcher.ReplyWithContentButNoText as e: + except content_fetcher_exceptions.ReplyWithContentButNoText as e: txt = f"Got reply but with no content - Status code {e.status_code} - It's possible that the filters were found, but contained no usable text (or contained only an image)." status.update({'status': 'ERROR', 'text': txt}) except Exception as e: diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py deleted file mode 100644 index 2b703caa..00000000 --- a/changedetectionio/content_fetcher.py +++ /dev/null @@ -1,658 +0,0 @@ -from abc import abstractmethod -from distutils.util import strtobool -from urllib.parse import urlparse -import chardet -import hashlib -import json -import os -import requests -import sys -import time -import urllib.parse -from loguru import logger - -visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary' - - -class Non200ErrorCodeReceived(Exception): - def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None): - # Set this so we can use it in other parts of the app - self.status_code = status_code - self.url = url - self.screenshot = screenshot - self.xpath_data = xpath_data - self.page_text = None - - if page_html: - from changedetectionio import html_tools - self.page_text = html_tools.html_to_text(page_html) - return - - -class checksumFromPreviousCheckWasTheSame(Exception): - def __init__(self): - return - - -class JSActionExceptions(Exception): - def __init__(self, status_code, url, screenshot, message=''): - self.status_code = status_code - self.url = url - self.screenshot = screenshot - self.message = message - return - - -class BrowserStepsStepException(Exception): - def __init__(self, step_n, original_e): - self.step_n = step_n - self.original_e = original_e - logger.debug(f"Browser Steps exception at step {self.step_n} {str(original_e)}") - return - - -# @todo - make base Exception class that announces via logger() -class PageUnloadable(Exception): - def __init__(self, status_code=None, url='', message='', screenshot=False): - # Set this so we can use it in other parts of the app - self.status_code = status_code - self.url = url - self.screenshot = screenshot - self.message = message - return - -class BrowserStepsInUnsupportedFetcher(Exception): - def __init__(self, url): - self.url = url - return - -class EmptyReply(Exception): - def __init__(self, status_code, url, screenshot=None): - # Set this so we can use it in other parts of the app - self.status_code = status_code - self.url = url - self.screenshot = screenshot - return - - -class ScreenshotUnavailable(Exception): - def __init__(self, status_code, url, page_html=None): - # Set this so we can use it in other parts of the app - self.status_code = status_code - self.url = url - if page_html: - from html_tools import html_to_text - self.page_text = html_to_text(page_html) - return - - -class ReplyWithContentButNoText(Exception): - def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''): - # Set this so we can use it in other parts of the app - self.status_code = status_code - self.url = url - self.screenshot = screenshot - self.has_filters = has_filters - self.html_content = html_content - return - - -class Fetcher(): - browser_connection_is_custom = None - browser_connection_url = None - browser_steps = None - browser_steps_screenshot_path = None - content = None - error = None - fetcher_description = "No description" - headers = {} - instock_data = None - instock_data_js = "" - status_code = None - webdriver_js_execute_code = None - xpath_data = None - xpath_element_js = "" - - # Will be needed in the future by the VisualSelector, always get this where possible. - screenshot = False - system_http_proxy = os.getenv('HTTP_PROXY') - system_https_proxy = os.getenv('HTTPS_PROXY') - - # Time ONTOP of the system defined env minimum time - render_extract_delay = 0 - - def __init__(self): - from pkg_resources import resource_string - # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector - self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8') - self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8') - - @abstractmethod - def get_error(self): - return self.error - - @abstractmethod - def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False): - # Should set self.error, self.status_code and self.content - pass - - @abstractmethod - def quit(self): - return - - @abstractmethod - def get_last_status_code(self): - return self.status_code - - @abstractmethod - def screenshot_step(self, step_n): - return None - - @abstractmethod - # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc - def is_ready(self): - return True - - def get_all_headers(self): - """ - Get all headers but ensure all keys are lowercase - :return: - """ - return {k.lower(): v for k, v in self.headers.items()} - - def browser_steps_get_valid_steps(self): - if self.browser_steps is not None and len(self.browser_steps): - valid_steps = filter( - lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), - self.browser_steps) - - return valid_steps - - return None - - def iterate_browser_steps(self): - from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface - from playwright._impl._errors import TimeoutError, Error - from jinja2 import Environment - jinja2_env = Environment(extensions=['jinja2_time.TimeExtension']) - - step_n = 0 - - if self.browser_steps is not None and len(self.browser_steps): - interface = steppable_browser_interface() - interface.page = self.page - valid_steps = self.browser_steps_get_valid_steps() - - for step in valid_steps: - step_n += 1 - logger.debug(f">> Iterating check - browser Step n {step_n} - {step['operation']}...") - self.screenshot_step("before-" + str(step_n)) - self.save_step_html("before-" + str(step_n)) - try: - optional_value = step['optional_value'] - selector = step['selector'] - # Support for jinja2 template in step values, with date module added - if '{%' in step['optional_value'] or '{{' in step['optional_value']: - optional_value = str(jinja2_env.from_string(step['optional_value']).render()) - if '{%' in step['selector'] or '{{' in step['selector']: - selector = str(jinja2_env.from_string(step['selector']).render()) - - getattr(interface, "call_action")(action_name=step['operation'], - selector=selector, - optional_value=optional_value) - self.screenshot_step(step_n) - self.save_step_html(step_n) - except (Error, TimeoutError) as e: - logger.debug(str(e)) - # Stop processing here - raise BrowserStepsStepException(step_n=step_n, original_e=e) - - # It's always good to reset these - def delete_browser_steps_screenshots(self): - import glob - if self.browser_steps_screenshot_path is not None: - dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg') - files = glob.glob(dest) - for f in files: - if os.path.isfile(f): - os.unlink(f) - - -# Maybe for the future, each fetcher provides its own diff output, could be used for text, image -# the current one would return javascript output (as we use JS to generate the diff) -# -def available_fetchers(): - # See the if statement at the bottom of this file for how we switch between playwright and webdriver - import inspect - p = [] - for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass): - if inspect.isclass(obj): - # @todo html_ is maybe better as fetcher_ or something - # In this case, make sure to edit the default one in store.py and fetch_site_status.py - if name.startswith('html_'): - t = tuple([name, obj.fetcher_description]) - p.append(t) - - return p - - -class base_html_playwright(Fetcher): - fetcher_description = "Playwright {}/Javascript".format( - os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() - ) - if os.getenv("PLAYWRIGHT_DRIVER_URL"): - fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) - - browser_type = '' - command_executor = '' - - # Configs for Proxy setup - # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" - playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password'] - - proxy = None - - def __init__(self, proxy_override=None, custom_browser_connection_url=None): - super().__init__() - - self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') - - if custom_browser_connection_url: - self.browser_connection_is_custom = True - self.browser_connection_url = custom_browser_connection_url - else: - # Fallback to fetching from system - # .strip('"') is going to save someone a lot of time when they accidently wrap the env value - self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"') - - - # If any proxy settings are enabled, then we should setup the proxy object - proxy_args = {} - for k in self.playwright_proxy_settings_mappings: - v = os.getenv('playwright_proxy_' + k, False) - if v: - proxy_args[k] = v.strip('"') - - if proxy_args: - self.proxy = proxy_args - - # allow per-watch proxy selection override - if proxy_override: - self.proxy = {'server': proxy_override} - - if self.proxy: - # Playwright needs separate username and password values - parsed = urlparse(self.proxy.get('server')) - if parsed.username: - self.proxy['username'] = parsed.username - self.proxy['password'] = parsed.password - - def screenshot_step(self, step_n=''): - screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85) - - if self.browser_steps_screenshot_path is not None: - destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) - logger.debug(f"Saving step screenshot to {destination}") - with open(destination, 'wb') as f: - f.write(screenshot) - - def save_step_html(self, step_n): - content = self.page.content() - destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) - logger.debug(f"Saving step HTML to {destination}") - with open(destination, 'w') as f: - f.write(content) - - def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False): - - - from playwright.sync_api import sync_playwright - import playwright._impl._errors - - self.delete_browser_steps_screenshots() - response = None - - with sync_playwright() as p: - browser_type = getattr(p, self.browser_type) - - # Seemed to cause a connection Exception even tho I can see it connect - # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) - # 60,000 connection timeout only - browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000) - - # SOCKS5 with authentication is not supported (yet) - # https://github.com/microsoft/playwright/issues/10567 - - # Set user agent to prevent Cloudflare from blocking the browser - # Use the default one configured in the App.py model that's passed from fetch_site_status.py - context = browser.new_context( - user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None), - proxy=self.proxy, - # This is needed to enable JavaScript execution on GitHub and others - bypass_csp=True, - # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers - service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), - # Should never be needed - accept_downloads=False - ) - - self.page = context.new_page() - if len(request_headers): - context.set_extra_http_headers(request_headers) - - # Listen for all console events and handle errors - self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) - - # Re-use as much code from browser steps as possible so its the same - from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface - browsersteps_interface = steppable_browser_interface() - browsersteps_interface.page = self.page - - response = browsersteps_interface.action_goto_url(value=url) - self.headers = response.all_headers() - - if response is None: - context.close() - browser.close() - logger.debug("Content Fetcher > Response object was none") - raise EmptyReply(url=url, status_code=None) - - try: - if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): - browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None) - except playwright._impl._errors.TimeoutError as e: - context.close() - browser.close() - # This can be ok, we will try to grab what we could retrieve - pass - except Exception as e: - logger.debug(f"Content Fetcher > Other exception when executing custom JS code {str(e)}") - context.close() - browser.close() - raise PageUnloadable(url=url, status_code=None, message=str(e)) - - extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay - self.page.wait_for_timeout(extra_wait * 1000) - - try: - self.status_code = response.status - except Exception as e: - # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962 - logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.") - logger.critical(response) - context.close() - browser.close() - raise PageUnloadable(url=url, status_code=None, message=str(e)) - - if self.status_code != 200 and not ignore_status_codes: - - screenshot=self.page.screenshot(type='jpeg', full_page=True, - quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72))) - - raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) - - if len(self.page.content().strip()) == 0: - context.close() - browser.close() - logger.debug("Content Fetcher > Content was empty") - raise EmptyReply(url=url, status_code=response.status) - - # Run Browser Steps here - if self.browser_steps_get_valid_steps(): - self.iterate_browser_steps() - - self.page.wait_for_timeout(extra_wait * 1000) - - # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) - if current_include_filters is not None: - self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) - else: - self.page.evaluate("var include_filters=''") - - self.xpath_data = self.page.evaluate( - "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") - self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") - - self.content = self.page.content() - # Bug 3 in Playwright screenshot handling - # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it - # JPEG is better here because the screenshots can be very very large - - # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded - # which will significantly increase the IO size between the server and client, it's recommended to use the lowest - # acceptable screenshot quality here - try: - # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage - self.screenshot = self.page.screenshot(type='jpeg', - full_page=True, - quality=int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)), - ) - except Exception as e: - # It's likely the screenshot was too long/big and something crashed - raise ScreenshotUnavailable(url=url, status_code=self.status_code) - finally: - context.close() - browser.close() - - -class base_html_webdriver(Fetcher): - if os.getenv("WEBDRIVER_URL"): - fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) - else: - fetcher_description = "WebDriver Chrome/Javascript" - - # Configs for Proxy setup - # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" - selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', - 'proxyAutoconfigUrl', 'sslProxy', 'autodetect', - 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] - proxy = None - - def __init__(self, proxy_override=None, custom_browser_connection_url=None): - super().__init__() - from selenium.webdriver.common.proxy import Proxy as SeleniumProxy - - # .strip('"') is going to save someone a lot of time when they accidently wrap the env value - if not custom_browser_connection_url: - self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') - else: - self.browser_connection_is_custom = True - self.browser_connection_url = custom_browser_connection_url - - # If any proxy settings are enabled, then we should setup the proxy object - proxy_args = {} - for k in self.selenium_proxy_settings_mappings: - v = os.getenv('webdriver_' + k, False) - if v: - proxy_args[k] = v.strip('"') - - # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy - if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy: - proxy_args['httpProxy'] = self.system_http_proxy - if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy: - proxy_args['httpsProxy'] = self.system_https_proxy - - # Allows override the proxy on a per-request basis - if proxy_override is not None: - proxy_args['httpProxy'] = proxy_override - - if proxy_args: - self.proxy = SeleniumProxy(raw=proxy_args) - - def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False): - - from selenium import webdriver - from selenium.webdriver.chrome.options import Options as ChromeOptions - from selenium.common.exceptions import WebDriverException - # request_body, request_method unused for now, until some magic in the future happens. - - options = ChromeOptions() - if self.proxy: - options.proxy = self.proxy - - self.driver = webdriver.Remote( - command_executor=self.browser_connection_url, - options=options) - - try: - self.driver.get(url) - except WebDriverException as e: - # Be sure we close the session window - self.quit() - raise - - self.driver.set_window_size(1280, 1024) - self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) - - if self.webdriver_js_execute_code is not None: - self.driver.execute_script(self.webdriver_js_execute_code) - # Selenium doesn't automatically wait for actions as good as Playwright, so wait again - self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) - - # @todo - how to check this? is it possible? - self.status_code = 200 - # @todo somehow we should try to get this working for WebDriver - # raise EmptyReply(url=url, status_code=r.status_code) - - # @todo - dom wait loaded? - time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) - self.content = self.driver.page_source - self.headers = {} - - self.screenshot = self.driver.get_screenshot_as_png() - - # Does the connection to the webdriver work? run a test connection. - def is_ready(self): - from selenium import webdriver - from selenium.webdriver.chrome.options import Options as ChromeOptions - - self.driver = webdriver.Remote( - command_executor=self.command_executor, - options=ChromeOptions()) - - # driver.quit() seems to cause better exceptions - self.quit() - return True - - def quit(self): - if self.driver: - try: - self.driver.quit() - except Exception as e: - logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}") - - -# "html_requests" is listed as the default fetcher in store.py! -class html_requests(Fetcher): - fetcher_description = "Basic fast Plaintext/HTTP Client" - - def __init__(self, proxy_override=None, custom_browser_connection_url=None): - super().__init__() - self.proxy_override = proxy_override - # browser_connection_url is none because its always 'launched locally' - - def run(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False): - - if self.browser_steps_get_valid_steps(): - raise BrowserStepsInUnsupportedFetcher(url=url) - - # Make requests use a more modern looking user-agent - if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None): - request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') - - proxies = {} - - # Allows override the proxy on a per-request basis - - # https://requests.readthedocs.io/en/latest/user/advanced/#socks - # Should also work with `socks5://user:pass@host:port` type syntax. - - if self.proxy_override: - proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override} - else: - if self.system_http_proxy: - proxies['http'] = self.system_http_proxy - if self.system_https_proxy: - proxies['https'] = self.system_https_proxy - - r = requests.request(method=request_method, - data=request_body, - url=url, - headers=request_headers, - timeout=timeout, - proxies=proxies, - verify=False) - - # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks. - # For example - some sites don't tell us it's utf-8, but return utf-8 content - # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably. - # https://github.com/psf/requests/issues/1604 good info about requests encoding detection - if not is_binary: - # Don't run this for PDF (and requests identified as binary) takes a _long_ time - if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'): - encoding = chardet.detect(r.content)['encoding'] - if encoding: - r.encoding = encoding - - self.headers = r.headers - - if not r.content or not len(r.content): - raise EmptyReply(url=url, status_code=r.status_code) - - # @todo test this - # @todo maybe you really want to test zero-byte return pages? - if r.status_code != 200 and not ignore_status_codes: - # maybe check with content works? - raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text) - - self.status_code = r.status_code - if is_binary: - # Binary files just return their checksum until we add something smarter - self.content = hashlib.md5(r.content).hexdigest() - else: - self.content = r.text - - - self.raw_content = r.content - - -# Decide which is the 'real' HTML webdriver, this is more a system wide config -# rather than site-specific. -use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) -if use_playwright_as_chrome_fetcher: - html_webdriver = base_html_playwright -else: - html_webdriver = base_html_webdriver diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py new file mode 100644 index 00000000..a6798dd7 --- /dev/null +++ b/changedetectionio/content_fetchers/__init__.py @@ -0,0 +1,38 @@ +import sys +from distutils.util import strtobool + +from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException +import os + +visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4, header, footer, section, article, aside, details, main, nav, section, summary' + +# available_fetchers() will scan this implementation looking for anything starting with html_ +# this information is used in the form selections +from changedetectionio.content_fetchers.requests import fetcher as html_requests + +def available_fetchers(): + # See the if statement at the bottom of this file for how we switch between playwright and webdriver + import inspect + p = [] + for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass): + if inspect.isclass(obj): + # @todo html_ is maybe better as fetcher_ or something + # In this case, make sure to edit the default one in store.py and fetch_site_status.py + if name.startswith('html_'): + t = tuple([name, obj.fetcher_description]) + p.append(t) + + return p + + +# Decide which is the 'real' HTML webdriver, this is more a system wide config +# rather than site-specific. +use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False) +if use_playwright_as_chrome_fetcher: + if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')): + from .playwright import fetcher as html_webdriver + else: + from .puppeteer import fetcher as html_webdriver + +else: + from .webdriver_selenium import fetcher as html_webdriver diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py new file mode 100644 index 00000000..71500d61 --- /dev/null +++ b/changedetectionio/content_fetchers/base.py @@ -0,0 +1,137 @@ +import os +from abc import abstractmethod +from loguru import logger + +from changedetectionio.content_fetchers import BrowserStepsStepException + + +class Fetcher(): + browser_connection_is_custom = None + browser_connection_url = None + browser_steps = None + browser_steps_screenshot_path = None + content = None + error = None + fetcher_description = "No description" + headers = {} + instock_data = None + instock_data_js = "" + status_code = None + webdriver_js_execute_code = None + xpath_data = None + xpath_element_js = "" + + # Will be needed in the future by the VisualSelector, always get this where possible. + screenshot = False + system_http_proxy = os.getenv('HTTP_PROXY') + system_https_proxy = os.getenv('HTTPS_PROXY') + + # Time ONTOP of the system defined env minimum time + render_extract_delay = 0 + + def __init__(self): + from pkg_resources import resource_string + # The code that scrapes elements and makes a list of elements/size/position to click on in the VisualSelector + self.xpath_element_js = resource_string(__name__, "res/xpath_element_scraper.js").decode('utf-8') + self.instock_data_js = resource_string(__name__, "res/stock-not-in-stock.js").decode('utf-8') + + @abstractmethod + def get_error(self): + return self.error + + @abstractmethod + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False, + current_include_filters=None, + is_binary=False): + # Should set self.error, self.status_code and self.content + pass + + @abstractmethod + def quit(self): + return + + @abstractmethod + def get_last_status_code(self): + return self.status_code + + @abstractmethod + def screenshot_step(self, step_n): + return None + + @abstractmethod + # Return true/false if this checker is ready to run, in the case it needs todo some special config check etc + def is_ready(self): + return True + + def get_all_headers(self): + """ + Get all headers but ensure all keys are lowercase + :return: + """ + return {k.lower(): v for k, v in self.headers.items()} + + def browser_steps_get_valid_steps(self): + if self.browser_steps is not None and len(self.browser_steps): + valid_steps = filter( + lambda s: (s['operation'] and len(s['operation']) and s['operation'] != 'Choose one' and s['operation'] != 'Goto site'), + self.browser_steps) + + return valid_steps + + return None + + def iterate_browser_steps(self): + from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface + from playwright._impl._errors import TimeoutError, Error + from jinja2 import Environment + jinja2_env = Environment(extensions=['jinja2_time.TimeExtension']) + + step_n = 0 + + if self.browser_steps is not None and len(self.browser_steps): + interface = steppable_browser_interface() + interface.page = self.page + valid_steps = self.browser_steps_get_valid_steps() + + for step in valid_steps: + step_n += 1 + logger.debug(f">> Iterating check - browser Step n {step_n} - {step['operation']}...") + self.screenshot_step("before-" + str(step_n)) + self.save_step_html("before-" + str(step_n)) + try: + optional_value = step['optional_value'] + selector = step['selector'] + # Support for jinja2 template in step values, with date module added + if '{%' in step['optional_value'] or '{{' in step['optional_value']: + optional_value = str(jinja2_env.from_string(step['optional_value']).render()) + if '{%' in step['selector'] or '{{' in step['selector']: + selector = str(jinja2_env.from_string(step['selector']).render()) + + getattr(interface, "call_action")(action_name=step['operation'], + selector=selector, + optional_value=optional_value) + self.screenshot_step(step_n) + self.save_step_html(step_n) + except (Error, TimeoutError) as e: + logger.debug(str(e)) + # Stop processing here + raise BrowserStepsStepException(step_n=step_n, original_e=e) + + # It's always good to reset these + def delete_browser_steps_screenshots(self): + import glob + if self.browser_steps_screenshot_path is not None: + dest = os.path.join(self.browser_steps_screenshot_path, 'step_*.jpeg') + files = glob.glob(dest) + for f in files: + if os.path.isfile(f): + os.unlink(f) + + def save_step_html(self, param): + pass diff --git a/changedetectionio/content_fetchers/exceptions/__init__.py b/changedetectionio/content_fetchers/exceptions/__init__.py new file mode 100644 index 00000000..fa9f3df7 --- /dev/null +++ b/changedetectionio/content_fetchers/exceptions/__init__.py @@ -0,0 +1,90 @@ +from loguru import logger + + +class Non200ErrorCodeReceived(Exception): + def __init__(self, status_code, url, screenshot=None, xpath_data=None, page_html=None): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + self.screenshot = screenshot + self.xpath_data = xpath_data + self.page_text = None + + if page_html: + from changedetectionio import html_tools + self.page_text = html_tools.html_to_text(page_html) + return + + +class checksumFromPreviousCheckWasTheSame(Exception): + def __init__(self): + return + + +class JSActionExceptions(Exception): + def __init__(self, status_code, url, screenshot, message=''): + self.status_code = status_code + self.url = url + self.screenshot = screenshot + self.message = message + return + +class BrowserConnectError(Exception): + msg = '' + def __init__(self, msg): + self.msg = msg + logger.error(f"Browser connection error {msg}") + return + +class BrowserStepsStepException(Exception): + def __init__(self, step_n, original_e): + self.step_n = step_n + self.original_e = original_e + logger.debug(f"Browser Steps exception at step {self.step_n} {str(original_e)}") + return + + +# @todo - make base Exception class that announces via logger() +class PageUnloadable(Exception): + def __init__(self, status_code=None, url='', message='', screenshot=False): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + self.screenshot = screenshot + self.message = message + return + +class BrowserStepsInUnsupportedFetcher(Exception): + def __init__(self, url): + self.url = url + return + +class EmptyReply(Exception): + def __init__(self, status_code, url, screenshot=None): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + self.screenshot = screenshot + return + + +class ScreenshotUnavailable(Exception): + def __init__(self, status_code, url, page_html=None): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + if page_html: + from html_tools import html_to_text + self.page_text = html_to_text(page_html) + return + + +class ReplyWithContentButNoText(Exception): + def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''): + # Set this so we can use it in other parts of the app + self.status_code = status_code + self.url = url + self.screenshot = screenshot + self.has_filters = has_filters + self.html_content = html_content + return diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py new file mode 100644 index 00000000..7faa2032 --- /dev/null +++ b/changedetectionio/content_fetchers/playwright.py @@ -0,0 +1,210 @@ +import json +import os +from urllib.parse import urlparse + +from loguru import logger +from changedetectionio.content_fetchers.base import Fetcher +from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable + +class fetcher(Fetcher): + fetcher_description = "Playwright {}/Javascript".format( + os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() + ) + if os.getenv("PLAYWRIGHT_DRIVER_URL"): + fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) + + browser_type = '' + command_executor = '' + + # Configs for Proxy setup + # In the ENV vars, is prefixed with "playwright_proxy_", so it is for example "playwright_proxy_server" + playwright_proxy_settings_mappings = ['bypass', 'server', 'username', 'password'] + + proxy = None + + def __init__(self, proxy_override=None, custom_browser_connection_url=None): + super().__init__() + + self.browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"') + + if custom_browser_connection_url: + self.browser_connection_is_custom = True + self.browser_connection_url = custom_browser_connection_url + else: + # Fallback to fetching from system + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"') + + # If any proxy settings are enabled, then we should setup the proxy object + proxy_args = {} + for k in self.playwright_proxy_settings_mappings: + v = os.getenv('playwright_proxy_' + k, False) + if v: + proxy_args[k] = v.strip('"') + + if proxy_args: + self.proxy = proxy_args + + # allow per-watch proxy selection override + if proxy_override: + self.proxy = {'server': proxy_override} + + if self.proxy: + # Playwright needs separate username and password values + parsed = urlparse(self.proxy.get('server')) + if parsed.username: + self.proxy['username'] = parsed.username + self.proxy['password'] = parsed.password + + def screenshot_step(self, step_n=''): + screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) + + if self.browser_steps_screenshot_path is not None: + destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) + logger.debug(f"Saving step screenshot to {destination}") + with open(destination, 'wb') as f: + f.write(screenshot) + + def save_step_html(self, step_n): + content = self.page.content() + destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) + logger.debug(f"Saving step HTML to {destination}") + with open(destination, 'w') as f: + f.write(content) + + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False, + current_include_filters=None, + is_binary=False): + + from playwright.sync_api import sync_playwright + import playwright._impl._errors + from changedetectionio.content_fetchers import visualselector_xpath_selectors + self.delete_browser_steps_screenshots() + response = None + + with sync_playwright() as p: + browser_type = getattr(p, self.browser_type) + + # Seemed to cause a connection Exception even tho I can see it connect + # self.browser = browser_type.connect(self.command_executor, timeout=timeout*1000) + # 60,000 connection timeout only + browser = browser_type.connect_over_cdp(self.browser_connection_url, timeout=60000) + + # SOCKS5 with authentication is not supported (yet) + # https://github.com/microsoft/playwright/issues/10567 + + # Set user agent to prevent Cloudflare from blocking the browser + # Use the default one configured in the App.py model that's passed from fetch_site_status.py + context = browser.new_context( + user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None), + proxy=self.proxy, + # This is needed to enable JavaScript execution on GitHub and others + bypass_csp=True, + # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers + service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), + # Should never be needed + accept_downloads=False + ) + + self.page = context.new_page() + if len(request_headers): + context.set_extra_http_headers(request_headers) + + # Listen for all console events and handle errors + self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}")) + + # Re-use as much code from browser steps as possible so its the same + from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface + browsersteps_interface = steppable_browser_interface() + browsersteps_interface.page = self.page + + response = browsersteps_interface.action_goto_url(value=url) + self.headers = response.all_headers() + + if response is None: + context.close() + browser.close() + logger.debug("Content Fetcher > Response object was none") + raise EmptyReply(url=url, status_code=None) + + try: + if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): + browsersteps_interface.action_execute_js(value=self.webdriver_js_execute_code, selector=None) + except playwright._impl._errors.TimeoutError as e: + context.close() + browser.close() + # This can be ok, we will try to grab what we could retrieve + pass + except Exception as e: + logger.debug(f"Content Fetcher > Other exception when executing custom JS code {str(e)}") + context.close() + browser.close() + raise PageUnloadable(url=url, status_code=None, message=str(e)) + + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay + self.page.wait_for_timeout(extra_wait * 1000) + + try: + self.status_code = response.status + except Exception as e: + # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962 + logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.") + logger.critical(response) + context.close() + browser.close() + raise PageUnloadable(url=url, status_code=None, message=str(e)) + + if self.status_code != 200 and not ignore_status_codes: + screenshot = self.page.screenshot(type='jpeg', full_page=True, + quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) + + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) + + if len(self.page.content().strip()) == 0: + context.close() + browser.close() + logger.debug("Content Fetcher > Content was empty") + raise EmptyReply(url=url, status_code=response.status) + + # Run Browser Steps here + if self.browser_steps_get_valid_steps(): + self.iterate_browser_steps() + + self.page.wait_for_timeout(extra_wait * 1000) + + # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) + if current_include_filters is not None: + self.page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) + else: + self.page.evaluate("var include_filters=''") + + self.xpath_data = self.page.evaluate( + "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") + self.instock_data = self.page.evaluate("async () => {" + self.instock_data_js + "}") + + self.content = self.page.content() + # Bug 3 in Playwright screenshot handling + # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it + # JPEG is better here because the screenshots can be very very large + + # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded + # which will significantly increase the IO size between the server and client, it's recommended to use the lowest + # acceptable screenshot quality here + try: + # The actual screenshot - this always base64 and needs decoding! horrible! huge CPU usage + self.screenshot = self.page.screenshot(type='jpeg', + full_page=True, + quality=int(os.getenv("SCREENSHOT_QUALITY", 72)), + ) + except Exception as e: + # It's likely the screenshot was too long/big and something crashed + raise ScreenshotUnavailable(url=url, status_code=self.status_code) + finally: + context.close() + browser.close() diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py new file mode 100644 index 00000000..9d455284 --- /dev/null +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -0,0 +1,234 @@ +import asyncio +import json +import os +import websockets.exceptions +from urllib.parse import urlparse + +from loguru import logger +from changedetectionio.content_fetchers.base import Fetcher +from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable, BrowserConnectError + + +class fetcher(Fetcher): + fetcher_description = "Puppeteer/direct {}/Javascript".format( + os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() + ) + if os.getenv("PLAYWRIGHT_DRIVER_URL"): + fetcher_description += " via '{}'".format(os.getenv("PLAYWRIGHT_DRIVER_URL")) + + browser_type = '' + command_executor = '' + + proxy = None + + def __init__(self, proxy_override=None, custom_browser_connection_url=None): + super().__init__() + + if custom_browser_connection_url: + self.browser_connection_is_custom = True + self.browser_connection_url = custom_browser_connection_url + else: + # Fallback to fetching from system + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + self.browser_connection_url = os.getenv("PLAYWRIGHT_DRIVER_URL", 'ws://playwright-chrome:3000').strip('"') + + # allow per-watch proxy selection override + # @todo check global too? + if proxy_override: + # Playwright needs separate username and password values + parsed = urlparse(proxy_override) + if parsed: + self.proxy = {'username': parsed.username, 'password': parsed.password} + # Add the proxy server chrome start option, the username and password never gets added here + # (It always goes in via await self.page.authenticate(self.proxy)) + import urllib.parse + # @todo filter some injection attack? + # check /somepath?thisandthat + # check scheme when no scheme + h = urllib.parse.quote(parsed.scheme + "://") if parsed.scheme else '' + h += urllib.parse.quote(f"{parsed.hostname}:{parsed.port}{parsed.path}?{parsed.query}", safe='') + + r = "?" if not '?' in self.browser_connection_url else '&' + self.browser_connection_url += f"{r}--proxy-server={h}" + + # def screenshot_step(self, step_n=''): + # screenshot = self.page.screenshot(type='jpeg', full_page=True, quality=85) + # + # if self.browser_steps_screenshot_path is not None: + # destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.jpeg'.format(step_n)) + # logger.debug(f"Saving step screenshot to {destination}") + # with open(destination, 'wb') as f: + # f.write(screenshot) + # + # def save_step_html(self, step_n): + # content = self.page.content() + # destination = os.path.join(self.browser_steps_screenshot_path, 'step_{}.html'.format(step_n)) + # logger.debug(f"Saving step HTML to {destination}") + # with open(destination, 'w') as f: + # f.write(content) + + async def fetch_page(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes, + current_include_filters, + is_binary + ): + + from changedetectionio.content_fetchers import visualselector_xpath_selectors + self.delete_browser_steps_screenshots() + + from pyppeteer import Pyppeteer + pyppeteer_instance = Pyppeteer() + + # Connect directly using the specified browser_ws_endpoint + # @todo timeout + try: + browser = await pyppeteer_instance.connect(browserWSEndpoint=self.browser_connection_url, + defaultViewport={"width": 1024, "height": 768} + ) + except websockets.exceptions.InvalidStatusCode as e: + raise BrowserConnectError(msg=f"Error while trying to connect the browser, Code {e.status_code} (check your access)") + except websockets.exceptions.InvalidURI: + raise BrowserConnectError(msg=f"Error connecting to the browser, check your browser connection address (should be ws:// or wss://") + except Exception as e: + raise BrowserConnectError(msg=f"Error connecting to the browser {str(e)}") + else: + self.page = await browser.newPage() + + await self.page.setBypassCSP(True) + if request_headers: + await self.page.setExtraHTTPHeaders(request_headers) + # @todo check user-agent worked + + # SOCKS5 with authentication is not supported (yet) + # https://github.com/microsoft/playwright/issues/10567 + self.page.setDefaultNavigationTimeout(0) + + if self.proxy: + # Setting Proxy-Authentication header is deprecated, and doing so can trigger header change errors from Puppeteer + # https://github.com/puppeteer/puppeteer/issues/676 ? + # https://help.brightdata.com/hc/en-us/articles/12632549957649-Proxy-Manager-How-to-Guides#h_01HAKWR4Q0AFS8RZTNYWRDFJC2 + # https://cri.dev/posts/2020-03-30-How-to-solve-Puppeteer-Chrome-Error-ERR_INVALID_ARGUMENT/ + await self.page.authenticate(self.proxy) + + # Re-use as much code from browser steps as possible so its the same + # from changedetectionio.blueprint.browser_steps.browser_steps import steppable_browser_interface + + # not yet used here, we fallback to playwright when browsersteps is required + # browsersteps_interface = steppable_browser_interface() + # browsersteps_interface.page = self.page + + response = await self.page.goto(url, waitUntil="load") + self.headers = response.headers + + if response is None: + await self.page.close() + await browser.close() + logger.warning("Content Fetcher > Response object was none") + raise EmptyReply(url=url, status_code=None) + + try: + if self.webdriver_js_execute_code is not None and len(self.webdriver_js_execute_code): + await self.page.evaluate(self.webdriver_js_execute_code) + except Exception as e: + logger.warning("Got exception when running evaluate on custom JS code") + logger.error(str(e)) + await self.page.close() + await browser.close() + # This can be ok, we will try to grab what we could retrieve + raise PageUnloadable(url=url, status_code=None, message=str(e)) + + extra_wait = int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay + await asyncio.sleep(1 + extra_wait) + + try: + self.status_code = response.status + except Exception as e: + # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962 + logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.") + logger.critical(response) + await self.page.close() + await browser.close() + raise PageUnloadable(url=url, status_code=None, message=str(e)) + + if self.status_code != 200 and not ignore_status_codes: + screenshot = await self.page.screenshot(type_='jpeg', + fullPage=True, + quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) + + raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, screenshot=screenshot) + content = await self.page.content + if len(content.strip()) == 0: + await self.page.close() + await browser.close() + logger.error("Content Fetcher > Content was empty") + raise EmptyReply(url=url, status_code=response.status) + + # Run Browser Steps here + # @todo not yet supported, we switch to playwright in this case + # if self.browser_steps_get_valid_steps(): + # self.iterate_browser_steps() + + await asyncio.sleep(1 + extra_wait) + + # So we can find an element on the page where its selector was entered manually (maybe not xPath etc) + # Setup the xPath/VisualSelector scraper + if current_include_filters is not None: + js = json.dumps(current_include_filters) + await self.page.evaluate(f"var include_filters={js}") + else: + await self.page.evaluate(f"var include_filters=''") + + self.xpath_data = await self.page.evaluate( + "async () => {" + self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) + "}") + self.instock_data = await self.page.evaluate("async () => {" + self.instock_data_js + "}") + + self.content = await self.page.content + # Bug 3 in Playwright screenshot handling + # Some bug where it gives the wrong screenshot size, but making a request with the clip set first seems to solve it + # JPEG is better here because the screenshots can be very very large + + # Screenshots also travel via the ws:// (websocket) meaning that the binary data is base64 encoded + # which will significantly increase the IO size between the server and client, it's recommended to use the lowest + # acceptable screenshot quality here + try: + self.screenshot = await self.page.screenshot(type_='jpeg', + fullPage=True, + quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) + except Exception as e: + logger.error("Error fetching screenshot") + # // May fail on very large pages with 'WARNING: tile memory limits exceeded, some content may not draw' + # // @ todo after text extract, we can place some overlay text with red background to say 'croppped' + logger.error('ERROR: content-fetcher page was maybe too large for a screenshot, reverting to viewport only screenshot') + try: + self.screenshot = await self.page.screenshot(type_='jpeg', + fullPage=False, + quality=int(os.getenv("SCREENSHOT_QUALITY", 72))) + except Exception as e: + logger.error('ERROR: Failed to get viewport-only reduced screenshot :(') + pass + finally: + await self.page.close() + await browser.close() + + async def main(self, **kwargs): + await self.fetch_page(**kwargs) + + def run(self, url, timeout, request_headers, request_body, request_method, ignore_status_codes=False, + current_include_filters=None, is_binary=False): + + # This will work in 3.10 but not >= 3.11 because 3.11 wants tasks only + asyncio.run(self.main( + url=url, + timeout=timeout, + request_headers=request_headers, + request_body=request_body, + request_method=request_method, + ignore_status_codes=ignore_status_codes, + current_include_filters=current_include_filters, + is_binary=is_binary + )) diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py new file mode 100644 index 00000000..b743dbce --- /dev/null +++ b/changedetectionio/content_fetchers/requests.py @@ -0,0 +1,91 @@ +import hashlib +import os + +import chardet +import requests + +from changedetectionio.content_fetchers.exceptions import BrowserStepsInUnsupportedFetcher, EmptyReply, Non200ErrorCodeReceived +from changedetectionio.content_fetchers.base import Fetcher + + +# "html_requests" is listed as the default fetcher in store.py! +class fetcher(Fetcher): + fetcher_description = "Basic fast Plaintext/HTTP Client" + + def __init__(self, proxy_override=None, custom_browser_connection_url=None): + super().__init__() + self.proxy_override = proxy_override + # browser_connection_url is none because its always 'launched locally' + + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False, + current_include_filters=None, + is_binary=False): + + if self.browser_steps_get_valid_steps(): + raise BrowserStepsInUnsupportedFetcher(url=url) + + # Make requests use a more modern looking user-agent + if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None): + request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') + + proxies = {} + + # Allows override the proxy on a per-request basis + + # https://requests.readthedocs.io/en/latest/user/advanced/#socks + # Should also work with `socks5://user:pass@host:port` type syntax. + + if self.proxy_override: + proxies = {'http': self.proxy_override, 'https': self.proxy_override, 'ftp': self.proxy_override} + else: + if self.system_http_proxy: + proxies['http'] = self.system_http_proxy + if self.system_https_proxy: + proxies['https'] = self.system_https_proxy + + r = requests.request(method=request_method, + data=request_body, + url=url, + headers=request_headers, + timeout=timeout, + proxies=proxies, + verify=False) + + # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks. + # For example - some sites don't tell us it's utf-8, but return utf-8 content + # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably. + # https://github.com/psf/requests/issues/1604 good info about requests encoding detection + if not is_binary: + # Don't run this for PDF (and requests identified as binary) takes a _long_ time + if not r.headers.get('content-type') or not 'charset=' in r.headers.get('content-type'): + encoding = chardet.detect(r.content)['encoding'] + if encoding: + r.encoding = encoding + + self.headers = r.headers + + if not r.content or not len(r.content): + raise EmptyReply(url=url, status_code=r.status_code) + + # @todo test this + # @todo maybe you really want to test zero-byte return pages? + if r.status_code != 200 and not ignore_status_codes: + # maybe check with content works? + raise Non200ErrorCodeReceived(url=url, status_code=r.status_code, page_html=r.text) + + self.status_code = r.status_code + if is_binary: + # Binary files just return their checksum until we add something smarter + self.content = hashlib.md5(r.content).hexdigest() + else: + self.content = r.text + + + self.raw_content = r.content diff --git a/changedetectionio/res/puppeteer_fetch.js b/changedetectionio/content_fetchers/res/puppeteer_fetch.js similarity index 100% rename from changedetectionio/res/puppeteer_fetch.js rename to changedetectionio/content_fetchers/res/puppeteer_fetch.js diff --git a/changedetectionio/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js similarity index 100% rename from changedetectionio/res/stock-not-in-stock.js rename to changedetectionio/content_fetchers/res/stock-not-in-stock.js diff --git a/changedetectionio/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js similarity index 100% rename from changedetectionio/res/xpath_element_scraper.js rename to changedetectionio/content_fetchers/res/xpath_element_scraper.js diff --git a/changedetectionio/content_fetchers/webdriver_selenium.py b/changedetectionio/content_fetchers/webdriver_selenium.py new file mode 100644 index 00000000..a45746f0 --- /dev/null +++ b/changedetectionio/content_fetchers/webdriver_selenium.py @@ -0,0 +1,119 @@ +import os +import time + +from loguru import logger +from changedetectionio.content_fetchers.base import Fetcher + +class fetcher(Fetcher): + if os.getenv("WEBDRIVER_URL"): + fetcher_description = "WebDriver Chrome/Javascript via '{}'".format(os.getenv("WEBDRIVER_URL")) + else: + fetcher_description = "WebDriver Chrome/Javascript" + + # Configs for Proxy setup + # In the ENV vars, is prefixed with "webdriver_", so it is for example "webdriver_sslProxy" + selenium_proxy_settings_mappings = ['proxyType', 'ftpProxy', 'httpProxy', 'noProxy', + 'proxyAutoconfigUrl', 'sslProxy', 'autodetect', + 'socksProxy', 'socksVersion', 'socksUsername', 'socksPassword'] + proxy = None + + def __init__(self, proxy_override=None, custom_browser_connection_url=None): + super().__init__() + from selenium.webdriver.common.proxy import Proxy as SeleniumProxy + + # .strip('"') is going to save someone a lot of time when they accidently wrap the env value + if not custom_browser_connection_url: + self.browser_connection_url = os.getenv("WEBDRIVER_URL", 'http://browser-chrome:4444/wd/hub').strip('"') + else: + self.browser_connection_is_custom = True + self.browser_connection_url = custom_browser_connection_url + + # If any proxy settings are enabled, then we should setup the proxy object + proxy_args = {} + for k in self.selenium_proxy_settings_mappings: + v = os.getenv('webdriver_' + k, False) + if v: + proxy_args[k] = v.strip('"') + + # Map back standard HTTP_ and HTTPS_PROXY to webDriver httpProxy/sslProxy + if not proxy_args.get('webdriver_httpProxy') and self.system_http_proxy: + proxy_args['httpProxy'] = self.system_http_proxy + if not proxy_args.get('webdriver_sslProxy') and self.system_https_proxy: + proxy_args['httpsProxy'] = self.system_https_proxy + + # Allows override the proxy on a per-request basis + if proxy_override is not None: + proxy_args['httpProxy'] = proxy_override + + if proxy_args: + self.proxy = SeleniumProxy(raw=proxy_args) + + def run(self, + url, + timeout, + request_headers, + request_body, + request_method, + ignore_status_codes=False, + current_include_filters=None, + is_binary=False): + + from selenium import webdriver + from selenium.webdriver.chrome.options import Options as ChromeOptions + from selenium.common.exceptions import WebDriverException + # request_body, request_method unused for now, until some magic in the future happens. + + options = ChromeOptions() + if self.proxy: + options.proxy = self.proxy + + self.driver = webdriver.Remote( + command_executor=self.browser_connection_url, + options=options) + + try: + self.driver.get(url) + except WebDriverException as e: + # Be sure we close the session window + self.quit() + raise + + self.driver.set_window_size(1280, 1024) + self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) + + if self.webdriver_js_execute_code is not None: + self.driver.execute_script(self.webdriver_js_execute_code) + # Selenium doesn't automatically wait for actions as good as Playwright, so wait again + self.driver.implicitly_wait(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5))) + + # @todo - how to check this? is it possible? + self.status_code = 200 + # @todo somehow we should try to get this working for WebDriver + # raise EmptyReply(url=url, status_code=r.status_code) + + # @todo - dom wait loaded? + time.sleep(int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) + self.content = self.driver.page_source + self.headers = {} + + self.screenshot = self.driver.get_screenshot_as_png() + + # Does the connection to the webdriver work? run a test connection. + def is_ready(self): + from selenium import webdriver + from selenium.webdriver.chrome.options import Options as ChromeOptions + + self.driver = webdriver.Remote( + command_executor=self.command_executor, + options=ChromeOptions()) + + # driver.quit() seems to cause better exceptions + self.quit() + return True + + def quit(self): + if self.driver: + try: + self.driver.quit() + except Exception as e: + logger.debug(f"Content Fetcher > Exception in chrome shutdown/quit {str(e)}") \ No newline at end of file diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index 058651dd..84e804f7 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -1,25 +1,19 @@ #!/usr/bin/python3 -from changedetectionio import queuedWatchMetaData +import datetime +import os +import queue +import threading +import time from copy import deepcopy from distutils.util import strtobool -from feedgen.feed import FeedGenerator -from flask_compress import Compress as FlaskCompress -from flask_login import current_user -from flask_restful import abort, Api -from flask_wtf import CSRFProtect from functools import wraps from threading import Event -import datetime + import flask_login -from loguru import logger -import os import pytz -import queue -import threading -import time import timeago - +from feedgen.feed import FeedGenerator from flask import ( Flask, abort, @@ -32,10 +26,15 @@ from flask import ( session, url_for, ) - +from flask_compress import Compress as FlaskCompress +from flask_login import current_user from flask_paginate import Pagination, get_page_parameter +from flask_restful import abort, Api +from flask_wtf import CSRFProtect +from loguru import logger from changedetectionio import html_tools, __version__ +from changedetectionio import queuedWatchMetaData from changedetectionio.api import api_v1 datastore = None @@ -763,7 +762,7 @@ def changedetection_app(config=None, datastore_o=None): @app.route("/settings", methods=['GET', "POST"]) @login_optionally_required def settings_page(): - from changedetectionio import content_fetcher, forms + from changedetectionio import forms default = deepcopy(datastore.data['settings']) if datastore.proxy_list is not None: diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index a4480cc1..cdcb8ee0 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -27,7 +27,7 @@ from validators.url import url as url_validator # each select