From c5a4e0aaa34b938e53bf3b15dcdf92c05ee155fa Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Wed, 7 Feb 2024 20:58:21 +0100 Subject: [PATCH] Fetching - Prefer to use SockPuppetBrowser (#2163) --- .github/workflows/test-only.yml | 42 +++--- .../blueprint/browser_steps/__init__.py | 11 +- .../blueprint/browser_steps/browser_steps.py | 2 +- changedetectionio/content_fetcher.py | 136 +----------------- changedetectionio/importer.py | 2 +- changedetectionio/res/puppeteer_fetch.js | 2 +- .../run_custom_browser_url_tests.sh | 16 +-- changedetectionio/run_proxy_tests.sh | 2 +- changedetectionio/static/js/browser-steps.js | 14 +- changedetectionio/templates/edit.html | 2 +- .../test_custom_browser_url.py | 12 +- .../tests/test_jsonpath_jq_selector.py | 2 +- changedetectionio/tests/test_request.py | 8 +- docker-compose.yml | 21 +-- 14 files changed, 58 insertions(+), 214 deletions(-) diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index f3fde056..231105a8 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -28,12 +28,12 @@ jobs: docker network create changedet-network - # Selenium+browserless + # Selenium and sockpuppetbrowser docker run --network changedet-network -d --hostname selenium -p 4444:4444 --rm --shm-size="2g" selenium/standalone-chrome:4 - docker run --network changedet-network -d --name browserless --hostname browserless -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm -p 3000:3000 --shm-size="2g" browserless/chrome:1.60-chrome-stable + docker run --network changedet-network -d --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest # For accessing custom browser tests - docker run --network changedet-network -d --name browserless-custom-url --hostname browserless-custom-url -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" -e "DEFAULT_LAUNCH_ARGS=[\"--window-size=1920,1080\"]" --rm --shm-size="2g" browserless/chrome:1.60-chrome-stable + docker run --network changedet-network -d --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url --rm dgtlmoon/sockpuppetbrowser:latest - name: Build changedetection.io container for testing run: | @@ -47,6 +47,12 @@ jobs: # Debug SMTP server/echo message back server docker run --network changedet-network -d -p 11025:11025 -p 11080:11080 --hostname mailserver test-changedetectionio bash -c 'python changedetectionio/tests/smtp/smtp-test-server.py' + - name: Show docker container state and other debug info + run: | + set -x + echo "Running processes in docker..." + docker ps + - name: Test built container with Pytest (generally as requests/plaintext fetching) run: | # Unit tests @@ -63,43 +69,33 @@ jobs: - name: Specific tests in built container for Selenium run: | - # Selenium fetch docker run --rm -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py' - - name: Specific tests in built container for Playwright - run: | - # Playwright/Browserless fetch - docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' + - name: Specific tests in built container for Playwright and SocketPuppetBrowser + run: | + # Playwright via Sockpuppetbrowser fetch + docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' - name: Specific tests in built container for headers and requests checks with Playwright - run: | - # Settings headers playwright tests - Call back in from Browserless, check headers - docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' + run: | + # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers + docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' - name: Specific tests in built container for headers and requests checks with Selenium - run: | + run: | docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "WEBDRIVER_URL=http://selenium:4444/wd/hub" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' - - name: Specific tests in built container with Playwright as Puppeteer experimental fetcher - run: | - docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py' - - name: Test built container restock detection via Playwright run: | - # restock detection via playwright - added name=changedet here so that playwright/browserless can connect to it - docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' + # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it + docker run --rm --name "changedet" -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-port=5004 --live-server-host=0.0.0.0 tests/restock/test_restock.py' - name: Test SMTP notification mime types run: | # SMTP content types - needs the 'Debug SMTP server/echo message back server' container from above docker run --rm --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/smtp/test_notification_smtp.py' - - name: Test with puppeteer fetcher and disk cache - run: | - docker run --rm -e "PUPPETEER_DISK_CACHE=/tmp/data/" -e "USE_EXPERIMENTAL_PUPPETEER_FETCH=yes" -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py' - # Browserless would have had -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" added above - - name: Test proxy interaction run: | cd changedetectionio diff --git a/changedetectionio/blueprint/browser_steps/__init__.py b/changedetectionio/blueprint/browser_steps/__init__.py index 999b81bc..2ee0d44f 100644 --- a/changedetectionio/blueprint/browser_steps/__init__.py +++ b/changedetectionio/blueprint/browser_steps/__init__.py @@ -4,22 +4,13 @@ # Why? # `browsersteps_playwright_browser_interface.chromium.connect_over_cdp()` will only run once without async() # - this flask app is not async() -# - browserless has a single timeout/keepalive which applies to the session made at .connect_over_cdp() +# - A single timeout/keepalive which applies to the session made at .connect_over_cdp() # # So it means that we must unfortunately for now just keep a single timer since .connect_over_cdp() was run # and know when that reaches timeout/keepalive :( when that time is up, restart the connection and tell the user # that their time is up, insert another coin. (reload) # -# Bigger picture -# - It's horrible that we have this click+wait deal, some nice socket.io solution using something similar -# to what the browserless debug UI already gives us would be smarter.. # -# OR -# - Some API call that should be hacked into browserless or playwright that we can "/api/bump-keepalive/{session_id}/60" -# So we can tell it that we need more time (run this on each action) -# -# OR -# - use multiprocessing to bump this over to its own process and add some transport layer (queue/pipes) from distutils.util import strtobool from flask import Blueprint, request, make_response diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py index 40925a3f..cfc96a20 100644 --- a/changedetectionio/blueprint/browser_steps/browser_steps.py +++ b/changedetectionio/blueprint/browser_steps/browser_steps.py @@ -169,7 +169,7 @@ class steppable_browser_interface(): self.page.locator(selector, timeout=1000).uncheck(timeout=1000) -# Responsible for maintaining a live 'context' with browserless +# Responsible for maintaining a live 'context' with the chrome CDP # @todo - how long do contexts live for anyway? class browsersteps_live_ui(steppable_browser_interface): context = None diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 4a2b7c6b..a8752e85 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -311,125 +311,6 @@ class base_html_playwright(Fetcher): with open(destination, 'w') as f: f.write(content) - def run_fetch_browserless_puppeteer(self, - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes=False, - current_include_filters=None, - is_binary=False): - - from pkg_resources import resource_string - - extra_wait_ms = (int(os.getenv("WEBDRIVER_DELAY_BEFORE_CONTENT_READY", 5)) + self.render_extract_delay) * 1000 - - self.xpath_element_js = self.xpath_element_js.replace('%ELEMENTS%', visualselector_xpath_selectors) - code = resource_string(__name__, "res/puppeteer_fetch.js").decode('utf-8') - # In the future inject this is a proper JS package - code = code.replace('%xpath_scrape_code%', self.xpath_element_js) - code = code.replace('%instock_scrape_code%', self.instock_data_js) - - from requests.exceptions import ConnectTimeout, ReadTimeout - wait_browserless_seconds = 240 - - browserless_function_url = os.getenv('BROWSERLESS_FUNCTION_URL') - from urllib.parse import urlparse - if not browserless_function_url: - # Convert/try to guess from PLAYWRIGHT_DRIVER_URL - o = urlparse(os.getenv('PLAYWRIGHT_DRIVER_URL')) - browserless_function_url = o._replace(scheme="http")._replace(path="function").geturl() - - - # Append proxy connect string - if self.proxy: - # Remove username/password if it exists in the URL or you will receive "ERR_NO_SUPPORTED_PROXIES" error - # Actual authentication handled by Puppeteer/node - o = urlparse(self.proxy.get('server')) - proxy_url = urllib.parse.quote(o._replace(netloc="{}:{}".format(o.hostname, o.port)).geturl()) - browserless_function_url = f"{browserless_function_url}&--proxy-server={proxy_url}" - - try: - amp = '&' if '?' in browserless_function_url else '?' - response = requests.request( - method="POST", - json={ - "code": code, - "context": { - # Very primitive disk cache - USE WITH EXTREME CAUTION - # Run browserless container with -e "FUNCTION_BUILT_INS=[\"fs\",\"crypto\"]" - 'disk_cache_dir': os.getenv("PUPPETEER_DISK_CACHE", False), # or path to disk cache ending in /, ie /tmp/cache/ - 'execute_js': self.webdriver_js_execute_code, - 'extra_wait_ms': extra_wait_ms, - 'include_filters': current_include_filters, - 'req_headers': request_headers, - 'screenshot_quality': int(os.getenv("PLAYWRIGHT_SCREENSHOT_QUALITY", 72)), - 'url': url, - 'user_agent': {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None), - 'proxy_username': self.proxy.get('username', '') if self.proxy else False, - 'proxy_password': self.proxy.get('password', '') if self.proxy and self.proxy.get('username') else False, - 'no_cache_list': [ - 'twitter', - '.pdf' - ], - # Could use https://github.com/easylist/easylist here, or install a plugin - 'block_url_list': [ - 'adnxs.com', - 'analytics.twitter.com', - 'doubleclick.net', - 'google-analytics.com', - 'googletagmanager', - 'trustpilot.com' - ] - } - }, - # @todo /function needs adding ws:// to http:// rebuild this - url=browserless_function_url+f"{amp}--disable-features=AudioServiceOutOfProcess&dumpio=true&--disable-remote-fonts", - timeout=wait_browserless_seconds) - - except ReadTimeout: - raise PageUnloadable(url=url, status_code=None, message=f"No response from browserless in {wait_browserless_seconds}s") - except ConnectTimeout: - raise PageUnloadable(url=url, status_code=None, message=f"Timed out connecting to browserless, retrying..") - else: - # 200 Here means that the communication to browserless worked only, not the page state - try: - x = response.json() - except Exception as e: - raise PageUnloadable(url=url, message="Error reading JSON response from browserless") - - try: - self.status_code = response.status_code - except Exception as e: - raise PageUnloadable(url=url, message="Error reading status_code code response from browserless") - - self.headers = x.get('headers') - - if self.status_code != 200 and not ignore_status_codes: - raise Non200ErrorCodeReceived(url=url, status_code=self.status_code, page_html=x.get('content','')) - - if self.status_code == 200: - import base64 - - if not x.get('screenshot'): - # https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips - # https://github.com/puppeteer/puppeteer/issues/1834 - # https://github.com/puppeteer/puppeteer/issues/1834#issuecomment-381047051 - # Check your memory is shared and big enough - raise ScreenshotUnavailable(url=url, status_code=None) - - if not x.get('content', '').strip(): - raise EmptyReply(url=url, status_code=None) - - self.content = x.get('content') - self.instock_data = x.get('instock_data') - self.screenshot = base64.b64decode(x.get('screenshot')) - self.xpath_data = x.get('xpath_data') - else: - # Some other error from browserless - raise PageUnloadable(url=url, status_code=None, message=response.content.decode('utf-8')) - def run(self, url, timeout, @@ -441,21 +322,6 @@ class base_html_playwright(Fetcher): is_binary=False): - # For now, USE_EXPERIMENTAL_PUPPETEER_FETCH is not supported by watches with BrowserSteps (for now!) - # browser_connection_is_custom doesnt work with puppeteer style fetch (use playwright native too in this case) - if not self.browser_connection_is_custom and not self.browser_steps and os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH'): - if strtobool(os.getenv('USE_EXPERIMENTAL_PUPPETEER_FETCH')): - # Temporary backup solution until we rewrite the playwright code - return self.run_fetch_browserless_puppeteer( - url, - timeout, - request_headers, - request_body, - request_method, - ignore_status_codes, - current_include_filters, - is_binary) - from playwright.sync_api import sync_playwright import playwright._impl._errors @@ -528,7 +394,7 @@ class base_html_playwright(Fetcher): self.status_code = response.status except Exception as e: # https://github.com/dgtlmoon/changedetection.io/discussions/2122#discussioncomment-8241962 - logger.critical(f"Response from browserless/playwright did not have a status_code! Response follows.") + logger.critical(f"Response from the browser/Playwright did not have a status_code! Response follows.") logger.critical(response) raise PageUnloadable(url=url, status_code=None, message=str(e)) diff --git a/changedetectionio/importer.py b/changedetectionio/importer.py index ba7b1139..42a062be 100644 --- a/changedetectionio/importer.py +++ b/changedetectionio/importer.py @@ -57,7 +57,7 @@ class import_url_list(Importer): # Flask wtform validators wont work with basic auth, use validators package # Up to 5000 per batch so we dont flood the server - # @todo validators.url failed on local hostnames (such as referring to ourself when using browserless) + # @todo validators.url will fail when you add your own IP etc if len(url) and 'http' in url.lower() and good < 5000: extras = None if processor: diff --git a/changedetectionio/res/puppeteer_fetch.js b/changedetectionio/res/puppeteer_fetch.js index 7bb9e17b..21c5abc8 100644 --- a/changedetectionio/res/puppeteer_fetch.js +++ b/changedetectionio/res/puppeteer_fetch.js @@ -146,7 +146,7 @@ module.exports = async ({page, context}) => { var xpath_data; var instock_data; try { - // Not sure the best way here, in the future this should be a new package added to npm then run in browserless + // Not sure the best way here, in the future this should be a new package added to npm then run in evaluatedCode // (Once the old playwright is removed) xpath_data = await page.evaluate((include_filters) => {%xpath_scrape_code%}, include_filters); instock_data = await page.evaluate(() => {%instock_scrape_code%}); diff --git a/changedetectionio/run_custom_browser_url_tests.sh b/changedetectionio/run_custom_browser_url_tests.sh index 10cea9c5..aef8b533 100755 --- a/changedetectionio/run_custom_browser_url_tests.sh +++ b/changedetectionio/run_custom_browser_url_tests.sh @@ -6,16 +6,16 @@ set -x # A extra browser is configured, but we never chose to use it, so it should NOT show in the logs -docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url' -docker logs browserless-custom-url &>log.txt +docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_not_via_custom_browser_url' +docker logs sockpuppetbrowser-custom-url &>log.txt grep 'custom-browser-search-string=1' log.txt if [ $? -ne 1 ] then - echo "Saw a request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should not" + echo "Saw a request in 'sockpuppetbrowser-custom-url' container with 'custom-browser-search-string=1' when I should not" exit 1 fi -docker logs browserless &>log.txt +docker logs sockpuppetbrowser &>log.txt grep 'custom-browser-search-string=1' log.txt if [ $? -ne 1 ] then @@ -24,16 +24,16 @@ then fi # Special connect string should appear in the custom-url container, but not in the 'default' one -docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url' -docker logs browserless-custom-url &>log.txt +docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/custom_browser_url/test_custom_browser_url.py::test_request_via_custom_browser_url' +docker logs sockpuppetbrowser-custom-url &>log.txt grep 'custom-browser-search-string=1' log.txt if [ $? -ne 0 ] then - echo "Did not see request in 'browserless-custom-url' container with 'custom-browser-search-string=1' when I should" + echo "Did not see request in 'sockpuppetbrowser-custom-url' container with 'custom-browser-search-string=1' when I should" exit 1 fi -docker logs browserless &>log.txt +docker logs sockpuppetbrowser &>log.txt grep 'custom-browser-search-string=1' log.txt if [ $? -ne 1 ] then diff --git a/changedetectionio/run_proxy_tests.sh b/changedetectionio/run_proxy_tests.sh index 97f81ed5..279cd85c 100755 --- a/changedetectionio/run_proxy_tests.sh +++ b/changedetectionio/run_proxy_tests.sh @@ -35,7 +35,7 @@ docker run --network changedet-network \ docker run --network changedet-network \ -e "SOCKSTEST=manual-playwright" \ -v `pwd`/tests/proxy_socks5/proxies.json-example-noauth:/app/changedetectionio/test-datastore/proxies.json \ - -e "PLAYWRIGHT_DRIVER_URL=ws://browserless:3000" \ + -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" \ --rm \ test-changedetectionio \ bash -c 'cd changedetectionio && pytest tests/proxy_socks5/test_socks5_proxy_sources.py' diff --git a/changedetectionio/static/js/browser-steps.js b/changedetectionio/static/js/browser-steps.js index 6da56b6a..90198d32 100644 --- a/changedetectionio/static/js/browser-steps.js +++ b/changedetectionio/static/js/browser-steps.js @@ -10,7 +10,7 @@ $(document).ready(function () { } }) var browsersteps_session_id; - var browserless_seconds_remaining = 0; + var browser_interface_seconds_remaining = 0; var apply_buttons_disabled = false; var include_text_elements = $("#include_text_elements"); var xpath_data = false; @@ -49,7 +49,7 @@ $(document).ready(function () { $('#browsersteps-img').removeAttr('src'); $("#browsersteps-click-start").show(); $("#browsersteps-selector-wrapper .spinner").hide(); - browserless_seconds_remaining = 0; + browser_interface_seconds_remaining = 0; browsersteps_session_id = false; apply_buttons_disabled = false; ctx.clearRect(0, 0, c.width, c.height); @@ -61,12 +61,12 @@ $(document).ready(function () { $('#browser_steps >li:first-child').css('opacity', '0.5'); } - // Show seconds remaining until playwright/browserless needs to restart the session + // Show seconds remaining until the browser interface needs to restart the session // (See comment at the top of changedetectionio/blueprint/browser_steps/__init__.py ) setInterval(() => { - if (browserless_seconds_remaining >= 1) { - document.getElementById('browserless-seconds-remaining').innerText = browserless_seconds_remaining + " seconds remaining in session"; - browserless_seconds_remaining -= 1; + if (browser_interface_seconds_remaining >= 1) { + document.getElementById('browser-seconds-remaining').innerText = browser_interface_seconds_remaining + " seconds remaining in session"; + browser_interface_seconds_remaining -= 1; } }, "1000") @@ -261,7 +261,7 @@ $(document).ready(function () { // This should trigger 'Goto site' console.log("Got startup response, requesting Goto-Site (first) step fake click"); $('#browser_steps >li:first-child .apply').click(); - browserless_seconds_remaining = 500; + browser_interface_seconds_remaining = 500; set_first_gotosite_disabled(); }).fail(function (data) { console.log(data); diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 812ddb2b..ecce2128 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -228,7 +228,7 @@ User-Agent: wonderbra 1.0") }}
- Loading (?) + Loading (?) {{ render_field(form.browser_steps) }}
diff --git a/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py index bfc2c95e..212d2e27 100644 --- a/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py +++ b/changedetectionio/tests/custom_browser_url/test_custom_browser_url.py @@ -7,10 +7,11 @@ from ..util import live_server_setup, wait_for_all_checks def do_test(client, live_server, make_test_use_extra_browser=False): # Grep for this string in the logs? - test_url = f"https://changedetection.io/ci-test.html" + test_url = f"https://changedetection.io/ci-test.html?non-custom-default=true" + # "non-custom-default" should not appear in the custom browser connection custom_browser_name = 'custom browser URL' - # needs to be set and something like 'ws://127.0.0.1:3000?stealth=1&--disable-web-security=true' + # needs to be set and something like 'ws://127.0.0.1:3000' assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" ##################### @@ -19,9 +20,7 @@ def do_test(client, live_server, make_test_use_extra_browser=False): data={"application-empty_pages_are_a_change": "", "requests-time_between_check-minutes": 180, 'application-fetch_backend': "html_webdriver", - # browserless-custom-url is setup in .github/workflows/test-only.yml - # the test script run_custom_browser_url_test.sh will look for 'custom-browser-search-string' in the container logs - 'requests-extra_browsers-0-browser_connection_url': 'ws://browserless-custom-url:3000?stealth=1&--disable-web-security=true&custom-browser-search-string=1', + 'requests-extra_browsers-0-browser_connection_url': 'ws://sockpuppetbrowser-custom-url:3000', 'requests-extra_browsers-0-browser_name': custom_browser_name }, follow_redirects=True @@ -51,7 +50,8 @@ def do_test(client, live_server, make_test_use_extra_browser=False): res = client.post( url_for("edit_page", uuid="first"), data={ - "url": test_url, + # 'run_customer_browser_url_tests.sh' will search for this string to know if we hit the right browser container or not + "url": f"https://changedetection.io/ci-test.html?custom-browser-search-string=1", "tags": "", "headers": "", 'fetch_backend': f"extra_browser_{custom_browser_name}", diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index 7dc4d68f..5dfdfef2 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -456,7 +456,7 @@ def test_ignore_json_order(client, live_server): def test_correct_header_detect(client, live_server): # Like in https://github.com/dgtlmoon/changedetection.io/pull/1593 - # Specify extra html that JSON is sometimes wrapped in - when using Browserless/Puppeteer etc + # Specify extra html that JSON is sometimes wrapped in - when using SockpuppetBrowser / Puppeteer / Playwrightetc with open("test-datastore/endpoint-content.txt", "w") as f: f.write('{"hello" : 123, "world": 123}') diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 5deaafa8..869ea349 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -14,7 +14,7 @@ def test_headers_in_request(client, live_server): # Add our URL to the import page test_url = url_for('test_headers', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): - # Because its no longer calling back to localhost but from browserless, set in test-only.yml + # Because its no longer calling back to localhost but from the browser container, set in test-only.yml test_url = test_url.replace('localhost', 'changedet') # Add the test URL twice, we will check @@ -89,7 +89,7 @@ def test_body_in_request(client, live_server): # Add our URL to the import page test_url = url_for('test_body', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): - # Because its no longer calling back to localhost but from browserless, set in test-only.yml + # Because its no longer calling back to localhost but from the browser container, set in test-only.yml test_url = test_url.replace('localhost', 'cdio') res = client.post( @@ -181,7 +181,7 @@ def test_method_in_request(client, live_server): # Add our URL to the import page test_url = url_for('test_method', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): - # Because its no longer calling back to localhost but from browserless, set in test-only.yml + # Because its no longer calling back to localhost but from the browser container, set in test-only.yml test_url = test_url.replace('localhost', 'cdio') # Add the test URL twice, we will check @@ -258,7 +258,7 @@ def test_headers_textfile_in_request(client, live_server): # Add our URL to the import page test_url = url_for('test_headers', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): - # Because its no longer calling back to localhost but from browserless, set in test-only.yml + # Because its no longer calling back to localhost but from the browser container, set in test-only.yml test_url = test_url.replace('localhost', 'cdio') print ("TEST URL IS ",test_url) diff --git a/docker-compose.yml b/docker-compose.yml index e9bdf85e..1b5bd9af 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,7 +30,7 @@ services: # https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy # # Alternative Playwright URL, do not use "'s or 's! - # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000/?stealth=1&--disable-web-security=true + # - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000 # # Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password # @@ -71,32 +71,23 @@ services: # condition: service_started # Used for fetching pages via Playwright+Chrome where you need Javascript support. - # Note: Playwright/browserless not supported on ARM type devices (rPi etc) # RECOMMENDED FOR FETCHING PAGES WITH CHROME # playwright-chrome: # hostname: playwright-chrome -# image: browserless/chrome:1.60-chrome-stable +# image: dgtlmoon/sockpuppetbrowser:latest +# cap_add: +# - SYS_ADMIN +## SYS_ADMIN might be too much, but it can be needed on your platform https://github.com/puppeteer/puppeteer/blob/main/docs/troubleshooting.md#running-puppeteer-on-gitlabci # restart: unless-stopped # environment: # - SCREEN_WIDTH=1920 # - SCREEN_HEIGHT=1024 # - SCREEN_DEPTH=16 -# - ENABLE_DEBUGGER=false -# - PREBOOT_CHROME=true -# - CONNECTION_TIMEOUT=300000 -# - MAX_CONCURRENT_SESSIONS=10 -# - CHROME_REFRESH_TIME=600000 -# - DEFAULT_BLOCK_ADS=true -# - DEFAULT_STEALTH=true -# -# Ignore HTTPS errors, like for self-signed certs -# - DEFAULT_IGNORE_HTTPS_ERRORS=true -# +# - MAX_CONCURRENT_CHROME_PROCESSES=10 # Used for fetching pages via Playwright+Chrome where you need Javascript support. # Note: Works well but is deprecated, does not fetch full page screenshots (doesnt work with Visual Selector) # Does not report status codes (200, 404, 403) and other issues - # More information about the advantages of playwright/browserless https://www.browserless.io/blog/2023/12/13/migrating-selenium-to-playwright/ # browser-chrome: # hostname: browser-chrome # image: selenium/standalone-chrome:4