diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml
index 1c89f2fa..8fb89d62 100644
--- a/.github/workflows/test-only.yml
+++ b/.github/workflows/test-only.yml
@@ -72,7 +72,11 @@ jobs:
run: |
# Playwright via Sockpuppetbrowser fetch
# tests/visualselector/test_fetch_data.py will do browser steps
- docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py'
+
- name: Playwright and SocketPuppetBrowser - Headers and requests
run: |
@@ -87,8 +91,11 @@ jobs:
# STRAIGHT TO CDP
- name: Pyppeteer and SocketPuppetBrowser - Specific tests in built container
run: |
- # Playwright via Sockpuppetbrowser fetch
- docker run --rm -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "FAST_PUPPETEER_CHROME_FETCHER=True" --network changedet-network test-changedetectionio bash -c 'cd changedetectionio;pytest tests/fetchers/test_content.py && pytest tests/test_errorhandling.py && pytest tests/visualselector/test_fetch_data.py'
+ # Playwright via Sockpuppetbrowser fetch
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_content.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_errorhandling.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/visualselector/test_fetch_data.py'
+ docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "FAST_PUPPETEER_CHROME_FETCHER=True" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/fetchers/test_custom_js_before_content.py'
- name: Pyppeteer and SocketPuppetBrowser - Headers and requests checks
run: |
diff --git a/changedetectionio/blueprint/browser_steps/browser_steps.py b/changedetectionio/blueprint/browser_steps/browser_steps.py
index 22710e99..6bb58b38 100644
--- a/changedetectionio/blueprint/browser_steps/browser_steps.py
+++ b/changedetectionio/blueprint/browser_steps/browser_steps.py
@@ -6,6 +6,8 @@ import re
from random import randint
from loguru import logger
+from changedetectionio.content_fetchers.base import manage_user_agent
+
# Two flags, tell the JS which of the "Selector" or "Value" field should be enabled in the front end
# 0- off, 1- on
browser_step_ui_config = {'Choose one': '0 0',
@@ -178,6 +180,7 @@ class browsersteps_live_ui(steppable_browser_interface):
stale = False
# bump and kill this if idle after X sec
age_start = 0
+ headers = {}
# use a special driver, maybe locally etc
command_executor = os.getenv(
@@ -192,7 +195,8 @@ class browsersteps_live_ui(steppable_browser_interface):
browser_type = os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').strip('"')
- def __init__(self, playwright_browser, proxy=None):
+ def __init__(self, playwright_browser, proxy=None, headers=None):
+ self.headers = headers or {}
self.age_start = time.time()
self.playwright_browser = playwright_browser
if self.context is None:
@@ -206,16 +210,17 @@ class browsersteps_live_ui(steppable_browser_interface):
# @todo handle multiple contexts, bind a unique id from the browser on each req?
self.context = self.playwright_browser.new_context(
- # @todo
- # user_agent=request_headers['User-Agent'] if request_headers.get('User-Agent') else 'Mozilla/5.0',
- # proxy=self.proxy,
- # This is needed to enable JavaScript execution on GitHub and others
- bypass_csp=True,
- # Should never be needed
- accept_downloads=False,
- proxy=proxy
+ accept_downloads=False, # Should never be needed
+ bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+ extra_http_headers=self.headers,
+ ignore_https_errors=True,
+ proxy=proxy,
+ service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
+ # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+ user_agent=manage_user_agent(headers=self.headers),
)
+
self.page = self.context.new_page()
# self.page.set_default_navigation_timeout(keep_open)
diff --git a/changedetectionio/blueprint/browser_steps/nonContext.py b/changedetectionio/blueprint/browser_steps/nonContext.py
index 5345f306..93abe269 100644
--- a/changedetectionio/blueprint/browser_steps/nonContext.py
+++ b/changedetectionio/blueprint/browser_steps/nonContext.py
@@ -1,5 +1,4 @@
from playwright.sync_api import PlaywrightContextManager
-import asyncio
# So playwright wants to run as a context manager, but we do something horrible and hacky
# we are holding the session open for as long as possible, then shutting it down, and opening a new one
diff --git a/changedetectionio/blueprint/tags/__init__.py b/changedetectionio/blueprint/tags/__init__.py
index 10e226d4..ba20cb4a 100644
--- a/changedetectionio/blueprint/tags/__init__.py
+++ b/changedetectionio/blueprint/tags/__init__.py
@@ -11,9 +11,10 @@ def construct_blueprint(datastore: ChangeDetectionStore):
def tags_overview_page():
from .form import SingleTag
add_form = SingleTag(request.form)
+ sorted_tags = sorted(datastore.data['settings']['application'].get('tags').items(), key=lambda x: x[1]['title'])
output = render_template("groups-overview.html",
form=add_form,
- available_tags=datastore.data['settings']['application'].get('tags', {}),
+ available_tags=sorted_tags,
)
return output
diff --git a/changedetectionio/blueprint/tags/templates/groups-overview.html b/changedetectionio/blueprint/tags/templates/groups-overview.html
index cab8d5e6..7d942f43 100644
--- a/changedetectionio/blueprint/tags/templates/groups-overview.html
+++ b/changedetectionio/blueprint/tags/templates/groups-overview.html
@@ -40,7 +40,7 @@
diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py
index d54b9bb2..3ad5f5f7 100644
--- a/changedetectionio/content_fetchers/__init__.py
+++ b/changedetectionio/content_fetchers/__init__.py
@@ -1,6 +1,6 @@
import sys
from distutils.util import strtobool
-
+from loguru import logger
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
import os
@@ -29,10 +29,15 @@ def available_fetchers():
# rather than site-specific.
use_playwright_as_chrome_fetcher = os.getenv('PLAYWRIGHT_DRIVER_URL', False)
if use_playwright_as_chrome_fetcher:
+ # @note - For now, browser steps always uses playwright
if not strtobool(os.getenv('FAST_PUPPETEER_CHROME_FETCHER', 'False')):
+ logger.debug('Using Playwright library as fetcher')
from .playwright import fetcher as html_webdriver
else:
+ logger.debug('Using direct Python Puppeteer library as fetcher')
from .puppeteer import fetcher as html_webdriver
else:
+ logger.debug("Falling back to selenium as fetcher")
from .webdriver_selenium import fetcher as html_webdriver
+
diff --git a/changedetectionio/content_fetchers/base.py b/changedetectionio/content_fetchers/base.py
index 71500d61..756a9bef 100644
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -5,6 +5,40 @@ from loguru import logger
from changedetectionio.content_fetchers import BrowserStepsStepException
+def manage_user_agent(headers, current_ua=''):
+ """
+ Basic setting of user-agent
+
+ NOTE!!!!!! The service that does the actual Chrome fetching should handle any anti-robot techniques
+ THERE ARE MANY WAYS THAT IT CAN BE DETECTED AS A ROBOT!!
+ This does not take care of
+ - Scraping of 'navigator' (platform, productSub, vendor, oscpu etc etc) browser object (navigator.appVersion) etc
+ - TCP/IP fingerprint JA3 etc
+ - Graphic rendering fingerprinting
+ - Your IP being obviously in a pool of bad actors
+ - Too many requests
+ - Scraping of SCH-UA browser replies (thanks google!!)
+ - Scraping of ServiceWorker, new window calls etc
+
+ See https://filipvitas.medium.com/how-to-set-user-agent-header-with-puppeteer-js-and-not-fail-28c7a02165da
+ Puppeteer requests https://github.com/dgtlmoon/pyppeteerstealth
+
+ :param page:
+ :param headers:
+ :return:
+ """
+ # Ask it what the user agent is, if its obviously ChromeHeadless, switch it to the default
+ ua_in_custom_headers = next((v for k, v in headers.items() if k.lower() == "user-agent"), None)
+ if ua_in_custom_headers:
+ return ua_in_custom_headers
+
+ if not ua_in_custom_headers and current_ua:
+ current_ua = current_ua.replace('HeadlessChrome', 'Chrome')
+ return current_ua
+
+ return None
+
+
class Fetcher():
browser_connection_is_custom = None
browser_connection_url = None
diff --git a/changedetectionio/content_fetchers/playwright.py b/changedetectionio/content_fetchers/playwright.py
index 7faa2032..7950e033 100644
--- a/changedetectionio/content_fetchers/playwright.py
+++ b/changedetectionio/content_fetchers/playwright.py
@@ -3,7 +3,8 @@ import os
from urllib.parse import urlparse
from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, ScreenshotUnavailable
class fetcher(Fetcher):
@@ -102,19 +103,16 @@ class fetcher(Fetcher):
# Set user agent to prevent Cloudflare from blocking the browser
# Use the default one configured in the App.py model that's passed from fetch_site_status.py
context = browser.new_context(
- user_agent={k.lower(): v for k, v in request_headers.items()}.get('user-agent', None),
+ accept_downloads=False, # Should never be needed
+ bypass_csp=True, # This is needed to enable JavaScript execution on GitHub and others
+ extra_http_headers=request_headers,
+ ignore_https_errors=True,
proxy=self.proxy,
- # This is needed to enable JavaScript execution on GitHub and others
- bypass_csp=True,
- # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
- service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'),
- # Should never be needed
- accept_downloads=False
+ service_workers=os.getenv('PLAYWRIGHT_SERVICE_WORKERS', 'allow'), # Should be `allow` or `block` - sites like YouTube can transmit large amounts of data via Service Workers
+ user_agent=manage_user_agent(headers=request_headers),
)
self.page = context.new_page()
- if len(request_headers):
- context.set_extra_http_headers(request_headers)
# Listen for all console events and handle errors
self.page.on("console", lambda msg: print(f"Playwright console: Watch URL: {url} {msg.type}: {msg.text} {msg.args}"))
diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py
index 64d06ee7..cad1b6b8 100644
--- a/changedetectionio/content_fetchers/puppeteer.py
+++ b/changedetectionio/content_fetchers/puppeteer.py
@@ -5,7 +5,8 @@ import websockets.exceptions
from urllib.parse import urlparse
from loguru import logger
-from changedetectionio.content_fetchers.base import Fetcher
+
+from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent
from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError
@@ -100,10 +101,11 @@ class fetcher(Fetcher):
else:
self.page = await browser.newPage()
+ await self.page.setUserAgent(manage_user_agent(headers=request_headers, current_ua=await self.page.evaluate('navigator.userAgent')))
+
await self.page.setBypassCSP(True)
if request_headers:
await self.page.setExtraHTTPHeaders(request_headers)
- # @todo check user-agent worked
# SOCKS5 with authentication is not supported (yet)
# https://github.com/microsoft/playwright/issues/10567
@@ -212,8 +214,12 @@ class fetcher(Fetcher):
logger.error('ERROR: Failed to get viewport-only reduced screenshot :(')
pass
finally:
+ # It's good to log here in the case that the browser crashes on shutting down but we still get the data we need
+ logger.success(f"Fetching '{url}' complete, closing page")
await self.page.close()
+ logger.success(f"Fetching '{url}' complete, closing browser")
await browser.close()
+ logger.success(f"Fetching '{url}' complete, exiting puppeteer fetch.")
async def main(self, **kwargs):
await self.fetch_page(**kwargs)
diff --git a/changedetectionio/content_fetchers/res/stock-not-in-stock.js b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
index fba2398e..ad86c034 100644
--- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js
+++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
@@ -10,7 +10,7 @@ function isItemInStock() {
const outOfStockTexts = [
' أخبرني عندما يتوفر',
'0 in stock',
- 'actuellement indisponible',
+ 'actuellement indisponible',
'agotado',
'article épuisé',
'artikel zurzeit vergriffen',
@@ -144,7 +144,7 @@ function isItemInStock() {
if (elementText.length) {
// try which ones could mean its in stock
- if (negateOutOfStockRegex.test(elementText)) {
+ if (negateOutOfStockRegex.test(elementText) && !elementText.includes('(0 products)')) {
console.log(`Negating/overriding 'Out of Stock' back to "Possibly in stock" found "${elementText}"`)
return 'Possibly in stock';
}
@@ -156,7 +156,9 @@ function isItemInStock() {
const element = elementsToScan[i];
// outside the 'fold' or some weird text in the heading area
// .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
- if (element.getBoundingClientRect().top + window.scrollY >= vh || element.getBoundingClientRect().top + window.scrollY <= 100) {
+
+ // Should be in the "above the fold" plus about 150px
+ if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
continue
}
elementText = "";
diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py
index 84e804f7..d9004a98 100644
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -404,17 +404,21 @@ def changedetection_app(config=None, datastore_o=None):
global datastore
from changedetectionio import forms
- limit_tag = request.args.get('tag', '').lower().strip()
+ active_tag_req = request.args.get('tag', '').lower().strip()
+ active_tag_uuid = active_tag = None
# Be sure limit_tag is a uuid
- for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items():
- if limit_tag == tag.get('title', '').lower().strip():
- limit_tag = uuid
+ if active_tag_req:
+ for uuid, tag in datastore.data['settings']['application'].get('tags', {}).items():
+ if active_tag_req == tag.get('title', '').lower().strip() or active_tag_req == uuid:
+ active_tag = tag
+ active_tag_uuid = uuid
+ break
# Redirect for the old rss path which used the /?rss=true
if request.args.get('rss'):
- return redirect(url_for('rss', tag=limit_tag))
+ return redirect(url_for('rss', tag=active_tag_uuid))
op = request.args.get('op')
if op:
@@ -425,7 +429,7 @@ def changedetection_app(config=None, datastore_o=None):
datastore.data['watching'][uuid].toggle_mute()
datastore.needs_write = True
- return redirect(url_for('index', tag = limit_tag))
+ return redirect(url_for('index', tag = active_tag_uuid))
# Sort by last_changed and add the uuid which is usually the key..
sorted_watches = []
@@ -436,7 +440,7 @@ def changedetection_app(config=None, datastore_o=None):
if with_errors and not watch.get('last_error'):
continue
- if limit_tag and not limit_tag in watch['tags']:
+ if active_tag_uuid and not active_tag_uuid in watch['tags']:
continue
if watch.get('last_error'):
errored_count += 1
@@ -455,11 +459,12 @@ def changedetection_app(config=None, datastore_o=None):
total=total_count,
per_page=datastore.data['settings']['application'].get('pager_size', 50), css_framework="semantic")
-
+ sorted_tags = sorted(datastore.data['settings']['application'].get('tags').items(), key=lambda x: x[1]['title'])
output = render_template(
"watch-overview.html",
# Don't link to hosting when we're on the hosting environment
- active_tag=limit_tag,
+ active_tag=active_tag,
+ active_tag_uuid=active_tag_uuid,
app_rss_token=datastore.data['settings']['application']['rss_access_token'],
datastore=datastore,
errored_count=errored_count,
@@ -474,7 +479,7 @@ def changedetection_app(config=None, datastore_o=None):
sort_attribute=request.args.get('sort') if request.args.get('sort') else request.cookies.get('sort'),
sort_order=request.args.get('order') if request.args.get('order') else request.cookies.get('order'),
system_default_fetcher=datastore.data['settings']['application'].get('fetch_backend'),
- tags=datastore.data['settings']['application'].get('tags'),
+ tags=sorted_tags,
watches=sorted_watches
)
diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py
index 8e341432..e89e469d 100644
--- a/changedetectionio/processors/text_json_diff.py
+++ b/changedetectionio/processors/text_json_diff.py
@@ -119,7 +119,7 @@ class perform_site_check(difference_detection_processor):
include_filters_from_tags = self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='include_filters')
# 1845 - remove duplicated filters in both group and watch include filter
- include_filters_rule = list({*watch.get('include_filters', []), *include_filters_from_tags})
+ include_filters_rule = list(dict.fromkeys(watch.get('include_filters', []) + include_filters_from_tags))
subtractive_selectors = [*self.datastore.get_tag_overrides_for_watch(uuid=uuid, attr='subtractive_selectors'),
*watch.get("subtractive_selectors", []),
diff --git a/changedetectionio/static/js/browser-steps.js b/changedetectionio/static/js/browser-steps.js
index 90198d32..7c9c38d8 100644
--- a/changedetectionio/static/js/browser-steps.js
+++ b/changedetectionio/static/js/browser-steps.js
@@ -160,6 +160,12 @@ $(document).ready(function () {
e.offsetX > item.left * y_scale && e.offsetX < item.left * y_scale + item.width * y_scale
) {
+ // Ignore really large ones, because we are scraping 'div' also from xpath_element_scraper but
+ // that div or whatever could be some wrapper and would generally make you select the whole page
+ if (item.width > 800 && item.height > 400) {
+ return
+ }
+
// There could be many elements here, record them all and then we'll find out which is the most 'useful'
// (input, textarea, button, A etc)
if (item.width < xpath_data['browser_width']) {
diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html
index 186efe42..16a56294 100644
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -1,6 +1,6 @@
{% extends 'base.html' %}
{% block content %}
-{% from '_helpers.jinja' import render_simple_field, render_field, render_nolabel_field %}
+{% from '_helpers.jinja' import render_simple_field, render_field, render_nolabel_field, sort_by_title %}
@@ -13,7 +13,7 @@
{{ render_nolabel_field(form.url, placeholder="https://...", required=true) }}
- {{ render_nolabel_field(form.tags, value=tags[active_tag].title if active_tag else '', placeholder="watch label / tag") }}
+ {{ render_nolabel_field(form.tags, value=active_tag.title if active_tag else '', placeholder="watch label / tag") }}
{{ render_nolabel_field(form.watch_submit_button, title="Watch this URL!" ) }}
{{ render_nolabel_field(form.edit_and_watch_submit_button, title="Edit first then Watch") }}
@@ -46,11 +46,13 @@
{% if search_q %}Searching "{{search_q}}" {% endif %}
All
- {% for uuid, tag in tags.items() %}
- {% if tag != "" %}
- {{ tag.title }}
- {% endif %}
- {% endfor %}
+
+
+ {% for uuid, tag in tags %}
+ {% if tag != "" %}
+ {{ tag.title }}
+ {% endif %}
+ {% endfor %}
{% set sort_order = sort_order or 'asc' %}
@@ -197,8 +199,8 @@
{% endif %}
- Recheck
- all {% if active_tag%} in "{{tags[active_tag].title}}"{%endif%}
+ Recheck
+ all {% if active_tag_uuid %} in "{{active_tag.title}}"{%endif%}
diff --git a/changedetectionio/tests/fetchers/test_custom_js_before_content.py b/changedetectionio/tests/fetchers/test_custom_js_before_content.py
new file mode 100644
index 00000000..bec4334a
--- /dev/null
+++ b/changedetectionio/tests/fetchers/test_custom_js_before_content.py
@@ -0,0 +1,56 @@
+import os
+from flask import url_for
+from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
+
+
+def test_execute_custom_js(client, live_server):
+
+ live_server_setup(live_server)
+ assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
+
+ test_url = url_for('test_interactive_html_endpoint', _external=True)
+ test_url = test_url.replace('localhost.localdomain', 'cdio')
+ test_url = test_url.replace('localhost', 'cdio')
+
+ res = client.post(
+ url_for("form_quick_watch_add"),
+ data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
+ follow_redirects=True
+ )
+
+ assert b"Watch added in Paused state, saving will unpause" in res.data
+
+ res = client.post(
+ url_for("edit_page", uuid="first", unpause_on_save=1),
+ data={
+ "url": test_url,
+ "tags": "",
+ 'fetch_backend': "html_webdriver",
+ 'webdriver_js_execute_code': 'document.querySelector("button[name=test-button]").click();',
+ 'headers': "testheader: yes\buser-agent: MyCustomAgent",
+ },
+ follow_redirects=True
+ )
+ assert b"unpaused" in res.data
+ wait_for_all_checks(client)
+
+ uuid = extract_UUID_from_client(client)
+ assert live_server.app.config['DATASTORE'].data['watching'][uuid].history_n >= 1, "Watch history had atleast 1 (everything fetched OK)"
+
+ assert b"This text should be removed" not in res.data
+
+ # Check HTML conversion detected and workd
+ res = client.get(
+ url_for("preview_page", uuid=uuid),
+ follow_redirects=True
+ )
+ assert b"This text should be removed" not in res.data
+ assert b"I smell JavaScript because the button was pressed" in res.data
+
+ assert b"testheader: yes" in res.data
+ assert b"user-agent: mycustomagent" in res.data
+
+ client.get(
+ url_for("form_delete", uuid="all"),
+ follow_redirects=True
+ )
\ No newline at end of file
diff --git a/changedetectionio/tests/test_group.py b/changedetectionio/tests/test_group.py
index ed38cb98..d9912a06 100644
--- a/changedetectionio/tests/test_group.py
+++ b/changedetectionio/tests/test_group.py
@@ -321,3 +321,154 @@ def test_clone_tag_on_quickwatchform_add(client, live_server):
res = client.get(url_for("tags.delete_all"), follow_redirects=True)
assert b'All tags deleted' in res.data
+
+def test_order_of_filters_tag_filter_and_watch_filter(client, live_server):
+
+ # Add a tag with some config, import a tag and it should roughly work
+ res = client.post(
+ url_for("tags.form_tag_add"),
+ data={"name": "test-tag-keep-order"},
+ follow_redirects=True
+ )
+ assert b"Tag added" in res.data
+ assert b"test-tag-keep-order" in res.data
+ tag_filters = [
+ '#only-this', # duplicated filters
+ '#only-this',
+ '#only-this',
+ '#only-this',
+ ]
+
+ res = client.post(
+ url_for("tags.form_tag_edit_submit", uuid="first"),
+ data={"name": "test-tag-keep-order",
+ "include_filters": '\n'.join(tag_filters) },
+ follow_redirects=True
+ )
+ assert b"Updated" in res.data
+ tag_uuid = get_UUID_for_tag_name(client, name="test-tag-keep-order")
+ res = client.get(
+ url_for("tags.form_tag_edit", uuid="first")
+ )
+ assert b"#only-this" in res.data
+
+
+ d = """
+
+ Some initial text
+ And 1 this
+
+ And 2 this
+ And 3 this
+ And 4 this
+ And 5 this
+ And 6 this
+ And 7 this
+ And 8 this
+ And 9 this
+ And 10 this
+ And 11 this
+ And 12 this
+ And 13 this
+ And 14 this
+ And 15 this
+
+
+ """
+
+ with open("test-datastore/endpoint-content.txt", "w") as f:
+ f.write(d)
+
+ test_url = url_for('test_endpoint', _external=True)
+ res = client.post(
+ url_for("import_page"),
+ data={"urls": test_url},
+ follow_redirects=True
+ )
+ assert b"1 Imported" in res.data
+ wait_for_all_checks(client)
+
+ filters = [
+ '/html/body/p[3]',
+ '/html/body/p[4]',
+ '/html/body/p[5]',
+ '/html/body/p[6]',
+ '/html/body/p[7]',
+ '/html/body/p[8]',
+ '/html/body/p[9]',
+ '/html/body/p[10]',
+ '/html/body/p[11]',
+ '/html/body/p[12]',
+ '/html/body/p[13]', # duplicated tags
+ '/html/body/p[13]',
+ '/html/body/p[13]',
+ '/html/body/p[13]',
+ '/html/body/p[13]',
+ '/html/body/p[14]',
+ ]
+
+ res = client.post(
+ url_for("edit_page", uuid="first"),
+ data={"include_filters": '\n'.join(filters),
+ "url": test_url,
+ "tags": "test-tag-keep-order",
+ "headers": "",
+ 'fetch_backend': "html_requests"},
+ follow_redirects=True
+ )
+ assert b"Updated watch." in res.data
+ wait_for_all_checks(client)
+
+ res = client.get(
+ url_for("preview_page", uuid="first"),
+ follow_redirects=True
+ )
+
+ assert b"And 1 this" in res.data # test-tag-keep-order
+
+ a_tag_filter_check = b'And 1 this' #'#only-this' of tag_filters
+ # check there is no duplication of tag_filters
+ assert res.data.count(a_tag_filter_check) == 1, f"duplicated filters didn't removed {res.data.count(a_tag_filter_check)} of {a_tag_filter_check} in {res.data=}"
+
+ a_filter_check = b"And 13 this" # '/html/body/p[13]'
+ # check there is no duplication of filters
+ assert res.data.count(a_filter_check) == 1, f"duplicated filters didn't removed. {res.data.count(a_filter_check)} of {a_filter_check} in {res.data=}"
+
+ a_filter_check_not_include = b"And 2 this" # '/html/body/p[2]'
+ assert a_filter_check_not_include not in res.data
+
+ checklist = [
+ b"And 3 this",
+ b"And 4 this",
+ b"And 5 this",
+ b"And 6 this",
+ b"And 7 this",
+ b"And 8 this",
+ b"And 9 this",
+ b"And 10 this",
+ b"And 11 this",
+ b"And 12 this",
+ b"And 13 this",
+ b"And 14 this",
+ b"And 1 this", # result of filter from tag.
+ ]
+ # check whether everything a user requested is there
+ for test in checklist:
+ assert test in res.data
+
+ # check whether everything a user requested is in order of filters.
+ n = 0
+ for test in checklist:
+ t_index = res.data[n:].find(test)
+ # if the text is not searched, return -1.
+ assert t_index >= 0, f"""failed because {test=} not in {res.data[n:]=}
+#####################
+Looks like some feature changed the order of result of filters.
+#####################
+the {test} appeared before. {test in res.data[:n]=}
+{res.data[:n]=}
+ """
+ n += t_index + len(test)
+
+ res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+ assert b'Deleted' in res.data
diff --git a/changedetectionio/tests/test_pdf.py b/changedetectionio/tests/test_pdf.py
index d8694a57..2d4fb6a9 100644
--- a/changedetectionio/tests/test_pdf.py
+++ b/changedetectionio/tests/test_pdf.py
@@ -29,7 +29,8 @@ def test_fetch_pdf(client, live_server):
follow_redirects=True
)
- assert b'PDF-1.5' not in res.data
+ # PDF header should not be there (it was converted to text)
+ assert b'PDF' not in res.data[:10]
assert b'hello world' in res.data
# So we know if the file changes in other ways
diff --git a/changedetectionio/tests/util.py b/changedetectionio/tests/util.py
index 5974e47a..aab79163 100644
--- a/changedetectionio/tests/util.py
+++ b/changedetectionio/tests/util.py
@@ -242,5 +242,28 @@ def live_server_setup(live_server):
resp.headers['Content-Type'] = 'application/pdf'
return resp
+ @live_server.app.route('/test-interactive-html-endpoint')
+ def test_interactive_html_endpoint():
+ header_text=""
+ for k,v in request.headers.items():
+ header_text += f"{k}: {v} "
+
+ resp = make_response(f"""
+
+
+ Primitive JS check for changedetectionio/tests/visualselector/test_fetch_data.py
+ This text should be removed
+ |