diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py
index 41747cd5..096c7752 100644
--- a/changedetectionio/__init__.py
+++ b/changedetectionio/__init__.py
@@ -599,7 +599,7 @@ def changedetection_app(config=None, datastore_o=None):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
# Reset the previous_md5 so we process a new snapshot including stripping ignore text.
- if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']:
+ if form.include_filters.data != datastore.data['watching'][uuid].get('include_filters', []):
if len(datastore.data['watching'][uuid].history):
extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid)
diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py
index 416ed6df..bc5615ca 100644
--- a/changedetectionio/content_fetcher.py
+++ b/changedetectionio/content_fetcher.py
@@ -164,16 +164,16 @@ class Fetcher():
}
- // inject the current one set in the css_filter, which may be a CSS rule
+ // inject the current one set in the include_filters, which may be a CSS rule
// used for displaying the current one in VisualSelector, where its not one we generated.
- if (css_filter.length) {
+ if (include_filters.length) {
q=false;
try {
// is it xpath?
- if (css_filter.startsWith('/') || css_filter.startsWith('xpath:')) {
- q=document.evaluate(css_filter.replace('xpath:',''), document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+ if (include_filters.startsWith('/') || include_filters.startsWith('xpath:')) {
+ q=document.evaluate(include_filters.replace('xpath:',''), document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
} else {
- q=document.querySelector(css_filter);
+ q=document.querySelector(include_filters);
}
} catch (e) {
// Maybe catch DOMException and alert?
@@ -186,7 +186,7 @@ class Fetcher():
if (bbox && bbox['width'] >0 && bbox['height']>0) {
size_pos.push({
- xpath: css_filter,
+ xpath: include_filters,
width: bbox['width'],
height: bbox['height'],
left: bbox['left'],
@@ -220,7 +220,7 @@ class Fetcher():
request_body,
request_method,
ignore_status_codes=False,
- current_css_filter=None):
+ current_include_filters=None):
# Should set self.error, self.status_code and self.content
pass
@@ -310,7 +310,7 @@ class base_html_playwright(Fetcher):
request_body,
request_method,
ignore_status_codes=False,
- current_css_filter=None):
+ current_include_filters=None):
from playwright.sync_api import sync_playwright
import playwright._impl._api_types
@@ -413,10 +413,10 @@ class base_html_playwright(Fetcher):
self.status_code = response.status
self.headers = response.all_headers()
- if current_css_filter is not None:
- page.evaluate("var css_filter={}".format(json.dumps(current_css_filter)))
+ if current_include_filters is not None:
+ page.evaluate("var include_filters={}".format(json.dumps(current_include_filters)))
else:
- page.evaluate("var css_filter=''")
+ page.evaluate("var include_filters=''")
self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}")
@@ -497,7 +497,7 @@ class base_html_webdriver(Fetcher):
request_body,
request_method,
ignore_status_codes=False,
- current_css_filter=None):
+ current_include_filters=None):
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
@@ -573,7 +573,7 @@ class html_requests(Fetcher):
request_body,
request_method,
ignore_status_codes=False,
- current_css_filter=None):
+ current_include_filters=None):
# Make requests use a more modern looking user-agent
if not 'User-Agent' in request_headers:
diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 03f4579d..12894b78 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -10,6 +10,12 @@ from changedetectionio import content_fetcher, html_tools
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+class FilterNotFoundInResponse(ValueError):
+ def __init__(self, msg):
+ ValueError.__init__(self, msg)
+
+
+
# Some common stuff here that can be moved to a base class
# (set_proxy_from_list)
class perform_site_check():
@@ -104,7 +110,7 @@ class perform_site_check():
if watch['webdriver_js_execute_code'] is not None and watch['webdriver_js_execute_code'].strip():
fetcher.webdriver_js_execute_code = watch['webdriver_js_execute_code']
- fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch['css_filter'])
+ fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch['include_filters'])
fetcher.quit()
self.screenshot = fetcher.screenshot
@@ -128,25 +134,26 @@ class perform_site_check():
is_html = False
is_json = False
- css_filter_rule = watch['css_filter']
+ include_filters_rule = watch['include_filters']
subtractive_selectors = watch.get(
"subtractive_selectors", []
) + self.datastore.data["settings"]["application"].get(
"global_subtractive_selectors", []
)
- has_filter_rule = css_filter_rule and len(css_filter_rule.strip())
+ has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip())
has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip())
if is_json and not has_filter_rule:
- css_filter_rule = "json:$"
+ include_filters_rule.append("json:$")
has_filter_rule = True
if has_filter_rule:
json_filter_prefixes = ['json:', 'jq:']
- if any(prefix in css_filter_rule for prefix in json_filter_prefixes):
- stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule)
- is_html = False
+ for filter in include_filters_rule:
+ if any(prefix in filter for prefix in json_filter_prefixes):
+ stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter)
+ is_html = False
if is_html or is_source:
@@ -161,18 +168,28 @@ class perform_site_check():
else:
# Then we assume HTML
if has_filter_rule:
- # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
- if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'):
- html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''),
- html_content=fetcher.content)
- else:
- # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
- html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+ html_content = ""
+ for filter_rule in include_filters_rule:
+ # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
+ if filter_rule[0] == '/' or filter_rule.startswith('xpath:'):
+ html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
+ html_content=fetcher.content,
+ append_pretty_line_formatting=not is_source)
+ else:
+ # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+ html_content += html_tools.include_filters(include_filters=filter_rule,
+ html_content=fetcher.content,
+ append_pretty_line_formatting=not is_source)
+
+ if not html_content.strip():
+ raise FilterNotFoundInResponse(include_filters_rule)
if has_subtractive_selectors:
html_content = html_tools.element_removal(subtractive_selectors, html_content)
- if not is_source:
+ if is_source:
+ stripped_text_from_html = html_content
+ else:
# extract text
stripped_text_from_html = \
html_tools.html_to_text(
@@ -182,9 +199,6 @@ class perform_site_check():
"render_anchor_tag_content", False)
)
- elif is_source:
- stripped_text_from_html = html_content
-
# Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
index 51e02884..7f857d0c 100644
--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -349,7 +349,7 @@ class watchForm(commonSettingsForm):
time_between_check = FormField(TimeBetweenCheckForm)
- css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='')
+ include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='')
subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)])
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 167d0f77..06b14958 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -7,26 +7,30 @@ from typing import List
import json
import re
-class FilterNotFoundInResponse(ValueError):
- def __init__(self, msg):
- ValueError.__init__(self, msg)
+# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
+TEXT_FILTER_LIST_LINE_SUFFIX = "
"
class JSONNotFound(ValueError):
def __init__(self, msg):
ValueError.__init__(self, msg)
-
-
+
# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
-def css_filter(css_filter, html_content):
+def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
soup = BeautifulSoup(html_content, "html.parser")
html_block = ""
- r = soup.select(css_filter, separator="")
- if len(html_content) > 0 and len(r) == 0:
- raise FilterNotFoundInResponse(css_filter)
- for item in r:
- html_block += str(item)
+ r = soup.select(include_filters, separator="")
+
+ for element in r:
+ # When there's more than 1 match, then add the suffix to separate each line
+ # And where the matched result doesn't include something that will cause Inscriptis to add a newline
+ # (This way each 'match' reliably has a new-line in the diff)
+ # Divs are converted to 4 whitespaces by inscriptis
+ if append_pretty_line_formatting and len(html_block) and not element.name in (['br', 'hr', 'div', 'p']):
+ html_block += TEXT_FILTER_LIST_LINE_SUFFIX
+
+ html_block += str(element)
- return html_block + "\n"
+ return html_block
def subtractive_css_selector(css_selector, html_content):
soup = BeautifulSoup(html_content, "html.parser")
@@ -42,25 +46,29 @@ def element_removal(selectors: List[str], html_content):
# Return str Utf-8 of matched rules
-def xpath_filter(xpath_filter, html_content):
+def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False):
from lxml import etree, html
tree = html.fromstring(bytes(html_content, encoding='utf-8'))
html_block = ""
r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
- if len(html_content) > 0 and len(r) == 0:
- raise FilterNotFoundInResponse(xpath_filter)
-
#@note: //title/text() wont work where