diff --git a/changedetectionio/__init__.py b/changedetectionio/__init__.py index 41747cd5..096c7752 100644 --- a/changedetectionio/__init__.py +++ b/changedetectionio/__init__.py @@ -599,7 +599,7 @@ def changedetection_app(config=None, datastore_o=None): extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) # Reset the previous_md5 so we process a new snapshot including stripping ignore text. - if form.css_filter.data.strip() != datastore.data['watching'][uuid]['css_filter']: + if form.include_filters.data != datastore.data['watching'][uuid].get('include_filters', []): if len(datastore.data['watching'][uuid].history): extra_update_obj['previous_md5'] = get_current_checksum_include_ignore_text(uuid=uuid) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index 416ed6df..bc5615ca 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -164,16 +164,16 @@ class Fetcher(): } - // inject the current one set in the css_filter, which may be a CSS rule + // inject the current one set in the include_filters, which may be a CSS rule // used for displaying the current one in VisualSelector, where its not one we generated. - if (css_filter.length) { + if (include_filters.length) { q=false; try { // is it xpath? - if (css_filter.startsWith('/') || css_filter.startsWith('xpath:')) { - q=document.evaluate(css_filter.replace('xpath:',''), document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; + if (include_filters.startsWith('/') || include_filters.startsWith('xpath:')) { + q=document.evaluate(include_filters.replace('xpath:',''), document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; } else { - q=document.querySelector(css_filter); + q=document.querySelector(include_filters); } } catch (e) { // Maybe catch DOMException and alert? @@ -186,7 +186,7 @@ class Fetcher(): if (bbox && bbox['width'] >0 && bbox['height']>0) { size_pos.push({ - xpath: css_filter, + xpath: include_filters, width: bbox['width'], height: bbox['height'], left: bbox['left'], @@ -220,7 +220,7 @@ class Fetcher(): request_body, request_method, ignore_status_codes=False, - current_css_filter=None): + current_include_filters=None): # Should set self.error, self.status_code and self.content pass @@ -310,7 +310,7 @@ class base_html_playwright(Fetcher): request_body, request_method, ignore_status_codes=False, - current_css_filter=None): + current_include_filters=None): from playwright.sync_api import sync_playwright import playwright._impl._api_types @@ -413,10 +413,10 @@ class base_html_playwright(Fetcher): self.status_code = response.status self.headers = response.all_headers() - if current_css_filter is not None: - page.evaluate("var css_filter={}".format(json.dumps(current_css_filter))) + if current_include_filters is not None: + page.evaluate("var include_filters={}".format(json.dumps(current_include_filters))) else: - page.evaluate("var css_filter=''") + page.evaluate("var include_filters=''") self.xpath_data = page.evaluate("async () => {" + self.xpath_element_js + "}") @@ -497,7 +497,7 @@ class base_html_webdriver(Fetcher): request_body, request_method, ignore_status_codes=False, - current_css_filter=None): + current_include_filters=None): from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities @@ -573,7 +573,7 @@ class html_requests(Fetcher): request_body, request_method, ignore_status_codes=False, - current_css_filter=None): + current_include_filters=None): # Make requests use a more modern looking user-agent if not 'User-Agent' in request_headers: diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 03f4579d..12894b78 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -10,6 +10,12 @@ from changedetectionio import content_fetcher, html_tools urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +class FilterNotFoundInResponse(ValueError): + def __init__(self, msg): + ValueError.__init__(self, msg) + + + # Some common stuff here that can be moved to a base class # (set_proxy_from_list) class perform_site_check(): @@ -104,7 +110,7 @@ class perform_site_check(): if watch['webdriver_js_execute_code'] is not None and watch['webdriver_js_execute_code'].strip(): fetcher.webdriver_js_execute_code = watch['webdriver_js_execute_code'] - fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch['css_filter']) + fetcher.run(url, timeout, request_headers, request_body, request_method, ignore_status_codes, watch['include_filters']) fetcher.quit() self.screenshot = fetcher.screenshot @@ -128,25 +134,26 @@ class perform_site_check(): is_html = False is_json = False - css_filter_rule = watch['css_filter'] + include_filters_rule = watch['include_filters'] subtractive_selectors = watch.get( "subtractive_selectors", [] ) + self.datastore.data["settings"]["application"].get( "global_subtractive_selectors", [] ) - has_filter_rule = css_filter_rule and len(css_filter_rule.strip()) + has_filter_rule = include_filters_rule and len("".join(include_filters_rule).strip()) has_subtractive_selectors = subtractive_selectors and len(subtractive_selectors[0].strip()) if is_json and not has_filter_rule: - css_filter_rule = "json:$" + include_filters_rule.append("json:$") has_filter_rule = True if has_filter_rule: json_filter_prefixes = ['json:', 'jq:'] - if any(prefix in css_filter_rule for prefix in json_filter_prefixes): - stripped_text_from_html = html_tools.extract_json_as_string(content=fetcher.content, json_filter=css_filter_rule) - is_html = False + for filter in include_filters_rule: + if any(prefix in filter for prefix in json_filter_prefixes): + stripped_text_from_html += html_tools.extract_json_as_string(content=fetcher.content, json_filter=filter) + is_html = False if is_html or is_source: @@ -161,18 +168,28 @@ class perform_site_check(): else: # Then we assume HTML if has_filter_rule: - # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." - if css_filter_rule[0] == '/' or css_filter_rule.startswith('xpath:'): - html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule.replace('xpath:', ''), - html_content=fetcher.content) - else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) + html_content = "" + for filter_rule in include_filters_rule: + # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." + if filter_rule[0] == '/' or filter_rule.startswith('xpath:'): + html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''), + html_content=fetcher.content, + append_pretty_line_formatting=not is_source) + else: + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + html_content += html_tools.include_filters(include_filters=filter_rule, + html_content=fetcher.content, + append_pretty_line_formatting=not is_source) + + if not html_content.strip(): + raise FilterNotFoundInResponse(include_filters_rule) if has_subtractive_selectors: html_content = html_tools.element_removal(subtractive_selectors, html_content) - if not is_source: + if is_source: + stripped_text_from_html = html_content + else: # extract text stripped_text_from_html = \ html_tools.html_to_text( @@ -182,9 +199,6 @@ class perform_site_check(): "render_anchor_tag_content", False) ) - elif is_source: - stripped_text_from_html = html_content - # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 51e02884..7f857d0c 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -349,7 +349,7 @@ class watchForm(commonSettingsForm): time_between_check = FormField(TimeBetweenCheckForm) - css_filter = StringField('CSS/JSON/XPATH Filter', [ValidateCSSJSONXPATHInput()], default='') + include_filters = StringListField('CSS/JSONPath/JQ/XPath Filters', [ValidateCSSJSONXPATHInput()], default='') subtractive_selectors = StringListField('Remove elements', [ValidateCSSJSONXPATHInput(allow_xpath=False, allow_json=False)]) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 167d0f77..06b14958 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -7,26 +7,30 @@ from typing import List import json import re -class FilterNotFoundInResponse(ValueError): - def __init__(self, msg): - ValueError.__init__(self, msg) +# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis +TEXT_FILTER_LIST_LINE_SUFFIX = "
" class JSONNotFound(ValueError): def __init__(self, msg): ValueError.__init__(self, msg) - - + # Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches -def css_filter(css_filter, html_content): +def include_filters(include_filters, html_content, append_pretty_line_formatting=False): soup = BeautifulSoup(html_content, "html.parser") html_block = "" - r = soup.select(css_filter, separator="") - if len(html_content) > 0 and len(r) == 0: - raise FilterNotFoundInResponse(css_filter) - for item in r: - html_block += str(item) + r = soup.select(include_filters, separator="") + + for element in r: + # When there's more than 1 match, then add the suffix to separate each line + # And where the matched result doesn't include something that will cause Inscriptis to add a newline + # (This way each 'match' reliably has a new-line in the diff) + # Divs are converted to 4 whitespaces by inscriptis + if append_pretty_line_formatting and len(html_block) and not element.name in (['br', 'hr', 'div', 'p']): + html_block += TEXT_FILTER_LIST_LINE_SUFFIX + + html_block += str(element) - return html_block + "\n" + return html_block def subtractive_css_selector(css_selector, html_content): soup = BeautifulSoup(html_content, "html.parser") @@ -42,25 +46,29 @@ def element_removal(selectors: List[str], html_content): # Return str Utf-8 of matched rules -def xpath_filter(xpath_filter, html_content): +def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False): from lxml import etree, html tree = html.fromstring(bytes(html_content, encoding='utf-8')) html_block = "" r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}) - if len(html_content) > 0 and len(r) == 0: - raise FilterNotFoundInResponse(xpath_filter) - #@note: //title/text() wont work where CDATA.. for element in r: + # When there's more than 1 match, then add the suffix to separate each line + # And where the matched result doesn't include something that will cause Inscriptis to add a newline + # (This way each 'match' reliably has a new-line in the diff) + # Divs are converted to 4 whitespaces by inscriptis + if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])): + html_block += TEXT_FILTER_LIST_LINE_SUFFIX + if type(element) == etree._ElementStringResult: - html_block += str(element) + "<br/>" + html_block += str(element) elif type(element) == etree._ElementUnicodeResult: - html_block += str(element) + "<br/>" + html_block += str(element) else: - html_block += etree.tostring(element, pretty_print=True).decode('utf-8') + "<br/>" + html_block += etree.tostring(element, pretty_print=True).decode('utf-8') return html_block diff --git a/changedetectionio/importer.py b/changedetectionio/importer.py index 4d3e2b2b..3668b356 100644 --- a/changedetectionio/importer.py +++ b/changedetectionio/importer.py @@ -103,12 +103,12 @@ class import_distill_io_json(Importer): pass except IndexError: pass - + extras['include_filters'] = [] try: - extras['css_filter'] = d_config['selections'][0]['frames'][0]['includes'][0]['expr'] if d_config['selections'][0]['frames'][0]['includes'][0]['type'] == 'xpath': - extras['css_filter'] = 'xpath:' + extras['css_filter'] - + extras['include_filters'].append('xpath:' + d_config['selections'][0]['frames'][0]['includes'][0]['expr']) + else: + extras['include_filters'].append(d_config['selections'][0]['frames'][0]['includes'][0]['expr']) except KeyError: pass except IndexError: diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index d09f144a..573ef47c 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -36,7 +36,7 @@ class model(dict): 'notification_body': None, 'notification_format': default_notification_format_for_watch, 'notification_muted': False, - 'css_filter': '', + 'include_filters': [], 'last_error': False, 'extract_text': [], # Extract text by regex after filters 'subtractive_selectors': [], diff --git a/changedetectionio/static/js/visual-selector.js b/changedetectionio/static/js/visual-selector.js index b73dc526..d4a488d9 100644 --- a/changedetectionio/static/js/visual-selector.js +++ b/changedetectionio/static/js/visual-selector.js @@ -50,7 +50,7 @@ $(document).ready(function() { state_clicked=false; ctx.clearRect(0, 0, c.width, c.height); xctx.clearRect(0, 0, c.width, c.height); - $("#css_filter").val(''); + $("#include_filters").val(''); }); @@ -68,7 +68,7 @@ $(document).ready(function() { xctx = c.getContext("2d"); // redline highlight context ctx = c.getContext("2d"); - current_default_xpath =$("#css_filter").val(); + current_default_xpath =$("#include_filters").val(); fetch_data(); $('#selector-canvas').off("mousemove mousedown"); // screenshot_url defined in the edit.html template @@ -205,9 +205,9 @@ $(document).ready(function() { var sel = selector_data['size_pos'][current_selected_i]; if (sel[0] == '/') { // @todo - not sure just checking / is right - $("#css_filter").val('xpath:'+sel.xpath); + $("#include_filters").val('xpath:'+sel.xpath); } else { - $("#css_filter").val(sel.xpath); + $("#include_filters").val(sel.xpath); } xctx.fillStyle = 'rgba(205,205,205,0.95)'; xctx.strokeStyle = 'rgba(225,0,0,0.9)'; diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 985a2e93..04d4fb39 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -82,8 +82,13 @@ class ChangeDetectionStore: except (FileNotFoundError, json.decoder.JSONDecodeError): if include_default_watches: print("Creating JSON store at", self.datastore_path) - self.add_watch(url='https://news.ycombinator.com/', tag='Tech news') - self.add_watch(url='https://changedetection.io/CHANGELOG.txt', tag='changedetection.io') + self.add_watch(url='https://news.ycombinator.com/', + tag='Tech news', + extras={'fetch_backend': 'html_requests'}) + + self.add_watch(url='https://changedetection.io/CHANGELOG.txt', + tag='changedetection.io', + extras={'fetch_backend': 'html_requests'}) self.__data['version_tag'] = version_tag @@ -267,7 +272,7 @@ class ChangeDetectionStore: extras = {} # should always be str if tag is None or not tag: - tag='' + tag = '' # Incase these are copied across, assume it's a reference and deepcopy() apply_extras = deepcopy(extras) @@ -282,17 +287,31 @@ class ChangeDetectionStore: res = r.json() # List of permissible attributes we accept from the wild internet - for k in ['url', 'tag', - 'paused', 'title', - 'previous_md5', 'headers', - 'body', 'method', - 'ignore_text', 'css_filter', - 'subtractive_selectors', 'trigger_text', - 'extract_title_as_title', 'extract_text', - 'text_should_not_be_present', - 'webdriver_js_execute_code']: + for k in [ + 'body', + 'css_filter', + 'extract_text', + 'extract_title_as_title', + 'headers', + 'ignore_text', + 'include_filters', + 'method', + 'paused', + 'previous_md5', + 'subtractive_selectors', + 'tag', + 'text_should_not_be_present', + 'title', + 'trigger_text', + 'webdriver_js_execute_code', + 'url', + ]: if res.get(k): - apply_extras[k] = res[k] + if k != 'css_filter': + apply_extras[k] = res[k] + else: + # We renamed the field and made it a list + apply_extras['include_filters'] = [res['css_filter']] except Exception as e: logging.error("Error fetching metadata for shared watch link", url, str(e)) @@ -315,12 +334,13 @@ class ChangeDetectionStore: del apply_extras[k] new_watch.update(apply_extras) - self.__data['watching'][new_uuid]=new_watch + self.__data['watching'][new_uuid] = new_watch self.__data['watching'][new_uuid].ensure_data_dir_exists() if write_to_disk_now: self.sync_to_json() + return new_uuid def visualselector_data_is_ready(self, watch_uuid): @@ -584,3 +604,14 @@ class ChangeDetectionStore: for v in ['User-Agent', 'Accept', 'Accept-Encoding', 'Accept-Language']: if self.data['settings']['headers'].get(v): del self.data['settings']['headers'][v] + + # Convert filters to a list of filters css_filter -> include_filters + def update_8(self): + for uuid, watch in self.data['watching'].items(): + try: + existing_filter = watch.get('css_filter', '') + if existing_filter: + watch['include_filters'] = [existing_filter] + except: + continue + return \ No newline at end of file diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html index 66286314..e7cd5b06 100644 --- a/changedetectionio/templates/edit.html +++ b/changedetectionio/templates/edit.html @@ -174,15 +174,17 @@ User-Agent: wonderbra 1.0") }} </div> </fieldset> <div class="pure-control-group"> - {% set field = render_field(form.css_filter, - placeholder=".class-name or #some-id, or other CSS selector rule.", + {% set field = render_field(form.include_filters, + rows=5, + placeholder="#example +xpath://body/div/span[contains(@class, 'example-class')]", class="m-d") %} {{ field }} {% if '/text()' in field %} <span class="pure-form-message-inline"><strong>Note!: //text() function does not work where the <element> contains <![CDATA[]]></strong></span><br/> {% endif %} - <span class="pure-form-message-inline"> + <span class="pure-form-message-inline">One rule per line, <i>any</i> rules that matches will be used.<br/> <ul> <li>CSS - Limit text to this CSS rule, only text matching this CSS rule is included.</li> <li>JSON - Limit text to this JSON rule, using either <a href="https://pypi.org/project/jsonpath-ng/" target="new">JSONPath</a> or <a href="https://stedolan.github.io/jq/" target="new">jq</a> (if installed). diff --git a/changedetectionio/tests/proxy_list/test_multiple_proxy.py b/changedetectionio/tests/proxy_list/test_multiple_proxy.py index fcd286eb..b329836e 100644 --- a/changedetectionio/tests/proxy_list/test_multiple_proxy.py +++ b/changedetectionio/tests/proxy_list/test_multiple_proxy.py @@ -24,7 +24,7 @@ def test_preferred_proxy(client, live_server): res = client.post( url_for("edit_page", uuid="first"), data={ - "css_filter": "", + "include_filters": "", "fetch_backend": "html_requests", "headers": "", "proxy": "proxy-two", diff --git a/changedetectionio/tests/test_auth.py b/changedetectionio/tests/test_auth.py index f8d1437e..ab2fadf7 100644 --- a/changedetectionio/tests/test_auth.py +++ b/changedetectionio/tests/test_auth.py @@ -23,7 +23,7 @@ def test_basic_auth(client, live_server): # Check form validation res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": "", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": "", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) assert b"Updated watch." in res.data diff --git a/changedetectionio/tests/test_css_selector.py b/changedetectionio/tests/test_css_selector.py index ab234ddb..5bac5e4b 100644 --- a/changedetectionio/tests/test_css_selector.py +++ b/changedetectionio/tests/test_css_selector.py @@ -46,22 +46,23 @@ def set_modified_response(): # Test that the CSS extraction works how we expect, important here is the right placing of new lines \n's -def test_css_filter_output(): - from changedetectionio import fetch_site_status +def test_include_filters_output(): from inscriptis import get_text # Check text with sub-parts renders correctly content = """<html> <body><div id="thingthing" > Some really <b>bold</b> text </div> </body> </html>""" - html_blob = css_filter(css_filter="#thingthing", html_content=content) + html_blob = include_filters(include_filters="#thingthing", html_content=content) text = get_text(html_blob) assert text == " Some really bold text" content = """<html> <body> <p>foo bar blah</p> - <div class="parts">Block A</div> <div class="parts">Block B</div></body> + <DIV class="parts">Block A</DiV> <div class="parts">Block B</DIV></body> </html> """ - html_blob = css_filter(css_filter=".parts", html_content=content) + + # in xPath this would be //*[@class='parts'] + html_blob = include_filters(include_filters=".parts", html_content=content) text = get_text(html_blob) # Divs are converted to 4 whitespaces by inscriptis @@ -69,10 +70,10 @@ def test_css_filter_output(): # Tests the whole stack works with the CSS Filter -def test_check_markup_css_filter_restriction(client, live_server): +def test_check_markup_include_filters_restriction(client, live_server): sleep_time_for_fetch_thread = 3 - css_filter = "#sametext" + include_filters = "#sametext" set_original_response() @@ -98,7 +99,7 @@ def test_check_markup_css_filter_restriction(client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": include_filters, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -107,7 +108,7 @@ def test_check_markup_css_filter_restriction(client, live_server): res = client.get( url_for("edit_page", uuid="first"), ) - assert bytes(css_filter.encode('utf-8')) in res.data + assert bytes(include_filters.encode('utf-8')) in res.data # Trigger a check client.get(url_for("form_watch_checknow"), follow_redirects=True) @@ -126,3 +127,58 @@ def test_check_markup_css_filter_restriction(client, live_server): # Because it should be looking at only that 'sametext' id res = client.get(url_for("index")) assert b'unviewed' in res.data + + +# Tests the whole stack works with the CSS Filter +def test_check_multiple_filters(client, live_server): + sleep_time_for_fetch_thread = 3 + + include_filters = "#blob-a\r\nxpath://*[contains(@id,'blob-b')]" + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write("""<html><body> + <div id="blob-a">Blob A</div> + <div id="blob-b">Blob B</div> + <div id="blob-c">Blob C</div> + </body> + </html> + """) + + # Give the endpoint time to spin up + time.sleep(1) + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + time.sleep(1) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"include_filters": include_filters, + "url": test_url, + "tag": "", + "headers": "", + 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + # Give the thread time to pick it up + time.sleep(sleep_time_for_fetch_thread) + + res = client.get( + url_for("preview_page", uuid="first"), + follow_redirects=True + ) + + # Only the two blobs should be here + assert b"Blob A" in res.data # CSS was ok + assert b"Blob B" in res.data # xPath was ok + assert b"Blob C" not in res.data # Should not be included diff --git a/changedetectionio/tests/test_extract_regex.py b/changedetectionio/tests/test_extract_regex.py index aad29d51..2bfad8b9 100644 --- a/changedetectionio/tests/test_extract_regex.py +++ b/changedetectionio/tests/test_extract_regex.py @@ -88,7 +88,7 @@ def test_check_filter_multiline(client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": '', + data={"include_filters": '', 'extract_text': '/something.+?6 billion.+?lines/si', "url": test_url, "tag": "", @@ -116,7 +116,7 @@ def test_check_filter_multiline(client, live_server): def test_check_filter_and_regex_extract(client, live_server): sleep_time_for_fetch_thread = 3 - css_filter = ".changetext" + include_filters = ".changetext" set_original_response() @@ -143,7 +143,7 @@ def test_check_filter_and_regex_extract(client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": css_filter, + data={"include_filters": include_filters, 'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i', "url": test_url, "tag": "", diff --git a/changedetectionio/tests/test_filter_exist_changes.py b/changedetectionio/tests/test_filter_exist_changes.py index 261686ed..10addf0a 100644 --- a/changedetectionio/tests/test_filter_exist_changes.py +++ b/changedetectionio/tests/test_filter_exist_changes.py @@ -92,7 +92,7 @@ def test_filter_doesnt_exist_then_exists_should_get_notification(client, live_se "tag": "my tag", "title": "my title", "headers": "", - "css_filter": '.ticket-available', + "include_filters": '.ticket-available', "fetch_backend": "html_requests"}) res = client.post( diff --git a/changedetectionio/tests/test_filter_failure_notification.py b/changedetectionio/tests/test_filter_failure_notification.py index 812f288b..7606945f 100644 --- a/changedetectionio/tests/test_filter_failure_notification.py +++ b/changedetectionio/tests/test_filter_failure_notification.py @@ -76,7 +76,7 @@ def run_filter_test(client, content_filter): "title": "my title", "headers": "", "filter_failure_notification_send": 'y', - "css_filter": content_filter, + "include_filters": content_filter, "fetch_backend": "html_requests"}) res = client.post( @@ -95,7 +95,7 @@ def run_filter_test(client, content_filter): time.sleep(3) # We should see something in the frontend - assert b'Warning, filter' in res.data + assert b'Warning, no filters were found' in res.data # Now it should exist and contain our "filter not found" alert assert os.path.isfile("test-datastore/notification.txt") @@ -131,7 +131,7 @@ def run_filter_test(client, content_filter): def test_setup(live_server): live_server_setup(live_server) -def test_check_css_filter_failure_notification(client, live_server): +def test_check_include_filters_failure_notification(client, live_server): set_original_response() time.sleep(1) run_filter_test(client, '#nope-doesnt-exist') diff --git a/changedetectionio/tests/test_jsonpath_jq_selector.py b/changedetectionio/tests/test_jsonpath_jq_selector.py index f6da84db..7dc75208 100644 --- a/changedetectionio/tests/test_jsonpath_jq_selector.py +++ b/changedetectionio/tests/test_jsonpath_jq_selector.py @@ -132,7 +132,7 @@ def set_original_response(): return None -def set_response_with_html(): +def set_json_response_with_html(): test_return_data = """ { "test": [ @@ -176,7 +176,7 @@ def set_modified_response(): def test_check_json_without_filter(client, live_server): # Request a JSON document from a application/json source containing HTML # and be sure it doesn't get chewed up by instriptis - set_response_with_html() + set_json_response_with_html() # Give the endpoint time to spin up time.sleep(1) @@ -189,9 +189,6 @@ def test_check_json_without_filter(client, live_server): follow_redirects=True ) - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(3) @@ -200,6 +197,7 @@ def test_check_json_without_filter(client, live_server): follow_redirects=True ) + # Should still see '"html": "<b>"' assert b'"<b>' in res.data assert res.data.count(b'{\n') >= 2 @@ -221,9 +219,6 @@ def check_json_filter(json_filter, client, live_server): ) assert b"1 Imported" in res.data - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(3) @@ -231,7 +226,7 @@ def check_json_filter(json_filter, client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": json_filter, + data={"include_filters": json_filter, "url": test_url, "tag": "", "headers": "", @@ -247,9 +242,6 @@ def check_json_filter(json_filter, client, live_server): ) assert bytes(escape(json_filter).encode('utf-8')) in res.data - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(3) # Make a change @@ -301,7 +293,7 @@ def check_json_filter_bool_val(json_filter, client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": json_filter, + data={"include_filters": json_filter, "url": test_url, "tag": "", "headers": "", @@ -311,11 +303,6 @@ def check_json_filter_bool_val(json_filter, client, live_server): ) assert b"Updated watch." in res.data - time.sleep(3) - - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(3) # Make a change @@ -360,9 +347,6 @@ def check_json_ext_filter(json_filter, client, live_server): ) assert b"1 Imported" in res.data - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(3) @@ -370,7 +354,7 @@ def check_json_ext_filter(json_filter, client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": json_filter, + data={"include_filters": json_filter, "url": test_url, "tag": "", "headers": "", @@ -386,9 +370,6 @@ def check_json_ext_filter(json_filter, client, live_server): ) assert bytes(escape(json_filter).encode('utf-8')) in res.data - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(3) # Make a change diff --git a/changedetectionio/tests/test_share_watch.py b/changedetectionio/tests/test_share_watch.py index 620bda03..e328bf81 100644 --- a/changedetectionio/tests/test_share_watch.py +++ b/changedetectionio/tests/test_share_watch.py @@ -14,7 +14,7 @@ def test_share_watch(client, live_server): live_server_setup(live_server) test_url = url_for('test_endpoint', _external=True) - css_filter = ".nice-filter" + include_filters = ".nice-filter" # Add our URL to the import page res = client.post( @@ -29,7 +29,7 @@ def test_share_watch(client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": css_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": include_filters, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -37,7 +37,7 @@ def test_share_watch(client, live_server): res = client.get( url_for("edit_page", uuid="first"), ) - assert bytes(css_filter.encode('utf-8')) in res.data + assert bytes(include_filters.encode('utf-8')) in res.data # click share the link res = client.get( @@ -73,4 +73,8 @@ def test_share_watch(client, live_server): res = client.get( url_for("edit_page", uuid="first"), ) - assert bytes(css_filter.encode('utf-8')) in res.data + assert bytes(include_filters.encode('utf-8')) in res.data + + # Check it saved the URL + res = client.get(url_for("index")) + assert bytes(test_url.encode('utf-8')) in res.data diff --git a/changedetectionio/tests/test_source.py b/changedetectionio/tests/test_source.py index a980fab9..4956f007 100644 --- a/changedetectionio/tests/test_source.py +++ b/changedetectionio/tests/test_source.py @@ -57,10 +57,9 @@ def test_check_basic_change_detection_functionality_source(client, live_server): - +# `subtractive_selectors` should still work in `source:` type requests def test_check_ignore_elements(client, live_server): set_original_response() - time.sleep(2) test_url = 'source:'+url_for('test_endpoint', _external=True) # Add our URL to the import page @@ -77,9 +76,9 @@ def test_check_ignore_elements(client, live_server): ##################### # We want <span> and <p> ONLY, but ignore span with .foobar-detection - res = client.post( + client.post( url_for("edit_page", uuid="first"), - data={"css_filter": 'span,p', "url": test_url, "tag": "", "subtractive_selectors": ".foobar-detection", 'fetch_backend': "html_requests"}, + data={"include_filters": 'span,p', "url": test_url, "tag": "", "subtractive_selectors": ".foobar-detection", 'fetch_backend': "html_requests"}, follow_redirects=True ) @@ -89,7 +88,6 @@ def test_check_ignore_elements(client, live_server): url_for("preview_page", uuid="first"), follow_redirects=True ) - assert b'foobar-detection' not in res.data assert b'<br' not in res.data assert b'<p' in res.data \ No newline at end of file diff --git a/changedetectionio/tests/test_trigger_regex_with_filter.py b/changedetectionio/tests/test_trigger_regex_with_filter.py index 1f95046a..34b0dc64 100644 --- a/changedetectionio/tests/test_trigger_regex_with_filter.py +++ b/changedetectionio/tests/test_trigger_regex_with_filter.py @@ -49,7 +49,7 @@ def test_trigger_regex_functionality_with_filter(client, live_server): url_for("edit_page", uuid="first"), data={"trigger_text": "/cool.stuff/", "url": test_url, - "css_filter": '#in-here', + "include_filters": '#in-here', "fetch_backend": "html_requests"}, follow_redirects=True ) diff --git a/changedetectionio/tests/test_watch_fields_storage.py b/changedetectionio/tests/test_watch_fields_storage.py index bd9c9e65..5db29a72 100644 --- a/changedetectionio/tests/test_watch_fields_storage.py +++ b/changedetectionio/tests/test_watch_fields_storage.py @@ -22,7 +22,7 @@ def test_check_watch_field_storage(client, live_server): url_for("edit_page", uuid="first"), data={ "notification_urls": "json://127.0.0.1:30000\r\njson://128.0.0.1\r\n", "time_between_check-minutes": 126, - "css_filter" : ".fooclass", + "include_filters" : ".fooclass", "title" : "My title", "ignore_text" : "ignore this", "url": test_url, diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py index 4e417a74..bbccc729 100644 --- a/changedetectionio/tests/test_xpath_selector.py +++ b/changedetectionio/tests/test_xpath_selector.py @@ -89,7 +89,7 @@ def test_check_xpath_filter_utf8(client, live_server): time.sleep(1) res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -143,7 +143,7 @@ def test_check_xpath_text_function_utf8(client, live_server): time.sleep(1) res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -182,9 +182,6 @@ def test_check_markup_xpath_filter_restriction(client, live_server): ) assert b"1 Imported" in res.data - # Trigger a check - client.get(url_for("form_watch_checknow"), follow_redirects=True) - # Give the thread time to pick it up time.sleep(sleep_time_for_fetch_thread) @@ -192,7 +189,7 @@ def test_check_markup_xpath_filter_restriction(client, live_server): # Add our URL to the import page res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": xpath_filter, "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -230,10 +227,11 @@ def test_xpath_validation(client, live_server): follow_redirects=True ) assert b"1 Imported" in res.data + time.sleep(2) res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": "/something horrible", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) assert b"is not a valid XPath expression" in res.data @@ -242,7 +240,7 @@ def test_xpath_validation(client, live_server): # actually only really used by the distll.io importer, but could be handy too -def test_check_with_prefix_css_filter(client, live_server): +def test_check_with_prefix_include_filters(client, live_server): res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) assert b'Deleted' in res.data @@ -263,7 +261,7 @@ def test_check_with_prefix_css_filter(client, live_server): res = client.post( url_for("edit_page", uuid="first"), - data={"css_filter": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, + data={"include_filters": "xpath://*[contains(@class, 'sametext')]", "url": test_url, "tag": "", "headers": "", 'fetch_backend': "html_requests"}, follow_redirects=True ) diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 8f1c0f28..4a131a90 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -4,7 +4,7 @@ import queue import time from changedetectionio import content_fetcher -from changedetectionio.html_tools import FilterNotFoundInResponse +from changedetectionio.fetch_site_status import FilterNotFoundInResponse # A single update worker # @@ -91,8 +91,8 @@ class update_worker(threading.Thread): return n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page', - 'notification_body': "Your configured CSS/xPath filter of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format( - watch['css_filter'], + 'notification_body': "Your configured CSS/xPath filters of '{}' for {{watch_url}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format( + ", ".join(watch['include_filters']), threshold), 'notification_format': 'text'} @@ -189,7 +189,7 @@ class update_worker(threading.Thread): if not self.datastore.data['watching'].get(uuid): continue - err_text = "Warning, filter '{}' not found".format(str(e)) + err_text = "Warning, no filters were found, no change detection ran." self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text, # So that we get a trigger when the content is added again 'previous_md5': ''})