From e30b17b8bc47a4713b508399506b723973e5fada Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Tue, 26 Sep 2023 13:59:59 +0200 Subject: [PATCH] UI + Fetching - Be more helpful when a filter contains no text, suggest ways to deal with images in filters (#1819) --- changedetectionio/content_fetcher.py | 4 +- .../processors/text_json_diff.py | 7 +- changedetectionio/tests/test_css_selector.py | 76 ++++++++++++++++++- changedetectionio/update_worker.py | 17 ++++- 4 files changed, 99 insertions(+), 5 deletions(-) diff --git a/changedetectionio/content_fetcher.py b/changedetectionio/content_fetcher.py index d7a4f835..dab956a5 100644 --- a/changedetectionio/content_fetcher.py +++ b/changedetectionio/content_fetcher.py @@ -77,11 +77,13 @@ class ScreenshotUnavailable(Exception): class ReplyWithContentButNoText(Exception): - def __init__(self, status_code, url, screenshot=None): + def __init__(self, status_code, url, screenshot=None, has_filters=False, html_content=''): # Set this so we can use it in other parts of the app self.status_code = status_code self.url = url self.screenshot = screenshot + self.has_filters = has_filters + self.html_content = html_content return diff --git a/changedetectionio/processors/text_json_diff.py b/changedetectionio/processors/text_json_diff.py index fb810f0c..5e69a591 100644 --- a/changedetectionio/processors/text_json_diff.py +++ b/changedetectionio/processors/text_json_diff.py @@ -314,7 +314,12 @@ class perform_site_check(difference_detection_processor): # Treat pages with no renderable text content as a change? No by default empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0: - raise content_fetcher.ReplyWithContentButNoText(url=url, status_code=fetcher.get_last_status_code(), screenshot=screenshot) + raise content_fetcher.ReplyWithContentButNoText(url=url, + status_code=fetcher.get_last_status_code(), + screenshot=screenshot, + has_filters=has_filter_rule, + html_content=html_content + ) # We rely on the actual text in the html output.. many sites have random script vars etc, # in the future we'll implement other mechanisms. diff --git a/changedetectionio/tests/test_css_selector.py b/changedetectionio/tests/test_css_selector.py index 0dfe2af7..dcc10331 100644 --- a/changedetectionio/tests/test_css_selector.py +++ b/changedetectionio/tests/test_css_selector.py @@ -2,7 +2,7 @@ import time from flask import url_for -from . util import live_server_setup +from .util import live_server_setup, wait_for_all_checks from ..html_tools import * @@ -176,3 +176,77 @@ def test_check_multiple_filters(client, live_server): assert b"Blob A" in res.data # CSS was ok assert b"Blob B" in res.data # xPath was ok assert b"Blob C" not in res.data # Should not be included + +# The filter exists, but did not contain anything useful +# Mainly used when the filter contains just an IMG, this can happen when someone selects an image in the visual-selector +# Tests fetcher can throw a "ReplyWithContentButNoText" exception after applying filter and extracting text +def test_filter_is_empty_help_suggestion(client, live_server): + #live_server_setup(live_server) + + include_filters = "#blob-a" + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(""" +
+ +
+ + + """) + + + # Add our URL to the import page + test_url = url_for('test_endpoint', _external=True) + res = client.post( + url_for("import_page"), + data={"urls": test_url}, + follow_redirects=True + ) + assert b"1 Imported" in res.data + wait_for_all_checks(client) + + # Goto the edit page, add our ignore text + # Add our URL to the import page + res = client.post( + url_for("edit_page", uuid="first"), + data={"include_filters": include_filters, + "url": test_url, + "tags": "", + "headers": "", + 'fetch_backend': "html_requests"}, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + wait_for_all_checks(client) + + + res = client.get( + url_for("index"), + follow_redirects=True + ) + + assert b'empty result or contain only an image' in res.data + + + ### Just an empty selector, no image + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(""" +
+ +
+ + + """) + + res = client.get(url_for("form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + + res = client.get( + url_for("index"), + follow_redirects=True + ) + + assert b'empty result or contain only an image' not in res.data + assert b'but contained no usable text' in res.data diff --git a/changedetectionio/update_worker.py b/changedetectionio/update_worker.py index 63a0aab4..de0a669d 100644 --- a/changedetectionio/update_worker.py +++ b/changedetectionio/update_worker.py @@ -3,7 +3,7 @@ import threading import queue import time -from changedetectionio import content_fetcher +from changedetectionio import content_fetcher, html_tools from .processors.text_json_diff import FilterNotFoundInResponse from .processors.restock_diff import UnableToExtractRestockData @@ -251,7 +251,20 @@ class update_worker(threading.Thread): # Totally fine, it's by choice - just continue on, nothing more to care about # Page had elements/content but no renderable text # Backend (not filters) gave zero output - self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)}) + extra_help = "" + if e.has_filters: + # Maybe it contains an image? offer a more helpful link + has_img = html_tools.include_filters(include_filters='img', + html_content=e.html_content) + if has_img: + extra_help = ", it's possible that the filters you have give an empty result or contain only an image more help here." + else: + extra_help = ", it's possible that the filters were found, but contained no usable text." + + self.datastore.update_watch(uuid=uuid, update_obj={ + 'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}" + }) + if e.screenshot: self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot) process_changedetection_results = False