diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 7f678657..1dc84698 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -118,16 +118,21 @@ class perform_site_check(): if is_html: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content - if has_filter_rule: - # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." - if css_filter_rule[0] == '/': - html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) - else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) - - # get_text() via inscriptis - stripped_text_from_html = get_text(html_content) + if not fetcher.headers.get('Content-Type', '') == 'text/plain': + + if has_filter_rule: + # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." + if css_filter_rule[0] == '/': + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) + else: + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) + + # get_text() via inscriptis + stripped_text_from_html = get_text(html_content) + else: + # Don't run get_text or xpath/css filters on plaintext + stripped_text_from_html = html_content # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')