From 39b7be9e7af8d9237d589da0a4d512b0c95edbd6 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 27 Jan 2022 23:16:50 +0100 Subject: [PATCH] plaintext mime type fix - Don't attempt to extract HTML content from plaintext, this will remove lines and break changedetection (#391) --- changedetectionio/fetch_site_status.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py index 7f678657..1dc84698 100644 --- a/changedetectionio/fetch_site_status.py +++ b/changedetectionio/fetch_site_status.py @@ -118,16 +118,21 @@ class perform_site_check(): if is_html: # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text html_content = fetcher.content - if has_filter_rule: - # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." - if css_filter_rule[0] == '/': - html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) - else: - # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text - html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) - - # get_text() via inscriptis - stripped_text_from_html = get_text(html_content) + if not fetcher.headers.get('Content-Type', '') == 'text/plain': + + if has_filter_rule: + # For HTML/XML we offer xpath as an option, just start a regular xPath "/.." + if css_filter_rule[0] == '/': + html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content) + else: + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content) + + # get_text() via inscriptis + stripped_text_from_html = get_text(html_content) + else: + # Don't run get_text or xpath/css filters on plaintext + stripped_text_from_html = html_content # Re #340 - return the content before the 'ignore text' was applied text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')