From 39b7be9e7af8d9237d589da0a4d512b0c95edbd6 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Thu, 27 Jan 2022 23:16:50 +0100
Subject: [PATCH] plaintext mime type fix - Don't attempt to extract HTML
 content from plaintext, this will remove lines and break changedetection
 (#391)

---
 changedetectionio/fetch_site_status.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/changedetectionio/fetch_site_status.py b/changedetectionio/fetch_site_status.py
index 7f678657..1dc84698 100644
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@@ -118,16 +118,21 @@ class perform_site_check():
             if is_html:
                 # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                 html_content = fetcher.content
-                if has_filter_rule:
-                    # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
-                    if css_filter_rule[0] == '/':
-                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
-                    else:
-                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
-                        html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
-
-                # get_text() via inscriptis
-                stripped_text_from_html = get_text(html_content)
+                if not fetcher.headers.get('Content-Type', '') == 'text/plain':
+
+                    if has_filter_rule:
+                        # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
+                        if css_filter_rule[0] == '/':
+                            html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+                        else:
+                            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                            html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+
+                    # get_text() via inscriptis
+                    stripped_text_from_html = get_text(html_content)
+                else:
+                    # Don't run get_text or xpath/css filters on plaintext
+                    stripped_text_from_html = html_content
 
             # Re #340 - return the content before the 'ignore text' was applied
             text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')