plaintext mime type fix - Don't attempt to extract HTML content from plaintext, this will remove lines and break changedetection (#391)

3 years ago · 39b7be9e7a
parent 6611823962
commit 39b7be9e7a
1 changed files with 15 additions and 10 deletions
--- a/changedetectionio/fetch_site_status.py
+++ b/changedetectionio/fetch_site_status.py
@ -118,16 +118,21 @@ class perform_site_check():
            if is_html:
                # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
                html_content = fetcher.content
-                if has_filter_rule:
-                    # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
-                    if css_filter_rule[0] == '/':
-                        html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
-                    else:
-                        # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
-                        html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
-
-                # get_text() via inscriptis
-                stripped_text_from_html = get_text(html_content)
+                if not fetcher.headers.get('Content-Type', '') == 'text/plain':
+
+                    if has_filter_rule:
+                        # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
+                        if css_filter_rule[0] == '/':
+                            html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
+                        else:
+                            # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                            html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
+
+                    # get_text() via inscriptis
+                    stripped_text_from_html = get_text(html_content)
+                else:
+                    # Don't run get_text or xpath/css filters on plaintext
+                    stripped_text_from_html = html_content

            # Re #340 - return the content before the 'ignore text' was applied
            text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')