plaintext mime type fix - Don't attempt to extract HTML content from plaintext, this will remove lines and break changedetection (#391)

pull/392/head^2
dgtlmoon 3 years ago committed by GitHub
parent 6611823962
commit 39b7be9e7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -118,6 +118,8 @@ class perform_site_check():
if is_html: if is_html:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
html_content = fetcher.content html_content = fetcher.content
if not fetcher.headers.get('Content-Type', '') == 'text/plain':
if has_filter_rule: if has_filter_rule:
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.." # For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
if css_filter_rule[0] == '/': if css_filter_rule[0] == '/':
@ -128,6 +130,9 @@ class perform_site_check():
# get_text() via inscriptis # get_text() via inscriptis
stripped_text_from_html = get_text(html_content) stripped_text_from_html = get_text(html_content)
else:
# Don't run get_text or xpath/css filters on plaintext
stripped_text_from_html = html_content
# Re #340 - return the content before the 'ignore text' was applied # Re #340 - return the content before the 'ignore text' was applied
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8') text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')

Loading…
Cancel
Save