|
|
|
@ -118,16 +118,21 @@ class perform_site_check():
|
|
|
|
|
if is_html:
|
|
|
|
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
|
|
|
|
html_content = fetcher.content
|
|
|
|
|
if has_filter_rule:
|
|
|
|
|
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
|
|
|
|
if css_filter_rule[0] == '/':
|
|
|
|
|
html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
|
|
|
|
|
else:
|
|
|
|
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
|
|
|
|
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
|
|
|
|
|
|
|
|
|
# get_text() via inscriptis
|
|
|
|
|
stripped_text_from_html = get_text(html_content)
|
|
|
|
|
if not fetcher.headers.get('Content-Type', '') == 'text/plain':
|
|
|
|
|
|
|
|
|
|
if has_filter_rule:
|
|
|
|
|
# For HTML/XML we offer xpath as an option, just start a regular xPath "/.."
|
|
|
|
|
if css_filter_rule[0] == '/':
|
|
|
|
|
html_content = html_tools.xpath_filter(xpath_filter=css_filter_rule, html_content=fetcher.content)
|
|
|
|
|
else:
|
|
|
|
|
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
|
|
|
|
|
html_content = html_tools.css_filter(css_filter=css_filter_rule, html_content=fetcher.content)
|
|
|
|
|
|
|
|
|
|
# get_text() via inscriptis
|
|
|
|
|
stripped_text_from_html = get_text(html_content)
|
|
|
|
|
else:
|
|
|
|
|
# Don't run get_text or xpath/css filters on plaintext
|
|
|
|
|
stripped_text_from_html = html_content
|
|
|
|
|
|
|
|
|
|
# Re #340 - return the content before the 'ignore text' was applied
|
|
|
|
|
text_content_before_ignored_filter = stripped_text_from_html.encode('utf-8')
|
|
|
|
|