<title> hack

closes #1874
pull/1879/head
dgtlmoon 1 year ago
parent 1c0fe4c23e
commit d2c09cfc7d

@ -98,10 +98,7 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
elif type(element) == etree._ElementUnicodeResult: elif type(element) == etree._ElementUnicodeResult:
html_block += str(element) html_block += str(element)
else: else:
if not is_rss: html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
else:
html_block += f"<div>{element.text}</div>\n"
return html_block return html_block
@ -274,7 +271,7 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>' pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
def repl(m): def repl(m):
text = m.group(1) text = m.group(1)
return xml_escape(html_to_text(html_content=text)) return xml_escape(html_to_text(html_content=text)).strip()
return re.sub(pattern, repl, html_content) return re.sub(pattern, repl, html_content)
@ -295,7 +292,8 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
# extracting this content # extracting this content
if render_anchor_tag_content: if render_anchor_tag_content:
parser_config = ParserConfig( parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]}, display_links=True annotation_rules={"a": ["hyperlink"]},
display_links=True
) )
# otherwise set config to None/default # otherwise set config to None/default
else: else:
@ -303,13 +301,12 @@ def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=Fals
# RSS Mode - Inscriptis will treat `title` as something else. # RSS Mode - Inscriptis will treat `title` as something else.
# Make it as a regular block display element (//item/title) # Make it as a regular block display element (//item/title)
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
if is_rss: if is_rss:
css = CSS_PROFILES['strict'].copy() html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
css['title'] = HtmlElement(display=Display.block) html_content = re.sub(r'</title>', r'</h1>', html_content)
text_content = get_text(html_content, ParserConfig(css=css))
else: text_content = get_text(html_content, config=parser_config)
# get text and annotations via inscriptis
text_content = get_text(html_content, config=parser_config)
return text_content return text_content

@ -274,7 +274,7 @@ class perform_site_check(difference_detection_processor):
html_tools.html_to_text( html_tools.html_to_text(
html_content=html_content, html_content=html_content,
render_anchor_tag_content=do_anchor, render_anchor_tag_content=do_anchor,
is_rss=is_rss is_rss=is_rss # #1874 activate the <title workaround hack
) )
# Re #340 - return the content before the 'ignore text' was applied # Re #340 - return the content before the 'ignore text' was applied

@ -118,7 +118,7 @@ def test_basic_cdata_rss_markup(client, live_server):
res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
def test_rss_xpath_filtering(client, live_server): def test_rss_xpath_filtering(client, live_server):
# live_server_setup(live_server) #live_server_setup(live_server)
set_original_cdata_xml() set_original_cdata_xml()
@ -154,6 +154,9 @@ def test_rss_xpath_filtering(client, live_server):
) )
assert b'CDATA' not in res.data assert b'CDATA' not in res.data
assert b'<![' not in res.data assert b'<![' not in res.data
# #1874 All but the first <title was getting selected
# Convert any HTML with just a top level <title> to <h1> to be sure title renders
assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath assert b'Hackers can access your computer' in res.data # Should ONLY be selected by the xpath
assert b'Some other title' in res.data # Should ONLY be selected by the xpath assert b'Some other title' in res.data # Should ONLY be selected by the xpath
assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath assert b'The days of Terminator' not in res.data # Should NOT be selected by the xpath

Loading…
Cancel
Save