@ -19,7 +19,7 @@ use tokio::{
net ::lookup_host ,
} ;
use html5gum ::{ Emitter , EndTag, HtmlString, InfallibleTokenizer , Readable , StartTag , StringReader , Tokenizer } ;
use html5gum ::{ Emitter , HtmlString, InfallibleTokenizer , Readable , StringReader , Tokenizer } ;
use crate ::{
error ::Error ,
@ -46,10 +46,15 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
// Generate the cookie store
let cookie_store = Arc ::new ( Jar ::default ( ) ) ;
let icon_download_timeout = Duration ::from_secs ( CONFIG . icon_download_timeout ( ) ) ;
let pool_idle_timeout = Duration ::from_secs ( 10 ) ;
// Reuse the client between requests
let client = get_reqwest_client_builder ( )
. cookie_provider ( Arc ::clone ( & cookie_store ) )
. timeout ( Duration ::from_secs ( CONFIG . icon_download_timeout ( ) ) )
. timeout ( icon_download_timeout )
. pool_max_idle_per_host ( 5 ) // Configure the Hyper Pool to only have max 5 idle connections
. pool_idle_timeout ( pool_idle_timeout ) // Configure the Hyper Pool to timeout after 10 seconds
. trust_dns ( true )
. default_headers ( default_headers . clone ( ) ) ;
match client . build ( ) {
@ -58,9 +63,11 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
error ! ( "Possible trust-dns error, trying with trust-dns disabled: '{e}'" ) ;
get_reqwest_client_builder ( )
. cookie_provider ( cookie_store )
. timeout ( Duration ::from_secs ( CONFIG . icon_download_timeout ( ) ) )
. default_headers ( default_headers )
. timeout ( icon_download_timeout )
. pool_max_idle_per_host ( 5 ) // Configure the Hyper Pool to only have max 5 idle connections
. pool_idle_timeout ( pool_idle_timeout ) // Configure the Hyper Pool to timeout after 10 seconds
. trust_dns ( false )
. default_headers ( default_headers )
. build ( )
. expect ( "Failed to build client" )
}
@ -258,7 +265,7 @@ mod tests {
}
}
#[ derive( Debug, Clone)]
#[ derive( Clone)]
enum DomainBlacklistReason {
Regex ,
IP ,
@ -415,38 +422,34 @@ fn get_favicons_node(
const TAG_LINK : & [ u8 ] = b" link " ;
const TAG_BASE : & [ u8 ] = b" base " ;
const TAG_HEAD : & [ u8 ] = b" head " ;
const ATTR_REL : & [ u8 ] = b" rel " ;
const ATTR_HREF : & [ u8 ] = b" href " ;
const ATTR_SIZES : & [ u8 ] = b" sizes " ;
let mut base_url = url . clone ( ) ;
let mut icon_tags : Vec < Start Tag> = Vec ::new ( ) ;
let mut icon_tags : Vec < Tag> = Vec ::new ( ) ;
for token in dom {
match token {
FaviconToken ::StartTag ( tag ) = > {
if * tag . name = = TAG_LINK
& & tag . attributes . contains_key ( ATTR_REL )
& & tag . attributes . contains_key ( ATTR_HREF )
{
let rel_value = std ::str ::from_utf8 ( tag . attributes . get ( ATTR_REL ) . unwrap ( ) )
. unwrap_or_default ( )
. to_ascii_lowercase ( ) ;
if rel_value . contains ( "icon" ) & & ! rel_value . contains ( "mask-icon" ) {
icon_tags . push ( tag ) ;
}
} else if * tag . name = = TAG_BASE & & tag . attributes . contains_key ( ATTR_HREF ) {
let href = std ::str ::from_utf8 ( tag . attributes . get ( ATTR_HREF ) . unwrap ( ) ) . unwrap_or_default ( ) ;
let tag_name : & [ u8 ] = & token . tag . name ;
match tag_name {
TAG_LINK = > {
icon_tags . push ( token . tag ) ;
}
TAG_BASE = > {
base_url = if let Some ( href ) = token . tag . attributes . get ( ATTR_HREF ) {
let href = std ::str ::from_utf8 ( href ) . unwrap_or_default ( ) ;
debug ! ( "Found base href: {href}" ) ;
base_url = match base_url . join ( href ) {
match base_url . join ( href ) {
Ok ( inner_url ) = > inner_url ,
_ = > url . clone ( ) ,
} ;
_ = > continue ,
}
} else {
continue ;
} ;
}
FaviconToken ::EndTag ( tag ) = > {
if * tag . name = = TAG_HEAD {
TAG_HEAD if token . closing = > {
break ;
}
_ = > {
continue ;
}
}
}
@ -820,43 +823,64 @@ impl reqwest::cookie::CookieStore for Jar {
}
/// Custom FaviconEmitter for the html5gum parser.
/// The FaviconEmitter is using an almost 1:1 copy of the DefaultEmitter with some small changes .
/// The FaviconEmitter is using an optimized version of the DefaultEmitter .
/// This prevents emitting tags like comments, doctype and also strings between the tags.
/// But it will also only emit the tags we need and only if they have the correct attributes
/// Therefor parsing the HTML content is faster.
use std ::collections ::{ BTreeSet , VecDeque } ;
use std ::collections ::BTreeMap ;
#[ derive(Debug) ]
enum FaviconToken {
StartTag ( StartTag ) ,
EndTag ( EndTag ) ,
#[ derive(Default) ]
pub struct Tag {
/// The tag's name, such as `"link"` or `"base"`.
pub name : HtmlString ,
/// A mapping for any HTML attributes this start tag may have.
///
/// Duplicate attributes are ignored after the first one as per WHATWG spec.
pub attributes : BTreeMap < HtmlString , HtmlString > ,
}
struct FaviconToken {
tag : Tag ,
closing : bool ,
}
#[ derive(Default, Debug) ]
#[ derive(Default )]
struct FaviconEmitter {
current_token : Option < FaviconToken > ,
last_start_tag : HtmlString ,
current_attribute : Option < ( HtmlString , HtmlString ) > ,
seen_attributes : BTreeSet < HtmlString > ,
emitted_tokens : VecDeque < FaviconToken > ,
emit_token : bool ,
}
impl FaviconEmitter {
fn emit_token ( & mut self , token : FaviconToken ) {
self . emitted_tokens . push_front ( token ) ;
}
fn flush_current_attribute ( & mut self , emit_current_tag : bool ) {
const ATTR_HREF : & [ u8 ] = b" href " ;
const ATTR_REL : & [ u8 ] = b" rel " ;
const TAG_LINK : & [ u8 ] = b" link " ;
const TAG_BASE : & [ u8 ] = b" base " ;
const TAG_HEAD : & [ u8 ] = b" head " ;
if let Some ( ref mut token ) = self . current_token {
let tag_name : & [ u8 ] = & token . tag . name ;
fn flush_current_attribute ( & mut self ) {
if let Some ( ( k , v ) ) = self . current_attribute . take ( ) {
match self . current_token {
Some ( FaviconToken ::StartTag ( ref mut tag ) ) = > {
tag . attributes . entry ( k ) . and_modify ( | _ | { } ) . or_insert ( v ) ;
if self . current_attribute . is_some ( ) & & ( tag_name = = TAG_BASE | | tag_name = = TAG_LINK ) {
let ( k , v ) = self . current_attribute . take ( ) . unwrap ( ) ;
token . tag . attributes . entry ( k ) . and_modify ( | _ | { } ) . or_insert ( v ) ;
}
Some ( FaviconToken ::EndTag ( _ ) ) = > {
self . seen_attributes . insert ( k ) ;
let tag_attr = & token . tag . attributes ;
match tag_name {
TAG_HEAD if token . closing = > self . emit_token = true ,
TAG_BASE if tag_attr . contains_key ( ATTR_HREF ) = > self . emit_token = true ,
TAG_LINK if emit_current_tag & & tag_attr . contains_key ( ATTR_REL ) & & tag_attr . contains_key ( ATTR_HREF ) = > {
let rel_value =
std ::str ::from_utf8 ( token . tag . attributes . get ( ATTR_REL ) . unwrap ( ) ) . unwrap_or_default ( ) ;
if rel_value . contains ( "icon" ) & & ! rel_value . contains ( "mask-icon" ) {
self . emit_token = true
}
_ = > {
debug_assert! ( false ) ;
}
_ = > ( ) ,
}
}
}
@ -871,87 +895,71 @@ impl Emitter for FaviconEmitter {
}
fn pop_token ( & mut self ) -> Option < Self ::Token > {
self . emitted_tokens . pop_back ( )
if self . emit_token {
self . emit_token = false ;
return self . current_token . take ( ) ;
}
None
}
fn init_start_tag ( & mut self ) {
self . current_token = Some ( FaviconToken ::StartTag ( StartTag ::default ( ) ) ) ;
self . current_token = Some ( FaviconToken {
tag : Tag ::default ( ) ,
closing : false ,
} ) ;
}
fn init_end_tag ( & mut self ) {
self . current_token = Some ( FaviconToken ::EndTag ( EndTag ::default ( ) ) ) ;
self . seen_attributes . clear ( ) ;
self . current_token = Some ( FaviconToken {
tag : Tag ::default ( ) ,
closing : true ,
} ) ;
}
fn emit_current_tag ( & mut self ) -> Option < html5gum ::State > {
self . flush_current_attribute ( ) ;
let mut token = self . current_token . take ( ) . unwrap ( ) ;
let mut emit = false ;
match token {
FaviconToken ::EndTag ( ref mut tag ) = > {
// Always clean seen attributes
self . seen_attributes . clear ( ) ;
self . set_last_start_tag ( None ) ;
// Only trigger an emit for the </head> tag.
// This is matched, and will break the for-loop.
if * tag . name = = b" head " {
emit = true ;
}
}
FaviconToken ::StartTag ( ref mut tag ) = > {
// Only trriger an emit for <link> and <base> tags.
// These are the only tags we want to parse.
if * tag . name = = b" link " | | * tag . name = = b" base " {
self . set_last_start_tag ( Some ( & tag . name ) ) ;
emit = true ;
} else {
self . set_last_start_tag ( None ) ;
}
}
}
// Only emit the tags we want to parse.
if emit {
self . emit_token ( token ) ;
self . flush_current_attribute ( true ) ;
self . last_start_tag . clear ( ) ;
if self . current_token . is_some ( ) & & ! self . current_token . as_ref ( ) . unwrap ( ) . closing {
self . last_start_tag . extend ( & * self . current_token . as_ref ( ) . unwrap ( ) . tag . name ) ;
}
None
html5gum ::naive_next_state ( & self . last_start_tag )
}
fn push_tag_name ( & mut self , s : & [ u8 ] ) {
match self . current_token {
Some (
FaviconToken ::StartTag ( StartTag {
ref mut name ,
..
} )
| FaviconToken ::EndTag ( EndTag {
ref mut name ,
..
} ) ,
) = > {
name . extend ( s ) ;
}
_ = > debug_assert! ( false ) ,
if let Some ( ref mut token ) = self . current_token {
token . tag . name . extend ( s ) ;
}
}
fn init_attribute ( & mut self ) {
self . flush_current_attribute ( ) ;
self . current_attribute = Some ( Default ::default ( ) ) ;
self . flush_current_attribute ( false ) ;
self . current_attribute = match & self . current_token {
Some ( token ) = > {
let tag_name : & [ u8 ] = & token . tag . name ;
match tag_name {
b" link " | b" head " | b" base " = > Some ( Default ::default ( ) ) ,
_ = > None ,
}
}
_ = > None ,
} ;
}
fn push_attribute_name ( & mut self , s : & [ u8 ] ) {
self . current_attribute . as_mut ( ) . unwrap ( ) . 0. extend ( s ) ;
if let Some ( attr ) = & mut self . current_attribute {
attr . 0. extend ( s )
}
}
fn push_attribute_value ( & mut self , s : & [ u8 ] ) {
self . current_attribute . as_mut ( ) . unwrap ( ) . 1. extend ( s ) ;
if let Some ( attr ) = & mut self . current_attribute {
attr . 1. extend ( s )
}
}
fn current_is_appropriate_end_tag_token ( & mut self ) -> bool {
match self . current_token {
Some ( FaviconToken::EndTag ( ref tag ) ) = > ! self . last_start_tag . is_empty ( ) & & self . last_start_tag = = tag . name ,
match & self . current_token {
Some ( token) if token . closing = > ! self . last_start_tag . is_empty ( ) & & self . last_start_tag = = token . tag . name ,
_ = > false ,
}
}