Optimized Favicon downloading

Some optimizations in regards to downloading Favicon's.

I also encounterd some issues with accessing some sites where the
connection got dropped or closed early. This seems a reqwest/hyper
thingy, https://github.com/hyperium/hyper/issues/2136. This is now also
fixed.

General:

- Decreased struct size
- Decreased memory allocations
- Optimized tokenizer a bit more to only emit tags when all attributes are there and are valid.

reqwest/hyper connection issue:
The following changes helped solve the connection issues to some sites.
The endresult is that some icons are now able to be downloaded always instead of sometimes.

- Enabled some extra reqwest features, `deflate` and `native-tls-alpn`
  (Which do not bring in any extra crates since other crates already enabled them, but they were not active for Vaultwarden it self)
- Configured reqwest to have a max amount of idle pool connections per host
- Configured reqwest to timeout the idle connections in 10 seconds
pull/3751/head
BlackDex 1 year ago
parent d1af468700
commit 6cdcb3b297
No known key found for this signature in database
GPG Key ID: 58C80A2AA6C765E1

@ -124,7 +124,7 @@ email_address = "0.2.4"
handlebars = { version = "4.3.7", features = ["dir_source"] } handlebars = { version = "4.3.7", features = ["dir_source"] }
# HTTP client (Used for favicons, version check, DUO and HIBP API) # HTTP client (Used for favicons, version check, DUO and HIBP API)
reqwest = { version = "0.11.18", features = ["stream", "json", "gzip", "brotli", "socks", "cookies", "trust-dns"] } reqwest = { version = "0.11.18", features = ["stream", "json", "deflate", "gzip", "brotli", "socks", "cookies", "trust-dns", "native-tls-alpn"] }
# Favicon extraction libraries # Favicon extraction libraries
html5gum = "0.5.7" html5gum = "0.5.7"

@ -19,7 +19,7 @@ use tokio::{
net::lookup_host, net::lookup_host,
}; };
use html5gum::{Emitter, EndTag, HtmlString, InfallibleTokenizer, Readable, StartTag, StringReader, Tokenizer}; use html5gum::{Emitter, HtmlString, InfallibleTokenizer, Readable, StringReader, Tokenizer};
use crate::{ use crate::{
error::Error, error::Error,
@ -46,10 +46,15 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
// Generate the cookie store // Generate the cookie store
let cookie_store = Arc::new(Jar::default()); let cookie_store = Arc::new(Jar::default());
let icon_download_timeout = Duration::from_secs(CONFIG.icon_download_timeout());
let pool_idle_timeout = Duration::from_secs(10);
// Reuse the client between requests // Reuse the client between requests
let client = get_reqwest_client_builder() let client = get_reqwest_client_builder()
.cookie_provider(Arc::clone(&cookie_store)) .cookie_provider(Arc::clone(&cookie_store))
.timeout(Duration::from_secs(CONFIG.icon_download_timeout())) .timeout(icon_download_timeout)
.pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections
.pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds
.trust_dns(true)
.default_headers(default_headers.clone()); .default_headers(default_headers.clone());
match client.build() { match client.build() {
@ -58,9 +63,11 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
error!("Possible trust-dns error, trying with trust-dns disabled: '{e}'"); error!("Possible trust-dns error, trying with trust-dns disabled: '{e}'");
get_reqwest_client_builder() get_reqwest_client_builder()
.cookie_provider(cookie_store) .cookie_provider(cookie_store)
.timeout(Duration::from_secs(CONFIG.icon_download_timeout())) .timeout(icon_download_timeout)
.default_headers(default_headers) .pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections
.pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds
.trust_dns(false) .trust_dns(false)
.default_headers(default_headers)
.build() .build()
.expect("Failed to build client") .expect("Failed to build client")
} }
@ -258,7 +265,7 @@ mod tests {
} }
} }
#[derive(Debug, Clone)] #[derive(Clone)]
enum DomainBlacklistReason { enum DomainBlacklistReason {
Regex, Regex,
IP, IP,
@ -415,38 +422,34 @@ fn get_favicons_node(
const TAG_LINK: &[u8] = b"link"; const TAG_LINK: &[u8] = b"link";
const TAG_BASE: &[u8] = b"base"; const TAG_BASE: &[u8] = b"base";
const TAG_HEAD: &[u8] = b"head"; const TAG_HEAD: &[u8] = b"head";
const ATTR_REL: &[u8] = b"rel";
const ATTR_HREF: &[u8] = b"href"; const ATTR_HREF: &[u8] = b"href";
const ATTR_SIZES: &[u8] = b"sizes"; const ATTR_SIZES: &[u8] = b"sizes";
let mut base_url = url.clone(); let mut base_url = url.clone();
let mut icon_tags: Vec<StartTag> = Vec::new(); let mut icon_tags: Vec<Tag> = Vec::new();
for token in dom { for token in dom {
match token { let tag_name: &[u8] = &token.tag.name;
FaviconToken::StartTag(tag) => { match tag_name {
if *tag.name == TAG_LINK TAG_LINK => {
&& tag.attributes.contains_key(ATTR_REL) icon_tags.push(token.tag);
&& tag.attributes.contains_key(ATTR_HREF) }
{ TAG_BASE => {
let rel_value = std::str::from_utf8(tag.attributes.get(ATTR_REL).unwrap()) base_url = if let Some(href) = token.tag.attributes.get(ATTR_HREF) {
.unwrap_or_default() let href = std::str::from_utf8(href).unwrap_or_default();
.to_ascii_lowercase();
if rel_value.contains("icon") && !rel_value.contains("mask-icon") {
icon_tags.push(tag);
}
} else if *tag.name == TAG_BASE && tag.attributes.contains_key(ATTR_HREF) {
let href = std::str::from_utf8(tag.attributes.get(ATTR_HREF).unwrap()).unwrap_or_default();
debug!("Found base href: {href}"); debug!("Found base href: {href}");
base_url = match base_url.join(href) { match base_url.join(href) {
Ok(inner_url) => inner_url, Ok(inner_url) => inner_url,
_ => url.clone(), _ => continue,
}; }
} } else {
continue;
};
} }
FaviconToken::EndTag(tag) => { TAG_HEAD if token.closing => {
if *tag.name == TAG_HEAD { break;
break; }
} _ => {
continue;
} }
} }
} }
@ -820,43 +823,64 @@ impl reqwest::cookie::CookieStore for Jar {
} }
/// Custom FaviconEmitter for the html5gum parser. /// Custom FaviconEmitter for the html5gum parser.
/// The FaviconEmitter is using an almost 1:1 copy of the DefaultEmitter with some small changes. /// The FaviconEmitter is using an optimized version of the DefaultEmitter.
/// This prevents emitting tags like comments, doctype and also strings between the tags. /// This prevents emitting tags like comments, doctype and also strings between the tags.
/// But it will also only emit the tags we need and only if they have the correct attributes
/// Therefor parsing the HTML content is faster. /// Therefor parsing the HTML content is faster.
use std::collections::{BTreeSet, VecDeque}; use std::collections::BTreeMap;
#[derive(Default)]
pub struct Tag {
/// The tag's name, such as `"link"` or `"base"`.
pub name: HtmlString,
#[derive(Debug)] /// A mapping for any HTML attributes this start tag may have.
enum FaviconToken { ///
StartTag(StartTag), /// Duplicate attributes are ignored after the first one as per WHATWG spec.
EndTag(EndTag), pub attributes: BTreeMap<HtmlString, HtmlString>,
} }
#[derive(Default, Debug)] struct FaviconToken {
tag: Tag,
closing: bool,
}
#[derive(Default)]
struct FaviconEmitter { struct FaviconEmitter {
current_token: Option<FaviconToken>, current_token: Option<FaviconToken>,
last_start_tag: HtmlString, last_start_tag: HtmlString,
current_attribute: Option<(HtmlString, HtmlString)>, current_attribute: Option<(HtmlString, HtmlString)>,
seen_attributes: BTreeSet<HtmlString>, emit_token: bool,
emitted_tokens: VecDeque<FaviconToken>,
} }
impl FaviconEmitter { impl FaviconEmitter {
fn emit_token(&mut self, token: FaviconToken) { fn flush_current_attribute(&mut self, emit_current_tag: bool) {
self.emitted_tokens.push_front(token); const ATTR_HREF: &[u8] = b"href";
} const ATTR_REL: &[u8] = b"rel";
const TAG_LINK: &[u8] = b"link";
const TAG_BASE: &[u8] = b"base";
const TAG_HEAD: &[u8] = b"head";
if let Some(ref mut token) = self.current_token {
let tag_name: &[u8] = &token.tag.name;
if self.current_attribute.is_some() && (tag_name == TAG_BASE || tag_name == TAG_LINK) {
let (k, v) = self.current_attribute.take().unwrap();
token.tag.attributes.entry(k).and_modify(|_| {}).or_insert(v);
}
fn flush_current_attribute(&mut self) { let tag_attr = &token.tag.attributes;
if let Some((k, v)) = self.current_attribute.take() { match tag_name {
match self.current_token { TAG_HEAD if token.closing => self.emit_token = true,
Some(FaviconToken::StartTag(ref mut tag)) => { TAG_BASE if tag_attr.contains_key(ATTR_HREF) => self.emit_token = true,
tag.attributes.entry(k).and_modify(|_| {}).or_insert(v); TAG_LINK if emit_current_tag && tag_attr.contains_key(ATTR_REL) && tag_attr.contains_key(ATTR_HREF) => {
} let rel_value =
Some(FaviconToken::EndTag(_)) => { std::str::from_utf8(token.tag.attributes.get(ATTR_REL).unwrap()).unwrap_or_default();
self.seen_attributes.insert(k); if rel_value.contains("icon") && !rel_value.contains("mask-icon") {
} self.emit_token = true
_ => { }
debug_assert!(false);
} }
_ => (),
} }
} }
} }
@ -871,87 +895,71 @@ impl Emitter for FaviconEmitter {
} }
fn pop_token(&mut self) -> Option<Self::Token> { fn pop_token(&mut self) -> Option<Self::Token> {
self.emitted_tokens.pop_back() if self.emit_token {
self.emit_token = false;
return self.current_token.take();
}
None
} }
fn init_start_tag(&mut self) { fn init_start_tag(&mut self) {
self.current_token = Some(FaviconToken::StartTag(StartTag::default())); self.current_token = Some(FaviconToken {
tag: Tag::default(),
closing: false,
});
} }
fn init_end_tag(&mut self) { fn init_end_tag(&mut self) {
self.current_token = Some(FaviconToken::EndTag(EndTag::default())); self.current_token = Some(FaviconToken {
self.seen_attributes.clear(); tag: Tag::default(),
closing: true,
});
} }
fn emit_current_tag(&mut self) -> Option<html5gum::State> { fn emit_current_tag(&mut self) -> Option<html5gum::State> {
self.flush_current_attribute(); self.flush_current_attribute(true);
let mut token = self.current_token.take().unwrap(); self.last_start_tag.clear();
let mut emit = false; if self.current_token.is_some() && !self.current_token.as_ref().unwrap().closing {
match token { self.last_start_tag.extend(&*self.current_token.as_ref().unwrap().tag.name);
FaviconToken::EndTag(ref mut tag) => {
// Always clean seen attributes
self.seen_attributes.clear();
self.set_last_start_tag(None);
// Only trigger an emit for the </head> tag.
// This is matched, and will break the for-loop.
if *tag.name == b"head" {
emit = true;
}
}
FaviconToken::StartTag(ref mut tag) => {
// Only trriger an emit for <link> and <base> tags.
// These are the only tags we want to parse.
if *tag.name == b"link" || *tag.name == b"base" {
self.set_last_start_tag(Some(&tag.name));
emit = true;
} else {
self.set_last_start_tag(None);
}
}
}
// Only emit the tags we want to parse.
if emit {
self.emit_token(token);
} }
None html5gum::naive_next_state(&self.last_start_tag)
} }
fn push_tag_name(&mut self, s: &[u8]) { fn push_tag_name(&mut self, s: &[u8]) {
match self.current_token { if let Some(ref mut token) = self.current_token {
Some( token.tag.name.extend(s);
FaviconToken::StartTag(StartTag {
ref mut name,
..
})
| FaviconToken::EndTag(EndTag {
ref mut name,
..
}),
) => {
name.extend(s);
}
_ => debug_assert!(false),
} }
} }
fn init_attribute(&mut self) { fn init_attribute(&mut self) {
self.flush_current_attribute(); self.flush_current_attribute(false);
self.current_attribute = Some(Default::default()); self.current_attribute = match &self.current_token {
Some(token) => {
let tag_name: &[u8] = &token.tag.name;
match tag_name {
b"link" | b"head" | b"base" => Some(Default::default()),
_ => None,
}
}
_ => None,
};
} }
fn push_attribute_name(&mut self, s: &[u8]) { fn push_attribute_name(&mut self, s: &[u8]) {
self.current_attribute.as_mut().unwrap().0.extend(s); if let Some(attr) = &mut self.current_attribute {
attr.0.extend(s)
}
} }
fn push_attribute_value(&mut self, s: &[u8]) { fn push_attribute_value(&mut self, s: &[u8]) {
self.current_attribute.as_mut().unwrap().1.extend(s); if let Some(attr) = &mut self.current_attribute {
attr.1.extend(s)
}
} }
fn current_is_appropriate_end_tag_token(&mut self) -> bool { fn current_is_appropriate_end_tag_token(&mut self) -> bool {
match self.current_token { match &self.current_token {
Some(FaviconToken::EndTag(ref tag)) => !self.last_start_tag.is_empty() && self.last_start_tag == tag.name, Some(token) if token.closing => !self.last_start_tag.is_empty() && self.last_start_tag == token.tag.name,
_ => false, _ => false,
} }
} }

Loading…
Cancel
Save