From 3dbdfe1f5488e5a05c38e3c74d0ab1cf7f1a9c67 Mon Sep 17 00:00:00 2001 From: Robin Freyler Date: Wed, 1 May 2024 11:14:30 +0200 Subject: [PATCH] Fix `BufferBackend` soundness issue and add `StringInterner::resolve_unchecked` (#68) * fix BufferBackend::resolve unsoundness Unfortunately this fix vastly regresses the performance of the method. Benchmarks show -73% throughput which is massive ... Still the BufferBackend is a viable choice for memory constrained environments. * add StringInterner::resolve_unchecked method We added this because it make a huge difference for the BufferBackend to have this available. --- benches/bench.rs | 35 ++++++++++++++++++++++++++++++++++- src/backend/buffer.rs | 18 ++++++++++-------- src/interner.rs | 13 ++++++++++++- tests/tests.rs | 20 ++++++++++++++++++++ 4 files changed, 76 insertions(+), 10 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index 6fe6854..b00a1c0 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -21,7 +21,11 @@ use criterion::{ }; use string_interner::backend::Backend; -criterion_group!(bench_resolve, bench_resolve_already_filled); +criterion_group!( + bench_resolve, + bench_resolve_already_filled, + bench_resolve_unchecked_already_filled +); criterion_group!(bench_get, bench_get_already_filled); criterion_group!(bench_iter, bench_iter_already_filled); criterion_group!( @@ -184,6 +188,35 @@ fn bench_resolve_already_filled(c: &mut Criterion) { bench_for_backend::(&mut g); } +fn bench_resolve_unchecked_already_filled(c: &mut Criterion) { + let mut g = c.benchmark_group("resolve_unchecked/already-filled"); + g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); + fn bench_for_backend(g: &mut BenchmarkGroup) { + g.bench_with_input( + BB::NAME, + &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), + |bencher, &(len_words, word_len)| { + let words = generate_test_strings(len_words, word_len); + bencher.iter_batched_ref( + || BB::setup_filled_with_ids(&words), + |(interner, word_ids)| { + for &word_id in &*word_ids { + black_box( + // SAFETY: We provide only valid symbols to the tested interners. + unsafe { interner.resolve_unchecked(word_id) }, + ); + } + }, + BatchSize::SmallInput, + ) + }, + ); + } + bench_for_backend::(&mut g); + bench_for_backend::(&mut g); + bench_for_backend::(&mut g); +} + fn bench_get_already_filled(c: &mut Criterion) { let mut g = c.benchmark_group("get/already-filled"); g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); diff --git a/src/backend/buffer.rs b/src/backend/buffer.rs index 0da9c24..03f6539 100644 --- a/src/backend/buffer.rs +++ b/src/backend/buffer.rs @@ -88,15 +88,12 @@ where /// /// Returns the string from the given index if any as well /// as the index of the next string in the buffer. - fn resolve_index_to_str(&self, index: usize) -> Option<(&str, usize)> { + fn resolve_index_to_str(&self, index: usize) -> Option<(&[u8], usize)> { let bytes = self.buffer.get(index..)?; let (str_len, str_len_bytes) = decode_var_usize(bytes)?; let index_str = index + str_len_bytes; let str_bytes = self.buffer.get(index_str..index_str + str_len)?; - // SAFETY: It is guaranteed by the backend that only valid strings - // are stored in this portion of the buffer. - let string = unsafe { str::from_utf8_unchecked(str_bytes) }; - Some((string, index_str + str_len)) + Some((str_bytes, index_str + str_len)) } /// Resolves the string for the given symbol. @@ -180,8 +177,10 @@ where #[inline] fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { - self.resolve_index_to_str(symbol.to_usize()) - .map(|(string, _next_str_index)| string) + match self.resolve_index_to_str(symbol.to_usize()) { + None => None, + Some((bytes, _)) => str::from_utf8(bytes).ok(), + } } fn shrink_to_fit(&mut self) { @@ -481,7 +480,10 @@ where fn next(&mut self) -> Option { self.backend .resolve_index_to_str(self.next) - .and_then(|(string, next)| { + .and_then(|(bytes, next)| { + // SAFETY: Within the iterator all indices given to `resolv_index_to_str` + // are properly pointing to the start of each interned string. + let string = unsafe { str::from_utf8_unchecked(bytes) }; let symbol = S::try_from_usize(self.next)?; self.next = next; self.remaining -= 1; diff --git a/src/interner.rs b/src/interner.rs index 441a378..db01de1 100644 --- a/src/interner.rs +++ b/src/interner.rs @@ -264,12 +264,23 @@ where self.backend.shrink_to_fit() } - /// Returns the string for the given symbol if any. + /// Returns the string for the given `symbol`` if any. #[inline] pub fn resolve(&self, symbol: ::Symbol) -> Option<&str> { self.backend.resolve(symbol) } + /// Returns the string for the given `symbol` without performing any checks. + /// + /// # Safety + /// + /// It is the caller's responsibility to provide this method with `symbol`s + /// that are valid for the [`StringInterner`]. + #[inline] + pub unsafe fn resolve_unchecked(&self, symbol: ::Symbol) -> &str { + unsafe { self.backend.resolve_unchecked(symbol) } + } + /// Returns an iterator that yields all interned strings and their symbols. #[inline] pub fn iter(&self) -> ::Iter<'_> { diff --git a/tests/tests.rs b/tests/tests.rs index cb09c11..1f4a55c 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -298,6 +298,26 @@ macro_rules! gen_tests_for_backend { assert_eq!(interner.resolve(dd), None); } + #[test] + fn resolve_unchecked_works() { + let mut interner = StringInterner::new(); + // Insert 3 unique strings: + let aa = interner.get_or_intern("aa"); + let bb = interner.get_or_intern("bb"); + let cc = interner.get_or_intern("cc"); + assert_eq!(interner.len(), 3); + // Resolve valid symbols: + assert_eq!(unsafe { interner.resolve_unchecked(aa) }, "aa"); + assert_eq!(unsafe { interner.resolve_unchecked(bb) }, "bb"); + assert_eq!(unsafe { interner.resolve_unchecked(cc) }, "cc"); + assert_eq!(interner.len(), 3); + // Resolve invalid symbols: + let dd = expect_valid_symbol(1000); + assert_ne!(aa, dd); + assert_ne!(bb, dd); + assert_ne!(cc, dd); + } + #[test] fn get_works() { let mut interner = StringInterner::new();