From 1a77cf34de309a40ad52cecd353eea60b00e8d91 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Thu, 7 Jul 2022 11:32:58 +1200 Subject: [PATCH] fix: hash fragments on full contents to avoid stale results --- pagefind/src/fossick/mod.rs | 3 -- pagefind/src/fragments/mod.rs | 1 - pagefind/src/index/mod.rs | 57 +++++++++++++++++++++++++---------- pagefind/src/output/mod.rs | 8 +---- 4 files changed, 42 insertions(+), 27 deletions(-) diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index b0dc06c2..3b35cc59 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -9,7 +9,6 @@ use tokio::io::{AsyncReadExt, BufReader}; use tokio::time::{sleep, Duration}; use crate::fragments::{PageFragment, PageFragmentData}; -use crate::utils::full_hash; use crate::SearchOptions; use parser::DomParser; @@ -111,13 +110,11 @@ impl Fossicker { let word_data = self.retrieve_words_from_digest(); let data = self.data.as_ref().unwrap(); - let hash = full_hash(data.digest.as_bytes()); Ok(FossickedData { file_path: self.file_path.clone(), has_custom_body: data.has_custom_body, fragment: PageFragment { - hash, page_number: 0, data: PageFragmentData { url: build_url(&self.file_path, options), diff --git a/pagefind/src/fragments/mod.rs b/pagefind/src/fragments/mod.rs index 8a5446b4..23ba0723 100644 --- a/pagefind/src/fragments/mod.rs +++ b/pagefind/src/fragments/mod.rs @@ -13,7 +13,6 @@ pub struct PageFragmentData { #[derive(Debug)] pub struct PageFragment { - pub hash: String, pub page_number: usize, pub data: PageFragmentData, } diff --git a/pagefind/src/index/mod.rs b/pagefind/src/index/mod.rs index ac9ab0e2..ffd586b0 100644 --- a/pagefind/src/index/mod.rs +++ b/pagefind/src/index/mod.rs @@ -1,8 +1,7 @@ use hashbrown::HashMap; use crate::{ - fossick::FossickedData, fragments::PageFragment, index::index_metadata::MetaFilter, - utils::full_hash, SearchOptions, + fossick::FossickedData, index::index_metadata::MetaFilter, utils::full_hash, SearchOptions, }; use index_filter::{FilterIndex, PackedValue}; use index_metadata::{MetaChunk, MetaIndex, MetaPage}; @@ -16,7 +15,15 @@ pub struct PagefindIndexes { pub word_indexes: HashMap>, pub filter_indexes: HashMap>, pub meta_index: Vec, - pub fragments: Vec<(String, PageFragment)>, + pub fragments: Vec<(String, String)>, +} + +#[derive(Clone)] +struct IntermediaryPageData { + full_hash: String, + encoded_data: String, + word_count: usize, + page_number: usize, } pub async fn build_indexes(pages: I, options: &SearchOptions) -> PagefindIndexes @@ -32,8 +39,8 @@ where let mut word_map: HashMap = HashMap::new(); let mut filter_map: HashMap>> = HashMap::new(); - let mut fragment_hashes: HashMap = HashMap::new(); - let mut fragments: Vec<(String, PageFragment)> = Vec::new(); + let mut fragment_hashes: HashMap = HashMap::new(); + let mut fragments: Vec<(usize, (String, IntermediaryPageData))> = Vec::new(); for (page_number, mut page) in pages.enumerate() { page.fragment.page_number = page_number; @@ -76,31 +83,46 @@ where } } - let mut short_hash = &page.fragment.hash[0..=6]; + let encoded_data = serde_json::to_string(&page.fragment.data).unwrap(); + let encoded_page = IntermediaryPageData { + full_hash: full_hash(encoded_data.as_bytes()), + word_count: page.fragment.data.word_count, + page_number: page.fragment.page_number, + encoded_data, + }; + + let mut short_hash = &encoded_page.full_hash[0..=6]; // If we hit a collision, extend one until we stop colliding // TODO: There are some collision issues here. // If two builds match a collision in different orders the hashes will swap, // which could return incorrect data due to files being cached. - while let Some(collision) = fragment_hashes.remove(short_hash) { - if collision.hash == page.fragment.hash { + while let Some(collision) = fragment_hashes.get(short_hash) { + if collision.full_hash == encoded_page.full_hash { // These pages are identical. Add both under the same hash. - fragments.push((collision.hash.clone(), collision)); + fragments.push(( + collision.word_count, + (collision.full_hash.clone(), collision.clone()), + )); } else { let new_length = short_hash.len(); - short_hash = &page.fragment.hash[0..=new_length]; + short_hash = &encoded_page.full_hash[0..=new_length]; } } - fragment_hashes.insert(short_hash.to_string(), page.fragment); + fragment_hashes.insert(short_hash.to_string(), encoded_page); } - fragments.extend(fragment_hashes.into_iter()); - fragments.sort_by_cached_key(|(_, fragment)| fragment.page_number); + fragments.extend( + fragment_hashes + .into_iter() + .map(|(hash, frag)| (frag.word_count, (hash, frag))), + ); + fragments.sort_by_cached_key(|(_, (_, fragment))| fragment.page_number); meta.pages - .extend(fragments.iter().map(|(hash, fragment)| MetaPage { + .extend(fragments.iter().map(|(word_count, (hash, _))| MetaPage { hash: hash.clone(), - word_count: fragment.data.word_count as u32, + word_count: *word_count as u32, })); // TODO: Change filter indexes to BTree to give them a stable hash. @@ -182,7 +204,10 @@ where word_indexes, filter_indexes, meta_index, - fragments, + fragments: fragments + .into_iter() + .map(|(_, (hash, frag))| (hash, frag.encoded_data)) + .collect(), } } diff --git a/pagefind/src/output/mod.rs b/pagefind/src/output/mod.rs index 28d22c23..f1e0fe0a 100644 --- a/pagefind/src/output/mod.rs +++ b/pagefind/src/output/mod.rs @@ -39,12 +39,6 @@ impl PagefindIndexes { pub async fn write_files(self, options: &SearchOptions) { let outdir = options.source.join(&options.bundle_dir); - let fragment_data: Vec<_> = self - .fragments - .iter() - .map(|(hash, fragment)| (hash, serde_json::to_string(&fragment.data).unwrap())) - .collect(); - let js = minify(&format!("{}\n{}\n{}", WEB_JS, GUNZIP_JS, SEARCH_JS)); let mut files = vec![ @@ -80,7 +74,7 @@ impl PagefindIndexes { ), ]; - files.extend(fragment_data.iter().map(|(hash, fragment)| { + files.extend(self.fragments.iter().map(|(hash, fragment)| { write( outdir.join(format!("fragment/{}.pf_fragment", hash)), vec![fragment.as_bytes()],