Added exact string matching

CloudCannon · Jun 30, 2022 · fdc5bd2 · fdc5bd2
1 parent dd78b88
commit fdc5bd2
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 83 deletions.
diff --git a/pagefind/features/exact_phrase.feature b/pagefind/features/exact_phrase.feature
@@ -1,4 +1,3 @@
-@skip
 Feature: Exact Phrase Matching
     Background:
         Given I have the environment variables:
@@ -20,24 +19,26 @@ Feature: Exact Phrase Matching
             """
         When I run my program
         Then I should see "Running Pagefind" in stdout
+        Then I should see the file "public/_pagefind/pagefind.js"
         When I serve the "public" directory
         When I load "/"
         When I evaluate:
             """
             async function() {
                 let pagefind = await import("/_pagefind/pagefind.js");
 
-                let results = await pagefind.search(`"about cats"`);
+                let search = await pagefind.search(`"about cats"`);
 
-                document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
-                let data = await results[0].data();
-                document.querySelector('[data-result]').innerText = data.url;
+                document.querySelector('[data-count]').innerText = `${search.results.length} result(s)`;
+                let data = await search.results[0]?.data();
+                document.querySelector('[data-result]').innerText = data?.url;
             }
             """
         Then There should be no logs
         Then The selector "[data-count]" should contain "1 result(s)"
         Then The selector "[data-result]" should contain "/cat/"
 
+    @skip
     Scenario: Exact matches will be discouraged across element boundaries
         Given I have a "public/catone/index.html" file with the body:
             """
@@ -57,44 +58,13 @@ Feature: Exact Phrase Matching
             async function() {
                 let pagefind = await import("/_pagefind/pagefind.js");
 
-                let results = await pagefind.search(`"post about"`);
+                let search = await pagefind.search(`"post about"`);
 
-                document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
-                let data = await results[0].data();
+                document.querySelector('[data-count]').innerText = `${search.results.length} result(s)`;
+                let data = await search.results[0].data();
                 document.querySelector('[data-result]').innerText = data.url;
             }
             """
         Then There should be no logs
         Then The selector "[data-count]" should contain "1 result(s)"
         Then The selector "[data-result]" should contain "/cattwo/"
-
-    Scenario: Exact matches will match across stop words
-        Given I have a "public/cat/index.html" file with the body:
-            """
-            <h1>Happy post about the cats</h1>
-            """
-        # This file will _also_ match, due to our stop word culling
-        Given I have a "public/dog/index.html" file with the body:
-            """
-            <h1>A post not about happy cats</h1>
-            """
-        When I run my program
-        Then I should see "Running Pagefind" in stdout
-        When I serve the "public" directory
-        When I load "/"
-        When I evaluate:
-            """
-            async function() {
-                let pagefind = await import("/_pagefind/pagefind.js");
-
-                let search = await pagefind.search(`"about the cats"`);
-
-                document.querySelector('[data-count]').innerText = `${search.results.length} result(s)`;
-                let data = await Promise.all(search.results.map(result => result.data()));
-                document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', ');
-            }
-            """
-        Then There should be no logs
-        Then The selector "[data-count]" should contain "2 result(s)"
-        Then The selector "[data-result]" should contain "/cat/, /dog/"
-
diff --git a/pagefind/src/output/stubs/search.js b/pagefind/src/output/stubs/search.js
@@ -176,7 +176,8 @@ class Pagefind {
         let start = Date.now();
         let ptr = await this.getPtr();
         // Strip special characters to match the indexing operation
-        term = term.toLowerCase().replace(/[^\w\s]/g, "").trim();
+        let exact_search = /^\s*".+"\s*$/.test(term);
+        term = term.toLowerCase().trim().replace(/[^\w\s]/g, "").trim();
 
         let filter_list = [];
         for (let [filter, values] of Object.entries(options.filters)) {
@@ -198,7 +199,7 @@ class Pagefind {
         // pointer may have updated from the loadChunk calls
         ptr = await this.getPtr();
         let searchStart = Date.now();
-        let result = this.backend.search(ptr, term, filter_list);
+        let result = this.backend.search(ptr, term, filter_list, exact_search);
         let [results, filters] = result.split(/:(.*)$/);
         let filterObj = this.parseFilters(filters);
         results = results.length ? results.split(" ") : [];

diff --git a/pagefind_web/src/lib.rs b/pagefind_web/src/lib.rs
@@ -201,7 +201,7 @@ pub fn filters(ptr: *mut SearchIndex) -> String {
 }
 
 #[wasm_bindgen]
-pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str) -> String {
+pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str, exact: bool) -> String {
     let search_index = unsafe { Box::from_raw(ptr) };
 
     if let Some(generator_version) = search_index.generator_version.as_ref() {
@@ -212,7 +212,11 @@ pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str) -> String {
     }
 
     let filter_set = search_index.filter(filter);
-    let results = search_index.search_term(query, filter_set);
+    let results = if exact {
+        search_index.exact_term(query, filter_set)
+    } else {
+        search_index.search_term(query, filter_set)
+    };
 
     let filter_string =
         search_index.get_filters(Some(results.iter().map(|r| r.page_index).collect()));
@@ -243,12 +247,3 @@ pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str) -> String {
 
     format!("{}:{}", result_string, filter_string)
 }
-
-#[cfg(test)]
-mod tests {
-    #[test]
-    fn it_works() {
-        let result = 2 + 2;
-        assert_eq!(result, 4);
-    }
-}
diff --git a/pagefind_web/src/search.rs b/pagefind_web/src/search.rs
@@ -1,8 +1,8 @@
-use std::{cmp::Ordering, collections::HashMap};
+use std::{borrow::Cow, cmp::Ordering};
 
 use crate::{util::*, PageWord};
 use bit_set::BitSet;
-use pagefind_stem::{Algorithm, Stemmer}; // TODO: too big, Stemming should be performed on the JS side
+use pagefind_stem::{Algorithm, Stemmer};
 
 use crate::SearchIndex;
 
@@ -14,23 +14,100 @@ pub struct PageSearchResult {
 }
 
 impl SearchIndex {
-    pub fn search_term(&self, term: &str, filter_results: Option<BitSet>) -> Vec<PageSearchResult> {
-        let terms = term.split(' ');
-        // TODO: i18n
-        // TODO: Stemming should be performed on the JS side of the boundary
-        //       As the snowball implementation there seems a lot smaller and just as fast.
-        let en_stemmer = Stemmer::create(Algorithm::English);
+    pub fn exact_term(
+        &self,
+        term: &str,
+        filter_results: Option<BitSet>,
+    ) -> Vec<PageSearchResult> {
+        debug!({
+            format! {"Searching {:?}", term}
+        });
+
+        let mut maps = Vec::new();
+        let mut words = Vec::new();
+        for term in stems_from_term(term) {
+            if let Some(word_index) = self.words.get(term.as_ref()) {
+                words.extend(word_index);
+                let mut set = BitSet::new();
+                for page in word_index {
+                    set.insert(page.page as usize);
+                }
+                maps.push(set);
+            } else {
+                // If we can't find this word, there are obviously no exact matches
+                return vec![];
+            }
+        }
+
+        if let Some(filter) = filter_results {
+            maps.push(filter);
+        }
 
+        let results = match intersect_maps(maps) {
+            Some(map) => map,
+            None => return vec![],
+        };
+
+        let mut pages: Vec<PageSearchResult> = vec![];
+
+        for page_index in results.iter() {
+            let word_locations: Vec<Vec<u32>> = words
+                .iter()
+                .filter_map(|p| {
+                    if p.page as usize == page_index {
+                        Some(p.locs.clone())
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+            debug!({
+                format! {"Word locations {:?}", word_locations}
+            });
+
+            if word_locations.len() > 1 {
+                'indexes: for pos in &word_locations[0] {
+                    let mut i = *pos;
+                    for subsequent in &word_locations[1..] {
+                        i += 1;
+                        // Test each subsequent word map to try and find a contiguous block
+                        if !subsequent.contains(&i) {
+                            continue 'indexes;
+                        }
+                    }
+                    let page = &self.pages[page_index];
+                    let search_result = PageSearchResult {
+                        page: page.hash.clone(),
+                        page_index,
+                        word_frequency: 1.0,
+                        word_locations: (*pos..=i).collect(),
+                    };
+                    pages.push(search_result);
+                }
+            } else {
+                let page = &self.pages[page_index];
+                let search_result = PageSearchResult {
+                    page: page.hash.clone(),
+                    page_index,
+                    word_frequency: 1.0,
+                    word_locations: word_locations[0].clone(),
+                };
+                pages.push(search_result);
+            }
+        }
+
+        pages
+    }
+
+    pub fn search_term(&self, term: &str, filter_results: Option<BitSet>) -> Vec<PageSearchResult> {
         debug!({
             format! {"Searching {:?}", term}
         });
 
         let mut maps = Vec::new();
         let mut unique_maps = Vec::new();
         let mut words = Vec::new();
-        for term in terms {
-            let term = en_stemmer.stem(term).into_owned(); // TODO: Remove this once JS stems
-
+        for term in stems_from_term(term) {
             let mut word_maps = Vec::new();
             for (word, word_index) in self.find_word_extensions(&term) {
                 words.extend(word_index);
@@ -41,32 +118,20 @@ impl SearchIndex {
                 unique_maps.push((word.len() - term.len() + 1, set.clone()));
                 word_maps.push(set);
             }
-            let mut word_maps = word_maps.drain(..);
-            if let Some(mut base) = word_maps.next() {
-                for map in word_maps {
-                    base.union_with(&map);
-                }
-                maps.push(base)
+            if let Some(result) = union_maps(word_maps) {
+                maps.push(result);
             }
         }
 
-        let mut maps = maps.drain(..);
-        let mut results = if let Some(map) = maps.next() {
-            map
-        } else {
-            return vec![];
-            // let _ = Box::into_raw(search_index);
-            // return "".into();
-        };
-
-        for map in maps {
-            results.intersect_with(&map);
-        }
-
         if let Some(filter) = filter_results {
-            results.intersect_with(&filter);
+            maps.push(filter);
         }
 
+        let results = match intersect_maps(maps) {
+            Some(map) => map,
+            None => return vec![],
+        };
+
         let mut pages: Vec<PageSearchResult> = vec![];
 
         for page_index in results.iter() {
@@ -139,3 +204,32 @@ impl SearchIndex {
         extensions
     }
 }
+
+fn stems_from_term(term: &str) -> Vec<Cow<str>> {
+    let en_stemmer = Stemmer::create(Algorithm::English);
+    term.split(' ').map(|word| en_stemmer.stem(word)).collect()
+}
+
+fn intersect_maps(mut maps: Vec<BitSet>) -> Option<BitSet> {
+    let mut maps = maps.drain(..);
+    if let Some(mut base) = maps.next() {
+        for map in maps {
+            base.intersect_with(&map);
+        }
+        Some(base)
+    } else {
+        None
+    }
+}
+
+fn union_maps(mut maps: Vec<BitSet>) -> Option<BitSet> {
+    let mut maps = maps.drain(..);
+    if let Some(mut base) = maps.next() {
+        for map in maps {
+            base.union_with(&map);
+        }
+        Some(base)
+    } else {
+        None
+    }
+}