Skip to content

Commit

Permalink
Added exact string matching
Browse files Browse the repository at this point in the history
  • Loading branch information
bglw committed Jun 30, 2022
1 parent dd78b88 commit fdc5bd2
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 83 deletions.
48 changes: 9 additions & 39 deletions pagefind/features/exact_phrase.feature
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
@skip
Feature: Exact Phrase Matching
Background:
Given I have the environment variables:
Expand All @@ -20,24 +19,26 @@ Feature: Exact Phrase Matching
"""
When I run my program
Then I should see "Running Pagefind" in stdout
Then I should see the file "public/_pagefind/pagefind.js"
When I serve the "public" directory
When I load "/"
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let results = await pagefind.search(`"about cats"`);
let search = await pagefind.search(`"about cats"`);
document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
let data = await results[0].data();
document.querySelector('[data-result]').innerText = data.url;
document.querySelector('[data-count]').innerText = `${search.results.length} result(s)`;
let data = await search.results[0]?.data();
document.querySelector('[data-result]').innerText = data?.url;
}
"""
Then There should be no logs
Then The selector "[data-count]" should contain "1 result(s)"
Then The selector "[data-result]" should contain "/cat/"

@skip
Scenario: Exact matches will be discouraged across element boundaries
Given I have a "public/catone/index.html" file with the body:
"""
Expand All @@ -57,44 +58,13 @@ Feature: Exact Phrase Matching
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let results = await pagefind.search(`"post about"`);
let search = await pagefind.search(`"post about"`);
document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
let data = await results[0].data();
document.querySelector('[data-count]').innerText = `${search.results.length} result(s)`;
let data = await search.results[0].data();
document.querySelector('[data-result]').innerText = data.url;
}
"""
Then There should be no logs
Then The selector "[data-count]" should contain "1 result(s)"
Then The selector "[data-result]" should contain "/cattwo/"

Scenario: Exact matches will match across stop words
Given I have a "public/cat/index.html" file with the body:
"""
<h1>Happy post about the cats</h1>
"""
# This file will _also_ match, due to our stop word culling
Given I have a "public/dog/index.html" file with the body:
"""
<h1>A post not about happy cats</h1>
"""
When I run my program
Then I should see "Running Pagefind" in stdout
When I serve the "public" directory
When I load "/"
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let search = await pagefind.search(`"about the cats"`);
document.querySelector('[data-count]').innerText = `${search.results.length} result(s)`;
let data = await Promise.all(search.results.map(result => result.data()));
document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', ');
}
"""
Then There should be no logs
Then The selector "[data-count]" should contain "2 result(s)"
Then The selector "[data-result]" should contain "/cat/, /dog/"

5 changes: 3 additions & 2 deletions pagefind/src/output/stubs/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ class Pagefind {
let start = Date.now();
let ptr = await this.getPtr();
// Strip special characters to match the indexing operation
term = term.toLowerCase().replace(/[^\w\s]/g, "").trim();
let exact_search = /^\s*".+"\s*$/.test(term);
term = term.toLowerCase().trim().replace(/[^\w\s]/g, "").trim();

let filter_list = [];
for (let [filter, values] of Object.entries(options.filters)) {
Expand All @@ -198,7 +199,7 @@ class Pagefind {
// pointer may have updated from the loadChunk calls
ptr = await this.getPtr();
let searchStart = Date.now();
let result = this.backend.search(ptr, term, filter_list);
let result = this.backend.search(ptr, term, filter_list, exact_search);
let [results, filters] = result.split(/:(.*)$/);
let filterObj = this.parseFilters(filters);
results = results.length ? results.split(" ") : [];
Expand Down
17 changes: 6 additions & 11 deletions pagefind_web/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ pub fn filters(ptr: *mut SearchIndex) -> String {
}

#[wasm_bindgen]
pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str) -> String {
pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str, exact: bool) -> String {
let search_index = unsafe { Box::from_raw(ptr) };

if let Some(generator_version) = search_index.generator_version.as_ref() {
Expand All @@ -212,7 +212,11 @@ pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str) -> String {
}

let filter_set = search_index.filter(filter);
let results = search_index.search_term(query, filter_set);
let results = if exact {
search_index.exact_term(query, filter_set)
} else {
search_index.search_term(query, filter_set)
};

let filter_string =
search_index.get_filters(Some(results.iter().map(|r| r.page_index).collect()));
Expand Down Expand Up @@ -243,12 +247,3 @@ pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str) -> String {

format!("{}:{}", result_string, filter_string)
}

#[cfg(test)]
mod tests {
#[test]
fn it_works() {
let result = 2 + 2;
assert_eq!(result, 4);
}
}
156 changes: 125 additions & 31 deletions pagefind_web/src/search.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::{cmp::Ordering, collections::HashMap};
use std::{borrow::Cow, cmp::Ordering};

use crate::{util::*, PageWord};
use bit_set::BitSet;
use pagefind_stem::{Algorithm, Stemmer}; // TODO: too big, Stemming should be performed on the JS side
use pagefind_stem::{Algorithm, Stemmer};

use crate::SearchIndex;

Expand All @@ -14,23 +14,100 @@ pub struct PageSearchResult {
}

impl SearchIndex {
pub fn search_term(&self, term: &str, filter_results: Option<BitSet>) -> Vec<PageSearchResult> {
let terms = term.split(' ');
// TODO: i18n
// TODO: Stemming should be performed on the JS side of the boundary
// As the snowball implementation there seems a lot smaller and just as fast.
let en_stemmer = Stemmer::create(Algorithm::English);
pub fn exact_term(
&self,
term: &str,
filter_results: Option<BitSet>,
) -> Vec<PageSearchResult> {
debug!({
format! {"Searching {:?}", term}
});

let mut maps = Vec::new();
let mut words = Vec::new();
for term in stems_from_term(term) {
if let Some(word_index) = self.words.get(term.as_ref()) {
words.extend(word_index);
let mut set = BitSet::new();
for page in word_index {
set.insert(page.page as usize);
}
maps.push(set);
} else {
// If we can't find this word, there are obviously no exact matches
return vec![];
}
}

if let Some(filter) = filter_results {
maps.push(filter);
}

let results = match intersect_maps(maps) {
Some(map) => map,
None => return vec![],
};

let mut pages: Vec<PageSearchResult> = vec![];

for page_index in results.iter() {
let word_locations: Vec<Vec<u32>> = words
.iter()
.filter_map(|p| {
if p.page as usize == page_index {
Some(p.locs.clone())
} else {
None
}
})
.collect();
debug!({
format! {"Word locations {:?}", word_locations}
});

if word_locations.len() > 1 {
'indexes: for pos in &word_locations[0] {
let mut i = *pos;
for subsequent in &word_locations[1..] {
i += 1;
// Test each subsequent word map to try and find a contiguous block
if !subsequent.contains(&i) {
continue 'indexes;
}
}
let page = &self.pages[page_index];
let search_result = PageSearchResult {
page: page.hash.clone(),
page_index,
word_frequency: 1.0,
word_locations: (*pos..=i).collect(),
};
pages.push(search_result);
}
} else {
let page = &self.pages[page_index];
let search_result = PageSearchResult {
page: page.hash.clone(),
page_index,
word_frequency: 1.0,
word_locations: word_locations[0].clone(),
};
pages.push(search_result);
}
}

pages
}

pub fn search_term(&self, term: &str, filter_results: Option<BitSet>) -> Vec<PageSearchResult> {
debug!({
format! {"Searching {:?}", term}
});

let mut maps = Vec::new();
let mut unique_maps = Vec::new();
let mut words = Vec::new();
for term in terms {
let term = en_stemmer.stem(term).into_owned(); // TODO: Remove this once JS stems

for term in stems_from_term(term) {
let mut word_maps = Vec::new();
for (word, word_index) in self.find_word_extensions(&term) {
words.extend(word_index);
Expand All @@ -41,32 +118,20 @@ impl SearchIndex {
unique_maps.push((word.len() - term.len() + 1, set.clone()));
word_maps.push(set);
}
let mut word_maps = word_maps.drain(..);
if let Some(mut base) = word_maps.next() {
for map in word_maps {
base.union_with(&map);
}
maps.push(base)
if let Some(result) = union_maps(word_maps) {
maps.push(result);
}
}

let mut maps = maps.drain(..);
let mut results = if let Some(map) = maps.next() {
map
} else {
return vec![];
// let _ = Box::into_raw(search_index);
// return "".into();
};

for map in maps {
results.intersect_with(&map);
}

if let Some(filter) = filter_results {
results.intersect_with(&filter);
maps.push(filter);
}

let results = match intersect_maps(maps) {
Some(map) => map,
None => return vec![],
};

let mut pages: Vec<PageSearchResult> = vec![];

for page_index in results.iter() {
Expand Down Expand Up @@ -139,3 +204,32 @@ impl SearchIndex {
extensions
}
}

fn stems_from_term(term: &str) -> Vec<Cow<str>> {
let en_stemmer = Stemmer::create(Algorithm::English);
term.split(' ').map(|word| en_stemmer.stem(word)).collect()
}

fn intersect_maps(mut maps: Vec<BitSet>) -> Option<BitSet> {
let mut maps = maps.drain(..);
if let Some(mut base) = maps.next() {
for map in maps {
base.intersect_with(&map);
}
Some(base)
} else {
None
}
}

fn union_maps(mut maps: Vec<BitSet>) -> Option<BitSet> {
let mut maps = maps.drain(..);
if let Some(mut base) = maps.next() {
for map in maps {
base.union_with(&map);
}
Some(base)
} else {
None
}
}

0 comments on commit fdc5bd2

Please sign in to comment.