From b48ea1c14f3e69ab7cf9a625cfe860038d712bbb Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Sun, 24 Dec 2023 23:48:16 +0100 Subject: [PATCH] Add an option to turn off density-weighting When searching, Pagefind applies a heuristic that often works quite well to boost pages with a higher density, i.e. a higher number of hits divided by the number of words on the page. This is called "density weighting". In some instances, it is desirable, though, to just use the number of hits directly, without dividing by the number of words on the page. Let's support this via the search option `use_weighting`, which default to `true` to maintain the current behavior. Signed-off-by: Johannes Schindelin --- docs/content/docs/api.md | 12 ++++++++++++ pagefind/features/weighting.feature | 28 +++++++++++++++++++++++++++ pagefind_web/src/lib.rs | 4 ++-- pagefind_web/src/search.rs | 3 ++- pagefind_web_js/lib/coupled_search.ts | 3 ++- 5 files changed, 46 insertions(+), 4 deletions(-) diff --git a/docs/content/docs/api.md b/docs/content/docs/api.md index 8436df0e..5eba92f7 100644 --- a/docs/content/docs/api.md +++ b/docs/content/docs/api.md @@ -239,6 +239,18 @@ const search = await pagefind.search("static", { See [Sorting using the Pagefind JavaScript API](/docs/js-api-sorting/) for more details and functionality. +## Turning off density-weighting + +By default, the results' weights are "density-weighted", i.e. the weights are calculated by counting the number of matches within a page divided by the page's word count. This density-weighting can be turned off: + +{{< diffcode >}} +```js +const earch = await pagefind.search("term", { ++ use_weighting: false +}); +``` +{{< /diffcode >}} + ## Re-initializing the search API In some cases you might need to re-initialize Pagefind. For example, if you dynamically change the language of the page without reloading, Pagefind will need to be re-initialized to reflect this langauge change. diff --git a/pagefind/features/weighting.feature b/pagefind/features/weighting.feature index 4de98933..5afde92c 100644 --- a/pagefind/features/weighting.feature +++ b/pagefind/features/weighting.feature @@ -224,3 +224,31 @@ Feature: Word Weighting Then There should be no logs # Treat the bal value here as a snapshot — update the expected value as needed Then The selector "p" should contain "weight:1/bal:82.28572/loc:4" + + Scenario: Density weighting can be turned off + Given I have a "public/single-word.html" file with the body: + """ +

word

+ """ + Given I have a "public/three-words.html" file with the body: + """ +

I have a word and a word and another word

+ """ + When I run my program + Then I should see "Running Pagefind" in stdout + When I serve the "public" directory + When I load "/" + When I evaluate: + """ + async function() { + let pagefind = await import("/pagefind/pagefind.js"); + + let search = await pagefind.search(`word`); + let search2 = await pagefind.search(`word`, { use_weighting: false }); + let counts = [search, search2].map(s => s.results.map(r => r.words.length)); + document.querySelector('p').innerText = JSON.stringify(counts); + } + """ + Then There should be no logs + # With density weighting, single-word should be the first hit, otherwise three-words + Then The selector "p" should contain "[[1,3],[3,1]]" diff --git a/pagefind_web/src/lib.rs b/pagefind_web/src/lib.rs index f4412e70..97cc919a 100644 --- a/pagefind_web/src/lib.rs +++ b/pagefind_web/src/lib.rs @@ -210,7 +210,7 @@ pub fn filters(ptr: *mut SearchIndex) -> String { } #[wasm_bindgen] -pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str, sort: &str, exact: bool) -> String { +pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str, sort: &str, exact: bool, weighting: bool) -> String { let search_index = unsafe { Box::from_raw(ptr) }; if let Some(generator_version) = search_index.generator_version.as_ref() { @@ -225,7 +225,7 @@ pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str, sort: &str, exac let (unfiltered_results, mut results) = if exact { search_index.exact_term(query, filter_set) } else { - search_index.search_term(query, filter_set) + search_index.search_term(query, filter_set, weighting) }; let unfiltered_total = unfiltered_results.len(); debug!({ format!("Raw total of {} results", unfiltered_total) }); diff --git a/pagefind_web/src/search.rs b/pagefind_web/src/search.rs index 824e7975..aded607c 100644 --- a/pagefind_web/src/search.rs +++ b/pagefind_web/src/search.rs @@ -175,6 +175,7 @@ impl SearchIndex { &self, term: &str, filter_results: Option, + weighting: bool, ) -> (Vec, Vec) { debug!({ format! {"Searching {:?}", term} @@ -318,7 +319,7 @@ impl SearchIndex { .map(|BalancedWordScore { balanced_score, .. }| balanced_score) .sum::() / 24.0) - / page.word_count as f32; + / (if weighting { page.word_count as f32 } else { 1.0 }); let search_result = PageSearchResult { page: page.hash.clone(), diff --git a/pagefind_web_js/lib/coupled_search.ts b/pagefind_web_js/lib/coupled_search.ts index d340971b..55cdea2a 100644 --- a/pagefind_web_js/lib/coupled_search.ts +++ b/pagefind_web_js/lib/coupled_search.ts @@ -390,6 +390,7 @@ class PagefindInstance { verbose: false, filters: {}, sort: {}, + use_weighting: true, ...options, }; const log = (str: string) => { if (options.verbose) console.log(str) }; @@ -443,7 +444,7 @@ class PagefindInstance { // pointer may have updated from the loadChunk calls ptr = await this.getPtr(); let searchStart = Date.now(); - let result = this.backend.search(ptr, term, filter_list, sort_list, exact_search) as string; + let result = this.backend.search(ptr, term, filter_list, sort_list, exact_search, options.use_weighting) as string; log(`Got the raw search result: ${result}`); let [unfilteredResultCount, all_results, filters, totalFilters] = result.split(/:([^:]*):(.*)__PF_UNFILTERED_DELIM__(.*)$/); let filterObj = this.parseFilters(filters);