Merge pull request #25 from tmpfs/issue-24

Clamp document_frequency in bm25 score before_each().
quantleaf · Jul 3, 2024 · 3cb82d9 · 3cb82d9
2 parents 6d891c4 + 9b41d5e
commit 3cb82d9
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 6 deletions.
diff --git a/src/index.rs b/src/index.rs
@@ -2,7 +2,6 @@ use std::{
     borrow::Cow,
     fmt::{Debug, Formatter},
     hash::Hash,
-    usize,
 };
 
 use crate::{FieldAccessor, Tokenizer};

diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs
@@ -38,6 +38,8 @@ impl<T: Debug> ScoreCalculator<T, BM25TermCalculations> for BM25 {
         document_frequency: usize,
         documents: &HashMap<T, DocumentDetails<T>>,
     ) -> Option<BM25TermCalculations> {
+        let frequency = std::cmp::min(documents.len(), document_frequency);
+        let diff = documents.len() - frequency;
         Some(BM25TermCalculations {
             expansion_boost: {
                 if term_expansion.query_term_expanded == term_expansion.query_term {
@@ -51,11 +53,7 @@ impl<T: Debug> ScoreCalculator<T, BM25TermCalculations> for BM25 {
                     )
                 }
             },
-            idf: f64::ln(
-                1_f64
-                    + ((documents.len() - document_frequency) as f64 + 0.5)
-                        / (document_frequency as f64 + 0.5),
-            ),
+            idf: f64::ln(1_f64 + (diff as f64 + 0.5) / (frequency as f64 + 0.5)),
         })
     }
 

diff --git a/tests/document_frequency.rs b/tests/document_frequency.rs
@@ -0,0 +1,32 @@
+use probly_search::{score::bm25, Index};
+use std::borrow::Cow;
+
+#[test]
+fn should_not_panic_when_document_frequency_gt_documents_len() {
+    struct Doc {
+        id: usize,
+        content: String,
+    }
+
+    // A white space tokenizer
+    fn tokenizer(s: &str) -> Vec<Cow<str>> {
+        s.split(' ').map(Cow::from).collect::<Vec<_>>()
+    }
+
+    fn content_extract(d: &Doc) -> Vec<&str> {
+        vec![&d.content]
+    }
+
+    let mut index = Index::<usize>::new(1);
+    let doc = Doc {
+        id: 0,
+        content: "this is text with lots of the, the, the, the".to_owned(),
+    };
+    index.add_document(&[content_extract], tokenizer, doc.id, &doc);
+    index.query(
+        &"What did the author do growing up?",
+        &mut bm25::new(),
+        tokenizer,
+        &[1.],
+    );
+}