Skip to content

Commit

Permalink
Merge pull request #25 from tmpfs/issue-24
Browse files Browse the repository at this point in the history
Clamp document_frequency in bm25 score before_each().
  • Loading branch information
marcus-pousette committed Jul 3, 2024
2 parents 6d891c4 + 9b41d5e commit 3cb82d9
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 6 deletions.
1 change: 0 additions & 1 deletion src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use std::{
borrow::Cow,
fmt::{Debug, Formatter},
hash::Hash,
usize,
};

use crate::{FieldAccessor, Tokenizer};
Expand Down
8 changes: 3 additions & 5 deletions src/score/default/bm25.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ impl<T: Debug> ScoreCalculator<T, BM25TermCalculations> for BM25 {
document_frequency: usize,
documents: &HashMap<T, DocumentDetails<T>>,
) -> Option<BM25TermCalculations> {
let frequency = std::cmp::min(documents.len(), document_frequency);
let diff = documents.len() - frequency;
Some(BM25TermCalculations {
expansion_boost: {
if term_expansion.query_term_expanded == term_expansion.query_term {
Expand All @@ -51,11 +53,7 @@ impl<T: Debug> ScoreCalculator<T, BM25TermCalculations> for BM25 {
)
}
},
idf: f64::ln(
1_f64
+ ((documents.len() - document_frequency) as f64 + 0.5)
/ (document_frequency as f64 + 0.5),
),
idf: f64::ln(1_f64 + (diff as f64 + 0.5) / (frequency as f64 + 0.5)),
})
}

Expand Down
32 changes: 32 additions & 0 deletions tests/document_frequency.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use probly_search::{score::bm25, Index};
use std::borrow::Cow;

#[test]
fn should_not_panic_when_document_frequency_gt_documents_len() {
struct Doc {
id: usize,
content: String,
}

// A white space tokenizer
fn tokenizer(s: &str) -> Vec<Cow<str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

fn content_extract(d: &Doc) -> Vec<&str> {
vec![&d.content]
}

let mut index = Index::<usize>::new(1);
let doc = Doc {
id: 0,
content: "this is text with lots of the, the, the, the".to_owned(),
};
index.add_document(&[content_extract], tokenizer, doc.id, &doc);
index.query(
&"What did the author do growing up?",
&mut bm25::new(),
tokenizer,
&[1.],
);
}

0 comments on commit 3cb82d9

Please sign in to comment.