From a686cdf94af314652daef5235a36c676b526bea0 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 31 Dec 2022 10:48:42 +0800 Subject: [PATCH 1/5] Support fields that may be collections. --- benches/test_benchmark.rs | 8 ++++---- src/index.rs | 24 ++++++++++++------------ src/lib.rs | 10 +++++----- src/score/default/zero_to_one.rs | 16 ++++++++-------- tests/integrations_tests.rs | 8 ++++---- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs index ddf2e4d..9946496 100644 --- a/benches/test_benchmark.rs +++ b/benches/test_benchmark.rs @@ -30,8 +30,8 @@ pub fn test_speed(c: &mut Criterion) { } s } - fn title_extract_x(d: &DocX) -> Option<&str> { - Some(d.title.as_str()) + fn title_extract_x(d: &DocX) -> Vec<&str> { + vec![d.title.as_str()] } c.bench_function("add_100k_docs", |b| { @@ -43,14 +43,14 @@ pub fn test_speed(c: &mut Criterion) { new_rand.push_str(&generate_string(0, 4)); random_strings.push(new_rand); } - let extractor = [title_extract_x as fn(&_) -> Option<&str>]; + let extractor = [title_extract_x as fn(&DocX) -> Vec<&str>]; b.iter(|| add_all_documents(&mut index, &extractor, &random_strings)); }); } fn add_all_documents( index: &mut Index, - extractor: &[fn(&DocX) -> Option<&str>], + extractor: &[fn(&DocX) -> Vec<&str>], random_strings: &[String], ) { for (i, s) in random_strings.iter().enumerate() { diff --git a/src/index.rs b/src/index.rs index c235097..cea8ea0 100644 --- a/src/index.rs +++ b/src/index.rs @@ -89,12 +89,12 @@ impl Index { let mut all_terms: Vec> = Vec::new(); for i in 0..fields.len() { - if let Some(field_value) = field_accessors[i](doc) { - let fields_len = fields.len(); - let mut field_details = fields.get_mut(i).unwrap(); - + let field_values = field_accessors[i](doc); + let fields_len = fields.len(); + let mut field_details = fields.get_mut(i).unwrap(); + for field_value in field_values { // tokenize text - let terms = tokenizer(field_value); + let terms = tokenizer(&field_value); // filter and count terms, ignore empty strings let mut filtered_terms_count = 0; @@ -486,8 +486,8 @@ mod tests { text: String, } - fn field_accessor(doc: &Doc) -> Option<&str> { - Some(doc.text.as_str()) + fn field_accessor(doc: &Doc) -> Vec<&str> { + vec![doc.text.as_str()] } mod add { @@ -497,7 +497,7 @@ mod tests { #[test] fn it_should_add_one_document_with_three_terms<'idn>() { let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + vec![field_accessor]; let mut index = Index::::new(1); let doc = Doc { @@ -549,7 +549,7 @@ mod tests { #[test] fn it_should_add_shared_terms() { let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + vec![field_accessor]; let mut index = Index::::new(1); let doc_1 = Doc { @@ -609,7 +609,7 @@ mod tests { #[test] fn it_should_ignore_empty_tokens() { let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + vec![field_accessor]; let mut index = Index::::new(1); let doc_1 = Doc { @@ -742,7 +742,7 @@ mod tests { #[test] fn it_should_count_nodes() { let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + vec![field_accessor as fn(doc: &Doc) -> Vec<&str>]; let mut index = Index::::new(1); let doc = Doc { @@ -762,7 +762,7 @@ mod tests { #[test] fn it_should_count_nodes_2() { let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + vec![field_accessor]; let mut index = Index::::new(1); diff --git a/src/lib.rs b/src/lib.rs index f0db0d3..14f26ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ pub use index::*; pub use query::QueryResult; /// Function that extracts a field value from a document. -pub type FieldAccessor = fn(&D) -> Option<&str>; +pub type FieldAccessor = fn(&D) -> Vec<&str>; /// Function used to tokenize a field. pub type Tokenizer = fn(&str) -> Vec>; @@ -31,12 +31,12 @@ pub mod test_util { pub text: String, } - pub fn title_extract(d: &Doc) -> Option<&str> { - Some(d.title.as_str()) + pub fn title_extract(d: &Doc) -> Vec<&str> { + vec![d.title.as_str()] } - pub fn text_extract(d: &Doc) -> Option<&str> { - Some(d.text.as_str()) + pub fn text_extract(d: &Doc) -> Vec<&str> { + vec![d.text.as_str()] } pub fn tokenizer(s: &str) -> Vec> { diff --git a/src/score/default/zero_to_one.rs b/src/score/default/zero_to_one.rs index 4a70020..1234bac 100644 --- a/src/score/default/zero_to_one.rs +++ b/src/score/default/zero_to_one.rs @@ -315,11 +315,11 @@ mod tests { title: String, description: String, } - fn title_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.title.as_str()) + fn title_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.title.as_str()] } - fn description_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.description.as_str()) + fn description_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.description.as_str()] } for (i, (title, description)) in titles.iter().zip(descriptions.iter()).enumerate() { @@ -364,11 +364,11 @@ mod tests { title: String, description: String, } - fn title_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.title.as_str()) + fn title_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.title.as_str()] } - fn description_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.description.as_str()) + fn description_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.description.as_str()] } for (i, (title, description)) in titles.iter().zip(descriptions.iter()).enumerate() { diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index 7068fd5..debefd7 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -16,12 +16,12 @@ fn tokenizer(s: &str) -> Vec> { s.split(' ').map(Cow::from).collect::>() } -fn title_extract(d: &Doc) -> Option<&str> { - Some(d.title.as_str()) +fn title_extract(d: &Doc) -> Vec<&str> { + vec![d.title.as_str()] } -fn description_extract(d: &Doc) -> Option<&str> { - Some(d.description.as_str()) +fn description_extract(d: &Doc) -> Vec<&str> { + vec![d.description.as_str()] } #[test] From e3aa9076ee6168e8f355631fb45e4f044f859d77 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 31 Dec 2022 10:56:20 +0800 Subject: [PATCH 2/5] Run fmt and fix clippy warnings. --- src/index.rs | 16 ++++++---------- src/score/default/bm25.rs | 2 +- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/index.rs b/src/index.rs index cea8ea0..996c104 100644 --- a/src/index.rs +++ b/src/index.rs @@ -94,7 +94,7 @@ impl Index { let mut field_details = fields.get_mut(i).unwrap(); for field_value in field_values { // tokenize text - let terms = tokenizer(&field_value); + let terms = tokenizer(field_value); // filter and count terms, ignore empty strings let mut filtered_terms_count = 0; @@ -170,7 +170,7 @@ impl Index { let doc_details_option = self.docs.get(&key); let mut remove_key = false; if let Some(doc_details) = doc_details_option { - removed.insert((&key).to_owned()); + removed.insert(key); let details = doc_details; remove_key = true; let new_len = (self.docs.len() - 1) as f64; @@ -496,8 +496,7 @@ mod tests { #[test] fn it_should_add_one_document_with_three_terms<'idn>() { - let field_accessors: Vec> = - vec![field_accessor]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); let doc = Doc { @@ -548,8 +547,7 @@ mod tests { #[test] fn it_should_add_shared_terms() { - let field_accessors: Vec> = - vec![field_accessor]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); let doc_1 = Doc { @@ -608,8 +606,7 @@ mod tests { #[test] fn it_should_ignore_empty_tokens() { - let field_accessors: Vec> = - vec![field_accessor]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); let doc_1 = Doc { @@ -761,8 +758,7 @@ mod tests { #[test] fn it_should_count_nodes_2() { - let field_accessors: Vec> = - vec![field_accessor]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs index 4882af3..cd1261e 100644 --- a/src/score/default/bm25.rs +++ b/src/score/default/bm25.rs @@ -71,7 +71,7 @@ impl ScoreCalculator for BM25 { let pre_calculations = &before_output.unwrap(); // it will exist as we need BM25 parameters let mut score: f64 = 0_f64; for x in 0..document_details.field_length.len() { - let mut tf = (&document_pointer.term_frequency[x]).to_owned() as f64; + let mut tf = document_pointer.term_frequency[x] as f64; if tf > 0_f64 { // calculating BM25 tf let field_length = &document_details.field_length[x]; From cec462850bb41648e7bd9cc7d655bcf0512caee5 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 31 Dec 2022 10:58:44 +0800 Subject: [PATCH 3/5] Fix CI clippy warning. --- src/score/default/bm25.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs index cd1261e..b032a4b 100644 --- a/src/score/default/bm25.rs +++ b/src/score/default/bm25.rs @@ -81,7 +81,7 @@ impl ScoreCalculator for BM25 { / (self.bm25k1 * ((1_f64 - self.bm25b) + self.bm25b - * (field_length.to_owned() as f64 / avg_field_length as f64)) + * (field_length.to_owned() as f64 / avg_field_length)) + tf); score += tf * pre_calculations.idf From 2703c24eb4041fe98acf547e8e91e6dea6d8119b Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 31 Dec 2022 10:59:44 +0800 Subject: [PATCH 4/5] Run cargo fmt. --- src/score/default/bm25.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs index b032a4b..80ed794 100644 --- a/src/score/default/bm25.rs +++ b/src/score/default/bm25.rs @@ -80,8 +80,7 @@ impl ScoreCalculator for BM25 { tf = ((self.bm25k1 + 1_f64) * tf) / (self.bm25k1 * ((1_f64 - self.bm25b) - + self.bm25b - * (field_length.to_owned() as f64 / avg_field_length)) + + self.bm25b * (field_length.to_owned() as f64 / avg_field_length)) + tf); score += tf * pre_calculations.idf From ce881dba3e8a1a1fac94d9cefe9dc36c319bcf62 Mon Sep 17 00:00:00 2001 From: muji Date: Wed, 4 Jan 2023 11:43:22 +0800 Subject: [PATCH 5/5] Update README. --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c395815..8e536f6 100644 --- a/README.md +++ b/README.md @@ -55,13 +55,13 @@ fn tokenizer(s: &str) -> Vec> { // We have to provide extraction functions for the fields we want to index // Title -fn title_extract(d: &Doc) -> Option<&str> { - Some(d.title.as_str()) +fn title_extract(d: &Doc) -> Vec<&str> { + vec![d.title.as_str()] } // Description -fn description_extract(d: &Doc) -> Option<&str> { - Some(d.description.as_str()) +fn description_extract(d: &Doc) -> Vec<&str> { + vec![d.description.as_str()] } // Create index with 2 fields