From 75f125862e9f5b5937b04f2c7931633a8e0e8237 Mon Sep 17 00:00:00 2001 From: Adrien Ball Date: Tue, 10 Sep 2019 12:06:06 +0200 Subject: [PATCH] Optimize memory footprint of resources (#151) * Load hierarchical word clusters more efficiently * Remove hierarchical aspect of word clusters * Use hashing trick to replace string by i32 in Gazetteer and Stemmer implementations * Make word clusterer implementation compatible with non-u16 clusters * Fix docstring * Add small improvements * Update Changelog --- CHANGELOG.md | 4 + src/intent_classifier/featurizer.rs | 15 ++- src/intent_parser/lookup_intent_parser.rs | 33 +++---- src/resources/gazetteer.rs | 17 ++-- src/resources/stemmer.rs | 19 ++-- src/resources/word_clusterer.rs | 113 +++++++++++++++------- src/slot_filler/features.rs | 10 +- 7 files changed, 134 insertions(+), 77 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6b4fee0..18c00319 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ # Changelog All notable changes to this project will be documented in this file. +## [Unreleased] - 2019-09-10 +### Fixed +- Optimize memory footprint of resources [#151](https://github.com/snipsco/snips-nlu-rs/pull/151) + ## [0.65.2] - 2019-09-06 ### Fixed - Freeze chrono to 0.4.8 to fix issue with rustling-ontology [#149](https://github.com/snipsco/snips-nlu-rs/pull/149) diff --git a/src/intent_classifier/featurizer.rs b/src/intent_classifier/featurizer.rs index a22379e7..36cb84f9 100644 --- a/src/intent_classifier/featurizer.rs +++ b/src/intent_classifier/featurizer.rs @@ -394,7 +394,10 @@ fn get_custom_entity_feature_name(entity_name: &str, language: NluUtilsLanguage) format!("entityfeature{}", e) } -fn get_word_clusters(query_tokens: &[String], word_clusterer: Arc) -> Vec { +fn get_word_clusters( + query_tokens: &[String], + word_clusterer: Arc, +) -> Vec { let tokens_ref = query_tokens.iter().map(|t| t.as_ref()).collect_vec(); compute_all_ngrams(tokens_ref.as_ref(), tokens_ref.len()) .into_iter() @@ -777,10 +780,12 @@ mod tests { // Given let language = Language::EN; let query_tokens = tokenize_light("I, love House, muSic", language); - let word_clusterer = HashMapWordClusterer::from_iter(vec![ - ("love".to_string(), "cluster_love".to_string()), - ("house".to_string(), "cluster_house".to_string()), - ]); + let clusters: &[u8] = r#" +love cluster_love +house cluster_house +"# + .as_ref(); + let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap(); // When let augmented_query = get_word_clusters(&query_tokens, Arc::new(word_clusterer)); diff --git a/src/intent_parser/lookup_intent_parser.rs b/src/intent_parser/lookup_intent_parser.rs index bf491f94..32f5e08c 100644 --- a/src/intent_parser/lookup_intent_parser.rs +++ b/src/intent_parser/lookup_intent_parser.rs @@ -1,27 +1,26 @@ -use std::collections::{HashMap, HashSet}; -use std::fs::File; -use std::path::Path; -use std::str::FromStr; -use std::sync::Arc; - -use failure::ResultExt; -use itertools::Itertools; -use log::debug; -use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language}; -use snips_nlu_utils::language::Language as NluUtilsLanguage; -use snips_nlu_utils::string::normalize; -use snips_nlu_utils::string::{hash_str_to_i32, substring_with_char_range, suffix_from_char_index}; -use snips_nlu_utils::token::tokenize_light; - use crate::errors::*; +use crate::intent_parser::InternalParsingResult; use crate::language::FromLanguage; use crate::models::LookupParserModel; use crate::resources::SharedResources; use crate::slot_utils::*; use crate::utils::{deduplicate_overlapping_entities, IntentName, MatchedEntity, SlotName}; +use crate::IntentParser; use crate::{EntityScope, GroupedEntityScope, InputHash, IntentId, SlotId}; - -use super::{IntentParser, InternalParsingResult}; +use failure::ResultExt; +use itertools::Itertools; +use log::debug; +use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language}; +use snips_nlu_utils::language::Language as NluUtilsLanguage; +use snips_nlu_utils::string::{ + hash_str_to_i32, normalize, substring_with_char_range, suffix_from_char_index, +}; +use snips_nlu_utils::token::tokenize_light; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::path::Path; +use std::str::FromStr; +use std::sync::Arc; /// HashMap based Intent Parser. The normalized/canonical form of an utterance /// serves as the key and the value is tuple of (intent_id, [vec_of_slots_ids]) diff --git a/src/resources/gazetteer.rs b/src/resources/gazetteer.rs index e949cba2..361b9e32 100644 --- a/src/resources/gazetteer.rs +++ b/src/resources/gazetteer.rs @@ -1,25 +1,25 @@ +use crate::errors::*; +use snips_nlu_utils::string::hash_str_to_i32; use std::collections::HashSet; use std::io::{BufRead, BufReader, Read}; use std::iter::FromIterator; -use crate::errors::*; - pub trait Gazetteer: Send + Sync { fn contains(&self, value: &str) -> bool; } pub struct HashSetGazetteer { - values: HashSet, + values: HashSet, } impl HashSetGazetteer { pub fn from_reader(reader: R) -> Result { let reader = BufReader::new(reader); - let mut values = HashSet::::new(); + let mut values = HashSet::new(); for line in reader.lines() { let word = line?; if !word.is_empty() { - values.insert(word); + values.insert(hash_str_to_i32(&*word)); } } Ok(Self { values }) @@ -29,14 +29,17 @@ impl HashSetGazetteer { impl FromIterator for HashSetGazetteer { fn from_iter>(iter: T) -> Self { Self { - values: HashSet::from_iter(iter), + values: iter + .into_iter() + .map(|str_value| hash_str_to_i32(&*str_value)) + .collect(), } } } impl Gazetteer for HashSetGazetteer { fn contains(&self, value: &str) -> bool { - self.values.contains(value) + self.values.contains(&hash_str_to_i32(value)) } } diff --git a/src/resources/stemmer.rs b/src/resources/stemmer.rs index f1460872..dbae99ec 100644 --- a/src/resources/stemmer.rs +++ b/src/resources/stemmer.rs @@ -1,22 +1,20 @@ +use crate::errors::*; +use snips_nlu_utils::string::{hash_str_to_i32, normalize}; use std::collections::HashMap; use std::io::Read; use std::iter::FromIterator; -use snips_nlu_utils::string::normalize; - -use crate::errors::*; - pub trait Stemmer: Send + Sync { fn stem(&self, value: &str) -> String; } pub struct HashMapStemmer { - values: HashMap, + values: HashMap, } impl HashMapStemmer { pub fn from_reader(reader: R) -> Result { - let mut values = HashMap::::new(); + let mut values = HashMap::new(); let mut csv_reader = csv::ReaderBuilder::new() .delimiter(b',') .quoting(false) @@ -28,7 +26,7 @@ impl HashMapStemmer { let elements = record?; let stem = &elements[0]; for value in elements.iter().skip(1) { - values.insert(value.to_string(), stem.to_string()); + values.insert(hash_str_to_i32(value), stem.to_string()); } } Ok(Self { values }) @@ -38,7 +36,10 @@ impl HashMapStemmer { impl FromIterator<(String, String)> for HashMapStemmer { fn from_iter>(iter: T) -> Self { Self { - values: HashMap::from_iter(iter), + values: iter + .into_iter() + .map(|(str_key, str_value)| (hash_str_to_i32(&*str_key), str_value)) + .collect(), } } } @@ -46,7 +47,7 @@ impl FromIterator<(String, String)> for HashMapStemmer { impl Stemmer for HashMapStemmer { fn stem(&self, value: &str) -> String { self.values - .get(&*normalize(value)) + .get(&hash_str_to_i32(&*normalize(value))) .map(|v| v.to_string()) .unwrap_or_else(|| value.to_string()) } diff --git a/src/resources/word_clusterer.rs b/src/resources/word_clusterer.rs index e4848c69..3c7ff034 100644 --- a/src/resources/word_clusterer.rs +++ b/src/resources/word_clusterer.rs @@ -1,17 +1,19 @@ +use crate::errors::*; +use itertools::Either; +use snips_nlu_ontology::Language; +use snips_nlu_utils::string::hash_str_to_i32; use std::collections::HashMap; use std::io::Read; -use std::iter::FromIterator; - -use snips_nlu_ontology::Language; - -use crate::errors::*; +use std::str::FromStr; pub trait WordClusterer: Send + Sync { fn get_cluster(&self, word: &str) -> Option; } pub struct HashMapWordClusterer { - values: HashMap, + /// This implementation allows to support both u16 and raw string representations for + /// word clusters + values: Either, HashMap>, } impl HashMapWordClusterer { @@ -21,27 +23,53 @@ impl HashMapWordClusterer { .quoting(false) .has_headers(false) .from_reader(reader); - let mut values = HashMap::::new(); + // This flag is switched to false as soon as a record is found which cannot + // be converted to a u16 + let mut u16_casting_ok = true; + let mut u16_values = HashMap::new(); + let mut str_values = HashMap::new(); for record in csv_reader.records() { let elements = record?; - values.insert(elements[0].to_string(), elements[1].to_string()); - } - - Ok(Self { values }) - } -} - -impl FromIterator<(String, String)> for HashMapWordClusterer { - fn from_iter>(iter: T) -> Self { - Self { - values: HashMap::from_iter(iter), + let hashed_key = hash_str_to_i32(elements[0].as_ref()); + // Casting into u16 is attempted only when all previous clusters were converted + // successfully + if u16_casting_ok { + match u16::from_str(elements[1].as_ref()) { + Ok(u16_value) => { + u16_values.insert(hashed_key, u16_value); + } + Err(_) => { + // A word cluster cannot be converted into a u16, let's move all the + // previously stored clusters into a raw string representation + for (hash, value) in u16_values.iter() { + str_values.insert(*hash, format!("{}", value)); + } + str_values.insert(hashed_key, elements[1].to_string()); + u16_casting_ok = false; + u16_values.clear(); + } + } + } else { + str_values.insert(hashed_key, elements[1].to_string()); + } } + Ok(Self { + values: if u16_casting_ok { + Either::Left(u16_values) + } else { + Either::Right(str_values) + }, + }) } } impl WordClusterer for HashMapWordClusterer { fn get_cluster(&self, word: &str) -> Option { - self.values.get(word).map(|v| v.to_string()) + let hashed_key = hash_str_to_i32(word); + match &self.values { + Either::Left(u16_values) => u16_values.get(&hashed_key).map(|v| format!("{}", v)), + Either::Right(str_values) => str_values.get(&hashed_key).cloned(), + } } } @@ -56,12 +84,12 @@ mod tests { use super::*; #[test] - fn test_hashmap_word_clusterer() { + fn test_hashmap_word_clusterer_with_non_u16_values() { // Given let clusters: &[u8] = r#" -hello 1111111111111 -world 1111110111111 -"yolo 1111100111111 +hello 42 +world 123 +"yolo cluster_which_is_not_u16 "# .as_ref(); @@ -71,18 +99,33 @@ world 1111110111111 // Then assert!(clusterer.is_ok()); let clusterer = clusterer.unwrap(); - assert_eq!( - clusterer.get_cluster("hello"), - Some("1111111111111".to_string()) - ); - assert_eq!( - clusterer.get_cluster("world"), - Some("1111110111111".to_string()) - ); - assert_eq!( - clusterer.get_cluster("\"yolo"), - Some("1111100111111".to_string()) - ); + assert!(clusterer.values.is_right()); + assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string())); + assert_eq!(clusterer.get_cluster("world"), Some("123".to_string())); + assert_eq!(clusterer.get_cluster("\"yolo"), Some("cluster_which_is_not_u16".to_string())); + assert_eq!(clusterer.get_cluster("unknown"), None); + } + + #[test] + fn test_hashmap_word_clusterer_with_u16_values() { + // Given + let clusters: &[u8] = r#" +hello 42 +world 123 +yolo 65500 +"# + .as_ref(); + + // When + let clusterer = HashMapWordClusterer::from_reader(clusters); + + // Then + assert!(clusterer.is_ok()); + let clusterer = clusterer.unwrap(); + assert!(clusterer.values.is_left()); + assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string())); + assert_eq!(clusterer.get_cluster("world"), Some("123".to_string())); + assert_eq!(clusterer.get_cluster("yolo"), Some("65500".to_string())); assert_eq!(clusterer.get_cluster("unknown"), None); } } diff --git a/src/slot_filler/features.rs b/src/slot_filler/features.rs index 9cccd4d5..b7f96578 100644 --- a/src/slot_filler/features.rs +++ b/src/slot_filler/features.rs @@ -834,9 +834,11 @@ mod tests { fn test_word_cluster_feature() { // Given let language = NluUtilsLanguage::EN; - let word_clusterer = HashMapWordClusterer::from_iter( - vec![("bird".to_string(), "010101".to_string())].into_iter(), - ); + let clusters: &[u8] = r#" +bird 42 +"# + .as_ref(); + let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap(); let tokens = tokenize("I love this bird", language); let feature = WordClusterFeature { cluster_name: "test_clusters".to_string(), @@ -849,7 +851,7 @@ mod tests { .collect(); // Then - let expected_results = vec![None, None, None, Some("010101".to_string())]; + let expected_results = vec![None, None, None, Some("42".to_string())]; assert_eq!(expected_results, results); } }