Skip to content

Commit

Permalink
Optimize memory footprint of resources (#151)
Browse files Browse the repository at this point in the history
* Load hierarchical word clusters more efficiently

* Remove hierarchical aspect of word clusters

* Use hashing trick to replace string by i32 in Gazetteer and Stemmer implementations

* Make word clusterer implementation compatible with non-u16 clusters

* Fix docstring

* Add small improvements

* Update Changelog
  • Loading branch information
adrienball authored Sep 10, 2019
1 parent bf16a8a commit 75f1258
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 77 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# Changelog
All notable changes to this project will be documented in this file.

## [Unreleased] - 2019-09-10
### Fixed
- Optimize memory footprint of resources [#151](https://github.com/snipsco/snips-nlu-rs/pull/151)

## [0.65.2] - 2019-09-06
### Fixed
- Freeze chrono to 0.4.8 to fix issue with rustling-ontology [#149](https://github.com/snipsco/snips-nlu-rs/pull/149)
Expand Down
15 changes: 10 additions & 5 deletions src/intent_classifier/featurizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,10 @@ fn get_custom_entity_feature_name(entity_name: &str, language: NluUtilsLanguage)
format!("entityfeature{}", e)
}

fn get_word_clusters(query_tokens: &[String], word_clusterer: Arc<dyn WordClusterer>) -> Vec<String> {
fn get_word_clusters(
query_tokens: &[String],
word_clusterer: Arc<dyn WordClusterer>,
) -> Vec<String> {
let tokens_ref = query_tokens.iter().map(|t| t.as_ref()).collect_vec();
compute_all_ngrams(tokens_ref.as_ref(), tokens_ref.len())
.into_iter()
Expand Down Expand Up @@ -777,10 +780,12 @@ mod tests {
// Given
let language = Language::EN;
let query_tokens = tokenize_light("I, love House, muSic", language);
let word_clusterer = HashMapWordClusterer::from_iter(vec![
("love".to_string(), "cluster_love".to_string()),
("house".to_string(), "cluster_house".to_string()),
]);
let clusters: &[u8] = r#"
love cluster_love
house cluster_house
"#
.as_ref();
let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap();

// When
let augmented_query = get_word_clusters(&query_tokens, Arc::new(word_clusterer));
Expand Down
33 changes: 16 additions & 17 deletions src/intent_parser/lookup_intent_parser.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,26 @@
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;

use failure::ResultExt;
use itertools::Itertools;
use log::debug;
use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language};
use snips_nlu_utils::language::Language as NluUtilsLanguage;
use snips_nlu_utils::string::normalize;
use snips_nlu_utils::string::{hash_str_to_i32, substring_with_char_range, suffix_from_char_index};
use snips_nlu_utils::token::tokenize_light;

use crate::errors::*;
use crate::intent_parser::InternalParsingResult;
use crate::language::FromLanguage;
use crate::models::LookupParserModel;
use crate::resources::SharedResources;
use crate::slot_utils::*;
use crate::utils::{deduplicate_overlapping_entities, IntentName, MatchedEntity, SlotName};
use crate::IntentParser;
use crate::{EntityScope, GroupedEntityScope, InputHash, IntentId, SlotId};

use super::{IntentParser, InternalParsingResult};
use failure::ResultExt;
use itertools::Itertools;
use log::debug;
use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language};
use snips_nlu_utils::language::Language as NluUtilsLanguage;
use snips_nlu_utils::string::{
hash_str_to_i32, normalize, substring_with_char_range, suffix_from_char_index,
};
use snips_nlu_utils::token::tokenize_light;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;

/// HashMap based Intent Parser. The normalized/canonical form of an utterance
/// serves as the key and the value is tuple of (intent_id, [vec_of_slots_ids])
Expand Down
17 changes: 10 additions & 7 deletions src/resources/gazetteer.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
use crate::errors::*;
use snips_nlu_utils::string::hash_str_to_i32;
use std::collections::HashSet;
use std::io::{BufRead, BufReader, Read};
use std::iter::FromIterator;

use crate::errors::*;

pub trait Gazetteer: Send + Sync {
fn contains(&self, value: &str) -> bool;
}

pub struct HashSetGazetteer {
values: HashSet<String>,
values: HashSet<i32>,
}

impl HashSetGazetteer {
pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
let reader = BufReader::new(reader);
let mut values = HashSet::<String>::new();
let mut values = HashSet::new();
for line in reader.lines() {
let word = line?;
if !word.is_empty() {
values.insert(word);
values.insert(hash_str_to_i32(&*word));
}
}
Ok(Self { values })
Expand All @@ -29,14 +29,17 @@ impl HashSetGazetteer {
impl FromIterator<String> for HashSetGazetteer {
fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
Self {
values: HashSet::from_iter(iter),
values: iter
.into_iter()
.map(|str_value| hash_str_to_i32(&*str_value))
.collect(),
}
}
}

impl Gazetteer for HashSetGazetteer {
fn contains(&self, value: &str) -> bool {
self.values.contains(value)
self.values.contains(&hash_str_to_i32(value))
}
}

Expand Down
19 changes: 10 additions & 9 deletions src/resources/stemmer.rs
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
use crate::errors::*;
use snips_nlu_utils::string::{hash_str_to_i32, normalize};
use std::collections::HashMap;
use std::io::Read;
use std::iter::FromIterator;

use snips_nlu_utils::string::normalize;

use crate::errors::*;

pub trait Stemmer: Send + Sync {
fn stem(&self, value: &str) -> String;
}

pub struct HashMapStemmer {
values: HashMap<String, String>,
values: HashMap<i32, String>,
}

impl HashMapStemmer {
pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
let mut values = HashMap::<String, String>::new();
let mut values = HashMap::new();
let mut csv_reader = csv::ReaderBuilder::new()
.delimiter(b',')
.quoting(false)
Expand All @@ -28,7 +26,7 @@ impl HashMapStemmer {
let elements = record?;
let stem = &elements[0];
for value in elements.iter().skip(1) {
values.insert(value.to_string(), stem.to_string());
values.insert(hash_str_to_i32(value), stem.to_string());
}
}
Ok(Self { values })
Expand All @@ -38,15 +36,18 @@ impl HashMapStemmer {
impl FromIterator<(String, String)> for HashMapStemmer {
fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
Self {
values: HashMap::from_iter(iter),
values: iter
.into_iter()
.map(|(str_key, str_value)| (hash_str_to_i32(&*str_key), str_value))
.collect(),
}
}
}

impl Stemmer for HashMapStemmer {
fn stem(&self, value: &str) -> String {
self.values
.get(&*normalize(value))
.get(&hash_str_to_i32(&*normalize(value)))
.map(|v| v.to_string())
.unwrap_or_else(|| value.to_string())
}
Expand Down
113 changes: 78 additions & 35 deletions src/resources/word_clusterer.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
use crate::errors::*;
use itertools::Either;
use snips_nlu_ontology::Language;
use snips_nlu_utils::string::hash_str_to_i32;
use std::collections::HashMap;
use std::io::Read;
use std::iter::FromIterator;

use snips_nlu_ontology::Language;

use crate::errors::*;
use std::str::FromStr;

pub trait WordClusterer: Send + Sync {
fn get_cluster(&self, word: &str) -> Option<String>;
}

pub struct HashMapWordClusterer {
values: HashMap<String, String>,
/// This implementation allows to support both u16 and raw string representations for
/// word clusters
values: Either<HashMap<i32, u16>, HashMap<i32, String>>,
}

impl HashMapWordClusterer {
Expand All @@ -21,27 +23,53 @@ impl HashMapWordClusterer {
.quoting(false)
.has_headers(false)
.from_reader(reader);
let mut values = HashMap::<String, String>::new();
// This flag is switched to false as soon as a record is found which cannot
// be converted to a u16
let mut u16_casting_ok = true;
let mut u16_values = HashMap::new();
let mut str_values = HashMap::new();
for record in csv_reader.records() {
let elements = record?;
values.insert(elements[0].to_string(), elements[1].to_string());
}

Ok(Self { values })
}
}

impl FromIterator<(String, String)> for HashMapWordClusterer {
fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
Self {
values: HashMap::from_iter(iter),
let hashed_key = hash_str_to_i32(elements[0].as_ref());
// Casting into u16 is attempted only when all previous clusters were converted
// successfully
if u16_casting_ok {
match u16::from_str(elements[1].as_ref()) {
Ok(u16_value) => {
u16_values.insert(hashed_key, u16_value);
}
Err(_) => {
// A word cluster cannot be converted into a u16, let's move all the
// previously stored clusters into a raw string representation
for (hash, value) in u16_values.iter() {
str_values.insert(*hash, format!("{}", value));
}
str_values.insert(hashed_key, elements[1].to_string());
u16_casting_ok = false;
u16_values.clear();
}
}
} else {
str_values.insert(hashed_key, elements[1].to_string());
}
}
Ok(Self {
values: if u16_casting_ok {
Either::Left(u16_values)
} else {
Either::Right(str_values)
},
})
}
}

impl WordClusterer for HashMapWordClusterer {
fn get_cluster(&self, word: &str) -> Option<String> {
self.values.get(word).map(|v| v.to_string())
let hashed_key = hash_str_to_i32(word);
match &self.values {
Either::Left(u16_values) => u16_values.get(&hashed_key).map(|v| format!("{}", v)),
Either::Right(str_values) => str_values.get(&hashed_key).cloned(),
}
}
}

Expand All @@ -56,12 +84,12 @@ mod tests {
use super::*;

#[test]
fn test_hashmap_word_clusterer() {
fn test_hashmap_word_clusterer_with_non_u16_values() {
// Given
let clusters: &[u8] = r#"
hello 1111111111111
world 1111110111111
"yolo 1111100111111
hello 42
world 123
"yolo cluster_which_is_not_u16
"#
.as_ref();

Expand All @@ -71,18 +99,33 @@ world 1111110111111
// Then
assert!(clusterer.is_ok());
let clusterer = clusterer.unwrap();
assert_eq!(
clusterer.get_cluster("hello"),
Some("1111111111111".to_string())
);
assert_eq!(
clusterer.get_cluster("world"),
Some("1111110111111".to_string())
);
assert_eq!(
clusterer.get_cluster("\"yolo"),
Some("1111100111111".to_string())
);
assert!(clusterer.values.is_right());
assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string()));
assert_eq!(clusterer.get_cluster("world"), Some("123".to_string()));
assert_eq!(clusterer.get_cluster("\"yolo"), Some("cluster_which_is_not_u16".to_string()));
assert_eq!(clusterer.get_cluster("unknown"), None);
}

#[test]
fn test_hashmap_word_clusterer_with_u16_values() {
// Given
let clusters: &[u8] = r#"
hello 42
world 123
yolo 65500
"#
.as_ref();

// When
let clusterer = HashMapWordClusterer::from_reader(clusters);

// Then
assert!(clusterer.is_ok());
let clusterer = clusterer.unwrap();
assert!(clusterer.values.is_left());
assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string()));
assert_eq!(clusterer.get_cluster("world"), Some("123".to_string()));
assert_eq!(clusterer.get_cluster("yolo"), Some("65500".to_string()));
assert_eq!(clusterer.get_cluster("unknown"), None);
}
}
10 changes: 6 additions & 4 deletions src/slot_filler/features.rs
Original file line number Diff line number Diff line change
Expand Up @@ -834,9 +834,11 @@ mod tests {
fn test_word_cluster_feature() {
// Given
let language = NluUtilsLanguage::EN;
let word_clusterer = HashMapWordClusterer::from_iter(
vec![("bird".to_string(), "010101".to_string())].into_iter(),
);
let clusters: &[u8] = r#"
bird 42
"#
.as_ref();
let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap();
let tokens = tokenize("I love this bird", language);
let feature = WordClusterFeature {
cluster_name: "test_clusters".to_string(),
Expand All @@ -849,7 +851,7 @@ mod tests {
.collect();

// Then
let expected_results = vec![None, None, None, Some("010101".to_string())];
let expected_results = vec![None, None, None, Some("42".to_string())];
assert_eq!(expected_results, results);
}
}

0 comments on commit 75f1258

Please sign in to comment.