From 75f125862e9f5b5937b04f2c7931633a8e0e8237 Mon Sep 17 00:00:00 2001
From: Adrien Ball <adrien.ball@snips.ai>
Date: Tue, 10 Sep 2019 12:06:06 +0200
Subject: [PATCH] Optimize memory footprint of resources (#151)

* Load hierarchical word clusters more efficiently

* Remove hierarchical aspect of word clusters

* Use hashing trick to replace string by i32 in Gazetteer and Stemmer implementations

* Make word clusterer implementation compatible with non-u16 clusters

* Fix docstring

* Add small improvements

* Update Changelog
---
 CHANGELOG.md                              |   4 +
 src/intent_classifier/featurizer.rs       |  15 ++-
 src/intent_parser/lookup_intent_parser.rs |  33 +++----
 src/resources/gazetteer.rs                |  17 ++--
 src/resources/stemmer.rs                  |  19 ++--
 src/resources/word_clusterer.rs           | 113 +++++++++++++++-------
 src/slot_filler/features.rs               |  10 +-
 7 files changed, 134 insertions(+), 77 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b6b4fee0..18c00319 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [Unreleased] - 2019-09-10
+### Fixed
+- Optimize memory footprint of resources [#151](https://github.com/snipsco/snips-nlu-rs/pull/151)
+
 ## [0.65.2] - 2019-09-06
 ### Fixed
 - Freeze chrono to 0.4.8 to fix issue with rustling-ontology [#149](https://github.com/snipsco/snips-nlu-rs/pull/149)
diff --git a/src/intent_classifier/featurizer.rs b/src/intent_classifier/featurizer.rs
index a22379e7..36cb84f9 100644
--- a/src/intent_classifier/featurizer.rs
+++ b/src/intent_classifier/featurizer.rs
@@ -394,7 +394,10 @@ fn get_custom_entity_feature_name(entity_name: &str, language: NluUtilsLanguage)
     format!("entityfeature{}", e)
 }
 
-fn get_word_clusters(query_tokens: &[String], word_clusterer: Arc<dyn WordClusterer>) -> Vec<String> {
+fn get_word_clusters(
+    query_tokens: &[String],
+    word_clusterer: Arc<dyn WordClusterer>,
+) -> Vec<String> {
     let tokens_ref = query_tokens.iter().map(|t| t.as_ref()).collect_vec();
     compute_all_ngrams(tokens_ref.as_ref(), tokens_ref.len())
         .into_iter()
@@ -777,10 +780,12 @@ mod tests {
         // Given
         let language = Language::EN;
         let query_tokens = tokenize_light("I, love House, muSic", language);
-        let word_clusterer = HashMapWordClusterer::from_iter(vec![
-            ("love".to_string(), "cluster_love".to_string()),
-            ("house".to_string(), "cluster_house".to_string()),
-        ]);
+        let clusters: &[u8] = r#"
+love	cluster_love
+house	cluster_house
+"#
+        .as_ref();
+        let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap();
 
         // When
         let augmented_query = get_word_clusters(&query_tokens, Arc::new(word_clusterer));
diff --git a/src/intent_parser/lookup_intent_parser.rs b/src/intent_parser/lookup_intent_parser.rs
index bf491f94..32f5e08c 100644
--- a/src/intent_parser/lookup_intent_parser.rs
+++ b/src/intent_parser/lookup_intent_parser.rs
@@ -1,27 +1,26 @@
-use std::collections::{HashMap, HashSet};
-use std::fs::File;
-use std::path::Path;
-use std::str::FromStr;
-use std::sync::Arc;
-
-use failure::ResultExt;
-use itertools::Itertools;
-use log::debug;
-use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language};
-use snips_nlu_utils::language::Language as NluUtilsLanguage;
-use snips_nlu_utils::string::normalize;
-use snips_nlu_utils::string::{hash_str_to_i32, substring_with_char_range, suffix_from_char_index};
-use snips_nlu_utils::token::tokenize_light;
-
 use crate::errors::*;
+use crate::intent_parser::InternalParsingResult;
 use crate::language::FromLanguage;
 use crate::models::LookupParserModel;
 use crate::resources::SharedResources;
 use crate::slot_utils::*;
 use crate::utils::{deduplicate_overlapping_entities, IntentName, MatchedEntity, SlotName};
+use crate::IntentParser;
 use crate::{EntityScope, GroupedEntityScope, InputHash, IntentId, SlotId};
-
-use super::{IntentParser, InternalParsingResult};
+use failure::ResultExt;
+use itertools::Itertools;
+use log::debug;
+use snips_nlu_ontology::{BuiltinEntityKind, IntentClassifierResult, Language};
+use snips_nlu_utils::language::Language as NluUtilsLanguage;
+use snips_nlu_utils::string::{
+    hash_str_to_i32, normalize, substring_with_char_range, suffix_from_char_index,
+};
+use snips_nlu_utils::token::tokenize_light;
+use std::collections::{HashMap, HashSet};
+use std::fs::File;
+use std::path::Path;
+use std::str::FromStr;
+use std::sync::Arc;
 
 /// HashMap based Intent Parser. The normalized/canonical form of an utterance
 /// serves as the key and the value is tuple of (intent_id, [vec_of_slots_ids])
diff --git a/src/resources/gazetteer.rs b/src/resources/gazetteer.rs
index e949cba2..361b9e32 100644
--- a/src/resources/gazetteer.rs
+++ b/src/resources/gazetteer.rs
@@ -1,25 +1,25 @@
+use crate::errors::*;
+use snips_nlu_utils::string::hash_str_to_i32;
 use std::collections::HashSet;
 use std::io::{BufRead, BufReader, Read};
 use std::iter::FromIterator;
 
-use crate::errors::*;
-
 pub trait Gazetteer: Send + Sync {
     fn contains(&self, value: &str) -> bool;
 }
 
 pub struct HashSetGazetteer {
-    values: HashSet<String>,
+    values: HashSet<i32>,
 }
 
 impl HashSetGazetteer {
     pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
         let reader = BufReader::new(reader);
-        let mut values = HashSet::<String>::new();
+        let mut values = HashSet::new();
         for line in reader.lines() {
             let word = line?;
             if !word.is_empty() {
-                values.insert(word);
+                values.insert(hash_str_to_i32(&*word));
             }
         }
         Ok(Self { values })
@@ -29,14 +29,17 @@ impl HashSetGazetteer {
 impl FromIterator<String> for HashSetGazetteer {
     fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
         Self {
-            values: HashSet::from_iter(iter),
+            values: iter
+                .into_iter()
+                .map(|str_value| hash_str_to_i32(&*str_value))
+                .collect(),
         }
     }
 }
 
 impl Gazetteer for HashSetGazetteer {
     fn contains(&self, value: &str) -> bool {
-        self.values.contains(value)
+        self.values.contains(&hash_str_to_i32(value))
     }
 }
 
diff --git a/src/resources/stemmer.rs b/src/resources/stemmer.rs
index f1460872..dbae99ec 100644
--- a/src/resources/stemmer.rs
+++ b/src/resources/stemmer.rs
@@ -1,22 +1,20 @@
+use crate::errors::*;
+use snips_nlu_utils::string::{hash_str_to_i32, normalize};
 use std::collections::HashMap;
 use std::io::Read;
 use std::iter::FromIterator;
 
-use snips_nlu_utils::string::normalize;
-
-use crate::errors::*;
-
 pub trait Stemmer: Send + Sync {
     fn stem(&self, value: &str) -> String;
 }
 
 pub struct HashMapStemmer {
-    values: HashMap<String, String>,
+    values: HashMap<i32, String>,
 }
 
 impl HashMapStemmer {
     pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
-        let mut values = HashMap::<String, String>::new();
+        let mut values = HashMap::new();
         let mut csv_reader = csv::ReaderBuilder::new()
             .delimiter(b',')
             .quoting(false)
@@ -28,7 +26,7 @@ impl HashMapStemmer {
             let elements = record?;
             let stem = &elements[0];
             for value in elements.iter().skip(1) {
-                values.insert(value.to_string(), stem.to_string());
+                values.insert(hash_str_to_i32(value), stem.to_string());
             }
         }
         Ok(Self { values })
@@ -38,7 +36,10 @@ impl HashMapStemmer {
 impl FromIterator<(String, String)> for HashMapStemmer {
     fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
         Self {
-            values: HashMap::from_iter(iter),
+            values: iter
+                .into_iter()
+                .map(|(str_key, str_value)| (hash_str_to_i32(&*str_key), str_value))
+                .collect(),
         }
     }
 }
@@ -46,7 +47,7 @@ impl FromIterator<(String, String)> for HashMapStemmer {
 impl Stemmer for HashMapStemmer {
     fn stem(&self, value: &str) -> String {
         self.values
-            .get(&*normalize(value))
+            .get(&hash_str_to_i32(&*normalize(value)))
             .map(|v| v.to_string())
             .unwrap_or_else(|| value.to_string())
     }
diff --git a/src/resources/word_clusterer.rs b/src/resources/word_clusterer.rs
index e4848c69..3c7ff034 100644
--- a/src/resources/word_clusterer.rs
+++ b/src/resources/word_clusterer.rs
@@ -1,17 +1,19 @@
+use crate::errors::*;
+use itertools::Either;
+use snips_nlu_ontology::Language;
+use snips_nlu_utils::string::hash_str_to_i32;
 use std::collections::HashMap;
 use std::io::Read;
-use std::iter::FromIterator;
-
-use snips_nlu_ontology::Language;
-
-use crate::errors::*;
+use std::str::FromStr;
 
 pub trait WordClusterer: Send + Sync {
     fn get_cluster(&self, word: &str) -> Option<String>;
 }
 
 pub struct HashMapWordClusterer {
-    values: HashMap<String, String>,
+    /// This implementation allows to support both u16 and raw string representations for
+    /// word clusters
+    values: Either<HashMap<i32, u16>, HashMap<i32, String>>,
 }
 
 impl HashMapWordClusterer {
@@ -21,27 +23,53 @@ impl HashMapWordClusterer {
             .quoting(false)
             .has_headers(false)
             .from_reader(reader);
-        let mut values = HashMap::<String, String>::new();
+        // This flag is switched to false as soon as a record is found which cannot
+        // be converted to a u16
+        let mut u16_casting_ok = true;
+        let mut u16_values = HashMap::new();
+        let mut str_values = HashMap::new();
         for record in csv_reader.records() {
             let elements = record?;
-            values.insert(elements[0].to_string(), elements[1].to_string());
-        }
-
-        Ok(Self { values })
-    }
-}
-
-impl FromIterator<(String, String)> for HashMapWordClusterer {
-    fn from_iter<T: IntoIterator<Item = (String, String)>>(iter: T) -> Self {
-        Self {
-            values: HashMap::from_iter(iter),
+            let hashed_key = hash_str_to_i32(elements[0].as_ref());
+            // Casting into u16 is attempted only when all previous clusters were converted
+            // successfully
+            if u16_casting_ok {
+                match u16::from_str(elements[1].as_ref()) {
+                    Ok(u16_value) => {
+                        u16_values.insert(hashed_key, u16_value);
+                    }
+                    Err(_) => {
+                        // A word cluster cannot be converted into a u16, let's move all the
+                        // previously stored clusters into a raw string representation
+                        for (hash, value) in u16_values.iter() {
+                            str_values.insert(*hash, format!("{}", value));
+                        }
+                        str_values.insert(hashed_key, elements[1].to_string());
+                        u16_casting_ok = false;
+                        u16_values.clear();
+                    }
+                }
+            } else {
+                str_values.insert(hashed_key, elements[1].to_string());
+            }
         }
+        Ok(Self {
+            values: if u16_casting_ok {
+                Either::Left(u16_values)
+            } else {
+                Either::Right(str_values)
+            },
+        })
     }
 }
 
 impl WordClusterer for HashMapWordClusterer {
     fn get_cluster(&self, word: &str) -> Option<String> {
-        self.values.get(word).map(|v| v.to_string())
+        let hashed_key = hash_str_to_i32(word);
+        match &self.values {
+            Either::Left(u16_values) => u16_values.get(&hashed_key).map(|v| format!("{}", v)),
+            Either::Right(str_values) => str_values.get(&hashed_key).cloned(),
+        }
     }
 }
 
@@ -56,12 +84,12 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_hashmap_word_clusterer() {
+    fn test_hashmap_word_clusterer_with_non_u16_values() {
         // Given
         let clusters: &[u8] = r#"
-hello	1111111111111
-world	1111110111111
-"yolo	1111100111111
+hello	42
+world	123
+"yolo	cluster_which_is_not_u16
 "#
         .as_ref();
 
@@ -71,18 +99,33 @@ world	1111110111111
         // Then
         assert!(clusterer.is_ok());
         let clusterer = clusterer.unwrap();
-        assert_eq!(
-            clusterer.get_cluster("hello"),
-            Some("1111111111111".to_string())
-        );
-        assert_eq!(
-            clusterer.get_cluster("world"),
-            Some("1111110111111".to_string())
-        );
-        assert_eq!(
-            clusterer.get_cluster("\"yolo"),
-            Some("1111100111111".to_string())
-        );
+        assert!(clusterer.values.is_right());
+        assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string()));
+        assert_eq!(clusterer.get_cluster("world"), Some("123".to_string()));
+        assert_eq!(clusterer.get_cluster("\"yolo"), Some("cluster_which_is_not_u16".to_string()));
+        assert_eq!(clusterer.get_cluster("unknown"), None);
+    }
+
+    #[test]
+    fn test_hashmap_word_clusterer_with_u16_values() {
+        // Given
+        let clusters: &[u8] = r#"
+hello	42
+world	123
+yolo	65500
+"#
+            .as_ref();
+
+        // When
+        let clusterer = HashMapWordClusterer::from_reader(clusters);
+
+        // Then
+        assert!(clusterer.is_ok());
+        let clusterer = clusterer.unwrap();
+        assert!(clusterer.values.is_left());
+        assert_eq!(clusterer.get_cluster("hello"), Some("42".to_string()));
+        assert_eq!(clusterer.get_cluster("world"), Some("123".to_string()));
+        assert_eq!(clusterer.get_cluster("yolo"), Some("65500".to_string()));
         assert_eq!(clusterer.get_cluster("unknown"), None);
     }
 }
diff --git a/src/slot_filler/features.rs b/src/slot_filler/features.rs
index 9cccd4d5..b7f96578 100644
--- a/src/slot_filler/features.rs
+++ b/src/slot_filler/features.rs
@@ -834,9 +834,11 @@ mod tests {
     fn test_word_cluster_feature() {
         // Given
         let language = NluUtilsLanguage::EN;
-        let word_clusterer = HashMapWordClusterer::from_iter(
-            vec![("bird".to_string(), "010101".to_string())].into_iter(),
-        );
+        let clusters: &[u8] = r#"
+bird	42
+"#
+        .as_ref();
+        let word_clusterer = HashMapWordClusterer::from_reader(clusters).unwrap();
         let tokens = tokenize("I love this bird", language);
         let feature = WordClusterFeature {
             cluster_name: "test_clusters".to_string(),
@@ -849,7 +851,7 @@ mod tests {
             .collect();
 
         // Then
-        let expected_results = vec![None, None, None, Some("010101".to_string())];
+        let expected_results = vec![None, None, None, Some("42".to_string())];
         assert_eq!(expected_results, results);
     }
 }