diff --git a/sudachi/src/config.rs b/sudachi/src/config.rs index e5e50a14..d67f12e9 100644 --- a/sudachi/src/config.rs +++ b/sudachi/src/config.rs @@ -29,6 +29,7 @@ use thiserror::Error; const DEFAULT_RESOURCE_DIR: &str = "resources"; const DEFAULT_SETTING_FILE: &str = "sudachi.json"; +const DEFAULT_SETTING_BYTES: &[u8] = include_bytes!("../../resources/sudachi.json"); const DEFAULT_CHAR_DEF_FILE: &str = "char.def"; /// Sudachi Error @@ -343,6 +344,12 @@ impl Config { Ok(raw_config.build()) } + pub fn new_embedded() -> Result { + let raw_config = ConfigBuilder::from_bytes(DEFAULT_SETTING_BYTES)?; + + Ok(raw_config.build()) + } + /// Creates a minimal config with the provided resource directory pub fn minimal_at(resource_dir: impl Into) -> Config { let mut cfg = Config::default(); diff --git a/sudachi/src/dic/character_category.rs b/sudachi/src/dic/character_category.rs index e13fb0cd..47d2bd15 100644 --- a/sudachi/src/dic/character_category.rs +++ b/sudachi/src/dic/character_category.rs @@ -85,6 +85,11 @@ impl CharacterCategory { Self::from_reader(reader) } + pub fn from_bytes(bytes: &[u8]) -> SudachiResult { + let reader = BufReader::new(bytes); + Self::from_reader(reader) + } + pub fn from_reader(data: T) -> SudachiResult { let ranges = Self::read_character_definition(data)?; Ok(Self::compile(&ranges)) diff --git a/sudachi/src/dic/dictionary.rs b/sudachi/src/dic/dictionary.rs index a7b95d87..0092c21f 100644 --- a/sudachi/src/dic/dictionary.rs +++ b/sudachi/src/dic/dictionary.rs @@ -75,7 +75,7 @@ impl JapaneseDictionary { Self::from_cfg_storage(cfg, sb) } - /// Creats a dictionary from the specified configuration and storage + /// Creates a dictionary from the specified configuration and storage pub fn from_cfg_storage( cfg: &Config, storage: SudachiDicData, @@ -115,6 +115,45 @@ impl JapaneseDictionary { Ok(dic) } + /// Creates a dictionary from the specified configuration and storage, with embedded character definition + pub fn from_cfg_storage_with_embedded_chardef( + cfg: &Config, + storage: SudachiDicData, + ) -> SudachiResult { + let mut basic_dict = LoadedDictionary::from_system_dictionary_embedded(unsafe { + storage.system_static_slice() + })?; + + let plugins = { + let grammar = &mut basic_dict.grammar; + let cfg = &*cfg; + Plugins::load(cfg, grammar)? + }; + + if plugins.oov.is_empty() { + return Err(SudachiError::NoOOVPluginProvided); + } + + for p in plugins.connect_cost.plugins() { + p.edit(&mut basic_dict.grammar); + } + + let mut dic = JapaneseDictionary { + storage, + plugins, + _grammar: basic_dict.grammar, + _lexicon: basic_dict.lexicon_set, + }; + + // this Vec is needed to prevent double borrowing of dic + let user_dicts: Vec<_> = dic.storage.user_static_slice(); + for udic in user_dicts { + dic = dic.merge_user_dictionary(udic)?; + } + + Ok(dic) + } + /// Returns grammar with the correct lifetime pub fn grammar<'a>(&'a self) -> &Grammar<'a> { &self._grammar diff --git a/sudachi/src/dic/mod.rs b/sudachi/src/dic/mod.rs index 65430939..8de28bc4 100644 --- a/sudachi/src/dic/mod.rs +++ b/sudachi/src/dic/mod.rs @@ -42,6 +42,7 @@ pub mod storage; pub mod subset; pub mod word_id; +const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../resources/char.def"); const POS_DEPTH: usize = 6; /// A dictionary consists of one system_dict and zero or more user_dicts @@ -51,14 +52,13 @@ pub struct LoadedDictionary<'a> { } impl<'a> LoadedDictionary<'a> { - /// Creates a system dictionary from bytes, and load a character category from file - pub fn from_system_dictionary( + /// Creates a system dictionary from bytes, and preloaded character category + pub fn from_system_dictionary_and_chardef( dictionary_bytes: &'a [u8], - character_category_file: &Path, + character_category: CharacterCategory, ) -> SudachiResult> { let system_dict = DictionaryLoader::read_system_dictionary(dictionary_bytes)?; - let character_category = CharacterCategory::from_file(character_category_file)?; let mut grammar = system_dict .grammar .ok_or(SudachiError::InvalidDictionaryGrammar)?; @@ -71,6 +71,29 @@ impl<'a> LoadedDictionary<'a> { }) } + /// Creates a system dictionary from bytes, and load a character category from file + pub fn from_system_dictionary( + dictionary_bytes: &'a [u8], + character_category_file: &Path, + ) -> SudachiResult> { + let character_category = CharacterCategory::from_file(character_category_file)?; + Ok(Self::from_system_dictionary_and_chardef( + dictionary_bytes, + character_category, + )?) + } + + /// Creates a system dictionary from bytes, and load embedded default character category + pub fn from_system_dictionary_embedded( + dictionary_bytes: &'a [u8], + ) -> SudachiResult> { + let character_category = CharacterCategory::from_bytes(DEFAULT_CHAR_DEF_BYTES)?; + Ok(Self::from_system_dictionary_and_chardef( + dictionary_bytes, + character_category, + )?) + } + #[cfg(test)] pub(crate) fn merge_dictionary( mut self, diff --git a/sudachi/src/plugin/input_text/default_input_text/mod.rs b/sudachi/src/plugin/input_text/default_input_text/mod.rs index 3eba4bb7..7bd53323 100644 --- a/sudachi/src/plugin/input_text/default_input_text/mod.rs +++ b/sudachi/src/plugin/input_text/default_input_text/mod.rs @@ -37,6 +37,7 @@ use crate::prelude::*; mod tests; const DEFAULT_REWRITE_DEF_FILE: &str = "rewrite.def"; +const DEFAULT_REWRITE_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/rewrite.def"); /// Provides basic normalization of the input text #[derive(Default)] @@ -262,10 +263,15 @@ impl InputTextPlugin for DefaultInputTextPlugin { settings .rewriteDef .unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()), - )?; + ); - let reader = BufReader::new(fs::File::open(&rewrite_file_path)?); - self.read_rewrite_lists(reader)?; + if rewrite_file_path.is_ok() { + let reader = BufReader::new(fs::File::open(&rewrite_file_path?)?); + self.read_rewrite_lists(reader)?; + } else { + let reader = BufReader::new(DEFAULT_REWRITE_DEF_BYTES); + self.read_rewrite_lists(reader)?; + } Ok(()) } diff --git a/sudachi/src/plugin/oov/mecab_oov/mod.rs b/sudachi/src/plugin/oov/mecab_oov/mod.rs index db0b6682..8e2f3a8d 100644 --- a/sudachi/src/plugin/oov/mecab_oov/mod.rs +++ b/sudachi/src/plugin/oov/mecab_oov/mod.rs @@ -39,7 +39,9 @@ use crate::prelude::*; mod test; const DEFAULT_CHAR_DEF_FILE: &str = "char.def"; +const DEFAULT_CHAR_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/char.def"); const DEFAULT_UNK_DEF_FILE: &str = "unk.def"; +const DEFAULT_UNK_DEF_BYTES: &[u8] = include_bytes!("../../../../../resources/unk.def"); /// provides MeCab oov nodes #[derive(Default)] @@ -257,17 +259,29 @@ impl OovProviderPlugin for MeCabOovPlugin { settings .charDef .unwrap_or_else(|| PathBuf::from(DEFAULT_CHAR_DEF_FILE)), - )?; - let reader = BufReader::new(fs::File::open(&char_def_path)?); - let categories = MeCabOovPlugin::read_character_property(reader)?; + ); + + let categories = if char_def_path.is_ok() { + let reader = BufReader::new(fs::File::open(&char_def_path?)?); + MeCabOovPlugin::read_character_property(reader)? + } else { + let reader = BufReader::new(DEFAULT_CHAR_DEF_BYTES); + MeCabOovPlugin::read_character_property(reader)? + }; let unk_def_path = config.complete_path( settings .unkDef .unwrap_or_else(|| PathBuf::from(DEFAULT_UNK_DEF_FILE)), - )?; - let reader = BufReader::new(fs::File::open(&unk_def_path)?); - let oov_list = MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?; + ); + + let oov_list = if unk_def_path.is_ok() { + let reader = BufReader::new(fs::File::open(&unk_def_path?)?); + MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)? + } else { + let reader = BufReader::new(DEFAULT_UNK_DEF_BYTES); + MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)? + }; self.categories = categories; self.oov_list = oov_list;