Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/parser profile #27

Merged
merged 2 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 37 additions & 38 deletions src/parser/structs/intermediate_token.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
use itertools::Itertools;
use regex::RegexSet;

use crate::parser::utils::LITERAL_IDENTIFIER;
use crate::parser::utils::PATTERN_SET;

#[derive(PartialEq, Debug)]
pub enum IntermediateToken<'a> {
Expand Down Expand Up @@ -103,44 +100,46 @@ impl<'a> IntermediateToken<'a> {
.concat()
}

pub fn longest_token_len() -> usize {
Self::all_token_patterns()
.iter()
.max_by(|a, b| a.chars().count().cmp(&b.chars().count()))
.expect("No patterns defined in the library")
.chars()
.count()
}
pub const ALL_TOKEN_PATTERNS_FROM_LONGEST: [&'static str; 26] = [
Self::FALSE_PATTERN_WORD,
Self::TRUE_PATTERN_WORD,
Self::AND_PATTERN_WORD,
Self::NOT_PATTERN_WORD,
Self::AND_PATTERN_LOGIC,
Self::OR_PATTERN_LOGIC,
Self::OR_PATTERN_WORD,
Self::AND_PATTERN_BIT,
Self::AND_PATTERN_MATH,
Self::AND_PATTERN_MATH_2,
Self::AND_PATTERN_BOOL,
Self::OR_PATTERN_BIT,
Self::OR_PATTERN_MATH,
Self::OR_PATTERN_MATH_2,
Self::OR_PATTERN_BOOL,
Self::NOT_PATTERN_TILDE,
Self::NOT_PATTERN_MARK,
Self::NOT_PATTERN_MATH,
Self::FALSE_PATTERN_CHAR,
Self::FALSE_PATTERN_NUM,
Self::TRUE_PATTERN_CHAR,
Self::TRUE_PATTERN_NUM,
Self::LITERAL_START_PATTERN,
Self::LITERAL_END_PATTERN,
Self::PARENTHESIS_START_PATTERN,
Self::PARENTHESIS_END_PATTERN,
];

fn all_token_patterns_ordered_from_longest() -> Vec<&'a str> {
Self::all_token_patterns()
.into_iter()
.sorted_by(|a, b| b.chars().count().cmp(&a.chars().count()))
.collect()
}
// FALSE_PATTERN_WORD == "false"
pub const LONGEST_TOKEN_LEN: usize = 5;

// TODO make a trait method
pub fn try_from(value: &'a str) -> Option<IntermediateToken> {
let input = Self::all_token_patterns_ordered_from_longest();

// escape the pattern so that e.g. "^" is not treated as regex, but as a literal character for the And operation
let set = RegexSet::new(input.iter().map(|pattern| {
format!(
r"(?i)^{}{}",
regex::escape(pattern),
if LITERAL_IDENTIFIER.is_match(pattern) {
"([^-_a-zA-Z0-9]|$)"
} else {
""
}
)
}))
.unwrap();

let pattern_or_no_match = set
let patterns = Self::ALL_TOKEN_PATTERNS_FROM_LONGEST;

let pattern_or_no_match = PATTERN_SET
.matches(value)
.into_iter()
.map(|index| &input[index])
.map(|index| &patterns[index])
.next();

pattern_or_no_match.map(|value| Self::from(value))
Expand Down Expand Up @@ -196,15 +195,15 @@ mod tests {

#[test]
fn test_longest() {
let actual = IntermediateToken::longest_token_len();
let actual = IntermediateToken::LONGEST_TOKEN_LEN;
let expected = IntermediateToken::FALSE_PATTERN_WORD.len();

assert_eq!(actual, expected);
}

#[test]
fn test_ordered_patterns() {
let tokens = IntermediateToken::all_token_patterns_ordered_from_longest();
let tokens = IntermediateToken::ALL_TOKEN_PATTERNS_FROM_LONGEST;

assert!(tokens
.iter()
Expand Down
6 changes: 3 additions & 3 deletions src/parser/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fn tokenize_level(
) -> Result<Vec<FinalToken>, TokenizeError> {
let mut result = vec![];
let mut buffer = String::new();
let take_size = IntermediateToken::longest_token_len() + 1;
let take_size = IntermediateToken::LONGEST_TOKEN_LEN + 1;

// trim whitespace in case of whitespace after opening parenthesis
trim_whitespace_left(input);
Expand Down Expand Up @@ -183,7 +183,7 @@ mod tests {
};
use crate::parser::error::EOL_VICINITY;
use crate::parser::structs::FinalToken::*;
use crate::parser::utils::LITERAL_IDENTIFIER;
use regex::Regex;

use super::*;

Expand Down Expand Up @@ -233,7 +233,7 @@ mod tests {

// test sanity
assert!(!all_tokens().contains(input));
assert!(!LITERAL_IDENTIFIER.is_match(input));
assert!(!Regex::new(r"[-_a-zA-Z0-9]+").unwrap().is_match(input));

let actual = tokenize(input);
let expected_err = UnknownSymbolError {
Expand Down
2 changes: 1 addition & 1 deletion src/parser/utils/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
pub use peek_until::peek_until_n;
pub use pop::pop_n_left;
pub use regex::{LITERAL_IDENTIFIER, SHOULD_END_LITERAL};
pub use regex::{PATTERN_SET, SHOULD_END_LITERAL};
pub use trim_whitespace::trim_whitespace_left;

mod peek_until;
Expand Down
21 changes: 19 additions & 2 deletions src/parser/utils/regex.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
use regex::Regex;
use crate::parser::structs::IntermediateToken;
use regex::{Regex, RegexSet};

lazy_static::lazy_static! {
pub static ref SHOULD_END_LITERAL: Regex = Regex::new(r"[^-_a-zA-Z0-9]").unwrap();
pub static ref LITERAL_IDENTIFIER: Regex = Regex::new(r"[-_a-zA-Z0-9]+").unwrap();
static ref LITERAL_IDENTIFIER: Regex = Regex::new(r"[-_a-zA-Z0-9]+").unwrap();

pub static ref PATTERN_SET: RegexSet = RegexSet::new(IntermediateToken::ALL_TOKEN_PATTERNS_FROM_LONGEST
.iter()
.map(|pattern| {
format!(
r"(?i)^{}{}",
// escape the pattern so that e.g. "^" is not treated as regex, but as a literal character for the And operation
regex::escape(pattern),
if LITERAL_IDENTIFIER.is_match(pattern) {
"([^-_a-zA-Z0-9]|$)"
} else {
""
}
)
}))
.unwrap();
}