Skip to content

Commit

Permalink
[ while #324 ] OCaml lexer: prettier output, _names for BNFC characte…
Browse files Browse the repository at this point in the history
…r classes

Instead of @l@, use name @_letter@ for predefined character class
@letter@ etc.

The included test makes sure the new names cannot clash with user
defined token names.  Previously, user token type L would be
translated to @l@ clashing with the predefined letter character
class.
  • Loading branch information
andreasabel committed Nov 13, 2020
1 parent 7179129 commit 2082e73
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 77 deletions.
164 changes: 88 additions & 76 deletions source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs
Original file line number Diff line number Diff line change
Expand Up @@ -48,51 +48,56 @@ cf2ocamllex _ parserMod cf =
]

header :: String -> CF -> [String]
header parserMod cf = [
"(* This ocamllex file was machine-generated by the BNF converter *)",
"{",
"open " ++ parserMod,
"open Lexing",
"",
hashtables cf,
"",
"let unescapeInitTail (s:string) : string =",
" let rec unesc s = match s with",
" '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs",
" | '\\\\'::'n'::cs -> '\\n' :: unesc cs",
" | '\\\\'::'t'::cs -> '\\t' :: unesc cs",
" | '\\\\'::'r'::cs -> '\\r' :: unesc cs",
-- " | '\\\\'::'f'::cs -> '\\f' :: unesc cs", -- \f not supported by ocaml
" | '\\\"'::[] -> []",
" | c::cs -> c :: unesc cs",
" | _ -> []",
" (* explode/implode from caml FAQ *)",
" in let explode (s : string) : char list =",
" let rec exp i l =",
" if i < 0 then l else exp (i - 1) (s.[i] :: l) in",
" exp (String.length s - 1) []",
" in let implode (l : char list) : string =",
" let res = Buffer.create (List.length l) in",
" List.iter (Buffer.add_char res) l;",
" Buffer.contents res",
" in implode (unesc (List.tl (explode s)))",
"",
"let incr_lineno (lexbuf:Lexing.lexbuf) : unit =",
" let pos = lexbuf.lex_curr_p in",
" lexbuf.lex_curr_p <- { pos with",
" pos_lnum = pos.pos_lnum + 1;",
" pos_bol = pos.pos_cnum;",
" }",
"}"
header parserMod cf = List.intercalate [""] . filter (not . null) $ concat
[ [ [ "(* This ocamllex file was machine-generated by the BNF converter *)"
, ""
, "(* preamble *)"
, "{"
, "open " ++ parserMod
, "open Lexing"
]
]
, hashtables cf
, [ [ "let unescapeInitTail (s:string) : string ="
, " let rec unesc s = match s with"
, " '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs"
, " | '\\\\'::'n'::cs -> '\\n' :: unesc cs"
, " | '\\\\'::'t'::cs -> '\\t' :: unesc cs"
, " | '\\\\'::'r'::cs -> '\\r' :: unesc cs"
-- " | '\\\\'::'f'::cs -> '\\f' :: unesc cs", -- \f not supported by ocaml
, " | '\\\"'::[] -> []"
, " | c::cs -> c :: unesc cs"
, " | _ -> []"
, " (* explode/implode from caml FAQ *)"
, " in let explode (s : string) : char list ="
, " let rec exp i l ="
, " if i < 0 then l else exp (i - 1) (s.[i] :: l) in"
, " exp (String.length s - 1) []"
, " in let implode (l : char list) : string ="
, " let res = Buffer.create (List.length l) in"
, " List.iter (Buffer.add_char res) l;"
, " Buffer.contents res"
, " in implode (unesc (List.tl (explode s)))"
, ""
, "let incr_lineno (lexbuf:Lexing.lexbuf) : unit ="
, " let pos = lexbuf.lex_curr_p in"
, " lexbuf.lex_curr_p <- { pos with"
, " pos_lnum = pos.pos_lnum + 1;"
, " pos_bol = pos.pos_cnum;"
, " }"
, "}"
]
]
]

-- | set up hashtables for reserved symbols and words
hashtables :: CF -> String
hashtables cf = unlines . concat $
-- | Set up hashtables for reserved symbols and words.
hashtables :: CF -> [[String]]
hashtables cf =
[ ht "symbol_table" $ unicodeAndSymbols cf
, ht "resword_table" $ asciiKeywords cf
]
where
ht :: String -> [String] -> [String]
ht table syms = unless (null syms) $
[ unwords [ "let", table, "= Hashtbl.create", show (length syms) ]
, unwords [ "let _ = List.iter (fun (kwd, tok) -> Hashtbl.add", table, "kwd tok)" ]
Expand All @@ -110,27 +115,30 @@ definitions cf = concat $
]

cMacros :: [String]
cMacros = [
"let l = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247'] (* isolatin1 letter FIXME *)",
"let c = ['A'-'Z' '\\192'-'\\221'] # ['\\215'] (* capital isolatin1 letter FIXME *)",
"let s = ['a'-'z' '\\222'-'\\255'] # ['\\247'] (* small isolatin1 letter FIXME *)",
"let d = ['0'-'9'] (* digit *)",
"let i = l | d | ['_' '\\''] (* identifier character *)",
"let u = _ (* universal: any character *)"
cMacros =
[ "(* BNFC character classes *)"
, "let _letter = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247'] (* isolatin1 letter FIXME *)"
, "let _upper = ['A'-'Z' '\\192'-'\\221'] # '\\215' (* capital isolatin1 letter FIXME *)"
, "let _lower = ['a'-'z' '\\222'-'\\255'] # '\\247' (* small isolatin1 letter FIXME *)"
, "let _digit = ['0'-'9'] (* _digit *)"
, "let _idchar = _letter | _digit | ['_' '\\''] (* identifier character *)"
, "let _universal = _ (* universal: any character *)"
, ""
]

rMacros :: CF -> [String]
rMacros cf
| null symbs = []
| otherwise =
[ "let rsyms = (* reserved words consisting of special symbols *)"
, " " ++ unwords (List.intersperse "|" (map mkEsc symbs))
[ "(* reserved words consisting of special symbols *)"
, unwords $ "let rsyms =" : List.intersperse "|" (map mkEsc symbs)
]
where symbs = unicodeAndSymbols cf

-- user macros, derived from the user-defined tokens
uMacros :: CF -> [String]
uMacros cf = ["let " ++ name ++ " = " ++ rep | (name, rep, _, _) <- userTokens cf]
uMacros cf = if null res then [] else "(* user-defined token types *)" : res
where res = ["let " ++ name ++ " = " ++ rep | (name, rep, _, _) <- userTokens cf]

-- | Returns the tuple of @(reg_name, reg_representation, token_name, is_position_token)@.

Expand All @@ -142,23 +150,27 @@ userTokens cf =
]

-- | Make OCamlLex rule
-- >>> mkRule "token" [("REGEX1","ACTION1"),("REGEX2","ACTION2"),("...","...")]
-- >>> mkRule "token" [("REGEX1","ACTION1"),("REGULAREXPRESSION2","ACTION2"),("...","...")]
-- (* lexing rules *)
-- rule token =
-- parse REGEX1 {ACTION1}
-- | REGEX2 {ACTION2}
-- | ... {...}
-- parse REGEX1 { ACTION1 }
-- | REGULAREXPRESSION2
-- { ACTION2 }
-- | ... { ... }
--
-- If no regex are given, we dont create a lexer rule:
-- >>> mkRule "empty" []
-- <BLANKLINE>
mkRule :: Doc -> [(Doc,Doc)] -> Doc
mkRule _ [] = empty
mkRule entrypoint (r1:rn) = vcat
[ "rule" <+> entrypoint <+> "="
, nest 2 $ hang "parse" 4 $ vcat
(nest 2 (mkOne r1):map (("|" <+>) . mkOne) rn) ]
mkRule entrypoint (r:rs) = vcat
[ "(* lexing rules *)"
, "rule" <+> entrypoint <+> "="
, nest 2 $ hang "parse" 4 $ vcat $
nest 2 (mkOne r) : map (("|" <+>) . mkOne) rs
]
where
mkOne (regex, action) = regex <+> braces action
mkOne (regex, action) = regex $$ nest 8 (hsep ["{", action, "}"])

-- | Create regex for single line comments
-- >>> mkRegexSingleLineComment "--"
Expand Down Expand Up @@ -187,26 +199,26 @@ rules cf = mkRule "token" $
++
-- reserved keywords
[ ( "rsyms"
, "let id = lexeme lexbuf in try Hashtbl.find symbol_table id with Not_found -> failwith (\"internal lexer error: reserved symbol \" ^ id ^ \" not found in hashtable\")" )
, "let x = lexeme lexbuf in try Hashtbl.find symbol_table x with Not_found -> failwith (\"internal lexer error: reserved symbol \" ^ x ^ \" not found in hashtable\")" )
| not (null (cfgSymbols cf))]
++
-- user tokens
[ (text n , tokenAction pos (text t)) | (n,_,t,pos) <- userTokens cf]
++
-- predefined tokens
[ ( "l i*", tokenAction False "Ident" ) ]
[ ( "_letter _idchar*", tokenAction False "Ident" ) ]
++
-- integers
[ ( "d+", "let i = lexeme lexbuf in TOK_Integer (int_of_string i)" )
[ ( "_digit+", "TOK_Integer (int_of_string (lexeme lexbuf))" )
-- doubles
, ( "d+ '.' d+ ('e' ('-')? d+)?"
, "let f = lexeme lexbuf in TOK_Double (float_of_string f)" )
, ( "_digit+ '.' _digit+ ('e' ('-')? _digit+)?"
, "TOK_Double (float_of_string (lexeme lexbuf))" )
-- strings
, ( "'\\\"' ((u # ['\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't' | 'r')))* '\\\"'"
, "let s = lexeme lexbuf in TOK_String (unescapeInitTail s)" )
, ( "'\\\"' (([^ '\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't' | 'r')))* '\\\"'"
, "TOK_String (unescapeInitTail (lexeme lexbuf))" )
-- chars
, ( "'\\'' ((u # ['\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't' | 'r'))) '\\\''"
, "let s = lexeme lexbuf in TOK_Char s.[1]")
, ( "'\\'' (([^ '\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't' | 'r'))) '\\\''"
, "TOK_Char (lexeme lexbuf).[1]")
-- spaces
, ( "[' ' '\\t']", "token lexbuf")
-- new lines
Expand All @@ -217,11 +229,11 @@ rules cf = mkRule "token" $
where
(multilineC, singleLineC) = comments cf
tokenAction pos t = case asciiKeywords cf of
[] -> "let l = lexeme lexbuf in TOK_" <> t <+> arg
_ -> "let l = lexeme lexbuf in try Hashtbl.find resword_table l with Not_found -> TOK_" <> t <+> arg
[] -> "TOK_" <> t <+> arg "(lexeme lexbuf)"
_ -> "let l = lexeme lexbuf in try Hashtbl.find resword_table l with Not_found -> TOK_" <> t <+> arg "l"
where
arg | pos = "((lexeme_start lexbuf, lexeme_end lexbuf), l)"
| otherwise = "l"
arg l | pos = text $ "((lexeme_start lexbuf, lexeme_end lexbuf), " ++ l ++ ")"
| otherwise = text l

-------------------------------------------------------------------
-- Modified from the inlined version of former @RegToAlex@.
Expand Down Expand Up @@ -294,8 +306,8 @@ instance Print Reg where
RAlts str -> prPrec i 1 $ List.intersperse "|" $ map charLiteral str
-- RAlts str -> concat [ ["["], map charLiteral str, ["]"] ]
RSeqs str -> [ show str ]
RDigit -> ["d"]
RLetter -> ["l"]
RUpper -> ["c"]
RLower -> ["s"]
RAny -> ["u"]
RDigit -> ["_digit"]
RLetter -> ["_letter"]
RUpper -> ["_upper"]
RLower -> ["_lower"]
RAny -> ["_universal"]
34 changes: 33 additions & 1 deletion testing/regression-tests/256_Regex/test.cf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
-- Issue #110: backslash in keyword

Start. S ::= Name "/\\" Name "and" Triple;
Start. Main ::= Name "/\\" Name "and" Triple;

-- Issue #319: quotation rule in ANTLR lexer definition

Expand All @@ -10,3 +10,35 @@ token Triple upper letter ["]-\'"] '\'';
-- Issue #276: whitespace characters in character class

token Name (char - [ "(){};.@\"" ] - [ " \n\t" ]) + ;

-- Re issue #322: Try to provoke clashes with BNFC character classes
-- in generated ocamllex definition.

token Letter {"letter1"};
token L {"letter2"};
token Alpha {"letter3"};
token A {"letter4"};

token Upper {"upper1"};
token U {"upper2"};
token Captial {"upper3"};
token C {"upper4"};

token Lower {"lower1"};
-- token L {"lower2"};
token Small {"lower3"};
token S {"lower4"};

token Digit {"digit1"};
token D {"digit2"};
token Num {"digit3"};
token Number {"digit4"};
token Numberic{"digit5"};
token N {"digit6"};

token Int {"int1"};
token I {"int2"};

token Universal {"universal1"};
token Uni {"universal2"};
-- token U {"universal3"};

0 comments on commit 2082e73

Please sign in to comment.