[ while #324 ] OCaml lexer: prettier output, _names for BNFC characte…

…r classes Instead of @l@, use name @_letter@ for predefined character class @letter@ etc. The included test makes sure the new names cannot clash with user defined token names. Previously, user token type L would be translated to @l@ clashing with the predefined letter character class.
BNFC · Nov 13, 2020 · 2082e73 · 2082e73
1 parent 7179129
commit 2082e73
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 77 deletions.
diff --git a/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs b/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs
@@ -48,51 +48,56 @@ cf2ocamllex _ parserMod cf =
    ]
 
 header :: String -> CF -> [String]
-header parserMod cf = [
-  "(* This ocamllex file was machine-generated by the BNF converter *)",
-  "{",
-  "open " ++ parserMod,
-  "open Lexing",
-  "",
-  hashtables cf,
-  "",
-  "let unescapeInitTail (s:string) : string =",
-  "  let rec unesc s = match s with",
-  "      '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs",
-  "    | '\\\\'::'n'::cs  -> '\\n' :: unesc cs",
-  "    | '\\\\'::'t'::cs  -> '\\t' :: unesc cs",
-  "    | '\\\\'::'r'::cs  -> '\\r' :: unesc cs",
-  -- "    | '\\\\'::'f'::cs  -> '\\f' :: unesc cs",  -- \f not supported by ocaml
-  "    | '\\\"'::[]    -> []",
-  "    | c::cs      -> c :: unesc cs",
-  "    | _         -> []",
-  "  (* explode/implode from caml FAQ *)",
-  "  in let explode (s : string) : char list =",
-  "      let rec exp i l =",
-  "        if i < 0 then l else exp (i - 1) (s.[i] :: l) in",
-  "      exp (String.length s - 1) []",
-  "  in let implode (l : char list) : string =",
-  "      let res = Buffer.create (List.length l) in",
-  "      List.iter (Buffer.add_char res) l;",
-  "      Buffer.contents res",
-  "  in implode (unesc (List.tl (explode s)))",
-  "",
-  "let incr_lineno (lexbuf:Lexing.lexbuf) : unit =",
-  "    let pos = lexbuf.lex_curr_p in",
-  "        lexbuf.lex_curr_p <- { pos with",
-  "            pos_lnum = pos.pos_lnum + 1;",
-  "            pos_bol = pos.pos_cnum;",
-  "        }",
-  "}"
+header parserMod cf = List.intercalate [""] . filter (not . null) $ concat
+  [ [ [ "(* This ocamllex file was machine-generated by the BNF converter *)"
+      , ""
+      , "(* preamble *)"
+      , "{"
+      , "open " ++ parserMod
+      , "open Lexing"
+      ]
+    ]
+  , hashtables cf
+  , [ [ "let unescapeInitTail (s:string) : string ="
+    , "  let rec unesc s = match s with"
+      , "      '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs"
+      , "    | '\\\\'::'n'::cs  -> '\\n' :: unesc cs"
+      , "    | '\\\\'::'t'::cs  -> '\\t' :: unesc cs"
+      , "    | '\\\\'::'r'::cs  -> '\\r' :: unesc cs"
+        -- "    | '\\\\'::'f'::cs  -> '\\f' :: unesc cs",  -- \f not supported by ocaml
+      , "    | '\\\"'::[]    -> []"
+      , "    | c::cs      -> c :: unesc cs"
+      , "    | _         -> []"
+      , "  (* explode/implode from caml FAQ *)"
+      , "  in let explode (s : string) : char list ="
+      , "      let rec exp i l ="
+      , "        if i < 0 then l else exp (i - 1) (s.[i] :: l) in"
+      , "      exp (String.length s - 1) []"
+      , "  in let implode (l : char list) : string ="
+      , "      let res = Buffer.create (List.length l) in"
+      , "      List.iter (Buffer.add_char res) l;"
+      , "      Buffer.contents res"
+      , "  in implode (unesc (List.tl (explode s)))"
+      , ""
+      , "let incr_lineno (lexbuf:Lexing.lexbuf) : unit ="
+      , "    let pos = lexbuf.lex_curr_p in"
+      , "        lexbuf.lex_curr_p <- { pos with"
+      , "            pos_lnum = pos.pos_lnum + 1;"
+      , "            pos_bol = pos.pos_cnum;"
+      , "        }"
+      , "}"
+      ]
+    ]
   ]
 
--- | set up hashtables for reserved symbols and words
-hashtables :: CF -> String
-hashtables cf = unlines . concat $
+-- | Set up hashtables for reserved symbols and words.
+hashtables :: CF -> [[String]]
+hashtables cf =
   [ ht "symbol_table"  $ unicodeAndSymbols cf
   , ht "resword_table" $ asciiKeywords cf
   ]
   where
+  ht :: String -> [String] -> [String]
   ht table syms = unless (null syms) $
     [ unwords [ "let", table, "= Hashtbl.create", show (length syms)                  ]
     , unwords [ "let _ = List.iter (fun (kwd, tok) -> Hashtbl.add", table, "kwd tok)" ]
@@ -110,27 +115,30 @@ definitions cf = concat $
   ]
 
 cMacros :: [String]
-cMacros = [
-  "let l = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247']    (*  isolatin1 letter FIXME *)",
-  "let c = ['A'-'Z' '\\192'-'\\221'] # ['\\215']    (*  capital isolatin1 letter FIXME *)",
-  "let s = ['a'-'z' '\\222'-'\\255'] # ['\\247']    (*  small isolatin1 letter FIXME *)",
-  "let d = ['0'-'9']                             (*  digit *)",
-  "let i = l | d | ['_' '\\'']                    (*  identifier character *)",
-  "let u = _                                     (* universal: any character *)"
+cMacros =
+  [ "(* BNFC character classes *)"
+  , "let _letter = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247']    (*  isolatin1 letter FIXME *)"
+  , "let _upper  = ['A'-'Z' '\\192'-'\\221'] # '\\215'      (*  capital isolatin1 letter FIXME *)"
+  , "let _lower  = ['a'-'z' '\\222'-'\\255'] # '\\247'      (*  small isolatin1 letter FIXME *)"
+  , "let _digit  = ['0'-'9']                             (*  _digit *)"
+  , "let _idchar = _letter | _digit | ['_' '\\'']         (*  identifier character *)"
+  , "let _universal = _                                  (* universal: any character *)"
+  , ""
   ]
 
 rMacros :: CF -> [String]
 rMacros cf
   | null symbs = []
   | otherwise  =
-      [ "let rsyms =    (* reserved words consisting of special symbols *)"
-      , "            " ++ unwords (List.intersperse "|" (map mkEsc symbs))
+      [ "(* reserved words consisting of special symbols *)"
+      , unwords $ "let rsyms =" : List.intersperse "|" (map mkEsc symbs)
       ]
   where symbs = unicodeAndSymbols cf
 
 -- user macros, derived from the user-defined tokens
 uMacros :: CF -> [String]
-uMacros cf = ["let " ++ name ++ " = " ++ rep | (name, rep, _, _) <- userTokens cf]
+uMacros cf = if null res then [] else "(* user-defined token types *)" : res
+  where res = ["let " ++ name ++ " = " ++ rep | (name, rep, _, _) <- userTokens cf]
 
 -- | Returns the tuple of @(reg_name, reg_representation, token_name, is_position_token)@.
 
@@ -142,23 +150,27 @@ userTokens cf =
   ]
 
 -- | Make OCamlLex rule
--- >>> mkRule "token" [("REGEX1","ACTION1"),("REGEX2","ACTION2"),("...","...")]
+-- >>> mkRule "token" [("REGEX1","ACTION1"),("REGULAREXPRESSION2","ACTION2"),("...","...")]
+-- (* lexing rules *)
 -- rule token =
---   parse REGEX1 {ACTION1}
---       | REGEX2 {ACTION2}
---       | ... {...}
+--   parse REGEX1  { ACTION1 }
+--       | REGULAREXPRESSION2
+--                 { ACTION2 }
+--       | ...     { ... }
 --
 -- If no regex are given, we dont create a lexer rule:
 -- >>> mkRule "empty" []
 -- <BLANKLINE>
 mkRule :: Doc -> [(Doc,Doc)] -> Doc
 mkRule _ [] = empty
-mkRule entrypoint (r1:rn) = vcat
-    [ "rule" <+> entrypoint <+> "="
-    , nest 2 $ hang "parse" 4 $ vcat
-        (nest 2 (mkOne r1):map (("|" <+>) . mkOne) rn) ]
+mkRule entrypoint (r:rs) = vcat
+    [ "(* lexing rules *)"
+    , "rule" <+> entrypoint <+> "="
+    , nest 2 $ hang "parse" 4 $ vcat $
+        nest 2 (mkOne r) : map (("|" <+>) . mkOne) rs
+    ]
   where
-    mkOne (regex, action) = regex <+> braces action
+    mkOne (regex, action) = regex $$ nest 8 (hsep ["{", action, "}"])
 
 -- | Create regex for single line comments
 -- >>> mkRegexSingleLineComment "--"
@@ -187,26 +199,26 @@ rules cf = mkRule "token" $
     ++
     -- reserved keywords
     [ ( "rsyms"
-      , "let id = lexeme lexbuf in try Hashtbl.find symbol_table id with Not_found -> failwith (\"internal lexer error: reserved symbol \" ^ id ^ \" not found in hashtable\")" )
+      , "let x = lexeme lexbuf in try Hashtbl.find symbol_table x with Not_found -> failwith (\"internal lexer error: reserved symbol \" ^ x ^ \" not found in hashtable\")" )
       | not (null (cfgSymbols cf))]
     ++
     -- user tokens
     [ (text n , tokenAction pos (text t)) | (n,_,t,pos) <- userTokens cf]
     ++
     -- predefined tokens
-    [ ( "l i*", tokenAction False "Ident" ) ]
+    [ ( "_letter _idchar*", tokenAction False "Ident" ) ]
     ++
     -- integers
-    [ ( "d+", "let i = lexeme lexbuf in TOK_Integer (int_of_string i)" )
+    [ ( "_digit+", "TOK_Integer (int_of_string (lexeme lexbuf))" )
     -- doubles
-    , ( "d+ '.' d+ ('e' ('-')? d+)?"
-      , "let f = lexeme lexbuf in TOK_Double (float_of_string f)" )
+    , ( "_digit+ '.' _digit+ ('e' ('-')? _digit+)?"
+      , "TOK_Double (float_of_string (lexeme lexbuf))" )
     -- strings
-    , ( "'\\\"' ((u # ['\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't' | 'r')))* '\\\"'"
-      , "let s = lexeme lexbuf in TOK_String (unescapeInitTail s)" )
+    , ( "'\\\"' (([^ '\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't' | 'r')))* '\\\"'"
+      , "TOK_String (unescapeInitTail (lexeme lexbuf))" )
     -- chars
-    , ( "'\\'' ((u # ['\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't' | 'r'))) '\\\''"
-      , "let s = lexeme lexbuf in TOK_Char s.[1]")
+    , ( "'\\'' (([^ '\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't' | 'r'))) '\\\''"
+      , "TOK_Char (lexeme lexbuf).[1]")
     -- spaces
     , ( "[' ' '\\t']", "token lexbuf")
     -- new lines
@@ -217,11 +229,11 @@ rules cf = mkRule "token" $
   where
     (multilineC, singleLineC) = comments cf
     tokenAction pos t = case asciiKeywords cf of
-        [] -> "let l = lexeme lexbuf in TOK_" <> t <+> arg
-        _  -> "let l = lexeme lexbuf in try Hashtbl.find resword_table l with Not_found -> TOK_" <> t <+> arg
+        [] -> "TOK_" <> t <+> arg "(lexeme lexbuf)"
+        _  -> "let l = lexeme lexbuf in try Hashtbl.find resword_table l with Not_found -> TOK_" <> t <+> arg "l"
       where
-      arg | pos       = "((lexeme_start lexbuf, lexeme_end lexbuf), l)"
-          | otherwise = "l"
+      arg l | pos       = text $ "((lexeme_start lexbuf, lexeme_end lexbuf), " ++ l ++ ")"
+            | otherwise = text l
 
 -------------------------------------------------------------------
 -- Modified from the inlined version of former @RegToAlex@.
@@ -294,8 +306,8 @@ instance Print Reg where
    RAlts str       -> prPrec i 1 $ List.intersperse "|" $ map charLiteral str
    -- RAlts str       -> concat [ ["["], map charLiteral str, ["]"] ]
    RSeqs str       -> [ show str ]
-   RDigit          -> ["d"]
-   RLetter         -> ["l"]
-   RUpper          -> ["c"]
-   RLower          -> ["s"]
-   RAny            -> ["u"]
+   RDigit          -> ["_digit"]
+   RLetter         -> ["_letter"]
+   RUpper          -> ["_upper"]
+   RLower          -> ["_lower"]
+   RAny            -> ["_universal"]
diff --git a/testing/regression-tests/256_Regex/test.cf b/testing/regression-tests/256_Regex/test.cf
@@ -1,6 +1,6 @@
 -- Issue #110: backslash in keyword
 
-Start. S ::= Name "/\\" Name "and" Triple;
+Start. Main ::= Name "/\\" Name "and" Triple;
 
 -- Issue #319: quotation rule in ANTLR lexer definition
 
@@ -10,3 +10,35 @@ token Triple upper letter ["]-\'"] '\'';
 -- Issue #276: whitespace characters in character class
 
 token Name (char - [ "(){};.@\"" ] - [ " \n\t" ]) + ;
+
+-- Re issue #322: Try to provoke clashes with BNFC character classes
+-- in generated ocamllex definition.
+
+token Letter  {"letter1"};
+token L       {"letter2"};
+token Alpha   {"letter3"};
+token A       {"letter4"};
+
+token Upper   {"upper1"};
+token U       {"upper2"};
+token Captial {"upper3"};
+token C       {"upper4"};
+
+token Lower   {"lower1"};
+-- token L       {"lower2"};
+token Small   {"lower3"};
+token S       {"lower4"};
+
+token Digit   {"digit1"};
+token D       {"digit2"};
+token Num     {"digit3"};
+token Number  {"digit4"};
+token Numberic{"digit5"};
+token N       {"digit6"};
+
+token Int     {"int1"};
+token I       {"int2"};
+
+token Universal {"universal1"};
+token Uni       {"universal2"};
+-- token U         {"universal3"};