diff --git a/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs b/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs index b489a4e5..10c4f867 100644 --- a/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs +++ b/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs @@ -48,51 +48,56 @@ cf2ocamllex _ parserMod cf = ] header :: String -> CF -> [String] -header parserMod cf = [ - "(* This ocamllex file was machine-generated by the BNF converter *)", - "{", - "open " ++ parserMod, - "open Lexing", - "", - hashtables cf, - "", - "let unescapeInitTail (s:string) : string =", - " let rec unesc s = match s with", - " '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs", - " | '\\\\'::'n'::cs -> '\\n' :: unesc cs", - " | '\\\\'::'t'::cs -> '\\t' :: unesc cs", - " | '\\\\'::'r'::cs -> '\\r' :: unesc cs", - -- " | '\\\\'::'f'::cs -> '\\f' :: unesc cs", -- \f not supported by ocaml - " | '\\\"'::[] -> []", - " | c::cs -> c :: unesc cs", - " | _ -> []", - " (* explode/implode from caml FAQ *)", - " in let explode (s : string) : char list =", - " let rec exp i l =", - " if i < 0 then l else exp (i - 1) (s.[i] :: l) in", - " exp (String.length s - 1) []", - " in let implode (l : char list) : string =", - " let res = Buffer.create (List.length l) in", - " List.iter (Buffer.add_char res) l;", - " Buffer.contents res", - " in implode (unesc (List.tl (explode s)))", - "", - "let incr_lineno (lexbuf:Lexing.lexbuf) : unit =", - " let pos = lexbuf.lex_curr_p in", - " lexbuf.lex_curr_p <- { pos with", - " pos_lnum = pos.pos_lnum + 1;", - " pos_bol = pos.pos_cnum;", - " }", - "}" +header parserMod cf = List.intercalate [""] . filter (not . null) $ concat + [ [ [ "(* This ocamllex file was machine-generated by the BNF converter *)" + , "" + , "(* preamble *)" + , "{" + , "open " ++ parserMod + , "open Lexing" + ] + ] + , hashtables cf + , [ [ "let unescapeInitTail (s:string) : string =" + , " let rec unesc s = match s with" + , " '\\\\'::c::cs when List.mem c ['\\\"'; '\\\\'; '\\\''] -> c :: unesc cs" + , " | '\\\\'::'n'::cs -> '\\n' :: unesc cs" + , " | '\\\\'::'t'::cs -> '\\t' :: unesc cs" + , " | '\\\\'::'r'::cs -> '\\r' :: unesc cs" + -- " | '\\\\'::'f'::cs -> '\\f' :: unesc cs", -- \f not supported by ocaml + , " | '\\\"'::[] -> []" + , " | c::cs -> c :: unesc cs" + , " | _ -> []" + , " (* explode/implode from caml FAQ *)" + , " in let explode (s : string) : char list =" + , " let rec exp i l =" + , " if i < 0 then l else exp (i - 1) (s.[i] :: l) in" + , " exp (String.length s - 1) []" + , " in let implode (l : char list) : string =" + , " let res = Buffer.create (List.length l) in" + , " List.iter (Buffer.add_char res) l;" + , " Buffer.contents res" + , " in implode (unesc (List.tl (explode s)))" + , "" + , "let incr_lineno (lexbuf:Lexing.lexbuf) : unit =" + , " let pos = lexbuf.lex_curr_p in" + , " lexbuf.lex_curr_p <- { pos with" + , " pos_lnum = pos.pos_lnum + 1;" + , " pos_bol = pos.pos_cnum;" + , " }" + , "}" + ] + ] ] --- | set up hashtables for reserved symbols and words -hashtables :: CF -> String -hashtables cf = unlines . concat $ +-- | Set up hashtables for reserved symbols and words. +hashtables :: CF -> [[String]] +hashtables cf = [ ht "symbol_table" $ unicodeAndSymbols cf , ht "resword_table" $ asciiKeywords cf ] where + ht :: String -> [String] -> [String] ht table syms = unless (null syms) $ [ unwords [ "let", table, "= Hashtbl.create", show (length syms) ] , unwords [ "let _ = List.iter (fun (kwd, tok) -> Hashtbl.add", table, "kwd tok)" ] @@ -110,27 +115,30 @@ definitions cf = concat $ ] cMacros :: [String] -cMacros = [ - "let l = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247'] (* isolatin1 letter FIXME *)", - "let c = ['A'-'Z' '\\192'-'\\221'] # ['\\215'] (* capital isolatin1 letter FIXME *)", - "let s = ['a'-'z' '\\222'-'\\255'] # ['\\247'] (* small isolatin1 letter FIXME *)", - "let d = ['0'-'9'] (* digit *)", - "let i = l | d | ['_' '\\''] (* identifier character *)", - "let u = _ (* universal: any character *)" +cMacros = + [ "(* BNFC character classes *)" + , "let _letter = ['a'-'z' 'A'-'Z' '\\192' - '\\255'] # ['\\215' '\\247'] (* isolatin1 letter FIXME *)" + , "let _upper = ['A'-'Z' '\\192'-'\\221'] # '\\215' (* capital isolatin1 letter FIXME *)" + , "let _lower = ['a'-'z' '\\222'-'\\255'] # '\\247' (* small isolatin1 letter FIXME *)" + , "let _digit = ['0'-'9'] (* _digit *)" + , "let _idchar = _letter | _digit | ['_' '\\''] (* identifier character *)" + , "let _universal = _ (* universal: any character *)" + , "" ] rMacros :: CF -> [String] rMacros cf | null symbs = [] | otherwise = - [ "let rsyms = (* reserved words consisting of special symbols *)" - , " " ++ unwords (List.intersperse "|" (map mkEsc symbs)) + [ "(* reserved words consisting of special symbols *)" + , unwords $ "let rsyms =" : List.intersperse "|" (map mkEsc symbs) ] where symbs = unicodeAndSymbols cf -- user macros, derived from the user-defined tokens uMacros :: CF -> [String] -uMacros cf = ["let " ++ name ++ " = " ++ rep | (name, rep, _, _) <- userTokens cf] +uMacros cf = if null res then [] else "(* user-defined token types *)" : res + where res = ["let " ++ name ++ " = " ++ rep | (name, rep, _, _) <- userTokens cf] -- | Returns the tuple of @(reg_name, reg_representation, token_name, is_position_token)@. @@ -142,23 +150,27 @@ userTokens cf = ] -- | Make OCamlLex rule --- >>> mkRule "token" [("REGEX1","ACTION1"),("REGEX2","ACTION2"),("...","...")] +-- >>> mkRule "token" [("REGEX1","ACTION1"),("REGULAREXPRESSION2","ACTION2"),("...","...")] +-- (* lexing rules *) -- rule token = --- parse REGEX1 {ACTION1} --- | REGEX2 {ACTION2} --- | ... {...} +-- parse REGEX1 { ACTION1 } +-- | REGULAREXPRESSION2 +-- { ACTION2 } +-- | ... { ... } -- -- If no regex are given, we dont create a lexer rule: -- >>> mkRule "empty" [] -- mkRule :: Doc -> [(Doc,Doc)] -> Doc mkRule _ [] = empty -mkRule entrypoint (r1:rn) = vcat - [ "rule" <+> entrypoint <+> "=" - , nest 2 $ hang "parse" 4 $ vcat - (nest 2 (mkOne r1):map (("|" <+>) . mkOne) rn) ] +mkRule entrypoint (r:rs) = vcat + [ "(* lexing rules *)" + , "rule" <+> entrypoint <+> "=" + , nest 2 $ hang "parse" 4 $ vcat $ + nest 2 (mkOne r) : map (("|" <+>) . mkOne) rs + ] where - mkOne (regex, action) = regex <+> braces action + mkOne (regex, action) = regex $$ nest 8 (hsep ["{", action, "}"]) -- | Create regex for single line comments -- >>> mkRegexSingleLineComment "--" @@ -187,26 +199,26 @@ rules cf = mkRule "token" $ ++ -- reserved keywords [ ( "rsyms" - , "let id = lexeme lexbuf in try Hashtbl.find symbol_table id with Not_found -> failwith (\"internal lexer error: reserved symbol \" ^ id ^ \" not found in hashtable\")" ) + , "let x = lexeme lexbuf in try Hashtbl.find symbol_table x with Not_found -> failwith (\"internal lexer error: reserved symbol \" ^ x ^ \" not found in hashtable\")" ) | not (null (cfgSymbols cf))] ++ -- user tokens [ (text n , tokenAction pos (text t)) | (n,_,t,pos) <- userTokens cf] ++ -- predefined tokens - [ ( "l i*", tokenAction False "Ident" ) ] + [ ( "_letter _idchar*", tokenAction False "Ident" ) ] ++ -- integers - [ ( "d+", "let i = lexeme lexbuf in TOK_Integer (int_of_string i)" ) + [ ( "_digit+", "TOK_Integer (int_of_string (lexeme lexbuf))" ) -- doubles - , ( "d+ '.' d+ ('e' ('-')? d+)?" - , "let f = lexeme lexbuf in TOK_Double (float_of_string f)" ) + , ( "_digit+ '.' _digit+ ('e' ('-')? _digit+)?" + , "TOK_Double (float_of_string (lexeme lexbuf))" ) -- strings - , ( "'\\\"' ((u # ['\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't' | 'r')))* '\\\"'" - , "let s = lexeme lexbuf in TOK_String (unescapeInitTail s)" ) + , ( "'\\\"' (([^ '\\\"' '\\\\' '\\n']) | ('\\\\' ('\\\"' | '\\\\' | '\\\'' | 'n' | 't' | 'r')))* '\\\"'" + , "TOK_String (unescapeInitTail (lexeme lexbuf))" ) -- chars - , ( "'\\'' ((u # ['\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't' | 'r'))) '\\\''" - , "let s = lexeme lexbuf in TOK_Char s.[1]") + , ( "'\\'' (([^ '\\\'' '\\\\']) | ('\\\\' ('\\\\' | '\\\'' | 'n' | 't' | 'r'))) '\\\''" + , "TOK_Char (lexeme lexbuf).[1]") -- spaces , ( "[' ' '\\t']", "token lexbuf") -- new lines @@ -217,11 +229,11 @@ rules cf = mkRule "token" $ where (multilineC, singleLineC) = comments cf tokenAction pos t = case asciiKeywords cf of - [] -> "let l = lexeme lexbuf in TOK_" <> t <+> arg - _ -> "let l = lexeme lexbuf in try Hashtbl.find resword_table l with Not_found -> TOK_" <> t <+> arg + [] -> "TOK_" <> t <+> arg "(lexeme lexbuf)" + _ -> "let l = lexeme lexbuf in try Hashtbl.find resword_table l with Not_found -> TOK_" <> t <+> arg "l" where - arg | pos = "((lexeme_start lexbuf, lexeme_end lexbuf), l)" - | otherwise = "l" + arg l | pos = text $ "((lexeme_start lexbuf, lexeme_end lexbuf), " ++ l ++ ")" + | otherwise = text l ------------------------------------------------------------------- -- Modified from the inlined version of former @RegToAlex@. @@ -294,8 +306,8 @@ instance Print Reg where RAlts str -> prPrec i 1 $ List.intersperse "|" $ map charLiteral str -- RAlts str -> concat [ ["["], map charLiteral str, ["]"] ] RSeqs str -> [ show str ] - RDigit -> ["d"] - RLetter -> ["l"] - RUpper -> ["c"] - RLower -> ["s"] - RAny -> ["u"] + RDigit -> ["_digit"] + RLetter -> ["_letter"] + RUpper -> ["_upper"] + RLower -> ["_lower"] + RAny -> ["_universal"] diff --git a/testing/regression-tests/256_Regex/test.cf b/testing/regression-tests/256_Regex/test.cf index 2d511aa9..7646dc2e 100644 --- a/testing/regression-tests/256_Regex/test.cf +++ b/testing/regression-tests/256_Regex/test.cf @@ -1,6 +1,6 @@ -- Issue #110: backslash in keyword -Start. S ::= Name "/\\" Name "and" Triple; +Start. Main ::= Name "/\\" Name "and" Triple; -- Issue #319: quotation rule in ANTLR lexer definition @@ -10,3 +10,35 @@ token Triple upper letter ["]-\'"] '\''; -- Issue #276: whitespace characters in character class token Name (char - [ "(){};.@\"" ] - [ " \n\t" ]) + ; + +-- Re issue #322: Try to provoke clashes with BNFC character classes +-- in generated ocamllex definition. + +token Letter {"letter1"}; +token L {"letter2"}; +token Alpha {"letter3"}; +token A {"letter4"}; + +token Upper {"upper1"}; +token U {"upper2"}; +token Captial {"upper3"}; +token C {"upper4"}; + +token Lower {"lower1"}; +-- token L {"lower2"}; +token Small {"lower3"}; +token S {"lower4"}; + +token Digit {"digit1"}; +token D {"digit2"}; +token Num {"digit3"}; +token Number {"digit4"}; +token Numberic{"digit5"}; +token N {"digit6"}; + +token Int {"int1"}; +token I {"int2"}; + +token Universal {"universal1"}; +token Uni {"universal2"}; +-- token U {"universal3"};