From 7179129d9931dd9c7e0479dd7b4afb182dc9c500 Mon Sep 17 00:00:00 2001
From: Andreas Abel <andreas.abel@ifi.lmu.de>
Date: Fri, 13 Nov 2020 11:27:03 +0100
Subject: [PATCH] [ #324 ] ocamllex does not accept unicode char literals

It seems that in ocaml a char is 8bit, and unicode characters are
their UTF-8 encoded strings.  This means we cannot represent unicode
character sets in the ocamllex lexer definition.  We can use string
literals in some circumstances.

For that reason, RAlts is now translated to a disjunction of char or
string literals (the latter for unicode chars) rather than to a
@[charset]@.
---
 source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs b/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs
index 26af9ea7..b489a4e5 100644
--- a/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs
+++ b/source/src/BNFC/Backend/OCaml/CFtoOCamlLex.hs
@@ -26,6 +26,7 @@ module BNFC.Backend.OCaml.CFtoOCamlLex (cf2ocamllex) where
 
 import Prelude hiding ((<>))
 
+import Data.Char (ord)
 import qualified Data.List as List
 import Text.PrettyPrint hiding (render)
 import qualified Text.PrettyPrint as PP
@@ -169,10 +170,10 @@ mkRegexSingleLineComment s = cstring s <+> "(_ # '\\n')*"
 
 -- | Create regex for multiline comments.
 -- >>> mkRegexMultilineComment "<!--" "-->"
--- "<!--" (u # '-')* '-' ((u # '-')+ '-')* '-' ((u # ['-''>']) (u # '-')* '-' ((u # '-')+ '-')* '-' | '-')* '>'
+-- "<!--" [^ '-']* '-' ([^ '-']+ '-')* '-' ([^ '-' '>'][^ '-']* '-' ([^ '-']+ '-')* '-' | '-')* '>'
 --
 -- >>> mkRegexMultilineComment "\"'" "'\""
--- "\"'" (u # '\'')* '\'' ((u # ['"''\'']) (u # '\'')* '\'' | '\'')* '"'
+-- "\"'" [^ '\'']* '\'' ([^ '"' '\''][^ '\'']* '\'' | '\'')* '"'
 mkRegexMultilineComment :: String -> String -> Doc
 mkRegexMultilineComment b e = text $ printRegOCaml $ mkRegMultilineComment b e
 
@@ -262,9 +263,14 @@ instance Print a => Print [a] where
   prt _ = prtList
 
 instance Print Char where
-  prt _ c = [show c]   -- if isAlphaNum c then [[c]] else ['\\':[c]]
+  prt _ c   = [charLiteral c]
   prtList s = [show s] -- map (concat . prt 0) s
 
+charLiteral :: Char -> String
+charLiteral c
+  | ord c <= 256 = show c
+  | otherwise    = ['"', c, '"']  -- ocamllex does not accept unicode character literals
+
 prPrec :: Int -> Int -> [String] -> [String]
 prPrec i j = if j<i then parenth else id
 
@@ -275,17 +281,21 @@ instance Print Reg where
   prt i e = case e of
    RSeq reg0 reg   -> prPrec i 2 (concat [prt 2 reg0 , prt 3 reg])
    RAlt reg0 reg   -> prPrec i 1 (concat [prt 1 reg0 , ["|"] , prt 2 reg])
+   RMinus RAny (RChar c) -> ["[^", charLiteral c, "]"]
+   RMinus RAny (RAlts str) -> concat [ ["[^"], map charLiteral str, ["]"] ]
    RMinus reg0 reg -> prPrec i 1 (concat [prt 2 reg0 , ["#"] , prt 2 reg])
-   RStar reg       -> prPrec i 3 (concat [prt 3 reg , ["*"]])
-   RPlus reg       -> prPrec i 3 (concat [prt 3 reg , ["+"]])
-   ROpt reg        -> prPrec i 3 (concat [prt 3 reg , ["?"]])
-   REps            -> prPrec i 3 (["\"\""])  -- special construct for eps in ocamllex?
-   RChar c         -> prPrec i 3 (concat [prt 0 c])
-   RAlts str       -> prPrec i 3 (concat [["["], [concatMap show str], ["]"]])
+   RStar reg       -> concat [prt 3 reg , ["*"]]
+   RPlus reg       -> concat [prt 3 reg , ["+"]]
+   ROpt reg        -> concat [prt 3 reg , ["?"]]
+   REps            -> ["\"\""]
+   RChar c         -> [ charLiteral c ]
+   -- ocamllex accepts unicode characters only in string literals.
+   -- Thus we translate RAlts to a disjunction rather than a character set
+   RAlts str       -> prPrec i 1 $ List.intersperse "|" $ map charLiteral str
+   -- RAlts str       -> concat [ ["["], map charLiteral str, ["]"] ]
    RSeqs str       -> [ show str ]
-   -- RSeqs str       -> prPrec i 2 (concat (map (prt 0) str))
-   RDigit          -> prPrec i 3 (concat [["d"]])
-   RLetter         -> prPrec i 3 (concat [["l"]])
-   RUpper          -> prPrec i 3 (concat [["c"]])
-   RLower          -> prPrec i 3 (concat [["s"]])
-   RAny            -> prPrec i 3 (concat [["u"]])
+   RDigit          -> ["d"]
+   RLetter         -> ["l"]
+   RUpper          -> ["c"]
+   RLower          -> ["s"]
+   RAny            -> ["u"]