diff --git a/CHANGES.md b/CHANGES.md index b77453bced..d9c7cf460a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -53,6 +53,7 @@ New: - Add `string.of_int` and `string.spaces`. - Add `list.assoc.nullable`. - Add `source.cue` (#3620). +- Add `string.chars` (#4111) - Added atomic file write operations. Changed: @@ -64,6 +65,7 @@ Changed: - Changed internal metadata format to be immutable (#3297). - Allow a getter for the offset of `on_offset` and dropped the metadata mechanism for updating it (#3355). +- `string.length` and `string.sub` now default to `utf8` encoding (#4109) - Disable output paging when `TERM` environment variable is not set. - Allow running as `root` user inside `docker` container by default (#3406). - Run `check_next` before playlist's requests resolutions (#3625) diff --git a/doc/content/migrating.md b/doc/content/migrating.md index 445cb42828..e884f591ec 100644 --- a/doc/content/migrating.md +++ b/doc/content/migrating.md @@ -92,6 +92,14 @@ end However, EBU R128 data is now extracted directly from metadata when available. So `replaygain` cannot control the gain type via this parameter anymore. +### String functions + +Some string functions have been updated to account for string encoding. In particular, `string.length` and `string.sub` now assume that their +given string is in `utf8` by default. + +While this is what most user expect, this can lead to backward incompatibilities and new exceptions. You can change back to the old default by +passing `encoding="ascii"` to these functions or using the `settings.string.default_encoding` settings. + ### `check_next` `check_next` in playlist operators is now called _before_ the request is resolved, to make it possible to cut out diff --git a/src/core/builtins/builtins_string_extra.ml b/src/core/builtins/builtins_string_extra.ml index 21a8452ceb..99091b0d1a 100644 --- a/src/core/builtins/builtins_string_extra.ml +++ b/src/core/builtins/builtins_string_extra.ml @@ -20,6 +20,32 @@ *****************************************************************************) +let log = Log.make ["lang"; "string"] + +let conf_string = + Dtools.Conf.void ~p:(Configure.conf#plug "string") "String settings" + +let () = + let conf_default_encoding = + Dtools.Conf.string + ~p:(conf_string#plug "default_encoding") + ~d:"utf8" + "Default encoding for `string.length`, `string.chars` and `string.sub`" + in + conf_default_encoding#on_change (fun v -> + let enc = + match v with + | "ascii" -> `Ascii + | "utf8" -> `Utf8 + | _ -> + log#important + "Invalid value %s for `settings.string.default_encoding`! \ + Should be one of: \"ascii\" or \"utf8\"." + v; + `Utf8 + in + Liquidsoap_lang.Builtins_string.default_encoding := enc) + let string = Liquidsoap_lang.Builtins_string.string let string_annotate = Lang.add_module ~base:string "annotate" diff --git a/src/lang/builtins_regexp.ml b/src/lang/builtins_regexp.ml index f8855d08db..4358aa0da6 100644 --- a/src/lang/builtins_regexp.ml +++ b/src/lang/builtins_regexp.ml @@ -49,7 +49,7 @@ let escape_regex_descr = else Lang_string.utf8_special_char s pos len) ~escape_char:(fun s pos len -> if s.[pos] = '/' && len = 1 then "\\/" - else Lang_string.escape_utf8_char s pos len) + else Lang_string.escape_utf8_char ~strict:false s pos len) ~next:Lang_string.utf8_next in Lang_string.escape_string escape_regex_formatter diff --git a/src/lang/builtins_string.ml b/src/lang/builtins_string.ml index 9177afbb0b..8729bf06c8 100644 --- a/src/lang/builtins_string.ml +++ b/src/lang/builtins_string.ml @@ -52,6 +52,86 @@ let _ = let l = List.map Lang.to_string l in Lang.string (String.concat sep l)) +let split ~encoding s = + let get = + match encoding with + | `Ascii -> + fun pos -> + let buf = Buffer.create 1 in + Buffer.add_char buf (String.get s pos); + (Buffer.contents buf, 1) + | `Utf8 -> + fun pos -> + let d = String.get_utf_8_uchar s pos in + if not (Uchar.utf_decode_is_valid d) then + failwith "Decoding failed!"; + let c = Uchar.utf_decode_uchar d in + let buf = Buffer.create 1 in + Buffer.add_utf_8_uchar buf c; + (Buffer.contents buf, Uchar.utf_decode_length d) + in + let len = String.length s in + let rec f chars pos = + if pos = len then List.rev chars + else ( + let char, len = get pos in + f (char :: chars) (pos + len)) + in + f [] 0 + +let default_encoding = ref `Utf8 + +let encoding_option = + ( "encoding", + Lang.nullable_t Lang.string_t, + Some Lang.null, + Some + "Encoding used to split characters. Should be one of: `\"utf8\"` or \ + `\"ascii\"`" ) + +let get_encoding p = + match Lang.to_valued_option Lang.to_string (List.assoc "encoding" p) with + | None -> ("utf8", !default_encoding) + | Some "utf8" -> ("utf8", `Utf8) + | Some "ascii" -> ("ascii", `Ascii) + | _ -> + Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!" + "invalid" + +let _ = + Lang.add_builtin ~base:string "chars" ~category:`String + ~descr:"Split string into characters. Raises `error.invalid` on errors." + [encoding_option; ("", Lang.string_t, None, None)] + (Lang.list_t Lang.string_t) + (fun p -> + let enc, encoding = get_encoding p in + let s = Lang.to_string (List.assoc "" p) in + try Lang.list (List.map Lang.string (split ~encoding s)) + with _ -> + Runtime_error.raise ~pos:(Lang.pos p) + ~message: + (Printf.sprintf "String cannot be split using encoding `\"%s\"`!" + enc) + "invalid") + +let _ = + Lang.add_builtin ~base:string "length" ~category:`String + ~descr: + "Return the string's length using the given encoding. Raises \ + `error.invalid` on errors." + [encoding_option; ("", Lang.string_t, None, None)] + Lang.int_t + (fun p -> + let enc, encoding = get_encoding p in + let s = Lang.to_string (List.assoc "" p) in + try Lang.int (List.length (split ~encoding s)) + with _ -> + Runtime_error.raise ~pos:(Lang.pos p) + ~message: + (Printf.sprintf "String cannot be split using encoding `\"%s\"`!" + enc) + "invalid") + let _ = Lang.add_builtin ~base:string "nth" ~category:`String ~descr: @@ -165,7 +245,7 @@ let string_escape = ("", Lang.string (String.sub s ofs len)); ]) | None, `Ascii -> Lang_string.escape_hex_char - | None, `Utf8 -> Lang_string.escape_utf8_char + | None, `Utf8 -> Lang_string.escape_utf8_char ~strict:false in let next = match encoding with @@ -213,7 +293,8 @@ let _ = match Lang.to_string format with | "octal" -> (Lang_string.escape_octal_char, Lang_string.ascii_next) | "hex" -> (Lang_string.escape_hex_char, Lang_string.ascii_next) - | "utf8" -> (Lang_string.escape_utf8_char, Lang_string.utf8_next) + | "utf8" -> + (Lang_string.escape_utf8_char ~strict:false, Lang_string.utf8_next) | _ -> raise (Error.Invalid_value @@ -264,15 +345,6 @@ let _ = let s = Lang.to_string (List.assoc "" p) in Lang.string (Lang_string.unescape_string s)) -let _ = - Lang.add_builtin ~base:string "length" ~category:`String - ~descr:"Get the length of a string." - [("", Lang.string_t, None, None)] - Lang.int_t - (fun p -> - let string = Lang.to_string (List.assoc "" p) in - Lang.int (String.length string)) - let _ = Lang.add_builtin ~base:string "sub" ~category:`String ~descr: @@ -285,6 +357,7 @@ let _ = Some "Return a sub string starting at this position. First position is 0." ); + encoding_option; ( "length", Lang.int_t, None, @@ -294,9 +367,24 @@ let _ = (fun p -> let start = Lang.to_int (List.assoc "start" p) in let len = Lang.to_int (List.assoc "length" p) in + let _, encoding = get_encoding p in let string = Lang.to_string (List.assoc "" p) in - Lang.string - (try String.sub string start len with Invalid_argument _ -> "")) + let s = + match encoding with + | `Ascii -> ( + try String.sub string start len with Invalid_argument _ -> "") + | `Utf8 -> ( + try + let chars = split ~encoding string in + if List.length chars < len + start then "" + else + String.concat "" + (List.filteri + (fun pos _ -> start <= pos && pos < start + len) + chars) + with _ -> "") + in + Lang.string s) let _ = Lang.add_builtin ~base:string "index" ~category:`String diff --git a/src/lang/lang_string.ml b/src/lang/lang_string.ml index 296a21a724..6147d5c0c3 100644 --- a/src/lang/lang_string.ml +++ b/src/lang/lang_string.ml @@ -105,15 +105,17 @@ let escape_char ~escape_fun s pos len = | '\'', 1 -> "\\'" | _ -> escape_fun s pos len -let escape_utf8_char = +let escape_utf8_char ~strict = let utf8_char_code s pos len = - try utf8_char_code s pos len with _ -> Uchar.to_int Uchar.rep + try utf8_char_code s pos len + with _ when not strict -> Uchar.to_int Uchar.rep in escape_char ~escape_fun:(fun s pos len -> Printf.sprintf "\\u%04X" (utf8_char_code s pos len)) -let escape_utf8_formatter ?(special_char = utf8_special_char) = - escape ~special_char ~escape_char:escape_utf8_char ~next:utf8_next +let escape_utf8_formatter ?(strict = false) ?(special_char = utf8_special_char) + = + escape ~special_char ~escape_char:(escape_utf8_char ~strict) ~next:utf8_next let escape_hex_char = escape_char ~escape_fun:(fun s pos len -> @@ -153,15 +155,15 @@ let escape_string escape s = len segments); Bytes.unsafe_to_string b) -let escape_utf8_string ?special_char = - escape_string (escape_utf8_formatter ?special_char) +let escape_utf8_string ?strict ?special_char = + escape_string (escape_utf8_formatter ?strict ?special_char) let escape_ascii_string ?special_char = escape_string (escape_ascii_formatter ?special_char) -let quote_utf8_string s = +let quote_utf8_string ?strict s = Printf.sprintf "\"%s\"" - (escape_utf8_string + (escape_utf8_string ?strict ~special_char:(fun s pos len -> if s.[pos] = '\'' && len = 1 then false else utf8_special_char s pos len) @@ -175,7 +177,9 @@ let quote_ascii_string s = else ascii_special_char s pos len) s) -let quote_string s = try quote_utf8_string s with _ -> quote_ascii_string s +let quote_string s = + try quote_utf8_string ~strict:true s with _ -> quote_ascii_string s + let unescape_utf8_pattern = "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" let unescape_hex_pattern = "\\\\x[0-9a-fA-F][0-9a-fA-F]" let unescape_octal_pattern = "\\\\[0-9][0-9][0-9]" diff --git a/src/lang/lang_string.mli b/src/lang/lang_string.mli index f4bea8db2f..a0d3f16009 100644 --- a/src/lang/lang_string.mli +++ b/src/lang/lang_string.mli @@ -16,9 +16,10 @@ val ascii_next : 'a -> int -> int val escape_char : escape_fun:(string -> int -> int -> string) -> string -> int -> int -> string -val escape_utf8_char : string -> int -> int -> string +val escape_utf8_char : strict:bool -> string -> int -> int -> string val escape_utf8_formatter : + ?strict:bool -> ?special_char:(string -> int -> int -> bool) -> string -> [> `Orig of int * int | `Subst of string * int ] list * int @@ -39,12 +40,15 @@ val escape_string : string val escape_utf8_string : - ?special_char:(string -> int -> int -> bool) -> string -> string + ?strict:bool -> + ?special_char:(string -> int -> int -> bool) -> + string -> + string val escape_ascii_string : ?special_char:(string -> int -> int -> bool) -> string -> string -val quote_utf8_string : string -> string +val quote_utf8_string : ?strict:bool -> string -> string val quote_ascii_string : string -> string val quote_string : string -> string val unescape_utf8_pattern : string diff --git a/tests/language/string.liq b/tests/language/string.liq index 1e3e0a1421..0e0c55977c 100755 --- a/tests/language/string.liq +++ b/tests/language/string.liq @@ -143,6 +143,37 @@ def f() = "blo#{(1, 2, 3)}", "blo(1, 2, 3)" ) + + s = "王^小東=" + test.equal(string.length(s), 5) + test.equal(string.chars(s), ["王", "^", "小", "東", "="]) + test.equal(string.sub(start=1, length=2, s), "^小") + test.equal(string.length(encoding="ascii", s), 11) + test.equal( + string.chars(encoding="ascii", s), + [ + "\xE7", + "\x8E", + "\x8B", + "^", + "\xE5", + "\xB0", + "\x8F", + "\xE6", + "\x9D", + "\xB1", + "=" + ] + ) + test.equal(string.sub(encoding="ascii", start=1, length=2, s), "\x8E\x8B") + + try + string.chars(encoding="utf16le", s) + test.fail() + catch e : [error.invalid] do + () + end + test.pass() end @@ -171,6 +202,7 @@ def test_escape_html() = test.equal(string.escape.html("\\"), "\\") test.equal(string.escape.html("/"), "/") test.equal(string.escape.html("`"), "`") + test.pass() end