Skip to content

Commit

Permalink
* Add optional encoding for string.length and string.sub, default to
Browse files Browse the repository at this point in the history
  `"utf8"`
* Add `string.chars` with encoding.
* Fix default string escaping to properly fallback to `"ascii"` when
  utf8 escaping failed.

Fixes: #4109
  • Loading branch information
toots committed Aug 25, 2024
1 parent 893690d commit 887793c
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 26 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ New:
- Add `string.of_int` and `string.spaces`.
- Add `list.assoc.nullable`.
- Add `source.cue` (#3620).
- Add `string.chars` (#4111)
- Added atomic file write operations.

Changed:
Expand All @@ -64,6 +65,7 @@ Changed:
- Changed internal metadata format to be immutable (#3297).
- Allow a getter for the offset of `on_offset` and dropped the metadata
mechanism for updating it (#3355).
- `string.length` and `string.sub` now default to `utf8` encoding (#4109)
- Disable output paging when `TERM` environment variable is not set.
- Allow running as `root` user inside `docker` container by default (#3406).
- Run `check_next` before playlist's requests resolutions (#3625)
Expand Down
8 changes: 8 additions & 0 deletions doc/content/migrating.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ end
However, EBU R128 data is now extracted directly from metadata when available.
So `replaygain` cannot control the gain type via this parameter anymore.

### String functions

Some string functions have been updated to account for string encoding. In particular, `string.length` and `string.sub` now assume that their
given string is in `utf8` by default.

While this is what most user expect, this can lead to backward incompatibilities and new exceptions. You can change back to the old default by
passing `encoding="ascii"` to these functions or using the `settings.string.default_encoding` settings.

### `check_next`

`check_next` in playlist operators is now called _before_ the request is resolved, to make it possible to cut out
Expand Down
26 changes: 26 additions & 0 deletions src/core/builtins/builtins_string_extra.ml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,32 @@
*****************************************************************************)

let log = Log.make ["lang"; "string"]

let conf_string =
Dtools.Conf.void ~p:(Configure.conf#plug "string") "String settings"

let () =
let conf_default_encoding =
Dtools.Conf.string
~p:(conf_string#plug "default_encoding")
~d:"utf8"
"Default encoding for `string.length`, `string.chars` and `string.sub`"
in
conf_default_encoding#on_change (fun v ->
let enc =
match v with
| "ascii" -> `Ascii
| "utf8" -> `Utf8
| _ ->
log#important
"Invalid value %s for `settings.string.default_encoding`! \
Should be one of: \"ascii\" or \"utf8\"."
v;
`Utf8
in
Liquidsoap_lang.Builtins_string.default_encoding := enc)

let string = Liquidsoap_lang.Builtins_string.string
let string_annotate = Lang.add_module ~base:string "annotate"

Expand Down
2 changes: 1 addition & 1 deletion src/lang/builtins_regexp.ml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ let escape_regex_descr =
else Lang_string.utf8_special_char s pos len)
~escape_char:(fun s pos len ->
if s.[pos] = '/' && len = 1 then "\\/"
else Lang_string.escape_utf8_char s pos len)
else Lang_string.escape_utf8_char ~strict:false s pos len)
~next:Lang_string.utf8_next
in
Lang_string.escape_string escape_regex_formatter
Expand Down
114 changes: 101 additions & 13 deletions src/lang/builtins_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,86 @@ let _ =
let l = List.map Lang.to_string l in
Lang.string (String.concat sep l))

let split ~encoding s =
let get =
match encoding with
| `Ascii ->
fun pos ->
let buf = Buffer.create 1 in
Buffer.add_char buf (String.get s pos);
(Buffer.contents buf, 1)
| `Utf8 ->
fun pos ->
let d = String.get_utf_8_uchar s pos in
if not (Uchar.utf_decode_is_valid d) then
failwith "Decoding failed!";
let c = Uchar.utf_decode_uchar d in
let buf = Buffer.create 1 in
Buffer.add_utf_8_uchar buf c;
(Buffer.contents buf, Uchar.utf_decode_length d)
in
let len = String.length s in
let rec f chars pos =
if pos = len then List.rev chars
else (
let char, len = get pos in
f (char :: chars) (pos + len))
in
f [] 0

let default_encoding = ref `Utf8

let encoding_option =
( "encoding",
Lang.nullable_t Lang.string_t,
Some Lang.null,
Some
"Encoding used to split characters. Should be one of: `\"utf8\"` or \
`\"ascii\"`" )

let get_encoding p =
match Lang.to_valued_option Lang.to_string (List.assoc "encoding" p) with
| None -> ("utf8", !default_encoding)
| Some "utf8" -> ("utf8", `Utf8)
| Some "ascii" -> ("ascii", `Ascii)
| _ ->
Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
"invalid"

let _ =
Lang.add_builtin ~base:string "chars" ~category:`String
~descr:"Split string into characters. Raises `error.invalid` on errors."
[encoding_option; ("", Lang.string_t, None, None)]
(Lang.list_t Lang.string_t)
(fun p ->
let enc, encoding = get_encoding p in
let s = Lang.to_string (List.assoc "" p) in
try Lang.list (List.map Lang.string (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:
"Return the string's length using the given encoding. Raises \
`error.invalid` on errors."
[encoding_option; ("", Lang.string_t, None, None)]
Lang.int_t
(fun p ->
let enc, encoding = get_encoding p in
let s = Lang.to_string (List.assoc "" p) in
try Lang.int (List.length (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "nth" ~category:`String
~descr:
Expand Down Expand Up @@ -165,7 +245,7 @@ let string_escape =
("", Lang.string (String.sub s ofs len));
])
| None, `Ascii -> Lang_string.escape_hex_char
| None, `Utf8 -> Lang_string.escape_utf8_char
| None, `Utf8 -> Lang_string.escape_utf8_char ~strict:false
in
let next =
match encoding with
Expand Down Expand Up @@ -213,7 +293,8 @@ let _ =
match Lang.to_string format with
| "octal" -> (Lang_string.escape_octal_char, Lang_string.ascii_next)
| "hex" -> (Lang_string.escape_hex_char, Lang_string.ascii_next)
| "utf8" -> (Lang_string.escape_utf8_char, Lang_string.utf8_next)
| "utf8" ->
(Lang_string.escape_utf8_char ~strict:false, Lang_string.utf8_next)
| _ ->
raise
(Error.Invalid_value
Expand Down Expand Up @@ -264,15 +345,6 @@ let _ =
let s = Lang.to_string (List.assoc "" p) in
Lang.string (Lang_string.unescape_string s))

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:"Get the length of a string."
[("", Lang.string_t, None, None)]
Lang.int_t
(fun p ->
let string = Lang.to_string (List.assoc "" p) in
Lang.int (String.length string))

let _ =
Lang.add_builtin ~base:string "sub" ~category:`String
~descr:
Expand All @@ -285,6 +357,7 @@ let _ =
Some
"Return a sub string starting at this position. First position is 0."
);
encoding_option;
( "length",
Lang.int_t,
None,
Expand All @@ -294,9 +367,24 @@ let _ =
(fun p ->
let start = Lang.to_int (List.assoc "start" p) in
let len = Lang.to_int (List.assoc "length" p) in
let _, encoding = get_encoding p in
let string = Lang.to_string (List.assoc "" p) in
Lang.string
(try String.sub string start len with Invalid_argument _ -> ""))
let s =
match encoding with
| `Ascii -> (
try String.sub string start len with Invalid_argument _ -> "")
| `Utf8 -> (
try
let chars = split ~encoding string in
if List.length chars < len + start then ""
else
String.concat ""
(List.filteri
(fun pos _ -> start <= pos && pos < start + len)
chars)
with _ -> "")
in
Lang.string s)

let _ =
Lang.add_builtin ~base:string "index" ~category:`String
Expand Down
22 changes: 13 additions & 9 deletions src/lang/lang_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,17 @@ let escape_char ~escape_fun s pos len =
| '\'', 1 -> "\\'"
| _ -> escape_fun s pos len

let escape_utf8_char =
let escape_utf8_char ~strict =
let utf8_char_code s pos len =
try utf8_char_code s pos len with _ -> Uchar.to_int Uchar.rep
try utf8_char_code s pos len
with _ when not strict -> Uchar.to_int Uchar.rep
in
escape_char ~escape_fun:(fun s pos len ->
Printf.sprintf "\\u%04X" (utf8_char_code s pos len))

let escape_utf8_formatter ?(special_char = utf8_special_char) =
escape ~special_char ~escape_char:escape_utf8_char ~next:utf8_next
let escape_utf8_formatter ?(strict = false) ?(special_char = utf8_special_char)
=
escape ~special_char ~escape_char:(escape_utf8_char ~strict) ~next:utf8_next

let escape_hex_char =
escape_char ~escape_fun:(fun s pos len ->
Expand Down Expand Up @@ -153,15 +155,15 @@ let escape_string escape s =
len segments);
Bytes.unsafe_to_string b)

let escape_utf8_string ?special_char =
escape_string (escape_utf8_formatter ?special_char)
let escape_utf8_string ?strict ?special_char =
escape_string (escape_utf8_formatter ?strict ?special_char)

let escape_ascii_string ?special_char =
escape_string (escape_ascii_formatter ?special_char)

let quote_utf8_string s =
let quote_utf8_string ?strict s =
Printf.sprintf "\"%s\""
(escape_utf8_string
(escape_utf8_string ?strict
~special_char:(fun s pos len ->
if s.[pos] = '\'' && len = 1 then false
else utf8_special_char s pos len)
Expand All @@ -175,7 +177,9 @@ let quote_ascii_string s =
else ascii_special_char s pos len)
s)

let quote_string s = try quote_utf8_string s with _ -> quote_ascii_string s
let quote_string s =
try quote_utf8_string ~strict:true s with _ -> quote_ascii_string s

let unescape_utf8_pattern = "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]"
let unescape_hex_pattern = "\\\\x[0-9a-fA-F][0-9a-fA-F]"
let unescape_octal_pattern = "\\\\[0-9][0-9][0-9]"
Expand Down
10 changes: 7 additions & 3 deletions src/lang/lang_string.mli
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ val ascii_next : 'a -> int -> int
val escape_char :
escape_fun:(string -> int -> int -> string) -> string -> int -> int -> string

val escape_utf8_char : string -> int -> int -> string
val escape_utf8_char : strict:bool -> string -> int -> int -> string

val escape_utf8_formatter :
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
[> `Orig of int * int | `Subst of string * int ] list * int
Expand All @@ -39,12 +40,15 @@ val escape_string :
string

val escape_utf8_string :
?special_char:(string -> int -> int -> bool) -> string -> string
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
string

val escape_ascii_string :
?special_char:(string -> int -> int -> bool) -> string -> string

val quote_utf8_string : string -> string
val quote_utf8_string : ?strict:bool -> string -> string
val quote_ascii_string : string -> string
val quote_string : string -> string
val unescape_utf8_pattern : string
Expand Down
32 changes: 32 additions & 0 deletions tests/language/string.liq
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,37 @@ def f() =
"blo#{(1, 2, 3)}",
"blo(1, 2, 3)"
)

s = "王^小東="
test.equal(string.length(s), 5)
test.equal(string.chars(s), ["王", "^", "小", "東", "="])
test.equal(string.sub(start=1, length=2, s), "^小")
test.equal(string.length(encoding="ascii", s), 11)
test.equal(
string.chars(encoding="ascii", s),
[
"\xE7",
"\x8E",
"\x8B",
"^",
"\xE5",
"\xB0",
"\x8F",
"\xE6",
"\x9D",
"\xB1",
"="
]
)
test.equal(string.sub(encoding="ascii", start=1, length=2, s), "\x8E\x8B")

try
string.chars(encoding="utf16le", s)
test.fail()
catch e : [error.invalid] do
()
end

test.pass()
end

Expand Down Expand Up @@ -171,6 +202,7 @@ def test_escape_html() =
test.equal(string.escape.html("\\"), "\\")
test.equal(string.escape.html("/"), "/")
test.equal(string.escape.html("`"), "`")

test.pass()
end

Expand Down

0 comments on commit 887793c

Please sign in to comment.