Skip to content

Commit

Permalink
* Add optional encoding for string.length and string.sub, default to
Browse files Browse the repository at this point in the history
  `"utf8"`
* Add `string.chars` with encoding.
* Fix default string escaping to properly fallback to `"ascii"` when
  utf8 escaping failed.

Fixes: #4109
  • Loading branch information
toots committed Aug 25, 2024
1 parent 893690d commit 3b2dd27
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 26 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ New:
- Add `string.of_int` and `string.spaces`.
- Add `list.assoc.nullable`.
- Add `source.cue` (#3620).
- Add `string.chars` (#4111)
- Added atomic file write operations.

Changed:
Expand All @@ -64,6 +65,7 @@ Changed:
- Changed internal metadata format to be immutable (#3297).
- Allow a getter for the offset of `on_offset` and dropped the metadata
mechanism for updating it (#3355).
- `string.length` and `string.sub` now default to `utf8` encoding (#4109)
- Disable output paging when `TERM` environment variable is not set.
- Allow running as `root` user inside `docker` container by default (#3406).
- Run `check_next` before playlist's requests resolutions (#3625)
Expand Down
2 changes: 1 addition & 1 deletion src/lang/builtins_regexp.ml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ let escape_regex_descr =
else Lang_string.utf8_special_char s pos len)
~escape_char:(fun s pos len ->
if s.[pos] = '/' && len = 1 then "\\/"
else Lang_string.escape_utf8_char s pos len)
else Lang_string.escape_utf8_char ~strict:false s pos len)
~next:Lang_string.utf8_next
in
Lang_string.escape_string escape_regex_formatter
Expand Down
156 changes: 143 additions & 13 deletions src/lang/builtins_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,113 @@ let _ =
let l = List.map Lang.to_string l in
Lang.string (String.concat sep l))

type utf_encoding = [ `Utf8 | `Utf16le | `Utf16be ]

let split ~encoding s =
let get =
match encoding with
| `Ascii ->
fun pos ->
let buf = Buffer.create 1 in
Buffer.add_char buf (String.get s pos);
(Buffer.contents buf, 1)
| #utf_encoding as enc ->
let get_char, add_char =
match enc with
| `Utf8 -> (String.get_utf_8_uchar, Buffer.add_utf_8_uchar)
| `Utf16le ->
(String.get_utf_16le_uchar, Buffer.add_utf_16le_uchar)
| `Utf16be ->
(String.get_utf_16be_uchar, Buffer.add_utf_16be_uchar)
in
fun pos ->
let d = get_char s pos in
if not (Uchar.utf_decode_is_valid d) then
failwith "Decoding failed!";
let c = Uchar.utf_decode_uchar d in
let buf = Buffer.create 1 in
add_char buf c;
(Buffer.contents buf, Uchar.utf_decode_length d)
in
let len = String.length s in
let rec f chars pos =
if pos = len then List.rev chars
else (
let char, len = get pos in
f (char :: chars) (pos + len))
in
f [] 0

let _ =
Lang.add_builtin ~base:string "chars" ~category:`String
~descr:"Split string into characters. Raises `error.invalid` on errors."
[
( "encoding",
Lang.string_t,
Some (Lang.string "utf8"),
Some
"Encoding used to split characters. Should be one of: `\"utf8\"`, \
`\"utf16be\"`, `\"utf16le\"` or `\"ascii\"`." );
("", Lang.string_t, None, None);
]
(Lang.list_t Lang.string_t)
(fun p ->
let enc = Lang.to_string (List.assoc "encoding" p) in
let encoding =
match enc with
| "utf8" -> `Utf8
| "utf16le" -> `Utf16le
| "utf16be" -> `Utf16be
| "ascii" -> `Ascii
| _ ->
Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
"invalid"
in
let s = Lang.to_string (List.assoc "" p) in
try Lang.list (List.map Lang.string (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:
"Return the string's length using the given encoding. Raises \
`error.invalid` on errors."
[
( "encoding",
Lang.string_t,
Some (Lang.string "utf8"),
Some
"Encoding used to split characters. Should be one of: `\"utf8\"`, \
`\"utf16be\"`, `\"utf16le\"` or `\"ascii\"`." );
("", Lang.string_t, None, None);
]
Lang.int_t
(fun p ->
let enc = Lang.to_string (List.assoc "encoding" p) in
let encoding =
match enc with
| "utf8" -> `Utf8
| "utf16le" -> `Utf16le
| "utf16be" -> `Utf16be
| "ascii" -> `Ascii
| _ ->
Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
"invalid"
in
let s = Lang.to_string (List.assoc "" p) in
try Lang.int (List.length (split ~encoding s))
with _ ->
Runtime_error.raise ~pos:(Lang.pos p)
~message:
(Printf.sprintf "String cannot be split using encoding `\"%s\"`!"
enc)
"invalid")

let _ =
Lang.add_builtin ~base:string "nth" ~category:`String
~descr:
Expand Down Expand Up @@ -165,7 +272,7 @@ let string_escape =
("", Lang.string (String.sub s ofs len));
])
| None, `Ascii -> Lang_string.escape_hex_char
| None, `Utf8 -> Lang_string.escape_utf8_char
| None, `Utf8 -> Lang_string.escape_utf8_char ~strict:false
in
let next =
match encoding with
Expand Down Expand Up @@ -213,7 +320,8 @@ let _ =
match Lang.to_string format with
| "octal" -> (Lang_string.escape_octal_char, Lang_string.ascii_next)
| "hex" -> (Lang_string.escape_hex_char, Lang_string.ascii_next)
| "utf8" -> (Lang_string.escape_utf8_char, Lang_string.utf8_next)
| "utf8" ->
(Lang_string.escape_utf8_char ~strict:false, Lang_string.utf8_next)
| _ ->
raise
(Error.Invalid_value
Expand Down Expand Up @@ -264,15 +372,6 @@ let _ =
let s = Lang.to_string (List.assoc "" p) in
Lang.string (Lang_string.unescape_string s))

let _ =
Lang.add_builtin ~base:string "length" ~category:`String
~descr:"Get the length of a string."
[("", Lang.string_t, None, None)]
Lang.int_t
(fun p ->
let string = Lang.to_string (List.assoc "" p) in
Lang.int (String.length string))

let _ =
Lang.add_builtin ~base:string "sub" ~category:`String
~descr:
Expand All @@ -285,6 +384,12 @@ let _ =
Some
"Return a sub string starting at this position. First position is 0."
);
( "encoding",
Lang.string_t,
Some (Lang.string "utf8"),
Some
"Encoding used to split characters. Should be one of: `\"utf8\"`, \
`\"utf16be\"`, `\"utf16le\"` or `\"ascii\"`." );
( "length",
Lang.int_t,
None,
Expand All @@ -294,9 +399,34 @@ let _ =
(fun p ->
let start = Lang.to_int (List.assoc "start" p) in
let len = Lang.to_int (List.assoc "length" p) in
let enc = Lang.to_string (List.assoc "encoding" p) in
let encoding =
match enc with
| "utf8" -> `Utf8
| "utf16le" -> `Utf16le
| "utf16be" -> `Utf16be
| "ascii" -> `Ascii
| _ ->
Runtime_error.raise ~pos:(Lang.pos p) ~message:"Invalid encoding!"
"invalid"
in
let string = Lang.to_string (List.assoc "" p) in
Lang.string
(try String.sub string start len with Invalid_argument _ -> ""))
let s =
match encoding with
| `Ascii -> (
try String.sub string start len with Invalid_argument _ -> "")
| #utf_encoding as encoding -> (
try
let chars = split ~encoding string in
if List.length chars < len + start then ""
else
String.concat ""
(List.filteri
(fun pos _ -> start <= pos && pos < start + len)
chars)
with _ -> "")
in
Lang.string s)

let _ =
Lang.add_builtin ~base:string "index" ~category:`String
Expand Down
22 changes: 13 additions & 9 deletions src/lang/lang_string.ml
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,17 @@ let escape_char ~escape_fun s pos len =
| '\'', 1 -> "\\'"
| _ -> escape_fun s pos len

let escape_utf8_char =
let escape_utf8_char ~strict =
let utf8_char_code s pos len =
try utf8_char_code s pos len with _ -> Uchar.to_int Uchar.rep
try utf8_char_code s pos len
with _ when not strict -> Uchar.to_int Uchar.rep
in
escape_char ~escape_fun:(fun s pos len ->
Printf.sprintf "\\u%04X" (utf8_char_code s pos len))

let escape_utf8_formatter ?(special_char = utf8_special_char) =
escape ~special_char ~escape_char:escape_utf8_char ~next:utf8_next
let escape_utf8_formatter ?(strict = false) ?(special_char = utf8_special_char)
=
escape ~special_char ~escape_char:(escape_utf8_char ~strict) ~next:utf8_next

let escape_hex_char =
escape_char ~escape_fun:(fun s pos len ->
Expand Down Expand Up @@ -153,15 +155,15 @@ let escape_string escape s =
len segments);
Bytes.unsafe_to_string b)

let escape_utf8_string ?special_char =
escape_string (escape_utf8_formatter ?special_char)
let escape_utf8_string ?strict ?special_char =
escape_string (escape_utf8_formatter ?strict ?special_char)

let escape_ascii_string ?special_char =
escape_string (escape_ascii_formatter ?special_char)

let quote_utf8_string s =
let quote_utf8_string ?strict s =
Printf.sprintf "\"%s\""
(escape_utf8_string
(escape_utf8_string ?strict
~special_char:(fun s pos len ->
if s.[pos] = '\'' && len = 1 then false
else utf8_special_char s pos len)
Expand All @@ -175,7 +177,9 @@ let quote_ascii_string s =
else ascii_special_char s pos len)
s)

let quote_string s = try quote_utf8_string s with _ -> quote_ascii_string s
let quote_string s =
try quote_utf8_string ~strict:true s with _ -> quote_ascii_string s

let unescape_utf8_pattern = "\\\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]"
let unescape_hex_pattern = "\\\\x[0-9a-fA-F][0-9a-fA-F]"
let unescape_octal_pattern = "\\\\[0-9][0-9][0-9]"
Expand Down
10 changes: 7 additions & 3 deletions src/lang/lang_string.mli
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ val ascii_next : 'a -> int -> int
val escape_char :
escape_fun:(string -> int -> int -> string) -> string -> int -> int -> string

val escape_utf8_char : string -> int -> int -> string
val escape_utf8_char : strict:bool -> string -> int -> int -> string

val escape_utf8_formatter :
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
[> `Orig of int * int | `Subst of string * int ] list * int
Expand All @@ -39,12 +40,15 @@ val escape_string :
string

val escape_utf8_string :
?special_char:(string -> int -> int -> bool) -> string -> string
?strict:bool ->
?special_char:(string -> int -> int -> bool) ->
string ->
string

val escape_ascii_string :
?special_char:(string -> int -> int -> bool) -> string -> string

val quote_utf8_string : string -> string
val quote_utf8_string : ?strict:bool -> string -> string
val quote_ascii_string : string -> string
val quote_string : string -> string
val unescape_utf8_pattern : string
Expand Down
32 changes: 32 additions & 0 deletions tests/language/string.liq
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,37 @@ def f() =
"blo#{(1, 2, 3)}",
"blo(1, 2, 3)"
)

s = "王^小東="
test.equal(string.length(s), 5)
test.equal(string.chars(s), ["王", "^", "小", "東", "="])
test.equal(string.sub(start=1, length=2, s), "^小")
test.equal(string.length(encoding="ascii", s), 11)
test.equal(
string.chars(encoding="ascii", s),
[
"\xE7",
"\x8E",
"\x8B",
"^",
"\xE5",
"\xB0",
"\x8F",
"\xE6",
"\x9D",
"\xB1",
"="
]
)
test.equal(string.sub(encoding="ascii", start=1, length=2, s), "\x8E\x8B")

try
string.chars(encoding="utf16le", s)
test.fail()
catch e : [error.invalid] do
()
end

test.pass()
end

Expand Down Expand Up @@ -171,6 +202,7 @@ def test_escape_html() =
test.equal(string.escape.html("\\"), "\\")
test.equal(string.escape.html("/"), "/")
test.equal(string.escape.html("`"), "`")

test.pass()
end

Expand Down

0 comments on commit 3b2dd27

Please sign in to comment.