Skip to content

Commit

Permalink
highlander: add utf8to16, utf16to8, cwstring; excise UTF16String
Browse files Browse the repository at this point in the history
  • Loading branch information
StefanKarpinski committed Feb 19, 2016
1 parent 11225d1 commit 103db50
Show file tree
Hide file tree
Showing 8 changed files with 325 additions and 52 deletions.
101 changes: 100 additions & 1 deletion base/c.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ containsnul(p::Ptr, len) = C_NULL != ccall(:memchr, Ptr{Cchar}, (Ptr{Cchar}, Cin
function unsafe_convert(::Type{Cstring}, s::ByteString)
p = unsafe_convert(Ptr{Cchar}, s)
if containsnul(p, sizeof(s))
throw(ArgumentError("embedded NUL chars are not allowed in C strings: $(repr(s))"))
throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))"))
end
return Cstring(p)
end
Expand All @@ -94,6 +94,105 @@ convert(::Type{Cstring}, s::Symbol) = Cstring(unsafe_convert(Ptr{Cchar}, s))

# in string.jl: unsafe_convert(::Type{Cwstring}, s::WString)

# FIXME: this should be handled by implicit conversion to Cwstring, but good luck with that
@windows_only function cwstring(s::AbstractString)
bytes = bytestring(s).data
0 in bytes && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))"))
return push!(utf8to16(bytes), 0)
end

# conversions between UTF-8 and UTF-16 for Windows APIs

function utf8to16(src::Vector{UInt8})
dst = UInt16[]
i, n = 1, length(src)
n > 0 || return dst
sizehint!(dst, 2n)
a = src[1]
while true
if i < n && -64 <= a % Int8 <= -12 # multi-byte character
b = src[i += 1]
if -64 <= (b % Int8) || a == 0xf4 && 0x8f < b
# invalid UTF-8 (non-continuation or too-high code point)
push!(dst, a)
a = b; continue
elseif a < 0xe0 # 2-byte UTF-8
push!(dst, 0x3080 $ (UInt16(a) << 6) $ b)
elseif i < n # 3/4-byte character
c = src[i += 1]
if -64 <= (c % Int8) # invalid UTF-8 (non-continuation)
push!(dst, a, b)
a = c; continue
elseif a < 0xf0 # 3-byte UTF-8
push!(dst, 0x2080 $ (UInt16(a) << 12) $ (UInt16(b) << 6) $ c)
elseif i < n
d = src[i += 1]
if -64 <= (d % Int8) # invalid UTF-8 (non-continuation)
push!(dst, a, b, c)
a = d; continue
elseif a == 0xf0 && b < 0x90 # overlong encoding
push!(dst, 0x2080 $ (UInt16(b) << 12) $ (UInt16(c) << 6) $ d)
else # 4-byte UTF-8
push!(dst, 0xe5b8 + (UInt16(a) << 8) + (UInt16(b) << 2) + (c >> 4),
0xdc80 $ (UInt16(c & 0xf) << 6) $ d)
end
else # too short
push!(dst, a, b, c)
break
end
else # too short
push!(dst, a, b)
break
end
else # ASCII or invalid UTF-8 (continuation byte or too-high code point)
push!(dst, a)
end
i < n || break
a = src[i += 1]
end
return dst
end

function utf16to8(src::Vector{UInt16})
dst = UInt8[]
i, n = 1, length(src)
n > 0 || return dst
sizehint!(dst, n)
a = src[1]
while true
if a < 0x80 # ASCII
push!(dst, a % UInt8)
elseif a < 0x800 # 2-byte UTF-8
push!(dst, 0xc0 | ((a >> 6) % UInt8),
0x80 | ((a % UInt8) & 0x3f))
elseif a & 0xfc00 == 0xd800 && i < n
b = src[i += 1]
if (b & 0xfc00) == 0xdc00
# 2-unit UTF-16 sequence => 4-byte UTF-8
a += 0x2840
push!(dst, 0xf0 | ((a >> 8) % UInt8),
0x80 | ((a % UInt8) >> 2),
0xf0 $ ((((a % UInt8) << 4) & 0x3f) $ (b >> 6) % UInt8),
0x80 | ((b % UInt8) & 0x3f))
else
push!(dst, 0xe0 | ((a >> 12) % UInt8),
0x80 | (((a >> 6) % UInt8) & 0x3f),
0x80 | ((a % UInt8) & 0x3f))
a = b; continue
end
else
# 1-unit high UTF-16 or unpaired high surrogate
# either way, encode as 3-byte UTF-8 code point
push!(dst, 0xe0 | ((a >> 12) % UInt8),
0x80 | (((a >> 6) % UInt8) & 0x3f),
0x80 | ((a % UInt8) & 0x3f))
end
i < n || break
a = src[i += 1]
end
return dst
end

# deferring (or un-deferring) ctrl-c handler for external C code that
# is not interrupt safe (see also issue #2622). The sigatomic_begin/end
# functions should always be called in matched pairs, ideally via:
Expand Down
28 changes: 16 additions & 12 deletions base/env.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,33 +26,37 @@ end # @unix_only

const ERROR_ENVVAR_NOT_FOUND = UInt32(203)

_getenvlen(var::AbstractString) = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Cwstring,Ptr{UInt8},UInt32),var,C_NULL,0)
_hasenv(s::AbstractString) = _getenvlen(s)!=0 || Libc.GetLastError()!=ERROR_ENVVAR_NOT_FOUND
_getenvlen(var::Vector{UInt16}) = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Ptr{UInt16},Ptr{UInt16},UInt32),var,C_NULL,0)
_hasenv(s::Vector{UInt16}) = _getenvlen(s) != 0 || Libc.GetLastError() != ERROR_ENVVAR_NOT_FOUND
_hasenv(s::AbstractString) = _hasenv(cwstring(s))

function access_env(onError::Function, str::AbstractString)
var = utf16(str)
var = cwstring(str)
len = _getenvlen(var)
if len == 0
return Libc.GetLastError() != ERROR_ENVVAR_NOT_FOUND ? utf8("") : onError(str)
end
val = zeros(UInt16,len)
ret = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Cwstring,Ptr{UInt16},UInt32),var,val,len)
ret = ccall(:GetEnvironmentVariableW,stdcall,UInt32,(Ptr{UInt16},Ptr{UInt16},UInt32),var,val,len)
if (ret == 0 && len != 1) || ret != len-1 || val[end] != 0
error(string("getenv: ", str, ' ', len, "-1 != ", ret, ": ", Libc.FormatMessage()))
end
return utf8(UTF16String(val))
pop!(val) # NUL
return UTF8String(utf16to8(val))
end

function _setenv(var::AbstractString, val::AbstractString, overwrite::Bool=true)
var = utf16(var)
function _setenv(svar::AbstractString, sval::AbstractString, overwrite::Bool=true)
var = cwstring(svar)
val = cwstring(sval)
if overwrite || !_hasenv(var)
ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Cwstring,Cwstring),var,val)
ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Ptr{UInt16},Ptr{UInt16}),var,val)
systemerror(:setenv, ret == 0)
end
end

function _unsetenv(var::AbstractString)
ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Cwstring,Ptr{UInt16}),var,C_NULL)
function _unsetenv(svar::AbstractString)
var = cwstring(svar)
ret = ccall(:SetEnvironmentVariableW,stdcall,Int32,(Ptr{UInt16},Ptr{UInt16}),var,C_NULL)
systemerror(:setenv, ret == 0)
end

Expand Down Expand Up @@ -105,10 +109,10 @@ end
function next(hash::EnvHash, block::Tuple{Ptr{UInt16},Ptr{UInt16}})
pos = block[1]
blk = block[2]
len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos)+1
len = ccall(:wcslen, UInt, (Ptr{UInt16},), pos)
buf = Array(UInt16, len)
unsafe_copy!(pointer(buf), pos, len)
env = utf8(UTF16String(buf))
env = UTF8String(utf16to8(buf))
m = match(r"^(=?[^=]+)=(.*)$"s, env)
if m === nothing
error("malformed environment entry: $env")
Expand Down
14 changes: 8 additions & 6 deletions base/file.jl
Original file line number Diff line number Diff line change
Expand Up @@ -218,19 +218,21 @@ function tempdir()
if lentemppath >= length(temppath) || lentemppath == 0
error("GetTempPath failed: $(Libc.FormatMessage())")
end
resize!(temppath,lentemppath+1)
return utf8(UTF16String(temppath))
resize!(temppath,lentemppath)
return UTF8String(utf16to8(temppath))
end
tempname(uunique::UInt32=UInt32(0)) = tempname(tempdir(), uunique)
const temp_prefix = cwstring("jl_")
function tempname(temppath::AbstractString,uunique::UInt32)
tempp = cwstring(temppath)
tname = Array(UInt16,32767)
uunique = ccall(:GetTempFileNameW,stdcall,UInt32,(Cwstring,Ptr{UInt16},UInt32,Ptr{UInt16}), temppath,utf16("jul"),uunique,tname)
uunique = ccall(:GetTempFileNameW,stdcall,UInt32,(Ptr{UInt16},Ptr{UInt16},UInt32,Ptr{UInt16}), tempp,temp_prefix,uunique,tname)
lentname = findfirst(tname,0)-1
if uunique == 0 || lentname <= 0
error("GetTempFileName failed: $(Libc.FormatMessage())")
end
resize!(tname,lentname+1)
return utf8(UTF16String(tname))
resize!(tname,lentname)
return UTF8String(utf16to8(tname))
end
function mktemp(parent=tempdir())
filename = tempname(parent, UInt32(0))
Expand All @@ -243,7 +245,7 @@ function mktempdir(parent=tempdir())
seed += 1
end
filename = tempname(parent, seed)
ret = ccall(:_wmkdir, Int32, (Ptr{UInt16},), utf16(filename))
ret = ccall(:_wmkdir, Int32, (Ptr{UInt16},), cwstring(filename))
if ret == 0
return filename
end
Expand Down
11 changes: 7 additions & 4 deletions base/filesystem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,13 @@ export File,
S_IRGRP, S_IWGRP, S_IXGRP, S_IRWXG,
S_IROTH, S_IWOTH, S_IXOTH, S_IRWXO

import Base: uvtype, uvhandle, eventloop, fd, position, stat, close,
write, read, unsafe_write, unsafe_read, readavailable, read!,
isopen, show, seek, seekend, skip, eof, nb_available,
check_open, _sizeof_uv_fs, uv_error, UVError
import Base:
UVError, _sizeof_uv_fs, check_open, close, eof, eventloop, fd, isopen,
nb_available, position, read, read!, readavailable, seek, seekend, show,
skip, stat, unsafe_read, unsafe_write, utf16to8, utf8to16, uv_error,
uvhandle, uvtype, write

@windows_only import Base: cwstring

include("path.jl")
include("stat.jl")
Expand Down
10 changes: 6 additions & 4 deletions base/interactiveutil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,13 @@ end
end
systemerror(:OpenClipboard, 0==ccall((:OpenClipboard, "user32"), stdcall, Cint, (Ptr{Void},), C_NULL))
systemerror(:EmptyClipboard, 0==ccall((:EmptyClipboard, "user32"), stdcall, Cint, ()))
x_u16 = utf16(x)
x_u16 = cwstring(x)
# copy data to locked, allocated space
p = ccall((:GlobalAlloc, "kernel32"), stdcall, Ptr{UInt16}, (UInt16, Int32), 2, sizeof(x_u16)+2)
p = ccall((:GlobalAlloc, "kernel32"), stdcall, Ptr{UInt16}, (UInt16, Int32), 2, sizeof(x_u16))
systemerror(:GlobalAlloc, p==C_NULL)
plock = ccall((:GlobalLock, "kernel32"), stdcall, Ptr{UInt16}, (Ptr{UInt16},), p)
systemerror(:GlobalLock, plock==C_NULL)
ccall(:memcpy, Ptr{UInt16}, (Ptr{UInt16},Ptr{UInt16},Int), plock, x_u16, sizeof(x_u16)+2)
ccall(:memcpy, Ptr{UInt16}, (Ptr{UInt16},Ptr{UInt16},Int), plock, x_u16, sizeof(x_u16))
systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{Void},), plock))
pdata = ccall((:SetClipboardData, "user32"), stdcall, Ptr{UInt16}, (UInt32, Ptr{UInt16}), 13, p)
systemerror(:SetClipboardData, pdata!=p)
Expand All @@ -152,7 +152,9 @@ end
systemerror(:CloseClipboard, 0==ccall((:CloseClipboard, "user32"), stdcall, Cint, ()))
plock = ccall((:GlobalLock, "kernel32"), stdcall, Ptr{UInt16}, (Ptr{UInt16},), pdata)
systemerror(:GlobalLock, plock==C_NULL)
s = utf8(utf16(plock))
len = 0
while unsafe_load(plock, len+1) != 0; len += 1; end
s = UTF8String(utf16to8(pointer_to_array(plock, len)))
systemerror(:GlobalUnlock, 0==ccall((:GlobalUnlock, "kernel32"), stdcall, Cint, (Ptr{UInt16},), plock))
return s
end
Expand Down
5 changes: 3 additions & 2 deletions base/libc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ export FILE, TmStruct, strftime, strptime, getpid, gethostname, free, malloc, ca
errno, strerror, flush_cstdio, systemsleep, time
@windows_only export GetLastError, FormatMessage

import Base: utf16to8

include(string(length(Core.ARGS)>=2?Core.ARGS[2]:"","errno_h.jl")) # include($BUILDROOT/base/errno_h.jl)

## RawFD ##
Expand Down Expand Up @@ -258,11 +260,10 @@ function FormatMessage end
C_NULL, e, 0, lpMsgBuf, 0, C_NULL)
p = lpMsgBuf[1]
len == 0 && return utf8("")
len = len + 1
buf = Array(UInt16, len)
unsafe_copy!(pointer(buf), p, len)
ccall(:LocalFree,stdcall,Ptr{Void},(Ptr{Void},),p)
return utf8(UTF16String(buf))
return UTF8String(utf16to8(buf))
end
end

Expand Down
41 changes: 18 additions & 23 deletions base/path.jl
Original file line number Diff line number Diff line change
Expand Up @@ -124,36 +124,31 @@ normpath(a::AbstractString, b::AbstractString...) = normpath(joinpath(a,b...))
abspath(a::AbstractString) = normpath(isabspath(a) ? a : joinpath(pwd(),a))
abspath(a::AbstractString, b::AbstractString...) = abspath(joinpath(a,b...))

@windows_only realpath(path::AbstractString) = realpath(utf16(path))
@windows_only function realpath(path::UTF16String)
p::UInt32 = sizeof(path)>>1
@windows_only function realpath(path::AbstractString)
path = cwstring(path)
buf = zeros(UInt16, length(path))
while true
buf = zeros(UInt16, p + 1)
p = ccall((:GetFullPathNameW, "kernel32"), stdcall,
UInt32, (Cwstring, UInt32, Ptr{UInt16}, Ptr{Void}),
n = ccall((:GetFullPathNameW, "kernel32"), stdcall,
UInt32, (Ptr{UInt16}, UInt32, Ptr{UInt16}, Ptr{Void}),
path, length(buf), buf, C_NULL)
systemerror(:realpath, p == 0)
if (p < length(buf))
resize!(buf, p + 1)
return utf8(UTF16String(buf))
end
systemerror(:realpath, n == 0)
x = n < length(buf) # is the buffer big enough?
resize!(buf, n) # shrink if x, grow if !x
x && return UTF8String(utf16to8(buf))
end
end

@windows_only longpath(path::AbstractString) = longpath(utf16(path))
@windows_only function longpath(path::UTF16String)
p::UInt32 = sizeof(path)>>1
@windows_only function longpath(path::AbstractString)
path = cwstring(path)
buf = zeros(UInt16, length(path))
while true
buf = zeros(UInt16, p + 1)
p = ccall((:GetLongPathNameW, "kernel32"), stdcall, UInt32,
(Cwstring, Ptr{UInt16}, UInt32),
n = ccall((:GetLongPathNameW, "kernel32"), stdcall,
UInt32, (Ptr{UInt16}, Ptr{UInt16}, UInt32),
path, buf, length(buf))
systemerror(:longpath, p == 0)
# Buffer wasn't big enough, in which case `p` is the necessary buffer size
if (p < length(buf))
resize!(buf, p + 1)
return utf8(UTF16String(buf))
end
systemerror(:longpath, n == 0)
x = n < length(buf) # is the buffer big enough?
resize!(buf, n) # shrink if x, grow if !x
x && return UTF8String(utf16to8(buf))
end
end

Expand Down
Loading

0 comments on commit 103db50

Please sign in to comment.