Skip to content

Commit

Permalink
Merge pull request JuliaLang#22 from AlexanderFabisch/one_of_c_encoding
Browse files Browse the repository at this point in the history
Add one-hot scheme / one-of-C encoding
  • Loading branch information
johnmyleswhite committed Sep 6, 2013
2 parents 5f298c9 + 2a6adf8 commit 79ba6ac
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Basic statistics functions for Julia
* `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data.
* `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`).
* `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode.
* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3].
* `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`.
* `quantile(a)`: Compute any desired quantile of `a`.
* `quartile(a): Compute the quartiles of `a`.
Expand Down
1 change: 1 addition & 0 deletions src/Stats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ module Stats
confint,
ecdf,
findat,
indicators,
inverse_rle,
loglikelihood,
nobs,
Expand Down
54 changes: 54 additions & 0 deletions src/others.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,60 @@ function ecdf{T}(X::AbstractVector{T})
return e
end

function indicators{T}(input::AbstractMatrix{T},
categories::Array{Any,1}={};
sparse::Bool=false)
nfeatures, nsamples = size(input)
if length(categories) != 0 && length(categories) != nfeatures
error("You must provide either categories for each feature or no categories")
end
internal_categories = copy(categories)
noutrows = 0
if length(internal_categories) != nfeatures
for i in 1:nfeatures
push!(internal_categories, sort(unique(input[i, :])))
end
end
for i in 1:nfeatures
noutrows += length(internal_categories[i])
end
if sparse
output = spzeros(noutrows, nsamples)
else
output = zeros(noutrows, nsamples)
end
offset = 1
for i in 1:nfeatures
indicators!(output, offset, slice(input, i, :), internal_categories[i])
offset += length(internal_categories[i])
end
return output
end

function indicators{T}(input::AbstractVector{T},
categories::Array{T,1}=sort(unique(input));
sparse::Bool=false)
if sparse
output = spzeros(length(categories), length(input))
else
output = zeros(length(categories), length(input))
end
indicators!(output, 1, input, categories)
return output
end

function indicators!{S<:Real,T}(output::AbstractArray{S},
offset::Integer,
input::AbstractVector{T},
categories::Array{T,1}=sort(unique(input)))
indices = (T=>Integer)[categories[i]=>i for i in 1:length(categories)]
const lo = offset-1
for i in 1:length(input)
output[indices[input[i]]+lo, i] = one(S)
end
return
end

abstract StatisticalModel

coef(obj::StatisticalModel) = error("No method defined")
Expand Down
19 changes: 19 additions & 0 deletions test/01.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,22 @@ fnecdf = ecdf(randn(10000000))

fnecdf = ecdf([0.5])
@test fnecdf([zeros(5000), ones(5000)]) == [zeros(5000), ones(5000)]

y = [1, 2, 1, 3, 2]
expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]'
@test indicators(y) == expected
@test indicators(y, [1:3], sparse=true) == expected
y = [2, 3, 2, 4, 3]
@test indicators(y) == expected
X = [1 2 3; 1 1 1; 2 1 1]
expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0]
@test indicators(X) == expected
expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0]
@test indicators(X, {[1:3], [1:3], [1:2]}) == expected
y = ["A", "B", "C", "B", "A"]
expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]'
@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected
X = ["A" "B" "C"; "B" "A" "C"]
cats = ["A", "B", "C", "D"]
expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]'
@test indicators(X, {cats, cats}, sparse=false) == expected

0 comments on commit 79ba6ac

Please sign in to comment.