Merge pull request JuliaLang#22 from AlexanderFabisch/one_of_c_encoding

Add one-hot scheme / one-of-C encoding
LilithHafner · Sep 6, 2013 · 79ba6ac · 79ba6ac
2 parents 5f298c9 + 2a6adf8
commit 79ba6ac
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ Basic statistics functions for Julia
 * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data.
 * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`).
 * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode.
+* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3].
 * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`.
 * `quantile(a)`: Compute any desired quantile of `a`.
 * `quartile(a): Compute the quartiles of `a`.

diff --git a/src/Stats.jl b/src/Stats.jl
@@ -45,6 +45,7 @@ module Stats
     confint,
     ecdf,
     findat,
+    indicators,
     inverse_rle,
     loglikelihood,
     nobs,

diff --git a/src/others.jl b/src/others.jl
@@ -97,6 +97,60 @@ function ecdf{T}(X::AbstractVector{T})
     return e
 end
 
+function indicators{T}(input::AbstractMatrix{T},
+                       categories::Array{Any,1}={};
+                       sparse::Bool=false)
+    nfeatures, nsamples = size(input)
+    if length(categories) != 0 && length(categories) != nfeatures
+        error("You must provide either categories for each feature or no categories")
+    end
+    internal_categories = copy(categories)
+    noutrows = 0
+    if length(internal_categories) != nfeatures
+        for i in 1:nfeatures
+            push!(internal_categories, sort(unique(input[i, :])))
+        end
+    end
+    for i in 1:nfeatures
+        noutrows += length(internal_categories[i])
+    end
+    if sparse
+        output = spzeros(noutrows, nsamples)
+    else
+        output = zeros(noutrows, nsamples)
+    end
+    offset = 1
+    for i in 1:nfeatures
+        indicators!(output, offset, slice(input, i, :), internal_categories[i])
+        offset += length(internal_categories[i])
+    end
+    return output
+end
+
+function indicators{T}(input::AbstractVector{T},
+                       categories::Array{T,1}=sort(unique(input));
+                       sparse::Bool=false)
+    if sparse
+        output = spzeros(length(categories), length(input))
+    else
+        output = zeros(length(categories), length(input))
+    end
+    indicators!(output, 1, input, categories)
+    return output
+end
+
+function indicators!{S<:Real,T}(output::AbstractArray{S},
+                                offset::Integer,
+                                input::AbstractVector{T},
+                                categories::Array{T,1}=sort(unique(input)))
+    indices = (T=>Integer)[categories[i]=>i for i in 1:length(categories)]
+    const lo = offset-1
+    for i in 1:length(input)
+        output[indices[input[i]]+lo, i] = one(S)
+    end
+    return
+end
+
 abstract StatisticalModel
 
 coef(obj::StatisticalModel) = error("No method defined")

diff --git a/test/01.jl b/test/01.jl
@@ -31,3 +31,22 @@ fnecdf = ecdf(randn(10000000))
 
 fnecdf = ecdf([0.5])
 @test fnecdf([zeros(5000), ones(5000)]) == [zeros(5000), ones(5000)]
+
+y = [1, 2, 1, 3, 2]
+expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]'
+@test indicators(y) == expected
+@test indicators(y, [1:3], sparse=true) == expected
+y = [2, 3, 2, 4, 3]
+@test indicators(y) == expected
+X = [1 2 3; 1 1 1; 2 1 1]
+expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0]
+@test indicators(X) == expected
+expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0]
+@test indicators(X, {[1:3], [1:3], [1:2]}) == expected
+y = ["A", "B", "C", "B", "A"]
+expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]'
+@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected
+X = ["A" "B" "C"; "B" "A" "C"]
+cats = ["A", "B", "C", "D"]
+expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]'
+@test indicators(X, {cats, cats}, sparse=false) == expected