From aecbaccced88dc7f0fde5cc016fb45c5ee9c693b Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Tue, 3 Sep 2013 22:49:59 +0200 Subject: [PATCH 1/6] Add one-hot scheme aka one-of-C encoding --- README.md | 1 + src/Stats.jl | 1 + src/others.jl | 13 +++++++++++++ test/01.jl | 4 ++++ 4 files changed, 19 insertions(+) diff --git a/README.md b/README.md index 9ebd5d47ff48f..b1398ed072728 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Basic statistics functions for Julia * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data. * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`). * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode. +* `onehot(a)`: Encode categories using one-hot scheme aka one-of-C encoding. Assumes that categories are encoded as integers in the range [0, c-1], where c is the number of categories (or classes). * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`. * `quantile(a)`: Compute any desired quantile of `a`. * `quartile(a): Compute the quartiles of `a`. diff --git a/src/Stats.jl b/src/Stats.jl index cc091672dac97..908f633e04e5a 100644 --- a/src/Stats.jl +++ b/src/Stats.jl @@ -48,6 +48,7 @@ module Stats inverse_rle, loglikelihood, nobs, + onehot, predict, residuals, model_response, diff --git a/src/others.jl b/src/others.jl index 8ff5e745f4399..16a09cc660e66 100644 --- a/src/others.jl +++ b/src/others.jl @@ -97,6 +97,19 @@ function ecdf{T}(X::AbstractVector{T}) return e end +# Encode categories using one-hot scheme aka one-of-C encoding +# Assumes that categories are encoded as integers in the range [0, c-1], +# where c is the number of categories (or classes) +function onehot{T<:Real}(y::AbstractVector{T}) + const n = length(y) + const c = max(y)+1 + Y = zeros(T, c, n) + for i in 1:n + Y[y[i]+1, i] = one(T) + end + return Y +end + abstract StatisticalModel coef(obj::StatisticalModel) = error("No method defined") diff --git a/test/01.jl b/test/01.jl index 6a4a3f81d1db0..4931cf0dd6831 100644 --- a/test/01.jl +++ b/test/01.jl @@ -31,3 +31,7 @@ fnecdf = ecdf(randn(10000000)) fnecdf = ecdf([0.5]) @test fnecdf([zeros(5000), ones(5000)]) == [zeros(5000), ones(5000)] + +y = [0, 1, 0, 2, 1] +@test onehot(y) == [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]' + From c033cbb0324e1c8f6be2043ebfed109fc73fa4fb Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 4 Sep 2013 09:13:12 +0200 Subject: [PATCH 2/6] Apply requested changes * pass range of categories as argument * output can be sparse --- README.md | 2 +- src/Stats.jl | 2 +- src/others.jl | 16 +++++++++++----- test/01.jl | 9 ++++++--- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b1398ed072728..071d00f913236 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Basic statistics functions for Julia * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data. * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`). * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode. -* `onehot(a)`: Encode categories using one-hot scheme aka one-of-C encoding. Assumes that categories are encoded as integers in the range [0, c-1], where c is the number of categories (or classes). +* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Assumes that categories are encoded as integers in the range [1, c], where c is the number of categories (or classes). * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`. * `quantile(a)`: Compute any desired quantile of `a`. * `quartile(a): Compute the quartiles of `a`. diff --git a/src/Stats.jl b/src/Stats.jl index 908f633e04e5a..af0d77ed4833f 100644 --- a/src/Stats.jl +++ b/src/Stats.jl @@ -45,10 +45,10 @@ module Stats confint, ecdf, findat, + indicators, inverse_rle, loglikelihood, nobs, - onehot, predict, residuals, model_response, diff --git a/src/others.jl b/src/others.jl index 16a09cc660e66..5fab5a9af848e 100644 --- a/src/others.jl +++ b/src/others.jl @@ -98,14 +98,20 @@ function ecdf{T}(X::AbstractVector{T}) end # Encode categories using one-hot scheme aka one-of-C encoding -# Assumes that categories are encoded as integers in the range [0, c-1], +# Assumes that categories are encoded as integers in the range [1, c], # where c is the number of categories (or classes) -function onehot{T<:Real}(y::AbstractVector{T}) +function indicators{T<:Real}(y::AbstractVector{T}, + categories::Range1{T}=min(y):max(y), + sparse::Bool=false) const n = length(y) - const c = max(y)+1 - Y = zeros(T, c, n) + const c = length(categories) + if sparse + Y = spzeros(T, c, n) + else + Y = zeros(T, c, n) + end for i in 1:n - Y[y[i]+1, i] = one(T) + Y[y[i]-categories[1]+1, i] = one(T) end return Y end diff --git a/test/01.jl b/test/01.jl index 4931cf0dd6831..9bc45a6c0b23e 100644 --- a/test/01.jl +++ b/test/01.jl @@ -32,6 +32,9 @@ fnecdf = ecdf(randn(10000000)) fnecdf = ecdf([0.5]) @test fnecdf([zeros(5000), ones(5000)]) == [zeros(5000), ones(5000)] -y = [0, 1, 0, 2, 1] -@test onehot(y) == [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]' - +y = [1, 2, 1, 3, 2] +expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]' +@test indicators(y) == expected +@test indicators(y, 1:3, true) == expected +y = [2, 3, 2, 4, 3] +@test indicators(y) == expected From 3653b0876709ac13eedcfe021ba53ddd7d8f61c6 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Wed, 4 Sep 2013 22:48:31 +0200 Subject: [PATCH 3/6] Indicator matrix for multiple features --- README.md | 2 +- src/others.jl | 58 +++++++++++++++++++++++++++++++++++++++------------ test/01.jl | 5 +++++ 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 071d00f913236..c792aa7a966dd 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Basic statistics functions for Julia * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data. * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`). * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode. -* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Assumes that categories are encoded as integers in the range [1, c], where c is the number of categories (or classes). +* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Assumes that categories are encoded as integers in the range [1, c], where c is the number of categories (or classes). Optionally, you can provide a range min:max so that min will be encoded as [1 0 ...] and max as [... 0 1]. * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`. * `quantile(a)`: Compute any desired quantile of `a`. * `quartile(a): Compute the quartiles of `a`. diff --git a/src/others.jl b/src/others.jl index 5fab5a9af848e..ca73e9d9622ff 100644 --- a/src/others.jl +++ b/src/others.jl @@ -97,23 +97,55 @@ function ecdf{T}(X::AbstractVector{T}) return e end -# Encode categories using one-hot scheme aka one-of-C encoding -# Assumes that categories are encoded as integers in the range [1, c], -# where c is the number of categories (or classes) -function indicators{T<:Real}(y::AbstractVector{T}, - categories::Range1{T}=min(y):max(y), - sparse::Bool=false) - const n = length(y) - const c = length(categories) +function indicators{T<:Real}(input::AbstractMatrix{T}, + categories::Array{Any, 1}={}, + sparse::Bool=false) + internal_categories = copy(categories) + nfeatures, nsamples = size(input) + nOutputRows = 0 + if length(internal_categories) != nfeatures + for i in 1:nfeatures + xmin, xmax = minmax(input[i, :]) + push!(internal_categories, xmin:xmax) + end + end + for i in 1:nfeatures + nOutputRows += length(internal_categories[i]) + end if sparse - Y = spzeros(T, c, n) + output = spzeros(T, nOutputRows, nsamples) else - Y = zeros(T, c, n) + output = zeros(T, nOutputRows, nsamples) end - for i in 1:n - Y[y[i]-categories[1]+1, i] = one(T) + offset = 1 + for i in 1:nfeatures + indicators!(output, offset, slice(input, i, :), internal_categories[i]) + offset += length(internal_categories[i]) end - return Y + return output +end + +function indicators{T<:Real}(input::AbstractVector{T}, + categories::Range1{T}=min(input):max(input), + sparse::Bool=false) + if sparse + output = spzeros(T, length(categories), length(input)) + else + output = zeros(T, length(categories), length(input)) + end + indicators!(output, 1, input, categories) + return output +end + +function indicators!{T<:Real}(output::AbstractArray{T}, + offset::Integer, + input::AbstractVector{T}, + categories::Range1{T}=min(input):max(input)) + const lo = offset-categories[1] + for i in 1:length(input) + output[input[i]+lo, i] = one(T) + end + return end abstract StatisticalModel diff --git a/test/01.jl b/test/01.jl index 9bc45a6c0b23e..4a6ac1015f492 100644 --- a/test/01.jl +++ b/test/01.jl @@ -38,3 +38,8 @@ expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]' @test indicators(y, 1:3, true) == expected y = [2, 3, 2, 4, 3] @test indicators(y) == expected +X = [1 2 3; 1 1 1; 2 1 1] +expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0] +@test indicators(X) == expected +expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0] +@test indicators(X, {1:3, 1:3, 1:2}) == expected From 8cccda135dfc93d05fe6202b29e693f5286486e5 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 5 Sep 2013 21:20:36 +0200 Subject: [PATCH 4/6] Add `indicators` for generic categories --- README.md | 2 +- src/others.jl | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++- test/01.jl | 7 ++++++ 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c792aa7a966dd..93b578f3a95c0 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Basic statistics functions for Julia * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data. * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`). * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode. -* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Assumes that categories are encoded as integers in the range [1, c], where c is the number of categories (or classes). Optionally, you can provide a range min:max so that min will be encoded as [1 0 ...] and max as [... 0 1]. +* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide either a range min:max for categories that are encoded as integers (so that min will be encoded as [1 0 ...] and max as [... 0 1]) or a list of possible values, e.g. ["A", "B, "C"]. * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`. * `quantile(a)`: Compute any desired quantile of `a`. * `quartile(a): Compute the quartiles of `a`. diff --git a/src/others.jl b/src/others.jl index ca73e9d9622ff..c1b96b16d1c2a 100644 --- a/src/others.jl +++ b/src/others.jl @@ -100,8 +100,11 @@ end function indicators{T<:Real}(input::AbstractMatrix{T}, categories::Array{Any, 1}={}, sparse::Bool=false) - internal_categories = copy(categories) nfeatures, nsamples = size(input) + if length(categories) != 0 && length(categories) != nfeatures + error("You must provide either categories for each feature or no categories") + end + internal_categories = copy(categories) nOutputRows = 0 if length(internal_categories) != nfeatures for i in 1:nfeatures @@ -148,6 +151,68 @@ function indicators!{T<:Real}(output::AbstractArray{T}, return end +function indicators{T}(input::AbstractMatrix{T}, + categories::Array{Any, 1}={}, + sparse::Bool=false) + nfeatures, nsamples = size(input) + if length(categories) != 0 && length(categories) != nfeatures + error("You must provide either categories for each feature or no categories") + end + internal_categories = copy(categories) + nOutputRows = 0 + if length(internal_categories) != nfeatures + for i in 1:nfeatures + push!(internal_categories, sort(unique(input[i, :]))) + end + end + for i in 1:nfeatures + nOutputRows += length(internal_categories[i]) + end + if sparse + output = spzeros(nOutputRows, nsamples) + else + output = zeros(nOutputRows, nsamples) + end + offset = 1 + for i in 1:nfeatures + indicators!(output, offset, slice(input, i, :), internal_categories[i]) + offset += length(internal_categories[i]) + end + return output +end + +function indicators{T}(input::AbstractVector{T}, + categories::Array{T,1}=sort(unique(input)), + sparse::Bool=false) + if sparse + output = spzeros(length(categories), length(input)) + else + output = zeros(length(categories), length(input)) + end + indicators!(output, 1, input, categories) + return output +end + +function indicators!{S<:Real,T}(output::AbstractArray{S}, + offset::Integer, + input::AbstractVector{T}, + categories::Array{T,1}=sort(unique(input))) + indices = (T=>Integer)[categories[i]=>i for i in 1:length(categories)] + const lo = offset-1 + for i in 1:length(input) + output[indices[input[i]]+lo, i] = one(S) + end + return +end + +function indicators!{S<:Real,T}(output::AbstractArray{S}, + offset::Integer, + input::AbstractVector{T}, + categories::Range1{T}=sort(unique(input))) + dict = (T=>Integer)[c[i]=>i for i in 1:length(categories)] + println(dict) +end + abstract StatisticalModel coef(obj::StatisticalModel) = error("No method defined") diff --git a/test/01.jl b/test/01.jl index 4a6ac1015f492..14871f4fbc9b4 100644 --- a/test/01.jl +++ b/test/01.jl @@ -43,3 +43,10 @@ expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0] @test indicators(X) == expected expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0] @test indicators(X, {1:3, 1:3, 1:2}) == expected +y = ["A", "B", "C", "B", "A"] +expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]' +@test indicators(y, ["A", "B", "C", "D"], true) == expected +X = ["A" "B" "C"; "B" "A" "C"] +cats = ["A", "B", "C", "D"] +expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]' +@test indicators(X, {cats, cats}, false) == expected From b6214e3ddb5c2baba66723d3e0f778af22337b74 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Thu, 5 Sep 2013 21:25:49 +0200 Subject: [PATCH 5/6] Remove unused code --- src/others.jl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/others.jl b/src/others.jl index c1b96b16d1c2a..a3db7f666e861 100644 --- a/src/others.jl +++ b/src/others.jl @@ -205,14 +205,6 @@ function indicators!{S<:Real,T}(output::AbstractArray{S}, return end -function indicators!{S<:Real,T}(output::AbstractArray{S}, - offset::Integer, - input::AbstractVector{T}, - categories::Range1{T}=sort(unique(input))) - dict = (T=>Integer)[c[i]=>i for i in 1:length(categories)] - println(dict) -end - abstract StatisticalModel coef(obj::StatisticalModel) = error("No method defined") From 2a6adf8729ef78a7ed7cb9de7018a5e593043860 Mon Sep 17 00:00:00 2001 From: Alexander Fabisch Date: Fri, 6 Sep 2013 00:29:01 +0200 Subject: [PATCH 6/6] Clean up `indicators` * sparse is a keyword argument * remove special handling for numerical types --- README.md | 2 +- src/others.jl | 66 +++++---------------------------------------------- test/01.jl | 8 +++---- 3 files changed, 11 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 93b578f3a95c0..02dd68eb5cf42 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Basic statistics functions for Julia * `mad(a)`: Compute the median absolute deviation of `a` with a correction factor, which ensures that the MAD will be a consistent estimator of the mean for normally distributed data. * `midrange(a)`: Compute the mid point of the range of `a` (e.g `(max(a) + min(a) / 2)`). * `modes(a)`: Compute all modes of `a`. Be warned that every element of an array with no repeated elements is considered a mode. -* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide either a range min:max for categories that are encoded as integers (so that min will be encoded as [1 0 ...] and max as [... 0 1]) or a list of possible values, e.g. ["A", "B, "C"]. +* `indicators(a)`: Encode categories using one-hot scheme aka one-of-C encoding, indicator matrix or dummy variables. Optionally, you can provide a list of possible values, e.g. ["A", "B, "C"] or [1:3]. * `percentile(a)`: Compute the percentiles (0%, 10%, ..., 100%) of `a`. * `quantile(a)`: Compute any desired quantile of `a`. * `quartile(a): Compute the quartiles of `a`. diff --git a/src/others.jl b/src/others.jl index a3db7f666e861..86e2e53907286 100644 --- a/src/others.jl +++ b/src/others.jl @@ -97,81 +97,27 @@ function ecdf{T}(X::AbstractVector{T}) return e end -function indicators{T<:Real}(input::AbstractMatrix{T}, - categories::Array{Any, 1}={}, - sparse::Bool=false) - nfeatures, nsamples = size(input) - if length(categories) != 0 && length(categories) != nfeatures - error("You must provide either categories for each feature or no categories") - end - internal_categories = copy(categories) - nOutputRows = 0 - if length(internal_categories) != nfeatures - for i in 1:nfeatures - xmin, xmax = minmax(input[i, :]) - push!(internal_categories, xmin:xmax) - end - end - for i in 1:nfeatures - nOutputRows += length(internal_categories[i]) - end - if sparse - output = spzeros(T, nOutputRows, nsamples) - else - output = zeros(T, nOutputRows, nsamples) - end - offset = 1 - for i in 1:nfeatures - indicators!(output, offset, slice(input, i, :), internal_categories[i]) - offset += length(internal_categories[i]) - end - return output -end - -function indicators{T<:Real}(input::AbstractVector{T}, - categories::Range1{T}=min(input):max(input), - sparse::Bool=false) - if sparse - output = spzeros(T, length(categories), length(input)) - else - output = zeros(T, length(categories), length(input)) - end - indicators!(output, 1, input, categories) - return output -end - -function indicators!{T<:Real}(output::AbstractArray{T}, - offset::Integer, - input::AbstractVector{T}, - categories::Range1{T}=min(input):max(input)) - const lo = offset-categories[1] - for i in 1:length(input) - output[input[i]+lo, i] = one(T) - end - return -end - function indicators{T}(input::AbstractMatrix{T}, - categories::Array{Any, 1}={}, + categories::Array{Any,1}={}; sparse::Bool=false) nfeatures, nsamples = size(input) if length(categories) != 0 && length(categories) != nfeatures error("You must provide either categories for each feature or no categories") end internal_categories = copy(categories) - nOutputRows = 0 + noutrows = 0 if length(internal_categories) != nfeatures for i in 1:nfeatures push!(internal_categories, sort(unique(input[i, :]))) end end for i in 1:nfeatures - nOutputRows += length(internal_categories[i]) + noutrows += length(internal_categories[i]) end if sparse - output = spzeros(nOutputRows, nsamples) + output = spzeros(noutrows, nsamples) else - output = zeros(nOutputRows, nsamples) + output = zeros(noutrows, nsamples) end offset = 1 for i in 1:nfeatures @@ -182,7 +128,7 @@ function indicators{T}(input::AbstractMatrix{T}, end function indicators{T}(input::AbstractVector{T}, - categories::Array{T,1}=sort(unique(input)), + categories::Array{T,1}=sort(unique(input)); sparse::Bool=false) if sparse output = spzeros(length(categories), length(input)) diff --git a/test/01.jl b/test/01.jl index 14871f4fbc9b4..fdcfab0e0bcef 100644 --- a/test/01.jl +++ b/test/01.jl @@ -35,18 +35,18 @@ fnecdf = ecdf([0.5]) y = [1, 2, 1, 3, 2] expected = [1 0 0; 0 1 0; 1 0 0; 0 0 1; 0 1 0]' @test indicators(y) == expected -@test indicators(y, 1:3, true) == expected +@test indicators(y, [1:3], sparse=true) == expected y = [2, 3, 2, 4, 3] @test indicators(y) == expected X = [1 2 3; 1 1 1; 2 1 1] expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 1 1; 1 0 0] @test indicators(X) == expected expected = [1 0 0; 0 1 0; 0 0 1; 1 1 1; 0 0 0; 0 0 0; 0 1 1; 1 0 0] -@test indicators(X, {1:3, 1:3, 1:2}) == expected +@test indicators(X, {[1:3], [1:3], [1:2]}) == expected y = ["A", "B", "C", "B", "A"] expected = [1.0 0.0 0.0 0.0; 0.0 1.0 0.0 0.0; 0.0 0.0 1.0 0.0; 0.0 1.0 0.0 0.0; 1.0 0.0 0.0 0.0]' -@test indicators(y, ["A", "B", "C", "D"], true) == expected +@test indicators(y, ["A", "B", "C", "D"], sparse=true) == expected X = ["A" "B" "C"; "B" "A" "C"] cats = ["A", "B", "C", "D"] expected = [1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0; 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0; 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0]' -@test indicators(X, {cats, cats}, false) == expected +@test indicators(X, {cats, cats}, sparse=false) == expected