Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eigen backend optimization for Dense, GRU and LSTM layers #108

53 changes: 34 additions & 19 deletions RTNeural/dense/dense_eigen.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ class Dense : public Layer<T>
Dense(int in_size, int out_size)
: Layer<T>(in_size, out_size)
{
weights = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>::Zero(out_size, in_size);
bias = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>::Zero(out_size, 1);
weights = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>::Zero(out_size, in_size + 1);

inVec = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>::Zero(in_size, 1);
outVec = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>::Zero(out_size, 1);
inVec = Eigen::Matrix<T, Eigen::Dynamic, 1>::Zero(in_size + 1);
outVec = Eigen::Matrix<T, Eigen::Dynamic, 1>::Zero(out_size);

inVec(in_size, 0) = (T)1;
}

Dense(std::initializer_list<int> sizes)
Expand All @@ -49,11 +50,17 @@ class Dense : public Layer<T>
/** Performs forward propagation for this layer. */
inline void forward(const T* input, T* out) noexcept override
{
inVec = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>, RTNeuralEigenAlignment>(
input, Layer<T>::in_size, 1);
outVec.noalias() = weights * inVec + bias;
for (int i = 0; i < Layer<T>::in_size; ++i)
inVec(i, 0) = input[i];

/**
* out = | w b | * | input |
* | 1 |
*/
outVec.noalias() = weights * inVec;

std::copy(outVec.data(), outVec.data() + Layer<T>::out_size, out);
for (int i = 0; i < Layer<T>::out_size; ++i)
out[i] = outVec(i, 0);
}

/**
Expand Down Expand Up @@ -89,18 +96,17 @@ class Dense : public Layer<T>
void setBias(const T* b)
{
for(int i = 0; i < Layer<T>::out_size; ++i)
bias(i, 0) = b[i];
weights(i, Layer<T>::in_size) = b[i];
}

/** Returns the weights value at the given indices. */
T getWeight(int i, int k) const noexcept { return weights(i, k); }

/** Returns the bias value at the given index. */
T getBias(int i) const noexcept { return bias(i, 0); }
T getBias(int i) const noexcept { return weights(i, Layer<T>::in_size); }

private:
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> weights;
Eigen::Matrix<T, Eigen::Dynamic, 1> bias;

Eigen::Matrix<T, Eigen::Dynamic, 1> inVec;
Eigen::Matrix<T, Eigen::Dynamic, 1> outVec;
Expand All @@ -114,8 +120,9 @@ class Dense : public Layer<T>
template <typename T, int in_sizet, int out_sizet>
class DenseT
{
using vec_type = Eigen::Matrix<T, out_sizet, 1>;
using mat_type = Eigen::Matrix<T, out_sizet, in_sizet>;
using out_vec_type = Eigen::Matrix<T, out_sizet, 1>;
using in_vec_type = Eigen::Matrix<T, in_sizet + 1, 1>;
using mat_type = Eigen::Matrix<T, out_sizet, in_sizet + 1>;

public:
static constexpr auto in_size = in_sizet;
Expand All @@ -125,8 +132,9 @@ class DenseT
: outs(outs_internal)
{
weights = mat_type::Zero();
bias = vec_type::Zero();
outs = vec_type::Zero();
ins_internal = in_vec_type::Zero();
ins_internal(in_size, 0) = (T)1;
outs = out_vec_type::Zero();
}

/** Returns the name of this layer. */
Expand All @@ -141,7 +149,14 @@ class DenseT
/** Performs forward propagation for this layer. */
inline void forward(const Eigen::Matrix<T, in_size, 1>& ins) noexcept
{
outs.noalias() = weights * ins + bias;
for (int i = 0; i < in_size; ++i)
ins_internal(i, 0) = ins(i, 0);
Comment on lines +152 to +153
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great if there were a way to do this kind of "one-multiply" dense layer operation without requiring the copy here... Unfortunately I can't think of an easy way at the moment.

Maybe it would be possible to set up something at a higher level which makes sure that all the layer input/output matrices have enough memory available to be extended to be one row longer? That way we could do something like an Eigen::Map to "grow" the input vector by one row, and just put a 1 in that row, which would avoid the copy. That seems a bit cumbersome though, so it would be cool if there were an easier way.


/**
* out = | w b | * | input |
* | 1 |
*/
outs.noalias() = weights * ins_internal;
}

/**
Expand Down Expand Up @@ -177,16 +192,16 @@ class DenseT
void setBias(const T* b)
{
for(int i = 0; i < out_size; ++i)
bias(i, 0) = b[i];
weights(i, in_size) = b[i];
}

Eigen::Map<vec_type, RTNeuralEigenAlignment> outs;
Eigen::Map<out_vec_type, RTNeuralEigenAlignment> outs;

private:
T outs_internal alignas(RTNEURAL_DEFAULT_ALIGNMENT)[out_size];
in_vec_type ins_internal;

mat_type weights;
vec_type bias;
};

} // namespace RTNeural
Expand Down
197 changes: 142 additions & 55 deletions RTNeural/gru/gru_eigen.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ class GRULayer : public Layer<T>
/** Resets the state of the GRU. */
void reset() override
{
std::fill(ht1.data(), ht1.data() + Layer<T>::out_size, (T)0);
extendedHt1.setZero();
extendedHt1(Layer<T>::out_size) = (T)1;
}

/** Returns the name of this layer. */
Expand All @@ -38,19 +39,53 @@ class GRULayer : public Layer<T>
/** Performs forward propagation for this layer. */
inline void forward(const T* input, T* h) noexcept override
{
inVec = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>, RTNeuralEigenAlignment>(
input, Layer<T>::in_size, 1);

zVec.noalias() = wVec_z * inVec + uVec_z * ht1 + bVec_z.col(0) + bVec_z.col(1);
rVec.noalias() = wVec_r * inVec + uVec_r * ht1 + bVec_r.col(0) + bVec_r.col(1);
sigmoid(zVec);
sigmoid(rVec);

cVec.noalias() = wVec_c * inVec + rVec.cwiseProduct(uVec_c * ht1 + bVec_c.col(1)) + bVec_c.col(0);
for (int i = 0; i < Layer<T>::in_size; ++i)
{
extendedInVec(i) = input[i];
}

/**
* | Wz bz[0] | | input | | Wz * input + bz[0] |
* alpha = | Wr br[0] | * | 1 | = | Wr * input + br[0] |
* | Wc bc[0] | | Wc * input + bc[0] |
*
* | Uz bz[1] | | h(t-1) | | Uz * h(t-1) + bz[1] |
* beta = | Ur br[1] | * | 1 | = | Ur * h(t-1) + br[1] |
* | Uc bc[1] | | Uc * h(t-1) + bc[1] |
*/
alphaVec.noalias() = wCombinedWeights * extendedInVec;
betaVec.noalias() = uCombinedWeights * extendedHt1;

/**
* gamma = sigmoid( | z | = sigmoid(alpha[0 : 2*out_sizet] + beta[0 : 2*out_sizet])
* | r | )
*/
gammaVec.noalias() = alphaVec.segment(0, 2 * Layer<T>::out_size) +
betaVec.segment(0, 2 * Layer<T>::out_size);
sigmoid(gammaVec);

/**
* c = tanh( alpha[2*out_sizet : 3*out_sizet] + r.cwiseProduct(beta[2*out_sizet : 3*out_sizet] )
* i.e. c = tanh( Wc * input + bc[0] + r.cwiseProduct(Uc * h(t-1) + bc[1]) )
*/
cVec.noalias() = alphaVec.segment(2 * Layer<T>::out_size, Layer<T>::out_size) +
gammaVec.segment(Layer<T>::out_size, Layer<T>::out_size).cwiseProduct(
betaVec.segment(2 * Layer<T>::out_size, Layer<T>::out_size));
cVec = cVec.array().tanh();

ht1 = (ones - zVec).cwiseProduct(cVec) + zVec.cwiseProduct(ht1);
std::copy(ht1.data(), ht1.data() + Layer<T>::out_size, h);
/**
* h(t-1) = (1 - z).cwiseProduct(c) + z.cwiseProduct(h(t-1))
* = c - z.cwiseProduct(c) + z.cwiseProduct(ht(t-1))
* = c + z.cwiseProduct(h(t-1) - c)
*/
extendedHt1.segment(0, Layer<T>::out_size) =
cVec + gammaVec.segment(0, Layer<T>::out_size).cwiseProduct(
extendedHt1.segment(0, Layer<T>::out_size) - cVec);

for (int i = 0; i < Layer<T>::out_size; ++i)
{
h[i] = extendedHt1(i);
}
}

/**
Expand Down Expand Up @@ -88,23 +123,29 @@ class GRULayer : public Layer<T>
T getBVal(int i, int k) const noexcept;

private:
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wVec_z;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wVec_r;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wVec_c;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uVec_z;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uVec_r;
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uVec_c;
Eigen::Matrix<T, Eigen::Dynamic, 2> bVec_z;
Eigen::Matrix<T, Eigen::Dynamic, 2> bVec_r;
Eigen::Matrix<T, Eigen::Dynamic, 2> bVec_c;

Eigen::Matrix<T, Eigen::Dynamic, 1> ht1;
Eigen::Matrix<T, Eigen::Dynamic, 1> zVec;
Eigen::Matrix<T, Eigen::Dynamic, 1> rVec;
// Kernels
// | Wz bz0 |
// | Wr br0 |
// | Wc bc0 |
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wCombinedWeights;

// | Uz bz1 |
// | Ur br1 |
// | Uc bc1 |
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uCombinedWeights;

// Input vec
Eigen::Matrix<T, Eigen::Dynamic, 1> extendedInVec;

// h(t-1) vec
Eigen::Matrix<T, Eigen::Dynamic, 1> extendedHt1;

// Scratch memory
Eigen::Matrix<T, Eigen::Dynamic, 1> alphaVec;
Eigen::Matrix<T, Eigen::Dynamic, 1> betaVec;
Eigen::Matrix<T, Eigen::Dynamic, 1> gammaVec;
Eigen::Matrix<T, Eigen::Dynamic, 1> cVec;

Eigen::Matrix<T, Eigen::Dynamic, 1> inVec;
Eigen::Matrix<T, Eigen::Dynamic, 1> ones;
};

//====================================================
Expand All @@ -119,12 +160,16 @@ class GRULayer : public Layer<T>
template <typename T, int in_sizet, int out_sizet, SampleRateCorrectionMode sampleRateCorr = SampleRateCorrectionMode::None>
class GRULayerT
{
using b_type = Eigen::Matrix<T, out_sizet, 1>;
using k_type = Eigen::Matrix<T, out_sizet, in_sizet>;
using r_type = Eigen::Matrix<T, out_sizet, out_sizet>;

using in_type = Eigen::Matrix<T, in_sizet, 1>;
using extended_in_type = Eigen::Matrix<T, in_sizet + 1, 1>;
using out_type = Eigen::Matrix<T, out_sizet, 1>;
using extended_out_type = Eigen::Matrix<T, out_sizet + 1, 1>;

using w_k_type = Eigen::Matrix<T, out_sizet * 3, in_sizet + 1>;
using u_k_type = Eigen::Matrix<T, out_sizet * 3, out_sizet + 1>;

using three_out_type = Eigen::Matrix<T, out_sizet * 3, 1>;
using two_out_type = Eigen::Matrix<T, out_sizet * 2, 1>;

public:
static constexpr auto in_size = in_sizet;
Expand Down Expand Up @@ -154,12 +199,48 @@ class GRULayerT
/** Performs forward propagation for this layer. */
inline void forward(const in_type& ins) noexcept
{
zVec.noalias() = sigmoid(wVec_z * ins + uVec_z * outs + bVec_z);
rVec.noalias() = sigmoid(wVec_r * ins + uVec_r * outs + bVec_r);

cVec.noalias() = wVec_c * ins + rVec.cwiseProduct(uVec_c * outs + bVec_c1) + bVec_c0;
for (int i = 0; i < in_sizet; ++i)
{
extendedInVec(i) = ins(i);
}

/**
* | Wz bz[0] | | input | | Wz * input + bz[0] |
* alpha = | Wr br[0] | * | 1 | = | Wr * input + br[0] |
* | Wc bc[0] | | Wc * input + bc[0] |
*
* | Uz bz[1] | | h(t-1) | | Uz * h(t-1) + bz[1] |
* beta = | Ur br[1] | * | 1 | = | Ur * h(t-1) + br[1] |
* | Uc bc[1] | | Uc * h(t-1) + bc[1] |
*/
alphaVec.noalias() = wCombinedWeights * extendedInVec;
betaVec.noalias() = uCombinedWeights * extendedHt1;

/**
* gamma = sigmoid( | z | = sigmoid(alpha[0 : 2*out_sizet] + beta[0 : 2*out_sizet])
* | r | )
*/
gammaVec = sigmoid(alphaVec.segment(0, 2 * out_sizet) +
betaVec.segment(0, 2 * out_sizet));

/**
* c = tanh( alpha[2*out_sizet : 3*out_sizet] + r.cwiseProduct(beta[2*out_sizet : 3*out_sizet] )
* i.e. c = tanh( Wc * input + bc[0] + r.cwiseProduct(Uc * h(t-1) + bc[1]) )
*/
cVec.noalias() = alphaVec.segment(2 * out_sizet, out_sizet) +
gammaVec.segment(out_sizet, out_sizet).cwiseProduct(
betaVec.segment(2 * out_sizet, out_sizet));
cVec = cVec.array().tanh();

/**
* h(t-1) = (1 - z).cwiseProduct(c) + z.cwiseProduct(h(t-1))
* = c - z.cwiseProduct(c) + z.cwiseProduct(ht(t-1))
* = c + z.cwiseProduct(h(t-1) - c)
*/
extendedHt1.segment(0, out_sizet) =
cVec + gammaVec.segment(0, out_sizet).cwiseProduct(
extendedHt1.segment(0, out_sizet) - cVec);

computeOutput();
}

Expand Down Expand Up @@ -193,16 +274,27 @@ class GRULayerT
inline std::enable_if_t<srCorr == SampleRateCorrectionMode::None, void>
computeOutput() noexcept
{
outs = (out_type::Ones() - zVec).cwiseProduct(cVec) + zVec.cwiseProduct(outs);
for (int i = 0; i < out_sizet; ++i)
{
outs(i) = extendedHt1(i);
}
Comment on lines +277 to +280
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly, I think there should be a way to avoid this copy... I'll try out a couple of ideas.

}

template <SampleRateCorrectionMode srCorr = sampleRateCorr>
inline std::enable_if_t<srCorr != SampleRateCorrectionMode::None, void>
computeOutput() noexcept
{
outs_delayed[delayWriteIdx] = (out_type::Ones() - zVec).cwiseProduct(cVec) + zVec.cwiseProduct(outs);
for (int i = 0; i < out_sizet; ++i)
{
outs_delayed[delayWriteIdx][i] = extendedHt1(i);
}

processDelay(outs_delayed, outs, delayWriteIdx);

for (int i = 0; i < out_sizet; ++i)
{
extendedHt1(i) = outs(i);
}
}

template <typename OutVec, SampleRateCorrectionMode srCorr = sampleRateCorr>
Expand All @@ -225,30 +317,25 @@ class GRULayerT
delayVec[j] = delayVec[j + 1];
}

static inline out_type sigmoid(const out_type& x) noexcept
template <typename Vector>
static inline auto sigmoid(const Vector& x) noexcept
{
return (T)1 / (((T)-1 * x.array()).array().exp() + (T)1);
}

// kernel weights
k_type wVec_z;
k_type wVec_r;
k_type wVec_c;

// recurrent weights
r_type uVec_z;
r_type uVec_r;
r_type uVec_c;

// biases
b_type bVec_z;
b_type bVec_r;
b_type bVec_c0;
b_type bVec_c1;

out_type zVec;
out_type rVec;
w_k_type wCombinedWeights;
u_k_type uCombinedWeights;

// scratch memory
three_out_type alphaVec;
three_out_type betaVec;
two_out_type gammaVec;

// input, output, memory
out_type cVec;
extended_in_type extendedInVec;
extended_out_type extendedHt1;

// needed for delays when doing sample rate correction
std::vector<out_type> outs_delayed;
Expand Down
Loading
Loading