-
Notifications
You must be signed in to change notification settings - Fork 58
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Eigen backend optimization for Dense, GRU and LSTM layers #108
Merged
jatinchowdhury18
merged 12 commits into
jatinchowdhury18:main
from
IHorvalds:experiment/eigen_backend_optimization
Aug 14, 2023
Merged
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
2e09a7b
Make Dense Eigen faster
IHorvalds 02b19b6
GRU Eigen templated implementation
IHorvalds ff2248c
Fix GRU and make LSTM faster
IHorvalds 487ee81
Add explanatory comments
IHorvalds 4f6d911
Remove auto templated parameters
IHorvalds 11e1ca6
Fix comment format
IHorvalds f1d3568
Merge branch 'jatinchowdhury18:main' into experiment/eigen_backend_op…
IHorvalds 77c1f75
Remove unused type alias
IHorvalds c726e13
Fix GRU and LSTM implementations for sample rate correction modes
IHorvalds 29bd1da
One more try for the Dense layer
IHorvalds 58d79bb
Revert "One more try for the Dense layer"
IHorvalds 4474a38
Merge branch 'jatinchowdhury18:main' into experiment/eigen_backend_op…
IHorvalds File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,8 @@ class GRULayer : public Layer<T> | |
/** Resets the state of the GRU. */ | ||
void reset() override | ||
{ | ||
std::fill(ht1.data(), ht1.data() + Layer<T>::out_size, (T)0); | ||
extendedHt1.setZero(); | ||
extendedHt1(Layer<T>::out_size) = (T)1; | ||
} | ||
|
||
/** Returns the name of this layer. */ | ||
|
@@ -38,19 +39,53 @@ class GRULayer : public Layer<T> | |
/** Performs forward propagation for this layer. */ | ||
inline void forward(const T* input, T* h) noexcept override | ||
{ | ||
inVec = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>, RTNeuralEigenAlignment>( | ||
input, Layer<T>::in_size, 1); | ||
|
||
zVec.noalias() = wVec_z * inVec + uVec_z * ht1 + bVec_z.col(0) + bVec_z.col(1); | ||
rVec.noalias() = wVec_r * inVec + uVec_r * ht1 + bVec_r.col(0) + bVec_r.col(1); | ||
sigmoid(zVec); | ||
sigmoid(rVec); | ||
|
||
cVec.noalias() = wVec_c * inVec + rVec.cwiseProduct(uVec_c * ht1 + bVec_c.col(1)) + bVec_c.col(0); | ||
for (int i = 0; i < Layer<T>::in_size; ++i) | ||
{ | ||
extendedInVec(i) = input[i]; | ||
} | ||
|
||
/** | ||
* | Wz bz[0] | | input | | Wz * input + bz[0] | | ||
* alpha = | Wr br[0] | * | 1 | = | Wr * input + br[0] | | ||
* | Wc bc[0] | | Wc * input + bc[0] | | ||
* | ||
* | Uz bz[1] | | h(t-1) | | Uz * h(t-1) + bz[1] | | ||
* beta = | Ur br[1] | * | 1 | = | Ur * h(t-1) + br[1] | | ||
* | Uc bc[1] | | Uc * h(t-1) + bc[1] | | ||
*/ | ||
alphaVec.noalias() = wCombinedWeights * extendedInVec; | ||
betaVec.noalias() = uCombinedWeights * extendedHt1; | ||
|
||
/** | ||
* gamma = sigmoid( | z | = sigmoid(alpha[0 : 2*out_sizet] + beta[0 : 2*out_sizet]) | ||
* | r | ) | ||
*/ | ||
gammaVec.noalias() = alphaVec.segment(0, 2 * Layer<T>::out_size) + | ||
betaVec.segment(0, 2 * Layer<T>::out_size); | ||
sigmoid(gammaVec); | ||
|
||
/** | ||
* c = tanh( alpha[2*out_sizet : 3*out_sizet] + r.cwiseProduct(beta[2*out_sizet : 3*out_sizet] ) | ||
* i.e. c = tanh( Wc * input + bc[0] + r.cwiseProduct(Uc * h(t-1) + bc[1]) ) | ||
*/ | ||
cVec.noalias() = alphaVec.segment(2 * Layer<T>::out_size, Layer<T>::out_size) + | ||
gammaVec.segment(Layer<T>::out_size, Layer<T>::out_size).cwiseProduct( | ||
betaVec.segment(2 * Layer<T>::out_size, Layer<T>::out_size)); | ||
cVec = cVec.array().tanh(); | ||
|
||
ht1 = (ones - zVec).cwiseProduct(cVec) + zVec.cwiseProduct(ht1); | ||
std::copy(ht1.data(), ht1.data() + Layer<T>::out_size, h); | ||
/** | ||
* h(t-1) = (1 - z).cwiseProduct(c) + z.cwiseProduct(h(t-1)) | ||
* = c - z.cwiseProduct(c) + z.cwiseProduct(ht(t-1)) | ||
* = c + z.cwiseProduct(h(t-1) - c) | ||
*/ | ||
extendedHt1.segment(0, Layer<T>::out_size) = | ||
cVec + gammaVec.segment(0, Layer<T>::out_size).cwiseProduct( | ||
extendedHt1.segment(0, Layer<T>::out_size) - cVec); | ||
|
||
for (int i = 0; i < Layer<T>::out_size; ++i) | ||
{ | ||
h[i] = extendedHt1(i); | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -88,23 +123,29 @@ class GRULayer : public Layer<T> | |
T getBVal(int i, int k) const noexcept; | ||
|
||
private: | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wVec_z; | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wVec_r; | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wVec_c; | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uVec_z; | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uVec_r; | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uVec_c; | ||
Eigen::Matrix<T, Eigen::Dynamic, 2> bVec_z; | ||
Eigen::Matrix<T, Eigen::Dynamic, 2> bVec_r; | ||
Eigen::Matrix<T, Eigen::Dynamic, 2> bVec_c; | ||
|
||
Eigen::Matrix<T, Eigen::Dynamic, 1> ht1; | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> zVec; | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> rVec; | ||
// Kernels | ||
// | Wz bz0 | | ||
// | Wr br0 | | ||
// | Wc bc0 | | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> wCombinedWeights; | ||
|
||
// | Uz bz1 | | ||
// | Ur br1 | | ||
// | Uc bc1 | | ||
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> uCombinedWeights; | ||
|
||
// Input vec | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> extendedInVec; | ||
|
||
// h(t-1) vec | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> extendedHt1; | ||
|
||
// Scratch memory | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> alphaVec; | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> betaVec; | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> gammaVec; | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> cVec; | ||
|
||
Eigen::Matrix<T, Eigen::Dynamic, 1> inVec; | ||
Eigen::Matrix<T, Eigen::Dynamic, 1> ones; | ||
}; | ||
|
||
//==================================================== | ||
|
@@ -119,12 +160,16 @@ class GRULayer : public Layer<T> | |
template <typename T, int in_sizet, int out_sizet, SampleRateCorrectionMode sampleRateCorr = SampleRateCorrectionMode::None> | ||
class GRULayerT | ||
{ | ||
using b_type = Eigen::Matrix<T, out_sizet, 1>; | ||
using k_type = Eigen::Matrix<T, out_sizet, in_sizet>; | ||
using r_type = Eigen::Matrix<T, out_sizet, out_sizet>; | ||
|
||
using in_type = Eigen::Matrix<T, in_sizet, 1>; | ||
using extended_in_type = Eigen::Matrix<T, in_sizet + 1, 1>; | ||
using out_type = Eigen::Matrix<T, out_sizet, 1>; | ||
using extended_out_type = Eigen::Matrix<T, out_sizet + 1, 1>; | ||
|
||
using w_k_type = Eigen::Matrix<T, out_sizet * 3, in_sizet + 1>; | ||
using u_k_type = Eigen::Matrix<T, out_sizet * 3, out_sizet + 1>; | ||
|
||
using three_out_type = Eigen::Matrix<T, out_sizet * 3, 1>; | ||
using two_out_type = Eigen::Matrix<T, out_sizet * 2, 1>; | ||
|
||
public: | ||
static constexpr auto in_size = in_sizet; | ||
|
@@ -154,12 +199,48 @@ class GRULayerT | |
/** Performs forward propagation for this layer. */ | ||
inline void forward(const in_type& ins) noexcept | ||
{ | ||
zVec.noalias() = sigmoid(wVec_z * ins + uVec_z * outs + bVec_z); | ||
rVec.noalias() = sigmoid(wVec_r * ins + uVec_r * outs + bVec_r); | ||
|
||
cVec.noalias() = wVec_c * ins + rVec.cwiseProduct(uVec_c * outs + bVec_c1) + bVec_c0; | ||
for (int i = 0; i < in_sizet; ++i) | ||
{ | ||
extendedInVec(i) = ins(i); | ||
} | ||
|
||
/** | ||
* | Wz bz[0] | | input | | Wz * input + bz[0] | | ||
* alpha = | Wr br[0] | * | 1 | = | Wr * input + br[0] | | ||
* | Wc bc[0] | | Wc * input + bc[0] | | ||
* | ||
* | Uz bz[1] | | h(t-1) | | Uz * h(t-1) + bz[1] | | ||
* beta = | Ur br[1] | * | 1 | = | Ur * h(t-1) + br[1] | | ||
* | Uc bc[1] | | Uc * h(t-1) + bc[1] | | ||
*/ | ||
alphaVec.noalias() = wCombinedWeights * extendedInVec; | ||
betaVec.noalias() = uCombinedWeights * extendedHt1; | ||
|
||
/** | ||
* gamma = sigmoid( | z | = sigmoid(alpha[0 : 2*out_sizet] + beta[0 : 2*out_sizet]) | ||
* | r | ) | ||
*/ | ||
gammaVec = sigmoid(alphaVec.segment(0, 2 * out_sizet) + | ||
betaVec.segment(0, 2 * out_sizet)); | ||
|
||
/** | ||
* c = tanh( alpha[2*out_sizet : 3*out_sizet] + r.cwiseProduct(beta[2*out_sizet : 3*out_sizet] ) | ||
* i.e. c = tanh( Wc * input + bc[0] + r.cwiseProduct(Uc * h(t-1) + bc[1]) ) | ||
*/ | ||
cVec.noalias() = alphaVec.segment(2 * out_sizet, out_sizet) + | ||
gammaVec.segment(out_sizet, out_sizet).cwiseProduct( | ||
betaVec.segment(2 * out_sizet, out_sizet)); | ||
cVec = cVec.array().tanh(); | ||
|
||
/** | ||
* h(t-1) = (1 - z).cwiseProduct(c) + z.cwiseProduct(h(t-1)) | ||
* = c - z.cwiseProduct(c) + z.cwiseProduct(ht(t-1)) | ||
* = c + z.cwiseProduct(h(t-1) - c) | ||
*/ | ||
extendedHt1.segment(0, out_sizet) = | ||
cVec + gammaVec.segment(0, out_sizet).cwiseProduct( | ||
extendedHt1.segment(0, out_sizet) - cVec); | ||
|
||
computeOutput(); | ||
} | ||
|
||
|
@@ -193,16 +274,27 @@ class GRULayerT | |
inline std::enable_if_t<srCorr == SampleRateCorrectionMode::None, void> | ||
computeOutput() noexcept | ||
{ | ||
outs = (out_type::Ones() - zVec).cwiseProduct(cVec) + zVec.cwiseProduct(outs); | ||
for (int i = 0; i < out_sizet; ++i) | ||
{ | ||
outs(i) = extendedHt1(i); | ||
} | ||
Comment on lines
+277
to
+280
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, I think there should be a way to avoid this copy... I'll try out a couple of ideas. |
||
} | ||
|
||
template <SampleRateCorrectionMode srCorr = sampleRateCorr> | ||
inline std::enable_if_t<srCorr != SampleRateCorrectionMode::None, void> | ||
computeOutput() noexcept | ||
{ | ||
outs_delayed[delayWriteIdx] = (out_type::Ones() - zVec).cwiseProduct(cVec) + zVec.cwiseProduct(outs); | ||
for (int i = 0; i < out_sizet; ++i) | ||
{ | ||
outs_delayed[delayWriteIdx][i] = extendedHt1(i); | ||
} | ||
|
||
processDelay(outs_delayed, outs, delayWriteIdx); | ||
|
||
for (int i = 0; i < out_sizet; ++i) | ||
{ | ||
extendedHt1(i) = outs(i); | ||
} | ||
} | ||
|
||
template <typename OutVec, SampleRateCorrectionMode srCorr = sampleRateCorr> | ||
|
@@ -225,30 +317,25 @@ class GRULayerT | |
delayVec[j] = delayVec[j + 1]; | ||
} | ||
|
||
static inline out_type sigmoid(const out_type& x) noexcept | ||
template <typename Vector> | ||
static inline auto sigmoid(const Vector& x) noexcept | ||
{ | ||
return (T)1 / (((T)-1 * x.array()).array().exp() + (T)1); | ||
} | ||
|
||
// kernel weights | ||
k_type wVec_z; | ||
k_type wVec_r; | ||
k_type wVec_c; | ||
|
||
// recurrent weights | ||
r_type uVec_z; | ||
r_type uVec_r; | ||
r_type uVec_c; | ||
|
||
// biases | ||
b_type bVec_z; | ||
b_type bVec_r; | ||
b_type bVec_c0; | ||
b_type bVec_c1; | ||
|
||
out_type zVec; | ||
out_type rVec; | ||
w_k_type wCombinedWeights; | ||
u_k_type uCombinedWeights; | ||
|
||
// scratch memory | ||
three_out_type alphaVec; | ||
three_out_type betaVec; | ||
two_out_type gammaVec; | ||
|
||
// input, output, memory | ||
out_type cVec; | ||
extended_in_type extendedInVec; | ||
extended_out_type extendedHt1; | ||
|
||
// needed for delays when doing sample rate correction | ||
std::vector<out_type> outs_delayed; | ||
|
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be great if there were a way to do this kind of "one-multiply" dense layer operation without requiring the copy here... Unfortunately I can't think of an easy way at the moment.
Maybe it would be possible to set up something at a higher level which makes sure that all the layer input/output matrices have enough memory available to be extended to be one row longer? That way we could do something like an
Eigen::Map
to "grow" the input vector by one row, and just put a1
in that row, which would avoid the copy. That seems a bit cumbersome though, so it would be cool if there were an easier way.