Skip to content

Commit

Permalink
support layerNorm layer (intel-analytics#1454)
Browse files Browse the repository at this point in the history
1. Add layerNorm layer
2. Update layerNorm function to layerNorm layer in Bert and TransformerLayer
  • Loading branch information
dding3 committed Jul 10, 2019
1 parent 66e2797 commit 7a64fd6
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 31 deletions.
9 changes: 2 additions & 7 deletions layers/BERT.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ import com.intel.analytics.bigdl.utils.serializer._
import com.intel.analytics.bigdl.utils.serializer.converters.DataConverter
import com.intel.analytics.bigdl.utils.{MultiShape, Shape}
import com.intel.analytics.zoo.pipeline.api.Net
import com.intel.analytics.zoo.pipeline.api.autograd.{AutoGrad, Parameter, Variable}
import com.intel.analytics.zoo.pipeline.api.autograd.{AutoGrad, Variable}
import com.intel.analytics.zoo.pipeline.api.keras.layers.utils.{GraphRef, KerasUtils}
import com.intel.analytics.zoo.pipeline.api.keras.models.Model
import com.intel.analytics.zoo.pipeline.api.keras.models.Model.{apply => _, _}

import org.apache.log4j.Logger

import scala.collection.mutable.ArrayBuffer
Expand Down Expand Up @@ -155,11 +154,7 @@ object BERT extends KerasLayerSerializable {
initWeights = initTokenEmbeddingW).from(tokenTypeInput)

val embeddings = wordEmbeddings + positionEmbeddings + tokenTypeEmbeddings
val w = Parameter[T](Shape(1, hiddenSize),
initWeight = Tensor.ones[T](hiddenSize).view(1, hiddenSize))
val b = Parameter[T](Shape(1, hiddenSize),
initWeight = Tensor[T](hiddenSize).view(1, hiddenSize))
val afterNorm = TransformerLayer.layerNorm(embeddings, 1e-12, weight = w, bias = b)
val afterNorm = LayerNorm[T](nOutput = hiddenSize, eps = 1e-12).from(embeddings)
val h = Dropout(hiddenPDrop).from(afterNorm)

val embeddingLayer = Model(Array(wordInput, tokenTypeInput, positionInput), h)
Expand Down
95 changes: 95 additions & 0 deletions layers/InternalLayerNorm.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Copyright 2018 Analytics Zoo Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.intel.analytics.zoo.pipeline.api.keras.layers.internal

import com.intel.analytics.bigdl.nn.{Mean, Sum}
import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, TensorModule}
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric

import scala.reflect.ClassTag

private[zoo] class InternalLayerNorm[T: ClassTag](val nOutput: Int = 768, val eps: Double = 1e-5)
(implicit ev: TensorNumeric[T]) extends TensorModule[T]{
val weight = Tensor.ones[T](nOutput).view(1, nOutput)
val bias = Tensor[T](nOutput).view(1, nOutput)

var gradWeight: Tensor[T] = Tensor[T]()
var gradBias: Tensor[T] = Tensor[T]()

var y: Tensor[T] = null
var divInput1: Tensor[T] = null
var divInput2: Tensor[T] = null
var sqrtInput: Tensor[T] = null

override def updateOutput(input: Tensor[T]): Tensor[T] = {
val dim = input.dim()
val u = input.sum(dim).div(ev.fromType(input.size(dim)))
divInput1 = input.clone().sub(u) // x - u
val square = divInput1.clone().square()
val s = square.sum(square.dim()).div(ev.fromType(square.size(square.dim())))
sqrtInput = s.add(ev.fromType(eps))
divInput2 = sqrtInput.clone().sqrt()
y = divInput1.clone.div(divInput2)
output = y.clone().cmul(weight).add(bias)
output
}

override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
val divGradInput1 = gradOutput.clone().cmul(weight).div(divInput2)
// below code is equal to
// val divGradInput2 = (divGradInput1.clone().div(divInput2))
// .mul(ev.fromType(-1)).cmul(divInput1)
// val squareGadO = divGradInput2.sum(divGradInput2.dim())
// val sqrtGradI = divInput2.div(sqrtInput).mul(ev.fromType(0.5)).cmul(squareGadO)
// val sumGradI = sqrtGradI.div(ev.fromType(divInput1.size(divInput1.dim())))
// .expand(divInput1.size())
// val squareGradI = divInput1.mul(ev.fromType(2)).cmul(sumGradI)
val divGradInput2 = (divGradInput1.clone().div(divInput2)).cmul(divInput1)
val squareGadO = divGradInput2.sum(divGradInput2.dim())
val sqrtGradI = divInput2.div(sqrtInput).cmul(squareGadO)
val sumGradI = sqrtGradI.div(ev.fromType(-1 * divInput1.size(divInput1.dim())))
.expand(divInput1.size())
val squareGradI = divInput1.cmul(sumGradI)

val addGradO = divGradInput1.add(squareGradI)
val addGradI = addGradO.sum(addGradO.dim())
val negativeGradO = addGradI.sum(addGradI.dim())
// val negativeGradI = negativeGradO.mul(ev.fromType(-1))
val sum2GradI = negativeGradO.div(ev.fromType(-1 * input.size(input.dim())))

gradInput = sum2GradI.add(addGradO)
gradInput
}

override def accGradParameters(input: Tensor[T], gradOutput: Tensor[T]): Unit = {
var i = 1
gradWeight = y.clone().cmul(gradOutput)
gradBias = gradOutput
while (i < gradOutput.dim()) {
gradBias = gradBias.sum(i)
gradWeight = gradWeight.sum(i)
i += 1
}
gradBias.resize(bias.size())
gradWeight.resize(weight.size())
}

override def parameters(): (Array[Tensor[T]], Array[Tensor[T]]) = {
(Array(this.weight, this.bias), Array(this.gradWeight, this.gradBias))
}
}
64 changes: 64 additions & 0 deletions layers/LayerNorm.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2018 Analytics Zoo Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.intel.analytics.zoo.pipeline.api.keras.layers

import com.intel.analytics.bigdl.nn.abstractnn.AbstractModule
import com.intel.analytics.bigdl.nn.keras.KerasLayer
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Shape
import com.intel.analytics.zoo.pipeline.api.Net
import com.intel.analytics.zoo.pipeline.api.keras.layers.internal.InternalLayerNorm
import com.intel.analytics.zoo.pipeline.api.keras.layers.utils.KerasUtils

import scala.reflect.ClassTag

/**
* Normalization layer used in Bert.
* u = AutoGrad.mean(x, sizes.size - 1, true)
* t = x - u
* s = AutoGrad.mean(AutoGrad.square(x - u), sizes.size -1, true)
* y = (x - u) / AutoGrad.sqrt(s + e)
* y * weight + bias
*
* @param nOutput The size of output dimension.
* @param eps Optional. Small value to avoid divide zero.
*
* @tparam T Numeric type. Only support float/double now
*/
class LayerNorm[T: ClassTag](val nOutput: Int = 768, val eps: Double = 1e-5,
val inputShape: Shape = null)(implicit ev: TensorNumeric[T])
extends KerasLayer[Tensor[T], Tensor[T], T](KerasUtils.addBatch(inputShape)) with Net{

override def doBuild(inputShape: Shape): AbstractModule[Tensor[T], Tensor[T], T] = {
val layer = new InternalLayerNorm[T](nOutput, eps)
layer.asInstanceOf[AbstractModule[Tensor[T], Tensor[T], T]]
}

override def computeOutputShape(inputShape: Shape): Shape = {
val input = inputShape.toSingle().toArray
Shape(input.slice(0, input.length -1) ++ Array(nOutput))
}
}

object LayerNorm {
def apply[@specialized(Float, Double) T: ClassTag](nOutput: Int = 768,
eps: Double = 1e-5,
inputShape: Shape = null)(implicit ev: TensorNumeric[T]): LayerNorm[T] = {
new LayerNorm[T](nOutput, eps, inputShape)
}
}
27 changes: 3 additions & 24 deletions layers/TransformerLayer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.{MultiShape, Shape}
import com.intel.analytics.zoo.pipeline.api.Net
import com.intel.analytics.zoo.pipeline.api.autograd.{AutoGrad, Constant, Parameter, Variable}
import com.intel.analytics.zoo.pipeline.api.autograd.{AutoGrad, Constant, Variable}
import com.intel.analytics.zoo.pipeline.api.keras.layers.utils.KerasUtils
import com.intel.analytics.zoo.pipeline.api.keras.models.{Model, Sequential}

Expand Down Expand Up @@ -113,21 +113,10 @@ private[layers] class TransformerLayer[T: ClassTag](

def block(x: Variable[T], hiddenSize: Int, attention_mask: Variable[T] = null,
eplision: Double = 1e-5): Variable[T] = {
// g, b for layerNorm
val g = Parameter[T](Shape(1, hiddenSize),
initWeight = Tensor.ones[T](hiddenSize).view(1, hiddenSize))
val b = Parameter[T](Shape(1, hiddenSize),
initWeight = Tensor[T](hiddenSize).view(1, hiddenSize))

// g, b for layerNorm
val g2 = Parameter[T](Shape(1, hiddenSize),
initWeight = Tensor.ones[T](hiddenSize).view(1, hiddenSize))
val b2 = Parameter[T](Shape(1, hiddenSize),
initWeight = Tensor[T](hiddenSize).view(1, hiddenSize))
val a = multiHeadSelfAttention(x, hiddenSize, attention_mask)
val n = TransformerLayer.layerNorm(x + a, eplision, weight = g, bias = b)
val n = LayerNorm[T](hiddenSize, eplision).from(x + a)
val m = mlp(n, hiddenSize)
val h = TransformerLayer.layerNorm(n + m, eplision, weight = g2, bias = b2)
val h = LayerNorm[T](hiddenSize, eplision).from(n + m)
h
}

Expand Down Expand Up @@ -277,14 +266,4 @@ object TransformerLayer {
new TransformerLayer[T](nBlock, residPdrop, attnPdrop, nHead,
initializerRange, bidirectional, outputAllBlock, embeddingLayer = embeddingLayer)
}

def layerNorm[@specialized(Float, Double) T: ClassTag](x: Variable[T],
e: Double = 1e-5, weight: Parameter[T], bias: Parameter[T])
(implicit ev: TensorNumeric[T]): Variable[T] = {
val sizes = x.getOutputShape().toSingle().toArray
val u = AutoGrad.mean(x, sizes.size - 1, true)
val s = AutoGrad.mean(AutoGrad.square(x - u), sizes.size -1, true)
val y = (x - u) / AutoGrad.sqrt(s + e)
y * weight + bias
}
}

0 comments on commit 7a64fd6

Please sign in to comment.