🚀 test(examples): integrate Gemma2-2B (#132)

owkin · Sep 1, 2024 · 838e922 · 838e922
1 parent 54b4a30
commit 838e922
Show file tree

Hide file tree

Showing 18 changed files with 1,475 additions and 96 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.
 
 ## [unreleased]
 
+🚀 **examples:** integrate Gemma2-2B ([#132](https://github.com/owkin/GrAIdient/pull/132))\
 ✨ **layer_seq:** LLM sliding window ([#131](https://github.com/owkin/GrAIdient/pull/131))\
 🚀 **examples:** 3 LLMs examples ([#130](https://github.com/owkin/GrAIdient/pull/130))\
 📚 **docs:** LLM doc & split tests ([129](https://github.com/owkin/GrAIdient/pull/129))\

diff --git a/Docs/Examples/LLM.md b/Docs/Examples/LLM.md
@@ -16,17 +16,24 @@ pip install -e .
 ```
 
 Then: 
-- download weights from 
+- Download weights from 
 [MistralAI](https://docs.mistral.ai/getting-started/open_weight_models/) 
+(mistral-7B-Instruct-v0.3)
 and / or
 [Llama](https://llama.meta.com/llama-downloads/) 
-- Update `_modelPathMistral`, `_modelPathLlama2`, `_modelPathLlama3` in the 
+(llama-2-7b-chat or Meta-Llama-3-8B-Instruct) 
+and / or Gemma2 from [HuggingFace](https://huggingface.co/google/gemma-2-2b-it) 
+(Gemma-2-2b-it).
+- Update `_modelPathMistral`, `_modelPathLlama2`, `_modelPathLlama3`, 
+`_modelPathGemma2` in the 
 [LLMExample](../../Tests/GrAIExamples/LLMExample.swift) file with the 
 previous downloaded weights. 
 - Optionnally update `_prompt`.
-- Rename `_testGenerateMistral`, `_testGenerateLlama2` and `_testGenerateLlama3` 
+- Rename `_testGenerateMistral`, `_testGenerateLlama2`, `_testGenerateLlama3` 
+and `_testGenerateGemma2`
 into 
-`testGenerateMistral`, `testGenerateLlama2` and `testGenerateLlama3`. 
+`testGenerateMistral`, `testGenerateLlama2`, `testGenerateLlama3` and 
+`testGenerateGemma2`. 
 - Run the tests.
 
 It is finally possible to clean the environment 🌍
@@ -41,6 +48,7 @@ conda env remove --name graiexamples
 1. Generate text from a prompt with Mistral 7B Instruct model.
 1. Generate text from a prompt with Llama 2 7B Chat model.
 1. Generate text from a prompt with Llama 3 8B Instruct model.  
+1. Generata text from a prompt with Gemme 2 2B Instruct model.
 
 ## Further tests
 

diff --git a/Sources/GrAIdient/Core/Function/Normalization.swift b/Sources/GrAIdient/Core/Function/Normalization.swift
@@ -61,14 +61,20 @@ class Normalization
     /// - Parameters:
     ///     - outs: The data to normalize.
     ///     - Ɣ: The weights to scale the normalization result.
+    ///     - addUnitOffset: Whether to add unit offset or not.
     /// - Returns: The data normalized.
     ///
     static func forwardΣGC(outs: [Double],
-                          Ɣ: [Double]) -> [Double]
+                           Ɣ: [Double],
+                           addUnitOffset: Bool) -> [Double]
     {
         let σ2 = vDSP.meanSquare(outs)
         let xHat = vDSP.divide(outs, sqrt(σ2 + _Ɛ))
-        let outsNew = vDSP.multiply(Ɣ, xHat)
+        var outsNew = vDSP.multiply(Ɣ, xHat)
+        if addUnitOffset
+        {
+            outsNew = vDSP.add(xHat, outsNew)
+        }
         return outsNew
     }
 
@@ -142,18 +148,24 @@ class Normalization
     /// - Parameters:
     ///     - outs: The data to normalize.
     ///     - Ɣ: The weights to scale the normalization result.
+    ///     - addUnitOffset: Whether to add unit offset or not.
     /// - Returns: (The data normalized,
     ///            The data normalized without taking into account the bias and the weight,
     ///            The deviation of the data).
     ///
     static func forwardΣ(outs: [Double],
-                         Ɣ: [Double]) -> (outsNew: [Double],
-                                          xHat: [Double],
-                                          σ2: Double)
+                         Ɣ: [Double],
+                         addUnitOffset: Bool) -> (outsNew: [Double],
+                                                  xHat: [Double],
+                                                  σ2: Double)
     {
         let σ2 = vDSP.meanSquare(outs)
         let xHat = vDSP.divide(outs, sqrt(σ2 + _Ɛ))
-        let outsNew = vDSP.multiply(Ɣ, xHat)
+        var outsNew = vDSP.multiply(Ɣ, xHat)
+        if addUnitOffset
+        {
+            outsNew = vDSP.add(xHat, outsNew)
+        }
 
         return (outsNew: outsNew,
                 xHat: xHat,
@@ -263,17 +275,28 @@ class Normalization
     ///     - xHat: The data normalized without taking into account the bias and the weight.
     ///     - σ2: The deviation of the data.
     ///     - Ɣ: The weights that scaled the normalization result.
+    ///     - addUnitOffset: Whether to add unit offset or not.
     /// - Returns: The gradient taking into account the normalization.
     ///
     static func backwardΣ(delta: [Double],
                           xHat: [Double],
                           σ2: Double,
-                          Ɣ: [Double]) -> [Double]
+                          Ɣ: [Double],
+                          addUnitOffset: Bool) -> [Double]
     {
         let nbElems = delta.count
         let factor = 1.0 / (Double(nbElems) * sqrt(σ2 + _Ɛ))
 
-        let Ɣdelta = vDSP.multiply(Ɣ, delta)
+        let Ɣdelta: [Double]
+        if addUnitOffset
+        {
+            Ɣdelta = vDSP.multiply(vDSP.add(1, Ɣ), delta)
+        }
+        else
+        {
+            Ɣdelta = vDSP.multiply(Ɣ, delta)
+        }
+
         let sum2 = vDSP.sum(vDSP.multiply(Ɣdelta, xHat))
 
         let tmp1 = vDSP.add(

diff --git a/Sources/GrAIdient/Core/Layer/LayerNormalization.swift b/Sources/GrAIdient/Core/Layer/LayerNormalization.swift
@@ -2847,7 +2847,8 @@ public class RMSNormalization: LayerWeightsNormalization
                     outs: layer.getOutsGC(
                         batch: batch, seq: seq, elem: elem
                     ),
-                    Ɣ: Ɣ
+                    Ɣ: Ɣ,
+                    addUnitOffset: layer.addUnitOffset
                 )
                 layer.setOutsGC(
                     batch: batch, seq: seq, elem: elem, outs: outs
@@ -2894,7 +2895,8 @@ public class RMSNormalization: LayerWeightsNormalization
             {
                 let (outs, xHat, σ2) = Normalization.forwardΣ(
                     outs: layer.getOuts(batch: batch, seq: seq),
-                    Ɣ: Ɣ
+                    Ɣ: Ɣ,
+                    addUnitOffset: layer.addUnitOffset
                 )
                 layer.setOuts(batch: batch, seq: seq, outs: outs)
 
@@ -2927,7 +2929,8 @@ public class RMSNormalization: LayerWeightsNormalization
                 delta: delta1,
                 xHat: _xHat[seq + sequence * batch],
                 σ2: _σ2[seq + sequence * batch],
-                Ɣ: Ɣ
+                Ɣ: Ɣ,
+                addUnitOffset: layer.addUnitOffset
             )
             layer.setDelta(batch: batch, seq: seq, delta: delta2)
 
@@ -3091,6 +3094,7 @@ class RMSNormalizationGPU: LayerWeightsNormalization
         let pNbNeurons: [UInt32] = [UInt32(_nbNeurons)]
         let pNbBatch: [UInt32] = [UInt32(batchSize)]
         let pSequence: [UInt32] = [UInt32(sequence)]
+        let pAddUnitOffset: [UInt32] = layer.addUnitOffset ? [1] : [0]
 
         if _xHat == nil
         {
@@ -3108,8 +3112,9 @@ class RMSNormalizationGPU: LayerWeightsNormalization
         command.setBytes(pNbNeurons, atIndex: 2)
         command.setBytes(pNbBatch, atIndex: 3)
         command.setBytes(pSequence, atIndex: 4)
-        command.setBuffer(layer.outs.metal, atIndex: 5)
-        command.setBuffer(_xHat.metal, atIndex: 6)
+        command.setBytes(pAddUnitOffset, atIndex: 5)
+        command.setBuffer(layer.outs.metal, atIndex: 6)
+        command.setBuffer(_xHat.metal, atIndex: 7)
 
         command.dispatchThreads(
             width: _nbNeurons,
@@ -3160,6 +3165,7 @@ class RMSNormalizationGPU: LayerWeightsNormalization
         let pNbNeurons: [UInt32] = [UInt32(_nbNeurons)]
         let pNbBatch: [UInt32] = [UInt32(batchSize)]
         let pSequence: [UInt32] = [UInt32(sequence)]
+        let pAddUnitOffset: [UInt32] = layer.addUnitOffset ? [1] : [0]
 
         let command = MetalKernel.get.createCommand(
             "backwardRMSNormSeq", deviceID: _deviceID
@@ -3171,7 +3177,8 @@ class RMSNormalizationGPU: LayerWeightsNormalization
         command.setBytes(pNbNeurons, atIndex: 4)
         command.setBytes(pNbBatch, atIndex: 5)
         command.setBytes(pSequence, atIndex: 6)
-        command.setBuffer(layer.delta.metal, atIndex: 7)
+        command.setBytes(pAddUnitOffset, atIndex: 7)
+        command.setBuffer(layer.delta.metal, atIndex: 8)
 
         command.dispatchThreads(
             width: _nbNeurons,
@@ -3189,6 +3196,7 @@ class RMSNormalizationGPU: LayerWeightsNormalization
         let pNbNeurons: [UInt32] = [UInt32(_nbNeurons)]
         let pNbBatch: [UInt32] = [UInt32(batchSize)]
         let pSequence: [UInt32] = [UInt32(sequence)]
+        let pAddUnitOffset: [UInt32] = layer.addUnitOffset ? [1] : [0]
 
         if _sum2 == nil
         {
@@ -3206,7 +3214,8 @@ class RMSNormalizationGPU: LayerWeightsNormalization
         command.setBytes(pNbNeurons, atIndex: 3)
         command.setBytes(pNbBatch, atIndex: 4)
         command.setBytes(pSequence, atIndex: 5)
-        command.setBuffer(_sum2.metal, atIndex: 6)
+        command.setBytes(pAddUnitOffset, atIndex: 6)
+        command.setBuffer(_sum2.metal, atIndex: 7)
 
         command.dispatchThreads(width: sequence, height: batchSize)
         command.enqueue()

diff --git a/Sources/GrAIdient/Core/Model/Model.swift b/Sources/GrAIdient/Core/Model/Model.swift
@@ -208,17 +208,15 @@ public class BaseModel: Codable
         let newModel = BaseModel(name: name)
         var newLayers = [Layer]()
 
-        var updatedSeq = false
         for layer in layers
         {
             let newLayer = layer.copy(mapping: mapping, inPlace: inPlace)
             newLayers.append(newLayer)
             mapping[layer.id] = newLayer
 
-            if let layerTmp = newLayer as? LayerSeq, !updatedSeq
+            if let layerTmp = newLayer as? LayerSeq
             {
                 layerTmp.sequence = sequence
-                updatedSeq = true
             }
         }
 

diff --git a/Sources/GrAIdient/LayerSeq/RMSNormSeq.swift b/Sources/GrAIdient/LayerSeq/RMSNormSeq.swift
@@ -13,6 +13,9 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
     /// Instance normalization in the GPU execution context.
     var _normGPU: RMSNormalizationGPU? = nil
 
+    /// Whether to add unit offset or not.
+    var addUnitOffset: Bool
+
     /// Whether to compute weights' gradients or not.
     public var computeDeltaWeights: Bool = true
 
@@ -84,6 +87,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
     private enum Keys: String, CodingKey
     {
         case norm
+        case addUnitOffset
     }
 
     ///
@@ -92,11 +96,16 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
     /// - Parameters:
     ///     - layerPrev: Previous layer that has been queued to the model.
     ///     - activation: The activation function.
+    ///     - addUnitOffset: Whether to add unit offset or not.
     ///     - params: Contextual parameters linking to the model.
     ///
-    public override init(layerPrev: LayerSeq, activation: String?,
-                         params: GrAI.Model.Params)
+    public init(layerPrev: LayerSeq,
+                activation: String?,
+                addUnitOffset: Bool,
+                params: GrAI.Model.Params)
     {
+        self.addUnitOffset = addUnitOffset
+
         super.init(layerPrev: layerPrev,
                    sequence: layerPrev.sequence,
                    nbNeurons: layerPrev.nbNeurons,
@@ -117,6 +126,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
     public required init(from decoder: Decoder) throws
     {
         let values = try decoder.container(keyedBy: Keys.self)
+        addUnitOffset = try values.decode(Bool.self, forKey: .addUnitOffset)
         _norm = try values.decodeIfPresent(
             LayerWeightsNormalization.self, forKey: .norm
         )
@@ -137,6 +147,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
     public override func encode(to encoder: Encoder) throws
     {
         var container = encoder.container(keyedBy: Keys.self)
+        try container.encode(addUnitOffset, forKey: .addUnitOffset)
         if let norm = _normGPU
         {
             try container.encode(norm, forKey: Keys.norm)
@@ -173,6 +184,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
         let layer = RMSNormSeq(
             layerPrev: layerPrev,
             activation: _activation?.name,
+            addUnitOffset: addUnitOffset,
             params: params
         )
         if inPlace
@@ -216,6 +228,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
         let layer = RMSNormSeq(
             layerPrev: layerPrev,
             activation: nil,
+            addUnitOffset: addUnitOffset,
             params: params
         )
         if inPlace
@@ -252,6 +265,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
         let layer = RMSNormSeq(
             layerPrev: layerPrev,
             activation: nil,
+            addUnitOffset: addUnitOffset,
             params: params
         )
         // only one of them should be cloned

diff --git a/Sources/GrAIdient/Metal/Kernel/RMSNormSeqFloat.metal b/Sources/GrAIdient/Metal/Kernel/RMSNormSeqFloat.metal
@@ -42,6 +42,7 @@ kernel void forwardRMSNormSeqFloat(
     constant uint & nbNeurons,
     constant uint & nbBatch,
     constant uint & sequence,
+    constant uint & addUnitOffset,
     device float * tmps,
     device float * xHat,
     uint2 id [[ thread_position_in_grid ]])
@@ -62,8 +63,16 @@ kernel void forwardRMSNormSeqFloat(
     float tmp1 = tmps[offset];
     float tmp2 = sqrt(σ2[seq + sequence * elem] + Ɛ);
     float xhat = tmp1 / tmp2;
+
     xHat[offset] = xhat;
-    tmps[offset] = Ɣ[depth] * xhat;
+    if (addUnitOffset)
+    {
+        tmps[offset] = (1 + Ɣ[depth]) * xhat;
+    }
+    else
+    {
+        tmps[offset] = Ɣ[depth] * xhat;
+    }
 }
 
 kernel void backwardWeights1RMSNormSeqFloat(
@@ -73,6 +82,7 @@ kernel void backwardWeights1RMSNormSeqFloat(
     constant uint & nbNeurons,
     constant uint & nbBatch,
     constant uint & sequence,
+    constant uint & addUnitOffset,
     device float * sum2,
     uint2 id [[ thread_position_in_grid ]])
 {
@@ -92,7 +102,17 @@ kernel void backwardWeights1RMSNormSeqFloat(
 
         float deltaTmp = delta[offsetTmp];
         float xHatTmp = xHat[offsetTmp];
-        float dxHat = Ɣ[depth] * deltaTmp;
+
+        float dxHat;
+        if (addUnitOffset)
+        {
+            dxHat = (1 + Ɣ[depth]) * deltaTmp;
+        }
+        else
+        {
+            dxHat = Ɣ[depth] * deltaTmp;
+        }
+
         tmp += dxHat * xHatTmp;
     }
     sum2[seq + sequence * elem] = tmp;
@@ -147,6 +167,7 @@ kernel void backwardRMSNormSeqFloat(
     constant uint & nbNeurons,
     constant uint & nbBatch,
     constant uint & sequence,
+    constant uint & addUnitOffset,
     device float * delta,
     uint2 id [[ thread_position_in_grid ]])
 {
@@ -166,7 +187,17 @@ kernel void backwardRMSNormSeqFloat(
 
     float mult =
         1.0 / ((float)nbElems * sqrt(σ2[seq + sequence * elem] + Ɛ));
-    float dxHat = Ɣ[depth] * delta[offset];
+
+    float dxHat;
+    if (addUnitOffset)
+    {
+        dxHat = (1 + Ɣ[depth]) * delta[offset];
+    }
+    else
+    {
+        dxHat = Ɣ[depth] * delta[offset];
+    }
+
     float tmp1 = nbElems * dxHat;
     float tmp3 = xHat[offset] * sum2[seq + sequence * elem];