Skip to content

Commit

Permalink
🚀 test(examples): integrate Gemma2-2B (#132)
Browse files Browse the repository at this point in the history
  • Loading branch information
jean-francoisreboud committed Sep 1, 2024
1 parent 54b4a30 commit 838e922
Show file tree
Hide file tree
Showing 18 changed files with 1,475 additions and 96 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ All notable changes to this project will be documented in this file.

## [unreleased]

🚀 **examples:** integrate Gemma2-2B ([#132](https://github.com/owkin/GrAIdient/pull/132))\
**layer_seq:** LLM sliding window ([#131](https://github.com/owkin/GrAIdient/pull/131))\
🚀 **examples:** 3 LLMs examples ([#130](https://github.com/owkin/GrAIdient/pull/130))\
📚 **docs:** LLM doc & split tests ([129](https://github.com/owkin/GrAIdient/pull/129))\
Expand Down
16 changes: 12 additions & 4 deletions Docs/Examples/LLM.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,24 @@ pip install -e .
```

Then:
- download weights from
- Download weights from
[MistralAI](https://docs.mistral.ai/getting-started/open_weight_models/)
(mistral-7B-Instruct-v0.3)
and / or
[Llama](https://llama.meta.com/llama-downloads/)
- Update `_modelPathMistral`, `_modelPathLlama2`, `_modelPathLlama3` in the
(llama-2-7b-chat or Meta-Llama-3-8B-Instruct)
and / or Gemma2 from [HuggingFace](https://huggingface.co/google/gemma-2-2b-it)
(Gemma-2-2b-it).
- Update `_modelPathMistral`, `_modelPathLlama2`, `_modelPathLlama3`,
`_modelPathGemma2` in the
[LLMExample](../../Tests/GrAIExamples/LLMExample.swift) file with the
previous downloaded weights.
- Optionnally update `_prompt`.
- Rename `_testGenerateMistral`, `_testGenerateLlama2` and `_testGenerateLlama3`
- Rename `_testGenerateMistral`, `_testGenerateLlama2`, `_testGenerateLlama3`
and `_testGenerateGemma2`
into
`testGenerateMistral`, `testGenerateLlama2` and `testGenerateLlama3`.
`testGenerateMistral`, `testGenerateLlama2`, `testGenerateLlama3` and
`testGenerateGemma2`.
- Run the tests.

It is finally possible to clean the environment 🌍
Expand All @@ -41,6 +48,7 @@ conda env remove --name graiexamples
1. Generate text from a prompt with Mistral 7B Instruct model.
1. Generate text from a prompt with Llama 2 7B Chat model.
1. Generate text from a prompt with Llama 3 8B Instruct model.
1. Generata text from a prompt with Gemme 2 2B Instruct model.

## Further tests

Expand Down
39 changes: 31 additions & 8 deletions Sources/GrAIdient/Core/Function/Normalization.swift
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,20 @@ class Normalization
/// - Parameters:
/// - outs: The data to normalize.
/// - Ɣ: The weights to scale the normalization result.
/// - addUnitOffset: Whether to add unit offset or not.
/// - Returns: The data normalized.
///
static func forwardΣGC(outs: [Double],
Ɣ: [Double]) -> [Double]
Ɣ: [Double],
addUnitOffset: Bool) -> [Double]
{
let σ2 = vDSP.meanSquare(outs)
let xHat = vDSP.divide(outs, sqrt(σ2 +))
let outsNew = vDSP.multiply(Ɣ, xHat)
var outsNew = vDSP.multiply(Ɣ, xHat)
if addUnitOffset
{
outsNew = vDSP.add(xHat, outsNew)
}
return outsNew
}

Expand Down Expand Up @@ -142,18 +148,24 @@ class Normalization
/// - Parameters:
/// - outs: The data to normalize.
/// - Ɣ: The weights to scale the normalization result.
/// - addUnitOffset: Whether to add unit offset or not.
/// - Returns: (The data normalized,
/// The data normalized without taking into account the bias and the weight,
/// The deviation of the data).
///
static func forwardΣ(outs: [Double],
Ɣ: [Double]) -> (outsNew: [Double],
xHat: [Double],
σ2: Double)
Ɣ: [Double],
addUnitOffset: Bool) -> (outsNew: [Double],
xHat: [Double],
σ2: Double)
{
let σ2 = vDSP.meanSquare(outs)
let xHat = vDSP.divide(outs, sqrt(σ2 +))
let outsNew = vDSP.multiply(Ɣ, xHat)
var outsNew = vDSP.multiply(Ɣ, xHat)
if addUnitOffset
{
outsNew = vDSP.add(xHat, outsNew)
}

return (outsNew: outsNew,
xHat: xHat,
Expand Down Expand Up @@ -263,17 +275,28 @@ class Normalization
/// - xHat: The data normalized without taking into account the bias and the weight.
/// - σ2: The deviation of the data.
/// - Ɣ: The weights that scaled the normalization result.
/// - addUnitOffset: Whether to add unit offset or not.
/// - Returns: The gradient taking into account the normalization.
///
static func backwardΣ(delta: [Double],
xHat: [Double],
σ2: Double,
Ɣ: [Double]) -> [Double]
Ɣ: [Double],
addUnitOffset: Bool) -> [Double]
{
let nbElems = delta.count
let factor = 1.0 / (Double(nbElems) * sqrt(σ2 +))

let Ɣdelta = vDSP.multiply(Ɣ, delta)
let Ɣdelta: [Double]
if addUnitOffset
{
Ɣdelta = vDSP.multiply(vDSP.add(1, Ɣ), delta)
}
else
{
Ɣdelta = vDSP.multiply(Ɣ, delta)
}

let sum2 = vDSP.sum(vDSP.multiply(Ɣdelta, xHat))

let tmp1 = vDSP.add(
Expand Down
23 changes: 16 additions & 7 deletions Sources/GrAIdient/Core/Layer/LayerNormalization.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2847,7 +2847,8 @@ public class RMSNormalization: LayerWeightsNormalization
outs: layer.getOutsGC(
batch: batch, seq: seq, elem: elem
),
Ɣ: Ɣ
Ɣ: Ɣ,
addUnitOffset: layer.addUnitOffset
)
layer.setOutsGC(
batch: batch, seq: seq, elem: elem, outs: outs
Expand Down Expand Up @@ -2894,7 +2895,8 @@ public class RMSNormalization: LayerWeightsNormalization
{
let (outs, xHat, σ2) = Normalization.forwardΣ(
outs: layer.getOuts(batch: batch, seq: seq),
Ɣ: Ɣ
Ɣ: Ɣ,
addUnitOffset: layer.addUnitOffset
)
layer.setOuts(batch: batch, seq: seq, outs: outs)

Expand Down Expand Up @@ -2927,7 +2929,8 @@ public class RMSNormalization: LayerWeightsNormalization
delta: delta1,
xHat: _xHat[seq + sequence * batch],
σ2: _σ2[seq + sequence * batch],
Ɣ: Ɣ
Ɣ: Ɣ,
addUnitOffset: layer.addUnitOffset
)
layer.setDelta(batch: batch, seq: seq, delta: delta2)

Expand Down Expand Up @@ -3091,6 +3094,7 @@ class RMSNormalizationGPU: LayerWeightsNormalization
let pNbNeurons: [UInt32] = [UInt32(_nbNeurons)]
let pNbBatch: [UInt32] = [UInt32(batchSize)]
let pSequence: [UInt32] = [UInt32(sequence)]
let pAddUnitOffset: [UInt32] = layer.addUnitOffset ? [1] : [0]

if _xHat == nil
{
Expand All @@ -3108,8 +3112,9 @@ class RMSNormalizationGPU: LayerWeightsNormalization
command.setBytes(pNbNeurons, atIndex: 2)
command.setBytes(pNbBatch, atIndex: 3)
command.setBytes(pSequence, atIndex: 4)
command.setBuffer(layer.outs.metal, atIndex: 5)
command.setBuffer(_xHat.metal, atIndex: 6)
command.setBytes(pAddUnitOffset, atIndex: 5)
command.setBuffer(layer.outs.metal, atIndex: 6)
command.setBuffer(_xHat.metal, atIndex: 7)

command.dispatchThreads(
width: _nbNeurons,
Expand Down Expand Up @@ -3160,6 +3165,7 @@ class RMSNormalizationGPU: LayerWeightsNormalization
let pNbNeurons: [UInt32] = [UInt32(_nbNeurons)]
let pNbBatch: [UInt32] = [UInt32(batchSize)]
let pSequence: [UInt32] = [UInt32(sequence)]
let pAddUnitOffset: [UInt32] = layer.addUnitOffset ? [1] : [0]

let command = MetalKernel.get.createCommand(
"backwardRMSNormSeq", deviceID: _deviceID
Expand All @@ -3171,7 +3177,8 @@ class RMSNormalizationGPU: LayerWeightsNormalization
command.setBytes(pNbNeurons, atIndex: 4)
command.setBytes(pNbBatch, atIndex: 5)
command.setBytes(pSequence, atIndex: 6)
command.setBuffer(layer.delta.metal, atIndex: 7)
command.setBytes(pAddUnitOffset, atIndex: 7)
command.setBuffer(layer.delta.metal, atIndex: 8)

command.dispatchThreads(
width: _nbNeurons,
Expand All @@ -3189,6 +3196,7 @@ class RMSNormalizationGPU: LayerWeightsNormalization
let pNbNeurons: [UInt32] = [UInt32(_nbNeurons)]
let pNbBatch: [UInt32] = [UInt32(batchSize)]
let pSequence: [UInt32] = [UInt32(sequence)]
let pAddUnitOffset: [UInt32] = layer.addUnitOffset ? [1] : [0]

if _sum2 == nil
{
Expand All @@ -3206,7 +3214,8 @@ class RMSNormalizationGPU: LayerWeightsNormalization
command.setBytes(pNbNeurons, atIndex: 3)
command.setBytes(pNbBatch, atIndex: 4)
command.setBytes(pSequence, atIndex: 5)
command.setBuffer(_sum2.metal, atIndex: 6)
command.setBytes(pAddUnitOffset, atIndex: 6)
command.setBuffer(_sum2.metal, atIndex: 7)

command.dispatchThreads(width: sequence, height: batchSize)
command.enqueue()
Expand Down
4 changes: 1 addition & 3 deletions Sources/GrAIdient/Core/Model/Model.swift
Original file line number Diff line number Diff line change
Expand Up @@ -208,17 +208,15 @@ public class BaseModel: Codable
let newModel = BaseModel(name: name)
var newLayers = [Layer]()

var updatedSeq = false
for layer in layers
{
let newLayer = layer.copy(mapping: mapping, inPlace: inPlace)
newLayers.append(newLayer)
mapping[layer.id] = newLayer

if let layerTmp = newLayer as? LayerSeq, !updatedSeq
if let layerTmp = newLayer as? LayerSeq
{
layerTmp.sequence = sequence
updatedSeq = true
}
}

Expand Down
18 changes: 16 additions & 2 deletions Sources/GrAIdient/LayerSeq/RMSNormSeq.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
/// Instance normalization in the GPU execution context.
var _normGPU: RMSNormalizationGPU? = nil

/// Whether to add unit offset or not.
var addUnitOffset: Bool

/// Whether to compute weights' gradients or not.
public var computeDeltaWeights: Bool = true

Expand Down Expand Up @@ -84,6 +87,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
private enum Keys: String, CodingKey
{
case norm
case addUnitOffset
}

///
Expand All @@ -92,11 +96,16 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
/// - Parameters:
/// - layerPrev: Previous layer that has been queued to the model.
/// - activation: The activation function.
/// - addUnitOffset: Whether to add unit offset or not.
/// - params: Contextual parameters linking to the model.
///
public override init(layerPrev: LayerSeq, activation: String?,
params: GrAI.Model.Params)
public init(layerPrev: LayerSeq,
activation: String?,
addUnitOffset: Bool,
params: GrAI.Model.Params)
{
self.addUnitOffset = addUnitOffset

super.init(layerPrev: layerPrev,
sequence: layerPrev.sequence,
nbNeurons: layerPrev.nbNeurons,
Expand All @@ -117,6 +126,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
public required init(from decoder: Decoder) throws
{
let values = try decoder.container(keyedBy: Keys.self)
addUnitOffset = try values.decode(Bool.self, forKey: .addUnitOffset)
_norm = try values.decodeIfPresent(
LayerWeightsNormalization.self, forKey: .norm
)
Expand All @@ -137,6 +147,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
public override func encode(to encoder: Encoder) throws
{
var container = encoder.container(keyedBy: Keys.self)
try container.encode(addUnitOffset, forKey: .addUnitOffset)
if let norm = _normGPU
{
try container.encode(norm, forKey: Keys.norm)
Expand Down Expand Up @@ -173,6 +184,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
let layer = RMSNormSeq(
layerPrev: layerPrev,
activation: _activation?.name,
addUnitOffset: addUnitOffset,
params: params
)
if inPlace
Expand Down Expand Up @@ -216,6 +228,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
let layer = RMSNormSeq(
layerPrev: layerPrev,
activation: nil,
addUnitOffset: addUnitOffset,
params: params
)
if inPlace
Expand Down Expand Up @@ -252,6 +265,7 @@ public class RMSNormSeq: ActivationSeq, LayerUpdate, LayerWithActivation
let layer = RMSNormSeq(
layerPrev: layerPrev,
activation: nil,
addUnitOffset: addUnitOffset,
params: params
)
// only one of them should be cloned
Expand Down
37 changes: 34 additions & 3 deletions Sources/GrAIdient/Metal/Kernel/RMSNormSeqFloat.metal
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ kernel void forwardRMSNormSeqFloat(
constant uint & nbNeurons,
constant uint & nbBatch,
constant uint & sequence,
constant uint & addUnitOffset,
device float * tmps,
device float * xHat,
uint2 id [[ thread_position_in_grid ]])
Expand All @@ -62,8 +63,16 @@ kernel void forwardRMSNormSeqFloat(
float tmp1 = tmps[offset];
float tmp2 = sqrt2[seq + sequence * elem] + Ɛ);
float xhat = tmp1 / tmp2;

xHat[offset] = xhat;
tmps[offset] = Ɣ[depth] * xhat;
if (addUnitOffset)
{
tmps[offset] = (1 + Ɣ[depth]) * xhat;
}
else
{
tmps[offset] = Ɣ[depth] * xhat;
}
}

kernel void backwardWeights1RMSNormSeqFloat(
Expand All @@ -73,6 +82,7 @@ kernel void backwardWeights1RMSNormSeqFloat(
constant uint & nbNeurons,
constant uint & nbBatch,
constant uint & sequence,
constant uint & addUnitOffset,
device float * sum2,
uint2 id [[ thread_position_in_grid ]])
{
Expand All @@ -92,7 +102,17 @@ kernel void backwardWeights1RMSNormSeqFloat(

float deltaTmp = delta[offsetTmp];
float xHatTmp = xHat[offsetTmp];
float dxHat = Ɣ[depth] * deltaTmp;

float dxHat;
if (addUnitOffset)
{
dxHat = (1 + Ɣ[depth]) * deltaTmp;
}
else
{
dxHat = Ɣ[depth] * deltaTmp;
}

tmp += dxHat * xHatTmp;
}
sum2[seq + sequence * elem] = tmp;
Expand Down Expand Up @@ -147,6 +167,7 @@ kernel void backwardRMSNormSeqFloat(
constant uint & nbNeurons,
constant uint & nbBatch,
constant uint & sequence,
constant uint & addUnitOffset,
device float * delta,
uint2 id [[ thread_position_in_grid ]])
{
Expand All @@ -166,7 +187,17 @@ kernel void backwardRMSNormSeqFloat(

float mult =
1.0 / ((float)nbElems * sqrt2[seq + sequence * elem] + Ɛ));
float dxHat = Ɣ[depth] * delta[offset];

float dxHat;
if (addUnitOffset)
{
dxHat = (1 + Ɣ[depth]) * delta[offset];
}
else
{
dxHat = Ɣ[depth] * delta[offset];
}

float tmp1 = nbElems * dxHat;
float tmp3 = xHat[offset] * sum2[seq + sequence * elem];

Expand Down
Loading

0 comments on commit 838e922

Please sign in to comment.