From c782cc797eaf9b1f74a7be930c3e857d6dd7fa66 Mon Sep 17 00:00:00 2001 From: Wanchao Date: Wed, 12 Jun 2024 15:15:32 -0700 Subject: [PATCH] update all toml files to use experimental section (#392) --- train_configs/llama2_13b.toml | 4 +++- train_configs/llama2_70b.toml | 4 +++- train_configs/llama2_7b.toml | 4 +++- train_configs/llama3_70b.toml | 4 +++- train_configs/llama3_8b.toml | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/train_configs/llama2_13b.toml b/train_configs/llama2_13b.toml index 280ac2ae..f3048ac4 100644 --- a/train_configs/llama2_13b.toml +++ b/train_configs/llama2_13b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama2_70b.toml b/train_configs/llama2_70b.toml index 959c270a..97b1bc71 100644 --- a/train_configs/llama2_70b.toml +++ b/train_configs/llama2_70b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 8 # 8-way TP -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama2_7b.toml b/train_configs/llama2_7b.toml index f2e66de7..95b4c496 100644 --- a/train_configs/llama2_7b.toml +++ b/train_configs/llama2_7b.toml @@ -32,11 +32,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 # dp-only would be sufficient for 7B -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama3_70b.toml b/train_configs/llama3_70b.toml index f45632ad..d498e677 100644 --- a/train_configs/llama3_70b.toml +++ b/train_configs/llama3_70b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 8 # 8-way TP -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint" diff --git a/train_configs/llama3_8b.toml b/train_configs/llama3_8b.toml index aaba99a2..f194addb 100644 --- a/train_configs/llama3_8b.toml +++ b/train_configs/llama3_8b.toml @@ -33,11 +33,13 @@ max_norm = 1.0 # grad norm clipping steps = 1000 data_parallel_degree = -1 tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 fp8_linear = "" compile = false dataset = "c4" +[experimental] +pipeline_parallel_degree = 1 + [checkpoint] enable_checkpoint = false folder = "checkpoint"