-
Notifications
You must be signed in to change notification settings - Fork 20
/
run_baichuan2_7b_910b.yaml
211 lines (192 loc) · 5.33 KB
/
run_baichuan2_7b_910b.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
seed: 0
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ''
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
only_save_strategy: False
resume_training: False
use_parallel: True
run_mode: 'train'
# trainer config
trainer:
type: CausalLanguageModelingTrainer
model_name: 'baichuan2_7b'
# runner config
runner_config:
epochs: 4
batch_size: 2
sink_mode: True
sink_size: 2
# optimizer
optimizer:
type: FP32StateAdamWeightDecay
beta1: 0.9
beta2: 0.98
eps: 1.e-8 # 1e-8
weight_decay: 1.e-4
# lr sechdule
lr_schedule:
type: CosineWithWarmUpLR
learning_rate: 2.e-5
lr_end: 2.e-6
warmup_ratio: 0.0
total_steps: -1 # -1 means it will load the total steps of the dataset
# dataset
train_dataset: &train_dataset
data_loader:
type: MindDataset
dataset_dir: ""
shuffle: True
input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune.
num_parallel_workers: 8
python_multiprocessing: False
drop_remainder: True
batch_size: 4
repeat: 1
numa_enable: False
prefetch_size: 1
train_dataset_task:
type: CausalLanguageModelDataset
dataset_config: *train_dataset
# if True, do evaluate during the training process. if false, do nothing.
# note that the task trainer should support _evaluate_in_training function.
do_eval: False
eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval.
eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end.
# eval dataset
eval_dataset: &eval_dataset
data_loader:
type: MindDataset
dataset_dir: ""
shuffle: False
input_columns: ["input_ids"]
num_parallel_workers: 8
python_multiprocessing: False
drop_remainder: False
repeat: 1
numa_enable: False
prefetch_size: 1
eval_dataset_task:
type: CausalLanguageModelDataset
dataset_config: *eval_dataset
# default parallel of device num = 16 910A
parallel_config:
data_parallel: 8
model_parallel: 1
pipeline_stage: 1
micro_batch_num: 8
vocab_emb_dp: True
gradient_aggregation_group: 4
# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
micro_batch_interleave_num: 1
# recompute config
recompute_config:
recompute: True
parallel_optimizer_comm_recompute: False
mp_comm_recompute: True
recompute_slice_activation: True
# callbacks
callbacks:
- type: MFLossMonitor
- type: CheckpointMointor
prefix: "baichuan2"
save_checkpoint_steps: 200
integrated_save: False
async_save: False
- type: ObsMonitor
# mindspore context init config
context:
mode: 0 #0--Graph Mode; 1--Pynative Mode
device_target: "Ascend"
enable_graph_kernel: False
graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
max_call_depth: 10000
max_device_memory: "59GB"
save_graphs: False
save_graphs_path: "./graph"
device_id: 0
# parallel context config
parallel:
parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
gradients_mean: False
enable_alltoall: False
full_batch: True
search_mode: "sharding_propagation"
enable_parallel_optimizer: True
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
parallel_optimizer_config:
gradient_accumulation_shard: False
parallel_optimizer_threshold: 64
# model config
model:
model_config:
type: LlamaConfig
batch_size: 1 # add for increase predict
seq_length: 512
hidden_size: 4096
num_layers: 32
num_heads: 32
vocab_size: 125696
multiple_of: 256
rms_norm_eps: 1.0e-6
bos_token_id: 1
eos_token_id: 2
pad_token_id: 0
ignore_token_id: -100
user_token_id: 195
assistant_token_id: 196
compute_dtype: "float16"
layernorm_compute_type: "float32"
softmax_compute_type: "float32"
rotary_dtype: "float32"
param_init_type: "float16"
use_past: False
compute_in_2d: True
use_flash_attention: False
offset: 0
checkpoint_name_or_path: "path/to/baichuan2-7B-Chat.ckpt"
repetition_penalty: 1.05
max_decode_length: 512
top_k: 5
top_p: 0.85
do_sample: True
max_new_tokens: 64
arch:
type: Baichuan7BV2ForCausalLM
processor:
return_tensors: ms
tokenizer:
unk_token: '<unk>'
bos_token: '<s>'
eos_token: '</s>'
pad_token: '<unk>'
type: Baichuan2Tokenizer
vocab_file: './tokenizer.model'
type: LlamaProcessor
# metric
metric:
type: PerplexityMetric
# wrapper cell config
runner_wrapper:
type: MFTrainOneStepCell
scale_sense:
type: DynamicLossScaleUpdateCell
loss_scale_value: 65536
scale_factor: 2
scale_window: 1000
use_clip_grad: True
eval_callbacks:
- type: ObsMonitor
auto_tune: False
filepath_prefix: './autotune'
autotune_per_step: 10
profile: False
profile_start_step: 1
profile_stop_step: 10
init_start_profile: False
profile_communication: False
profile_memory: True
layer_scale: False
layer_decay: 0.65
lr_scale_factor: 256
# aicc
remote_save_url: "Please input obs url on AICC platform."