Skip to content

Commit 0211dc1

Browse files
author
Copybara
committed
Copybara import of gpu-recipes:
- 480db0b7f6cb217dfb4f15daa473e375d769ade1 Removed invalid setting. - 71fb202a48113ec2fbc7a0386e9e8f3aa85a0c7a Merge "Adding bf16 and fp8 recipe for Mixtral-8x7b Nemo o... - cfb4f4073ff43aa4d962c272e2834b26f60492d6 Merge "Llama-3.1-70B A4 Nemo fp8/bf16 256 gpus" into main - d1800ca28d565fd377fa2c23d2eb67f9d4ce603c Adding main readme GitOrigin-RevId: d1800ca28d565fd377fa2c23d2eb67f9d4ce603c
1 parent 2850f84 commit 0211dc1

File tree

13 files changed

+1893
-3
lines changed

13 files changed

+1893
-3
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@ Models | GPU Machine Type
3939

4040
Models | GPU Machine Type | Framework | Workload Type | Orchestrator | Link to the recipe
4141
------------------ | ---------------------------------------------------------------------------------------------------- | --------- | ------------- | ------------ | ------------------
42-
**Llama-3.1-405B** | [A4 (NVIDIA B200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-vms) | MaxText | Pre-training | GKE | [Link](./training/a4/llama3-1-405b/maxtext-pretraining-gke/README.md)
43-
**Llama-3.1-405B** | [A4 (NVIDIA B200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-vms) | NeMo | Pre-training | GKE | [Link](./training/a4/llama3-1-405b/nemo-pretraining-gke/README.md)
42+
**Llama-3.1-70B** | [A4 (NVIDIA B200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-vms) | NeMo | Pre-training | GKE | [Link](./training/a4/llama3-1-70b/nemo-pretraining-gke/README.md)
43+
**Llama-3.1-405B** | [A4 (NVIDIA B200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-vms) | MaxText | Pre-training | GKE | [Link](./training/a4/llama3-1-405b/maxtext-pretraining-gke/README.md)
44+
**Llama-3.1-405B** | [A4 (NVIDIA B200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-vms) | NeMo | Pre-training | GKE | [Link](./training/a4/llama3-1-405b/nemo-pretraining-gke/README.md)
45+
**Mixtral-8-7B** | [A4 (NVIDIA B200)](https://cloud.google.com/compute/docs/accelerator-optimized-machines#a4-vms) | NeMo | Pre-training | GKE | [Link](./training/a4/mixtral-8x7b/nemo-pretraining-gke/README.md)
4446

4547
### Inference benchmarks A3 Mega
4648

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
run:
2+
name: llama3_1_70b
3+
results_dir:
4+
time_limit: "0-01:30:00"
5+
dependency: "singleton"
6+
trainer:
7+
devices: 8
8+
accelerator: gpu
9+
precision: bf16
10+
logger: false
11+
enable_checkpointing: false
12+
use_distributed_sampler: false
13+
max_epochs: null
14+
max_steps: 15
15+
max_time: "05:23:30:00"
16+
log_every_n_steps: 1
17+
val_check_interval: null
18+
limit_val_batches: 0.0
19+
limit_test_batches: 5
20+
accumulate_grad_batches: 1
21+
gradient_clip_val: 1.0
22+
exp_manager:
23+
explicit_log_dir:
24+
exp_dir: null
25+
name: megatron_llama
26+
create_wandb_logger: false
27+
wandb_logger_kwargs:
28+
project: nemo_llama_pretrain
29+
name: ${run.name}
30+
create_dllogger_logger: true
31+
dllogger_logger_kwargs:
32+
verbose: true
33+
stdout: true
34+
create_tensorboard_logger: true
35+
resume_if_exists: false
36+
resume_ignore_no_checkpoint: true
37+
create_checkpoint_callback: false
38+
checkpoint_callback_params:
39+
monitor: val_loss
40+
save_top_k: 10
41+
mode: min
42+
always_save_nemo: False
43+
save_nemo_on_train_end: False
44+
filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
45+
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
46+
log_step_timing: true
47+
log_tflops_per_sec_per_gpu: true
48+
step_timing_kwargs:
49+
sync_cuda: true
50+
buffer_size: 5
51+
seconds_to_sleep: 60
52+
model:
53+
tp_only_amax_red: true
54+
mcore_gpt: true
55+
micro_batch_size: 1
56+
global_batch_size: 2048
57+
rampup_batch_size: null
58+
tensor_model_parallel_size: 2
59+
pipeline_model_parallel_size: 4
60+
virtual_pipeline_model_parallel_size: 20
61+
context_parallel_size: 1
62+
encoder_seq_length: 8192
63+
max_position_embeddings: 8192
64+
num_layers: 80
65+
hidden_size: 8192
66+
ffn_hidden_size: 28672
67+
num_attention_heads: 64
68+
num_query_groups: 8
69+
init_method_std: 0.008944
70+
use_scaled_init_method: true
71+
hidden_dropout: 0.0
72+
attention_dropout: 0.0
73+
ffn_dropout: 0.0
74+
kv_channels: null
75+
apply_query_key_layer_scaling: true
76+
normalization: rmsnorm
77+
layernorm_epsilon: 1.0e-05
78+
do_layer_norm_weight_decay: false
79+
make_vocab_size_divisible_by: 128
80+
pre_process: true
81+
post_process: true
82+
persist_layer_norm: true
83+
bias: false
84+
activation: fast-swiglu
85+
headscale: false
86+
transformer_block_type: pre_ln
87+
openai_gelu: false
88+
normalize_attention_scores: true
89+
position_embedding_type: rope
90+
rotary_percentage: 1.0
91+
apply_rope_fusion: true
92+
attention_type: multihead
93+
share_embeddings_and_output_weights: false
94+
scale_positional_embedding: true
95+
tokenizer:
96+
library: 'megatron'
97+
type: 'GPT2BPETokenizer'
98+
model: null
99+
delimiter: null # only used for tabular tokenizer
100+
vocab_file: gpt2-vocab.json
101+
merge_file: gpt2-merges.txt
102+
103+
native_amp_init_scale: 4294967296
104+
native_amp_growth_interval: 1000
105+
hysteresis: 2
106+
fp32_residual_connection: false
107+
fp16_lm_cross_entropy: false
108+
megatron_amp_O2: true
109+
grad_allreduce_chunk_size_mb: 125
110+
grad_div_ar_fusion: true
111+
gradient_accumulation_fusion: true
112+
bias_activation_fusion: true
113+
bias_dropout_add_fusion: true
114+
masked_softmax_fusion: true
115+
seed: 1234
116+
resume_from_checkpoint: null
117+
use_cpu_initialization: false
118+
onnx_safe: false
119+
apex_transformer_log_level: 30
120+
gradient_as_bucket_view: true
121+
sync_batch_comm: false
122+
activations_checkpoint_granularity: null
123+
activations_checkpoint_method: null
124+
activations_checkpoint_num_layers: null
125+
num_micro_batches_with_partial_activation_checkpoints: null
126+
activations_checkpoint_layers_per_pipeline: null
127+
sequence_parallel: true
128+
129+
130+
## Transformer Engine
131+
transformer_engine: true
132+
fp8: false # enables fp8 in TransformerLayer forward
133+
fp8_e4m3: true # sets fp8_format = recipe.Format.E4M3
134+
fp8_hybrid: true # sets fp8_format = recipe.Format.HYBRID
135+
fp8_margin: 0 # scaling margin
136+
fp8_interval: 1 # scaling update interval
137+
fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
138+
fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
139+
ub_tp_comm_overlap: true
140+
use_flash_attention: true
141+
overlap_p2p_comm: true
142+
batch_p2p_comm: false
143+
gc_interval: 100
144+
nsys_profile:
145+
enabled: False
146+
trace: [nvtx,cuda]
147+
start_step: 10 # Global batch to start profiling
148+
end_step: 10 # Global batch to end profiling
149+
ranks: [0] # Global rank IDs to profile
150+
gen_shape: False # Generate model and kernel details including input shapes
151+
optim:
152+
name: mcore_distributed_optim
153+
lr: 0.00015
154+
weight_decay: 0.1
155+
betas:
156+
- 0.9
157+
- 0.95
158+
bucket_cap_mb: 125
159+
overlap_grad_sync: true
160+
overlap_param_sync: true
161+
contiguous_grad_buffer: true
162+
contiguous_param_buffer: true
163+
grad_sync_dtype: bf16
164+
sched:
165+
name: CosineAnnealing
166+
warmup_steps: 2000
167+
constant_steps: 11873
168+
min_lr: 1.0e-05
169+
data:
170+
data_impl: mock
171+
# splits_string: 90,8,2
172+
splits_string: 99990,8,2
173+
seq_length: 8192
174+
skip_warmup: true
175+
num_workers: 2
176+
dataloader_type: single
177+
reset_position_ids: false
178+
reset_attention_mask: false
179+
eod_mask_loss: false
180+
index_mapping_dir: null
181+
data_prefix: null
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
run:
2+
name: llama3_1_70b
3+
results_dir:
4+
time_limit: "0-01:30:00"
5+
dependency: "singleton"
6+
trainer:
7+
devices: 8
8+
accelerator: gpu
9+
precision: bf16
10+
logger: false
11+
enable_checkpointing: false
12+
use_distributed_sampler: false
13+
max_epochs: null
14+
max_steps: 15
15+
max_time: "05:23:30:00"
16+
log_every_n_steps: 1
17+
val_check_interval: null
18+
limit_val_batches: 0.0
19+
limit_test_batches: 5
20+
accumulate_grad_batches: 1
21+
gradient_clip_val: 1.0
22+
exp_manager:
23+
explicit_log_dir:
24+
exp_dir: null
25+
name: megatron_llama
26+
create_wandb_logger: false
27+
wandb_logger_kwargs:
28+
project: nemo_llama_pretrain
29+
name: ${run.name}
30+
create_dllogger_logger: true
31+
dllogger_logger_kwargs:
32+
verbose: true
33+
stdout: true
34+
create_tensorboard_logger: true
35+
resume_if_exists: false
36+
resume_ignore_no_checkpoint: true
37+
create_checkpoint_callback: false
38+
checkpoint_callback_params:
39+
monitor: val_loss
40+
save_top_k: 10
41+
mode: min
42+
always_save_nemo: False
43+
save_nemo_on_train_end: False
44+
filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
45+
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
46+
log_step_timing: true
47+
log_tflops_per_sec_per_gpu: true
48+
step_timing_kwargs:
49+
sync_cuda: true
50+
buffer_size: 5
51+
seconds_to_sleep: 60
52+
model:
53+
tp_only_amax_red: true
54+
mcore_gpt: true
55+
micro_batch_size: 1
56+
global_batch_size: 2048
57+
rampup_batch_size: null
58+
tensor_model_parallel_size: 2
59+
pipeline_model_parallel_size: 4
60+
virtual_pipeline_model_parallel_size: 20
61+
context_parallel_size: 1
62+
encoder_seq_length: 8192
63+
max_position_embeddings: 8192
64+
num_layers: 80
65+
hidden_size: 8192
66+
ffn_hidden_size: 28672
67+
num_attention_heads: 64
68+
num_query_groups: 8
69+
init_method_std: 0.008944
70+
use_scaled_init_method: true
71+
hidden_dropout: 0.0
72+
attention_dropout: 0.0
73+
ffn_dropout: 0.0
74+
kv_channels: null
75+
apply_query_key_layer_scaling: true
76+
normalization: rmsnorm
77+
layernorm_epsilon: 1.0e-05
78+
do_layer_norm_weight_decay: false
79+
make_vocab_size_divisible_by: 128
80+
pre_process: true
81+
post_process: true
82+
persist_layer_norm: true
83+
bias: false
84+
activation: fast-swiglu
85+
headscale: false
86+
transformer_block_type: pre_ln
87+
openai_gelu: false
88+
normalize_attention_scores: true
89+
position_embedding_type: rope
90+
rotary_percentage: 1.0
91+
apply_rope_fusion: true
92+
attention_type: multihead
93+
share_embeddings_and_output_weights: false
94+
scale_positional_embedding: true
95+
tokenizer:
96+
library: 'megatron'
97+
type: 'GPT2BPETokenizer'
98+
model: null
99+
delimiter: null # only used for tabular tokenizer
100+
vocab_file: gpt2-vocab.json
101+
merge_file: gpt2-merges.txt
102+
native_amp_init_scale: 4294967296
103+
native_amp_growth_interval: 1000
104+
hysteresis: 2
105+
fp32_residual_connection: false
106+
fp16_lm_cross_entropy: false
107+
megatron_amp_O2: true
108+
grad_allreduce_chunk_size_mb: 125
109+
grad_div_ar_fusion: true
110+
gradient_accumulation_fusion: true
111+
bias_activation_fusion: true
112+
bias_dropout_add_fusion: true
113+
masked_softmax_fusion: true
114+
seed: 1234
115+
resume_from_checkpoint: null
116+
use_cpu_initialization: false
117+
onnx_safe: false
118+
apex_transformer_log_level: 30
119+
gradient_as_bucket_view: true
120+
sync_batch_comm: false
121+
activations_checkpoint_granularity: null
122+
activations_checkpoint_method: null
123+
activations_checkpoint_num_layers: null
124+
num_micro_batches_with_partial_activation_checkpoints: null
125+
activations_checkpoint_layers_per_pipeline: null
126+
sequence_parallel: true
127+
128+
129+
## Transformer Engine
130+
transformer_engine: true
131+
fp8: true
132+
fp8_e4m3: true
133+
fp8_hybrid: true
134+
fp8_margin: 0
135+
fp8_interval: 1
136+
fp8_amax_history_len: 1024
137+
fp8_amax_compute_algo: max
138+
ub_tp_comm_overlap: true
139+
use_flash_attention: true
140+
overlap_p2p_comm: true
141+
batch_p2p_comm: false
142+
gc_interval: 100
143+
nsys_profile:
144+
enabled: False
145+
trace: [nvtx,cuda]
146+
start_step: 10 # Global batch to start profiling
147+
end_step: 10 # Global batch to end profiling
148+
ranks: [0] # Global rank IDs to profile
149+
gen_shape: False # Generate model and kernel details including input shapes
150+
optim:
151+
# name: distributed_fused_adam
152+
name: mcore_distributed_optim
153+
lr: 0.00015
154+
weight_decay: 0.1
155+
betas:
156+
- 0.9
157+
- 0.95
158+
bucket_cap_mb: 125
159+
overlap_grad_sync: true
160+
overlap_param_sync: true
161+
contiguous_grad_buffer: true
162+
contiguous_param_buffer: true
163+
grad_sync_dtype: bf16
164+
sched:
165+
name: CosineAnnealing
166+
warmup_steps: 2000
167+
constant_steps: 11873
168+
min_lr: 1.0e-05
169+
data:
170+
data_impl: mock
171+
# splits_string: 90,8,2
172+
splits_string: 99990,8,2
173+
seq_length: 8192
174+
skip_warmup: true
175+
num_workers: 2
176+
dataloader_type: single
177+
reset_position_ids: false
178+
reset_attention_mask: false
179+
eod_mask_loss: false
180+
index_mapping_dir: null
181+
data_prefix: null

0 commit comments

Comments
 (0)