1+ run :
2+ name : llama3_1_70b
3+ results_dir :
4+ time_limit : " 0-01:30:00"
5+ dependency : " singleton"
6+ trainer :
7+ devices : 8
8+ accelerator : gpu
9+ precision : bf16
10+ logger : false
11+ enable_checkpointing : false
12+ use_distributed_sampler : false
13+ max_epochs : null
14+ max_steps : 15
15+ max_time : " 05:23:30:00"
16+ log_every_n_steps : 1
17+ val_check_interval : null
18+ limit_val_batches : 0.0
19+ limit_test_batches : 5
20+ accumulate_grad_batches : 1
21+ gradient_clip_val : 1.0
22+ exp_manager :
23+ explicit_log_dir :
24+ exp_dir : null
25+ name : megatron_llama
26+ create_wandb_logger : false
27+ wandb_logger_kwargs :
28+ project : nemo_llama_pretrain
29+ name : ${run.name}
30+ create_dllogger_logger : true
31+ dllogger_logger_kwargs :
32+ verbose : true
33+ stdout : true
34+ create_tensorboard_logger : true
35+ resume_if_exists : false
36+ resume_ignore_no_checkpoint : true
37+ create_checkpoint_callback : false
38+ checkpoint_callback_params :
39+ monitor : val_loss
40+ save_top_k : 10
41+ mode : min
42+ always_save_nemo : False
43+ save_nemo_on_train_end : False
44+ filename : ' megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}'
45+ model_parallel_size : ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
46+ log_step_timing : true
47+ log_tflops_per_sec_per_gpu : true
48+ step_timing_kwargs :
49+ sync_cuda : true
50+ buffer_size : 5
51+ seconds_to_sleep : 60
52+ model :
53+ tp_only_amax_red : true
54+ mcore_gpt : true
55+ micro_batch_size : 1
56+ global_batch_size : 2048
57+ rampup_batch_size : null
58+ tensor_model_parallel_size : 2
59+ pipeline_model_parallel_size : 4
60+ virtual_pipeline_model_parallel_size : 20
61+ context_parallel_size : 1
62+ encoder_seq_length : 8192
63+ max_position_embeddings : 8192
64+ num_layers : 80
65+ hidden_size : 8192
66+ ffn_hidden_size : 28672
67+ num_attention_heads : 64
68+ num_query_groups : 8
69+ init_method_std : 0.008944
70+ use_scaled_init_method : true
71+ hidden_dropout : 0.0
72+ attention_dropout : 0.0
73+ ffn_dropout : 0.0
74+ kv_channels : null
75+ apply_query_key_layer_scaling : true
76+ normalization : rmsnorm
77+ layernorm_epsilon : 1.0e-05
78+ do_layer_norm_weight_decay : false
79+ make_vocab_size_divisible_by : 128
80+ pre_process : true
81+ post_process : true
82+ persist_layer_norm : true
83+ bias : false
84+ activation : fast-swiglu
85+ headscale : false
86+ transformer_block_type : pre_ln
87+ openai_gelu : false
88+ normalize_attention_scores : true
89+ position_embedding_type : rope
90+ rotary_percentage : 1.0
91+ apply_rope_fusion : true
92+ attention_type : multihead
93+ share_embeddings_and_output_weights : false
94+ scale_positional_embedding : true
95+ tokenizer :
96+ library : ' megatron'
97+ type : ' GPT2BPETokenizer'
98+ model : null
99+ delimiter : null # only used for tabular tokenizer
100+ vocab_file : gpt2-vocab.json
101+ merge_file : gpt2-merges.txt
102+
103+ native_amp_init_scale : 4294967296
104+ native_amp_growth_interval : 1000
105+ hysteresis : 2
106+ fp32_residual_connection : false
107+ fp16_lm_cross_entropy : false
108+ megatron_amp_O2 : true
109+ grad_allreduce_chunk_size_mb : 125
110+ grad_div_ar_fusion : true
111+ gradient_accumulation_fusion : true
112+ bias_activation_fusion : true
113+ bias_dropout_add_fusion : true
114+ masked_softmax_fusion : true
115+ seed : 1234
116+ resume_from_checkpoint : null
117+ use_cpu_initialization : false
118+ onnx_safe : false
119+ apex_transformer_log_level : 30
120+ gradient_as_bucket_view : true
121+ sync_batch_comm : false
122+ activations_checkpoint_granularity : null
123+ activations_checkpoint_method : null
124+ activations_checkpoint_num_layers : null
125+ num_micro_batches_with_partial_activation_checkpoints : null
126+ activations_checkpoint_layers_per_pipeline : null
127+ sequence_parallel : true
128+
129+
130+ # # Transformer Engine
131+ transformer_engine : true
132+ fp8 : false # enables fp8 in TransformerLayer forward
133+ fp8_e4m3 : true # sets fp8_format = recipe.Format.E4M3
134+ fp8_hybrid : true # sets fp8_format = recipe.Format.HYBRID
135+ fp8_margin : 0 # scaling margin
136+ fp8_interval : 1 # scaling update interval
137+ fp8_amax_history_len : 1024 # Number of steps for which amax history is recorded per tensor
138+ fp8_amax_compute_algo : max # 'most_recent' or 'max'. Algorithm for computing amax from history
139+ ub_tp_comm_overlap : true
140+ use_flash_attention : true
141+ overlap_p2p_comm : true
142+ batch_p2p_comm : false
143+ gc_interval : 100
144+ nsys_profile :
145+ enabled : False
146+ trace : [nvtx,cuda]
147+ start_step : 10 # Global batch to start profiling
148+ end_step : 10 # Global batch to end profiling
149+ ranks : [0] # Global rank IDs to profile
150+ gen_shape : False # Generate model and kernel details including input shapes
151+ optim :
152+ name : mcore_distributed_optim
153+ lr : 0.00015
154+ weight_decay : 0.1
155+ betas :
156+ - 0.9
157+ - 0.95
158+ bucket_cap_mb : 125
159+ overlap_grad_sync : true
160+ overlap_param_sync : true
161+ contiguous_grad_buffer : true
162+ contiguous_param_buffer : true
163+ grad_sync_dtype : bf16
164+ sched :
165+ name : CosineAnnealing
166+ warmup_steps : 2000
167+ constant_steps : 11873
168+ min_lr : 1.0e-05
169+ data :
170+ data_impl : mock
171+ # splits_string: 90,8,2
172+ splits_string : 99990,8,2
173+ seq_length : 8192
174+ skip_warmup : true
175+ num_workers : 2
176+ dataloader_type : single
177+ reset_position_ids : false
178+ reset_attention_mask : false
179+ eod_mask_loss : false
180+ index_mapping_dir : null
181+ data_prefix : null
0 commit comments