diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml index e291ab67a..e003981e5 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml @@ -10,20 +10,20 @@ Model: vocab_size: 50304 hidden_size: 2048 num_layers: 24 - num_attention_heads: 16 + num_attention_heads: 8 ffn_hidden_size: - hidden_dropout_prob: 0.1 - attention_probs_dropout_prob: 0.1 + hidden_dropout_prob: 0 + attention_probs_dropout_prob: 0 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 - use_recompute: True + use_recompute: False recompute_granularity: Distributed: - dp_degree: 8 - mp_degree: 1 + dp_degree: 1 + mp_degree: 8 pp_degree: 1 sharding: sharding_degree: 1 diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml index 33857b512..1ec8ef19b 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml @@ -9,17 +9,17 @@ Global: Model: vocab_size: 50304 hidden_size: 1024 - num_layers: 24 - num_attention_heads: 16 - ffn_hidden_size: 4096 - hidden_dropout_prob: 0.1 - attention_probs_dropout_prob: 0.1 + num_layers: 1 + num_attention_heads: 8 + ffn_hidden_size: + hidden_dropout_prob: 0 + attention_probs_dropout_prob: 0 max_position_embeddings: 1024 type_vocab_size: 16 initializer_range: 0.02 use_recompute: False recompute_granularity: - + Distributed: dp_degree: 1 diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index 742ad8030..9299abce3 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -16,7 +16,7 @@ Engine: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + use_pure_fp16: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -42,7 +42,7 @@ Data: split: [949, 50, 1] max_seq_len: 1024 sampler: - name: DistributedBatchSampler + name: GPTBatchSampler shuffle: False drop_last: True loader: diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 7051c7d06..fac421dba 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -288,44 +288,6 @@ def _train_one_epoch(self, train_start = time.time() train_losses = [] - if self._run_mode == 'step' and not skip_first: - if step % self._eval_freq == 0: - self._module.model.eval() - - eval_losses = [] - eval_start = time.time() - - for eval_step, batch in enumerate(valid_data_loader): - loss = self._evaluate_impl(batch) - eval_losses.append(loss) - - if eval_step >= self._eval_iters - 1: - break - - paddle.device.cuda.synchronize() - eval_cost = time.time() - eval_start - eval_loss = sum(eval_losses) / len(eval_losses) - - log_dict = { - 'loss': eval_loss.numpy()[0], - 'epoch': epoch_index, - 'batch': eval_step, - 'total_batch': total_eval_batch, - 'eval_cost': eval_cost / self._logging_freq, - } - self._module.validation_step_end(log_dict) - - self._module.model.train() - - if self._save_steps > 0 and step % self._save_steps == 0: - self.save(epoch=epoch_index, step=step) - else: - skip_first = False - - if self._run_mode == 'step' and step >= self._max_steps: - logger.info("The training process is complete.") - return - if self.profiler: self.profiler.step() @@ -351,6 +313,31 @@ def fit(self, epoch=1, train_data_loader=None, valid_data_loader=None): if self._load_recovery['rng_state'] != -1: paddle.set_cuda_rng_state(self._load_recovery['rng_state']) + model = self._module.model + + parameters_list = [] + for p in model.parameters(): + # print(p.name) + parameters_list.append(p) + + mp_rank = paddle.distributed.get_rank() + sta = paddle.load( + '/workspace/workspace/FleetX/output/model_state_mp_{:0>2d}.pdopt'. + format(mp_rank)) + + print("======" * 10) + index = 0 + for k, v in sta.items(): + # print(k, v) + # pass + parameters_list[index].name = k + index += 1 + model.set_state_dict(sta) + for p in model.parameters(): + pass + # print(p.name, p) + + # paddle.seed(1024) for epoch_index in range(start_epoch, epoch): self._train_one_epoch(epoch_index, train_data_loader, valid_data_loader) @@ -433,6 +420,7 @@ def _optim_update_params(self): p.bw_storage.scale_(1.0 / self._dp_group.nranks) dist.all_reduce(p.bw_storage, group=self._dp_group) + # print(">>>", self._optimizer._parameter_list) if self._use_pure_fp16: self._scaler.step(self._optimizer) self._scaler.update() diff --git a/ppfleetx/data/dataset/gpt_dataset.py b/ppfleetx/data/dataset/gpt_dataset.py index 02d409fee..1ffd0eb8d 100644 --- a/ppfleetx/data/dataset/gpt_dataset.py +++ b/ppfleetx/data/dataset/gpt_dataset.py @@ -102,7 +102,13 @@ def __init__(self, self.input_dir = input_dir self.max_seq_len = max_seq_len self.mode = mode + self.name = "gpt_" + mode + self.name = self.name.lower() + + if self.name == "gpt_eval": + self.name = "gpt_valid" + self.eos_id = tokenizer.eos_token_id self.sample_ids = sample_ids self.sample_lens = sample_lens diff --git a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index dcae2cf05..a46ffa202 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -585,7 +585,7 @@ def __init__(self, initializer_range=0.02, sequence_parallel=False): super(GPTEmbeddings, self).__init__() - + # print("hidden_dropout_prob: ", hidden_dropout_prob) self.sequence_parallel = sequence_parallel self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding( vocab_size, @@ -612,6 +612,10 @@ def forward(self, input_ids, position_ids=None): embeddings = input_embedings + position_embeddings # if sequence parallel is true, change embedding shape [b, s, h] to [s, b, h] # set the sequence dim as first, so the split in sequence dim is data-continuous + # print("input_ids", input_ids) + # print("embeddings", embeddings) + # print("embeddings_weight", self.word_embeddings.weight) + if self.sequence_parallel: embeddings = paddle.transpose(embeddings, perm=[1, 0, 2]) embeddings = ScatterOp.apply(embeddings) @@ -619,6 +623,7 @@ def forward(self, input_ids, position_ids=None): embeddings = self.dropout(embeddings) else: embeddings = self.dropout(embeddings) + return embeddings @@ -647,7 +652,7 @@ def __init__(self, self.vocab_size = vocab_size hcg = fleet.get_hybrid_communicate_group() - mp_size = hcg.get_model_parallel_world_size() + mp_size = hcg.get_model_parallel_world_size() if mp_size <= 1: sequence_parallel = False diff --git a/ppfleetx/optims/lr_scheduler.py b/ppfleetx/optims/lr_scheduler.py index 18b45f6c6..35c545f4c 100644 --- a/ppfleetx/optims/lr_scheduler.py +++ b/ppfleetx/optims/lr_scheduler.py @@ -33,6 +33,8 @@ def __init__(self, self.warmup_step = warmup_rate * decay_steps self.max_lr = max_lr self.min_lr = min_lr + + print(self.max_lr, self.min_lr, self.warmup_step) super(CosineAnnealingWithWarmupDecay, self).__init__( max_lr, last_epoch, verbose)