Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions ppfleetx/configs/nlp/gpt/pretrain_gpt_1.3B_dp8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,20 @@ Model:
vocab_size: 50304
hidden_size: 2048
num_layers: 24
num_attention_heads: 16
num_attention_heads: 8
ffn_hidden_size:
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
hidden_dropout_prob: 0
attention_probs_dropout_prob: 0
max_position_embeddings: 1024
type_vocab_size: 16
initializer_range: 0.02
use_recompute: True
use_recompute: False
recompute_granularity:


Distributed:
dp_degree: 8
mp_degree: 1
dp_degree: 1
mp_degree: 8
pp_degree: 1
sharding:
sharding_degree: 1
Expand Down
12 changes: 6 additions & 6 deletions ppfleetx/configs/nlp/gpt/pretrain_gpt_345M_single_card.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ Global:
Model:
vocab_size: 50304
hidden_size: 1024
num_layers: 24
num_attention_heads: 16
ffn_hidden_size: 4096
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
num_layers: 1
num_attention_heads: 8
ffn_hidden_size:
hidden_dropout_prob: 0
attention_probs_dropout_prob: 0
max_position_embeddings: 1024
type_vocab_size: 16
initializer_range: 0.02
use_recompute: False
recompute_granularity:


Distributed:
dp_degree: 1
Expand Down
4 changes: 2 additions & 2 deletions ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Engine:
eval_iters: 10
test_iters:
mix_precision:
use_pure_fp16: True
use_pure_fp16: False
scale_loss: 32768.0
custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
custom_white_list: ["lookup_table", "lookup_table_v2"]
Expand All @@ -42,7 +42,7 @@ Data:
split: [949, 50, 1]
max_seq_len: 1024
sampler:
name: DistributedBatchSampler
name: GPTBatchSampler
shuffle: False
drop_last: True
loader:
Expand Down
64 changes: 26 additions & 38 deletions ppfleetx/core/engine/eager_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,44 +288,6 @@ def _train_one_epoch(self,
train_start = time.time()
train_losses = []

if self._run_mode == 'step' and not skip_first:
if step % self._eval_freq == 0:
self._module.model.eval()

eval_losses = []
eval_start = time.time()

for eval_step, batch in enumerate(valid_data_loader):
loss = self._evaluate_impl(batch)
eval_losses.append(loss)

if eval_step >= self._eval_iters - 1:
break

paddle.device.cuda.synchronize()
eval_cost = time.time() - eval_start
eval_loss = sum(eval_losses) / len(eval_losses)

log_dict = {
'loss': eval_loss.numpy()[0],
'epoch': epoch_index,
'batch': eval_step,
'total_batch': total_eval_batch,
'eval_cost': eval_cost / self._logging_freq,
}
self._module.validation_step_end(log_dict)

self._module.model.train()

if self._save_steps > 0 and step % self._save_steps == 0:
self.save(epoch=epoch_index, step=step)
else:
skip_first = False

if self._run_mode == 'step' and step >= self._max_steps:
logger.info("The training process is complete.")
return

if self.profiler:
self.profiler.step()

Expand All @@ -351,6 +313,31 @@ def fit(self, epoch=1, train_data_loader=None, valid_data_loader=None):
if self._load_recovery['rng_state'] != -1:
paddle.set_cuda_rng_state(self._load_recovery['rng_state'])

model = self._module.model

parameters_list = []
for p in model.parameters():
# print(p.name)
parameters_list.append(p)

mp_rank = paddle.distributed.get_rank()
sta = paddle.load(
'/workspace/workspace/FleetX/output/model_state_mp_{:0>2d}.pdopt'.
format(mp_rank))

print("======" * 10)
index = 0
for k, v in sta.items():
# print(k, v)
# pass
parameters_list[index].name = k
index += 1
model.set_state_dict(sta)
for p in model.parameters():
pass
# print(p.name, p)

# paddle.seed(1024)
for epoch_index in range(start_epoch, epoch):
self._train_one_epoch(epoch_index, train_data_loader,
valid_data_loader)
Expand Down Expand Up @@ -433,6 +420,7 @@ def _optim_update_params(self):
p.bw_storage.scale_(1.0 / self._dp_group.nranks)
dist.all_reduce(p.bw_storage, group=self._dp_group)

# print(">>>", self._optimizer._parameter_list)
if self._use_pure_fp16:
self._scaler.step(self._optimizer)
self._scaler.update()
Expand Down
6 changes: 6 additions & 0 deletions ppfleetx/data/dataset/gpt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,13 @@ def __init__(self,
self.input_dir = input_dir
self.max_seq_len = max_seq_len
self.mode = mode

self.name = "gpt_" + mode
self.name = self.name.lower()

if self.name == "gpt_eval":
self.name = "gpt_valid"

self.eos_id = tokenizer.eos_token_id
self.sample_ids = sample_ids
self.sample_lens = sample_lens
Expand Down
9 changes: 7 additions & 2 deletions ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ def __init__(self,
initializer_range=0.02,
sequence_parallel=False):
super(GPTEmbeddings, self).__init__()

# print("hidden_dropout_prob: ", hidden_dropout_prob)
self.sequence_parallel = sequence_parallel
self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
vocab_size,
Expand All @@ -612,13 +612,18 @@ def forward(self, input_ids, position_ids=None):
embeddings = input_embedings + position_embeddings
# if sequence parallel is true, change embedding shape [b, s, h] to [s, b, h]
# set the sequence dim as first, so the split in sequence dim is data-continuous
# print("input_ids", input_ids)
# print("embeddings", embeddings)
# print("embeddings_weight", self.word_embeddings.weight)

if self.sequence_parallel:
embeddings = paddle.transpose(embeddings, perm=[1, 0, 2])
embeddings = ScatterOp.apply(embeddings)
with get_rng_state_tracker().rng_state('local_seed'):
embeddings = self.dropout(embeddings)
else:
embeddings = self.dropout(embeddings)

return embeddings


Expand Down Expand Up @@ -647,7 +652,7 @@ def __init__(self,
self.vocab_size = vocab_size

hcg = fleet.get_hybrid_communicate_group()
mp_size = hcg.get_model_parallel_world_size()
mp_size = hcg.get_model_parallel_world_size()
if mp_size <= 1:
sequence_parallel = False

Expand Down
2 changes: 2 additions & 0 deletions ppfleetx/optims/lr_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def __init__(self,
self.warmup_step = warmup_rate * decay_steps
self.max_lr = max_lr
self.min_lr = min_lr

print(self.max_lr, self.min_lr, self.warmup_step)
super(CosineAnnealingWithWarmupDecay, self).__init__(
max_lr, last_epoch, verbose)

Expand Down