diff --git a/.gitignore b/.gitignore index a01a301f..6510a756 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,26 @@ **/playground **/wandb +# my custom ignore +**/swanlog +**/verl_original +**/verl_v03 +**/verl_v04 +**/nohupoutput +**/nohupoutput* +**/*.nohupoutput +**/train_3b_drgrpo_tang3.sh +**/train_3b_drgrpo_song2.sh +**/train_tiny_zero_a100_drgrpo_tang3.sh +**/train_tiny_zero_a100_drgrpo_song2.sh +**/0.7.0 +**/tempCodeRunnerFile.python +**/*_tang3.sh +**/*_song2.sh +**/train_paramtest* +**/x * +**/save + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 1929999b..0167ca67 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # TinyZero + ![image](cover.png) TinyZero is a reproduction of [DeepSeek R1 Zero](https://github.com/deepseek-ai/DeepSeek-R1) in countdown and multiplication tasks. We built upon [veRL](https://github.com/volcengine/verl). @@ -11,12 +12,12 @@ Twitter thread: https://x.com/jiayi_pirate/status/1882839370505621655 Full experiment log: https://wandb.ai/jiayipan/TinyZero -Paper's on it's way! +> 📢: We release [Apative Parallel Reasoning](https://github.com/Parallel-Reasoning/APR), where we explore a new dimension in scaling reasoining models ## Installation ``` -conda create -n zero python=3.9 +conda create -n zero python=3.10.18 # install torch [or you can skip this step and let vllm to install the correct version for you] pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121 # install vllm diff --git a/data_preprocess-gsm8k.py b/data_preprocess-gsm8k.py new file mode 100644 index 00000000..673d9734 --- /dev/null +++ b/data_preprocess-gsm8k.py @@ -0,0 +1,101 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Preprocess the GSM8k dataset to parquet format +""" + +import re +import os +import datasets + +from verl.utils.hdfs_io import copy, makedirs +import argparse + + +def extract_solution(solution_str): + solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) + assert solution is not None + final_solution = solution.group(0) + final_solution = final_solution.split('#### ')[1].replace(',', '') + return final_solution + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--local_dir', default='~/data/gsm8k') + parser.add_argument('--hdfs_dir', default=None) + + args = parser.parse_args() + + num_few_shot = 5 + data_source = 'openai/gsm8k' + + dataset = datasets.load_dataset(data_source, 'main') + + train_dataset = dataset['train'] + test_dataset = dataset['test'] + + print(f'train_dataset: {type(train_dataset)}, {len(train_dataset)}') + print(f'test_dataset: {type(test_dataset)}, {len(test_dataset)}') + print(f'train_dataset[0]: {train_dataset[0]}') + + exit(0) + + instruction_following = "Let's think step by step and output the final answer after \"####\"." + + # add a row to each data item that represents a unique id + def make_map_fn(split): + + def process_fn(example, idx): + question_raw = example.pop('question') + + question = question_raw + ' ' + instruction_following + + answer_raw = example.pop('answer') + solution = extract_solution(answer_raw) + data = { + "data_source": data_source, + "prompt": [{ + "role": "user", + "content": question, + }], + "ability": "math", + "reward_model": { + "style": "rule", + "ground_truth": solution + }, + "extra_info": { + 'split': split, + 'index': idx, + 'answer': answer_raw, + "question": question_raw, + } + } + return data + + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True) + + local_dir = args.local_dir + hdfs_dir = args.hdfs_dir + + train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet')) + test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet')) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + + copy(src=local_dir, dst=hdfs_dir) diff --git a/requirements.txt b/requirements.txt index 83b00feb..8b80312e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ ray tensordict<0.6 transformers<4.48 vllm<=0.6.3 +swanlab wandb diff --git a/scripts/sft_countdown_2a40.sh b/scripts/sft_countdown_2a40.sh new file mode 100644 index 00000000..e6eff7fa --- /dev/null +++ b/scripts/sft_countdown_2a40.sh @@ -0,0 +1,43 @@ +# Tested with 2 & 4 GPUs + +set -x + +if [ "$#" -lt 2 ]; then + echo "Usage: run_gemma_2b.sh [other_configs...]" + exit 1 +fi + +nproc_per_node=$1 +save_path=$2 + +# Shift the arguments so $@ refers to the rest +shift 2 + +torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ + -m verl.trainer.fsdp_sft_trainer \ + data.train_files=$DATA_DIR/countdown_CoT_train_filtered_token_length_1024.parquet \ + data.val_files=$DATA_DIR/countdown_CoT_test_filtered_token_length_1024.parquet \ + data.prompt_key=extra_info \ + data.response_key=extra_info \ + data.prompt_dict_keys=['question'] \ + data.response_dict_keys=['answer'] \ + data.truncation=left \ + data.max_length=2048 \ + data.train_batch_size=32 \ + data.micro_batch_size=4 \ + \ + model.partial_pretrain=$BASE_MODEL \ + \ + optim.lr=1e-5 \ + optim.weight_decay=0.01 \ + optim.warmup_steps_ratio=0.1 \ + \ + trainer.default_local_dir=$save_path \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.logger=['swanlab'] \ + trainer.total_epochs=1 \ + trainer.default_hdfs_dir=null \ + trainer.val_before_training=True \ + trainer.validate_every_n_steps=10 \ + $@ \ No newline at end of file diff --git a/scripts/sft_countdown_4a40.sh b/scripts/sft_countdown_4a40.sh new file mode 100644 index 00000000..ef83f846 --- /dev/null +++ b/scripts/sft_countdown_4a40.sh @@ -0,0 +1,44 @@ +# Tested with 2 & 4 GPUs + +set -x + +if [ "$#" -lt 2 ]; then + echo "Usage: run_gemma_2b.sh [other_configs...]" + exit 1 +fi + +nproc_per_node=$1 +save_path=$2 + +# Shift the arguments so $@ refers to the rest +shift 2 + +torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ + -m verl.trainer.fsdp_sft_trainer \ + data.train_files=$TRAIN_FILE_PATH \ + data.val_files=$TEST_FILE_PATH \ + data.prompt_key=extra_info \ + data.response_key=extra_info \ + data.prompt_dict_keys=['question'] \ + data.response_dict_keys=['answer'] \ + data.truncation=left \ + data.max_length=2048 \ + data.train_batch_size=32 \ + data.micro_batch_size=4 \ + data.truncation=pass \ + \ + model.partial_pretrain=$BASE_MODEL \ + \ + optim.lr=1e-6 \ + optim.weight_decay=0.01 \ + optim.warmup_steps_ratio=0.1 \ + \ + trainer.default_local_dir=$save_path \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.logger=['swanlab'] \ + trainer.total_epochs=1 \ + trainer.default_hdfs_dir=null \ + trainer.val_before_training=True \ + trainer.validate_every_n_steps=10 \ + $@ \ No newline at end of file diff --git a/scripts/sft_gsm8k_2a40.sh b/scripts/sft_gsm8k_2a40.sh new file mode 100644 index 00000000..3b7893c4 --- /dev/null +++ b/scripts/sft_gsm8k_2a40.sh @@ -0,0 +1,36 @@ +# Tested with 2 & 4 GPUs + +set -x + +if [ "$#" -lt 2 ]; then + echo "Usage: run_gemma_2b.sh [other_configs...]" + exit 1 +fi + +nproc_per_node=$1 +save_path=$2 + +# Shift the arguments so $@ refers to the rest +shift 2 + +torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ + -m verl.trainer.fsdp_sft_trainer \ + data.train_files=$DATA_DIR/train.parquet \ + data.val_files=$DATA_DIR/test.parquet \ + data.prompt_key=extra_info \ + data.response_key=extra_info \ + +data.prompt_dict_keys=['question'] \ + +data.response_dict_keys=['answer'] \ + data.train_batch_size=64 \ + data.micro_batch_size=8 \ + model.partial_pretrain=$BASE_MODEL \ + trainer.default_local_dir=$save_path \ + trainer.project_name=$PROJECT_NAME \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.logger=['swanlab'] \ + trainer.total_epochs=1 \ + trainer.total_training_steps=20 \ + trainer.default_hdfs_dir=null \ + trainer.val_before_training=True \ + trainer.validate_every_n_steps=10 \ + $@ \ No newline at end of file diff --git a/scripts/train_tiny_zero_drgrpo_2a40.sh b/scripts/train_tiny_zero_drgrpo_2a40.sh new file mode 100644 index 00000000..c4bb361c --- /dev/null +++ b/scripts/train_tiny_zero_drgrpo_2a40.sh @@ -0,0 +1,43 @@ +# this is a training script for Dr.GRPO algorithm + +/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + algorithm.norm_adv_by_std_in_grpo=False \ + data.train_files=$DATA_DIR/train.parquet \ + data.val_files=$DATA_DIR/test.parquet \ + data.train_batch_size=128 \ + data.val_batch_size=640 \ + data.max_prompt_length=256 \ + data.max_response_length=1024 \ + actor_rollout_ref.model.path=$BASE_MODEL \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum-norm \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.grad_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size=2 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger=['swanlab'] \ + ++trainer.val_before_train=False \ + trainer.default_hdfs_dir=null \ + trainer.n_gpus_per_node=$N_GPUS \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.project_name=TinyZero \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log \ No newline at end of file diff --git a/scripts/train_tiny_zero_drgrpo_2a40_resume.sh b/scripts/train_tiny_zero_drgrpo_2a40_resume.sh new file mode 100644 index 00000000..480c61bf --- /dev/null +++ b/scripts/train_tiny_zero_drgrpo_2a40_resume.sh @@ -0,0 +1,46 @@ +# this is a training script for Dr.GRPO algorithm + +/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ + trainer.tracking.resume=True \ + trainer.tracking.resume_id=$RESUME_ID \ + trainer.tracking.start_step=$RESUME_START_STEP \ + algorithm.adv_estimator=grpo \ + algorithm.norm_adv_by_std_in_grpo=False \ + data.train_files=$DATA_DIR/train.parquet \ + data.val_files=$DATA_DIR/test.parquet \ + data.train_batch_size=128 \ + data.val_batch_size=640 \ + data.max_prompt_length=256 \ + data.max_response_length=1024 \ + actor_rollout_ref.model.path=$BASE_MODEL \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size=4 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum-norm \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.grad_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size=2 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger=['swanlab'] \ + ++trainer.val_before_train=False \ + trainer.default_hdfs_dir=null \ + trainer.n_gpus_per_node=$N_GPUS \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.project_name=TinyZero \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log \ No newline at end of file diff --git a/scripts/train_tiny_zero_a100_grpo.sh b/scripts/train_tiny_zero_grpo_2a40.sh similarity index 91% rename from scripts/train_tiny_zero_a100_grpo.sh rename to scripts/train_tiny_zero_grpo_2a40.sh index 3f0703f9..880ed24a 100644 --- a/scripts/train_tiny_zero_a100_grpo.sh +++ b/scripts/train_tiny_zero_grpo_2a40.sh @@ -1,4 +1,6 @@ -/home/weiji/anaconda3/envs/zero/bin/python3 -m verl.trainer.main_ppo \ +#this is training script for grpo algorithm + +/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ algorithm.adv_estimator=grpo \ data.train_files=$DATA_DIR/train.parquet \ data.val_files=$DATA_DIR/test.parquet \ @@ -9,12 +11,12 @@ actor_rollout_ref.model.path=$BASE_MODEL \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \ actor_rollout_ref.actor.ppo_micro_batch_size=4 \ actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ - actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ @@ -27,7 +29,7 @@ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ - trainer.logger=['wandb'] \ + trainer.logger=['swanlab'] \ +trainer.val_before_train=False \ trainer.default_hdfs_dir=null \ trainer.n_gpus_per_node=$N_GPUS \ diff --git a/scripts/train_tiny_zero_a100_ppo.sh b/scripts/train_tiny_zero_grpo_2a40_resume.sh similarity index 51% rename from scripts/train_tiny_zero_a100_ppo.sh rename to scripts/train_tiny_zero_grpo_2a40_resume.sh index 817f0c48..962c5cb5 100644 --- a/scripts/train_tiny_zero_a100_ppo.sh +++ b/scripts/train_tiny_zero_grpo_2a40_resume.sh @@ -1,4 +1,10 @@ -/home/weiji/anaconda3/envs/zero/bin/python3 -m verl.trainer.main_ppo \ +#this is training script for grpo algorithm + +/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ + trainer.tracking.resume=True \ + trainer.tracking.resume_id=$RESUME_ID \ + trainer.tracking.start_step=$RESUME_START_STEP \ + algorithm.adv_estimator=grpo \ data.train_files=$DATA_DIR/train.parquet \ data.val_files=$DATA_DIR/test.parquet \ data.train_batch_size=128 \ @@ -7,17 +13,26 @@ data.max_response_length=1024 \ actor_rollout_ref.model.path=$BASE_MODEL \ actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \ actor_rollout_ref.actor.ppo_micro_batch_size=4 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.grad_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.ref.log_prob_micro_batch_size=2 \ - critic.optim.lr=1e-5 \ - critic.model.path=$BASE_MODEL \ - critic.ppo_micro_batch_size=4 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.kl_ctrl.kl_coef=0.001 \ - trainer.logger=['wandb'] \ + trainer.critic_warmup=0 \ + trainer.logger=['swanlab'] \ +trainer.val_before_train=False \ trainer.default_hdfs_dir=null \ trainer.n_gpus_per_node=$N_GPUS \ diff --git a/scripts/train_tiny_zero_ppo_4a20_1.5b.sh b/scripts/train_tiny_zero_ppo_4a20_1.5b.sh new file mode 100644 index 00000000..f825fdd5 --- /dev/null +++ b/scripts/train_tiny_zero_ppo_4a20_1.5b.sh @@ -0,0 +1,66 @@ +#this is training script for ppo algorithm + +/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ + data.train_files=$DATA_DIR/train.parquet \ + data.val_files=$DATA_DIR/test.parquet \ + data.train_batch_size=64 \ + data.val_batch_size=320 \ + data.max_prompt_length=256 \ + data.max_response_length=1024 \ + actor_rollout_ref.model.path=$BASE_MODEL \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size=4 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size=2 \ + critic.optim.lr=1e-5 \ + critic.model.path=$BASE_MODEL \ + critic.ppo_micro_batch_size=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=['swanlab'] \ + +trainer.val_before_train=False \ + trainer.default_hdfs_dir=null \ + trainer.n_gpus_per_node=$N_GPUS \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.project_name=TinyZero \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log + + +# /home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ +# data.train_files=$DATA_DIR/train.parquet \ +# data.val_files=$DATA_DIR/test.parquet \ +# data.train_batch_size=32 \ +# data.val_batch_size=64 \ +# data.max_prompt_length=256 \ +# data.max_response_length=1024 \ +# actor_rollout_ref.model.path=$BASE_MODEL \ +# actor_rollout_ref.actor.optim.lr=1e-6 \ +# actor_rollout_ref.actor.ppo_mini_batch_size=16 \ +# actor_rollout_ref.actor.ppo_micro_batch_size=4 \ +# actor_rollout_ref.model.use_remove_padding=True \ +# actor_rollout_ref.actor.use_dynamic_bsz=True \ +# actor_rollout_ref.rollout.log_prob_micro_batch_size=2 \ +# actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ +# actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ +# actor_rollout_ref.ref.log_prob_micro_batch_size=2 \ +# critic.optim.lr=1e-5 \ +# critic.model.path=$BASE_MODEL \ +# critic.ppo_micro_batch_size=4 \ +# algorithm.kl_ctrl.kl_coef=0.001 \ +# trainer.logger=['swanlab'] \ +# +trainer.val_before_train=False \ +# trainer.default_hdfs_dir=null \ +# trainer.n_gpus_per_node=$N_GPUS \ +# trainer.nnodes=1 \ +# trainer.save_freq=10 \ +# trainer.test_freq=10 \ +# trainer.project_name=TinyZero \ +# trainer.experiment_name=$EXPERIMENT_NAME \ +# trainer.total_epochs=15 2>&1 | tee verl_demo.log \ No newline at end of file diff --git a/scripts/train_tiny_zero_ppo_a100.sh b/scripts/train_tiny_zero_ppo_a100.sh new file mode 100644 index 00000000..f825fdd5 --- /dev/null +++ b/scripts/train_tiny_zero_ppo_a100.sh @@ -0,0 +1,66 @@ +#this is training script for ppo algorithm + +/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ + data.train_files=$DATA_DIR/train.parquet \ + data.val_files=$DATA_DIR/test.parquet \ + data.train_batch_size=64 \ + data.val_batch_size=320 \ + data.max_prompt_length=256 \ + data.max_response_length=1024 \ + actor_rollout_ref.model.path=$BASE_MODEL \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=32 \ + actor_rollout_ref.actor.ppo_micro_batch_size=4 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size=2 \ + critic.optim.lr=1e-5 \ + critic.model.path=$BASE_MODEL \ + critic.ppo_micro_batch_size=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=['swanlab'] \ + +trainer.val_before_train=False \ + trainer.default_hdfs_dir=null \ + trainer.n_gpus_per_node=$N_GPUS \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.project_name=TinyZero \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log + + +# /home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ +# data.train_files=$DATA_DIR/train.parquet \ +# data.val_files=$DATA_DIR/test.parquet \ +# data.train_batch_size=32 \ +# data.val_batch_size=64 \ +# data.max_prompt_length=256 \ +# data.max_response_length=1024 \ +# actor_rollout_ref.model.path=$BASE_MODEL \ +# actor_rollout_ref.actor.optim.lr=1e-6 \ +# actor_rollout_ref.actor.ppo_mini_batch_size=16 \ +# actor_rollout_ref.actor.ppo_micro_batch_size=4 \ +# actor_rollout_ref.model.use_remove_padding=True \ +# actor_rollout_ref.actor.use_dynamic_bsz=True \ +# actor_rollout_ref.rollout.log_prob_micro_batch_size=2 \ +# actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ +# actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ +# actor_rollout_ref.ref.log_prob_micro_batch_size=2 \ +# critic.optim.lr=1e-5 \ +# critic.model.path=$BASE_MODEL \ +# critic.ppo_micro_batch_size=4 \ +# algorithm.kl_ctrl.kl_coef=0.001 \ +# trainer.logger=['swanlab'] \ +# +trainer.val_before_train=False \ +# trainer.default_hdfs_dir=null \ +# trainer.n_gpus_per_node=$N_GPUS \ +# trainer.nnodes=1 \ +# trainer.save_freq=10 \ +# trainer.test_freq=10 \ +# trainer.project_name=TinyZero \ +# trainer.experiment_name=$EXPERIMENT_NAME \ +# trainer.total_epochs=15 2>&1 | tee verl_demo.log \ No newline at end of file diff --git a/scripts/train_tiny_zero_h200_ppo.sh b/scripts/train_tiny_zero_ppo_h200.sh similarity index 100% rename from scripts/train_tiny_zero_h200_ppo.sh rename to scripts/train_tiny_zero_ppo_h200.sh diff --git a/sft_countdown-qwen3b.sh b/sft_countdown-qwen3b.sh new file mode 100644 index 00000000..b7daf70b --- /dev/null +++ b/sft_countdown-qwen3b.sh @@ -0,0 +1,27 @@ + +export N_GPUS=4 +# export CUDA_VISIBLE_DEVICES=0,1,2,6 +export CUDA_VISIBLE_DEVICES=2,3,5,6 +export LOCAL_RANK=0 +# ray stop --force && ray start --head --include-dashboard=True --dashboard-port=8263 + +export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-3B" + +export DATA_DIR="/NAS/chenfeng/dataset/countdown" +export TRAIN_FILE_PATH=$DATA_DIR/CoT_train_lim2000.parquet +export TEST_FILE_PATH=$DATA_DIR/CoT_test_lim2000.parquet + +export PROJECT_NAME=sft +export EXPERIMENT_NAME=countdown-qwen2.5-3b-coldstart-SFT-lr1e-6 +export VLLM_ATTENTION_BACKEND=XFORMERS + +export SWANLAB_API_KEY=YOUR_API_KEY_HERE + +# export RESUME_ID=YOUREXPERIMENT_ID_HERE like lgmfk2t4ro1... +# export RESUME_START_STEP=210 + +mkdir -p ./outputs/$EXPERIMENT_NAME +mkdir -p ./checkpoints/sft/$EXPERIMENT_NAME +# bash ./examples/sft/gsm8k/train_gsm8k-sft-peft.sh 4 ./save/$EXPERIMENT_NAME +echo "Output saved to ./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput"; \ +nohup bash ./scripts/sft_countdown_4a20.sh $N_GPUS ./checkpoints/sft/$EXPERIMENT_NAME &>"./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput" diff --git a/tests/model_download.py b/tests/model_download.py new file mode 100644 index 00000000..3c198931 --- /dev/null +++ b/tests/model_download.py @@ -0,0 +1,10 @@ +from modelscope import snapshot_download + +# 指定缓存目录 +cache_dir = "/NAS/chenfeng/models/Qwen/Qwen2.5-1.5B" + +# 下载模型(示例:下载 damo/nlp_structbert_backbone_base_std) +snapshot_download( + model_id="Qwen/Qwen2.5-1.5B", + cache_dir=cache_dir +) diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 00000000..d7a76523 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,131 @@ +""" +Preprocess dataset for countdown task - given a target number and N numbers, generate equations to reach target +""" + +import re +import os +from datasets import Dataset, load_dataset +from random import randint, seed, choice +from typing import List, Tuple +from tqdm import tqdm +from verl.utils.hdfs_io import copy, makedirs +import argparse + + +def gen_dataset( + num_samples: int, + num_operands: int = 6, + max_target: int = 1000, + min_number: int = 1, + max_number: int = 100, + operations: List[str] = ['+', '-', '*', '/'], + seed_value: int = 42, +) -> List[Tuple]: + """Generate dataset for countdown task. + + Args: + num_samples: Number of samples to generate + num_operands: Number of numbers provided in each sample + max_target: Maximum value for target number + min_number: Minimum value for provided numbers + max_number: Maximum value for provided numbers + operations: List of allowed operations + seed_value: Random seed for reproducibility + + Returns: + List of tuples containing (target, numbers, solution) + """ + seed(seed_value) + samples = [] + + for _ in tqdm(range(num_samples)): + # Generate random target + target = randint(1, max_target) + + # Generate random numbers + numbers = [randint(min_number, max_number) for _ in range(num_operands)] + + + samples.append((target, numbers)) + + return samples + +def make_prefix(dp, template_type): + target = dp['target'] + numbers = dp['nums'] + # NOTE: also need to change reward_score/countdown.py + if template_type == 'base': + """This works for any base model""" + prefix = f"""A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. +User: Using the numbers {numbers}, create an equation that equals {target}. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in tags. And return the final answer in tags, for example (1 + 2) / 3 . +Assistant: Let me solve this step by step. +""" + elif template_type == 'qwen-instruct': ### qwen-instruct model要用不同的prompt模板 + """This works for Qwen Instruct Models""" + prefix = f"""<|im_start|>system\nYou are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.<|im_end|>\n<|im_start|>user\n Using the numbers {numbers}, create an equation that equals {target}. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in tags. And return the final answer in tags, for example (1 + 2) / 3 .<|im_end|>\n<|im_start|>assistant\nLet me solve this step by step.\n""" + return prefix + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + # parser.add_argument('--local_dir', default='~/data/countdown') + # parser.add_argument('--hdfs_dir', default=None) + parser.add_argument('--num_samples', type=int, default=100000) + parser.add_argument('--num_operands', type=int, default=6) + parser.add_argument('--max_target', type=int, default=1000) + parser.add_argument('--min_number', type=int, default=1) + parser.add_argument('--max_number', type=int, default=100) + parser.add_argument('--train_size', type=int, default=327680) + parser.add_argument('--test_size', type=int, default=1024) + parser.add_argument('--template_type', type=str, default='base') + + args = parser.parse_args() + + data_source = 'countdown' + TRAIN_SIZE = args.train_size + TEST_SIZE = args.test_size + + raw_dataset = load_dataset('Jiayi-Pan/Countdown-Tasks-3to4', split='train') + + assert len(raw_dataset) > TRAIN_SIZE + TEST_SIZE + train_dataset = raw_dataset.select(range(TRAIN_SIZE)) + test_dataset = raw_dataset.select(range(TRAIN_SIZE, TRAIN_SIZE + TEST_SIZE)) + + def make_map_fn(split): + def process_fn(example, idx): + question = make_prefix(example, template_type=args.template_type) + solution = { + "target": example['target'], + "numbers": example['nums'] + } + data = { + "data_source": data_source, + "prompt": [{ + "role": "user", + "content": question, + }], + "ability": "math", + "reward_model": { + "style": "rule", + "ground_truth": solution + }, + "extra_info": { + 'split': split, + 'index': idx, + } + } + return data + return process_fn + + train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True) + + local_dir = args.local_dir + hdfs_dir = args.hdfs_dir + + train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet')) + test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet')) + + if hdfs_dir is not None: + makedirs(hdfs_dir) + copy(src=local_dir, dst=hdfs_dir) diff --git a/tests/test_dataset_2.py b/tests/test_dataset_2.py new file mode 100644 index 00000000..841e1c96 --- /dev/null +++ b/tests/test_dataset_2.py @@ -0,0 +1,63 @@ + +# python ./tests/test_dataset_2.py +import numpy as np +import pandas as pd + +train_path = '/NAS/chenfeng/dataset/countdown/train.parquet' +test_path = '/NAS/chenfeng/dataset/countdown/test.parquet' + +def load_data(path): + return pd.read_parquet(path, engine='pyarrow') + + +train_data = load_data(train_path) +print(train_data.head()) + +test_data = load_data(test_path) +print(test_data.head()) + +print(f'------------------------------------------------------') +print(f"Train data shape: {train_data.shape}") +print(f"Test data shape: {test_data.shape}") + +print(f"Train data columns: {train_data.columns.tolist()}") +print(f"Test data columns: {test_data.columns.tolist()}") + +print(f'------------------------------------------------------') + +print(f"Train data sample: {train_data.iloc[0]}") + +print(f'train_data_describe:\n{train_data.describe()}') + +for col in train_data.columns: + try: + print(f'Unique values in "{col}" column ({train_data[col].nunique()}): {train_data[col].unique()}') + except Exception as e: + print(f'Error processing column "{col}": {e}') + +print(f'------------------------------------------------------') + +for row in range(5): + print(f'prompt of sample {row}: {train_data.loc[row, "prompt"]}') + +flag = 0 +for row in range(len(train_data)): +# for row in range(5): + prompt_this = train_data.loc[row, 'prompt'] + if len(prompt_this) != 1: + print(f'Error in sample {row}: prompt is not a list or len={len(prompt_this)}, type={type(prompt_this)}') + flag += 1 + continue + prompt_str = prompt_this[0]['content'] + if not isinstance(prompt_str, str): + print(f'Error in sample {row}: prompt is not a string') + flag += 1 + continue + if r'(+, -, *, /)' not in prompt_str: + print(f'Error in sample {row}: {prompt_str}') + flag += 1 + +if flag == 0: + print(f'All prompts contain "(+, -, *, /)"') +else: + print(f'Found {flag} samples without "(+, -, *, /)" in prompt') \ No newline at end of file diff --git a/train_1.5b_ppo.sh b/train_1.5b_ppo.sh index 00e7069b..b4be14e5 100755 --- a/train_1.5b_ppo.sh +++ b/train_1.5b_ppo.sh @@ -1,15 +1,16 @@ -#!/bin/bash -# alias python='/home/weiji/anaconda3/envs/zero/bin/python' -# alias python3='/home/weiji/anaconda3/envs/zero/bin/python3' -# alias pip='/home/weiji/anaconda3/envs/zero/bin/pip' +# This one works for 1.5B PPO on 4A20 GPUs -export N_GPUS=2 -export CUDA_VISIBLE_DEVICES=2,3 +export N_GPUS=4 +export CUDA_VISIBLE_DEVICES=0,1,5,6 ray stop --force && ray start --head --include-dashboard=True -export BASE_MODEL="model/Qwen2.5-1.5B" -export DATA_DIR="data/countdown" -export ROLLOUT_TP_SIZE=2 -export EXPERIMENT_NAME=countdown-qwen2.5-1.5b +export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-1.5B/Qwen/Qwen2.5-1.5B" +export DATA_DIR="/NAS/chenfeng/dataset/countdown" +export ROLLOUT_TP_SIZE=4 +export EXPERIMENT_NAME=countdown-qwen2.5-1.5b-ppo export VLLM_ATTENTION_BACKEND=XFORMERS -bash ./scripts/train_tiny_zero_a100_ppo.sh \ No newline at end of file +export SWANLAB_API_KEY=YOUR_API_KEY +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +# bash ./scripts/train_tiny_zero_a100_ppo.sh +nohup bash ./scripts/train_tiny_zero_4a20_1.5b_ppo.sh.sh &>"./outputs/ppo_$(date +'%y%m%d-%H%M%S').nohupoutput" diff --git a/train_3b_drgrpo.sh b/train_3b_drgrpo.sh new file mode 100644 index 00000000..be0ac5cd --- /dev/null +++ b/train_3b_drgrpo.sh @@ -0,0 +1,15 @@ +# This one works for 3B DR-GRPO on 2A20 GPUs + +export N_GPUS=2 +export CUDA_VISIBLE_DEVICES=0,1 +# ray stop --force && ray start --head --include-dashboard=True --dashboard-port=8263 +export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-3B" +export DATA_DIR="/NAS/chenfeng/dataset/countdown" +export ROLLOUT_TP_SIZE=2 +export EXPERIMENT_NAME=countdown-qwen2.5-3b-dr-grpo +export VLLM_ATTENTION_BACKEND=XFORMERS + +export SWANLAB_API_KEY=YOUR_API_KEY_HERE + +# bash ./scripts/train_tiny_zero_a100_drgrpo.sh +nohup bash ./scripts/train_tiny_zero_a100_drgrpo.sh &>./nohupoutput_drgrpo \ No newline at end of file diff --git a/train_3b_grpo.sh b/train_3b_grpo.sh index 93a5d836..585d25d2 100755 --- a/train_3b_grpo.sh +++ b/train_3b_grpo.sh @@ -1,15 +1,15 @@ -#!/bin/bash -# alias python='/home/weiji/anaconda3/envs/zero/bin/python' -# alias python3='/home/weiji/anaconda3/envs/zero/bin/python3' -# alias pip='/home/weiji/anaconda3/envs/zero/bin/pip' +# This one works for 3B GRPO on 2A20 GPUs export N_GPUS=2 -export CUDA_VISIBLE_DEVICES=2,3 -ray stop --force && ray start --head --include-dashboard=True -export BASE_MODEL="model/Qwen2.5-3B" -export DATA_DIR="data/countdown" +export CUDA_VISIBLE_DEVICES=3,4 +# ray stop --force && ray start --head --include-dashboard=True --dashboard-port=8263 +export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-3B" +export DATA_DIR="/NAS/chenfeng/dataset/countdown" export ROLLOUT_TP_SIZE=2 -export EXPERIMENT_NAME=countdown-qwen2.5-3b-grpo +export EXPERIMENT_NAME=countdown-qwen2.5-3b-grpo-better-format export VLLM_ATTENTION_BACKEND=XFORMERS -bash ./scripts/train_tiny_zero_a100_grpo.sh \ No newline at end of file +export SWANLAB_API_KEY=YOUR_API_KEY_HERE + +# bash ./scripts/train_tiny_zero_a100_grpo.sh +nohup bash ./scripts/train_tiny_zero_a100_grpo.sh &>./nohupoutput_grpo diff --git a/train_3b_ppo.sh b/train_3b_ppo.sh index c4dceeb0..4e195e6a 100755 --- a/train_3b_ppo.sh +++ b/train_3b_ppo.sh @@ -1,15 +1,11 @@ -#!/bin/bash -# alias python='/home/weiji/anaconda3/envs/zero/bin/python' -# alias python3='/home/weiji/anaconda3/envs/zero/bin/python3' -# alias pip='/home/weiji/anaconda3/envs/zero/bin/pip' export N_GPUS=2 -export CUDA_VISIBLE_DEVICES=2,3 -ray stop --force && ray start --head --include-dashboard=True -export BASE_MODEL="model/Qwen2.5-3B" -export DATA_DIR="data/countdown" +export CUDA_VISIBLE_DEVICES=3,4 +# ray stop --force && ray start --head --include-dashboard=True +export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-3B" +export DATA_DIR="/NAS/chenfeng/dataset/countdown" export ROLLOUT_TP_SIZE=2 -export EXPERIMENT_NAME=countdown-qwen2.5-3b +export EXPERIMENT_NAME=countdown-qwen2.5-3b-ppo export VLLM_ATTENTION_BACKEND=XFORMERS bash ./scripts/train_tiny_zero_a100_ppo.sh \ No newline at end of file diff --git a/train_coldstart_countdown_Drgrpo_3b.sh b/train_coldstart_countdown_Drgrpo_3b.sh new file mode 100644 index 00000000..028eb745 --- /dev/null +++ b/train_coldstart_countdown_Drgrpo_3b.sh @@ -0,0 +1,28 @@ +# try 3B-2048 Dr.GRPO on 4A20 GPUs + +export N_GPUS=4 +export CUDA_VISIBLE_DEVICES=2,3,4,6 +ray stop --force && ray start --head --include-dashboard=True --dashboard-port=8263 + +# from scratch +# export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-3B" + +# coldstart +export BASE_MODEL="/NAS/chenfeng/TinyZero-NOOOM-dr-grpo/checkpoints/sft/countdown-qwen2.5-3b-coldstart-SFT/global_step_61" + +# resume +# export BASE_MODEL=/NAS/chenfeng/TinyZero-NOOOM-dr-grpo/checkpoints/TinyZero/countdown-qwen2.5-3b-dr-grpo/actor/global_step_90 + +export DATA_DIR="/NAS/chenfeng/dataset/countdown" +export ROLLOUT_TP_SIZE=4 +export EXPERIMENT_NAME=cntdn-qwen2.5-3b--COLDSTART_cntdn_1e-5-DrGRPO +export VLLM_ATTENTION_BACKEND=XFORMERS + +export SWANLAB_API_KEY=YOUR_API_KEY_HERE + +# bash ./scripts/train_tiny_zero_a100_drgrpo_tang3.sh +# nohup bash ./scripts/train_tiny_zero_a100_drgrpo_tang3.sh &>./nohupoutput +mkdir -p ./outputs/$EXPERIMENT_NAME +echo "\ +# Output saved to ./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput"; \ +nohup bash ./scripts/train_tiny_zero_drgrpo_4a20_tang3.sh &>"./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput" diff --git a/train_coldstart_countdown_dapo_3b.sh b/train_coldstart_countdown_dapo_3b.sh new file mode 100644 index 00000000..d56a4dd1 --- /dev/null +++ b/train_coldstart_countdown_dapo_3b.sh @@ -0,0 +1,116 @@ +# try 3B-2048 DAPO on 4A40 GPUs +# this is a training script for DAPO algorithm & length limit 2048 + +# similarity&differnce from dr-grpo: +# [v]token-mean +# [v]clip-higher +# [v]no-kl-loss +# [v]turn on norm_adv_by_std_in_grpo +# [v]no-kl-in-reward +# [v]overlong-buffer +# [x]dynamic-sampling + + +export N_GPUS=4 +export CUDA_VISIBLE_DEVICES=1,2,3,4 +ray stop --force && ray start --head --include-dashboard=True --dashboard-port=8263 + +# train from scratch +# export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-3B" + +# # coldstart +# export BASE_MODEL="/NAS/chenfeng/TinyZero-NOOOM-dr-grpo/checkpoints/sft/countdown-qwen2.5-3b-coldstart-SFT/global_step_61" + +# resume +export BASE_MODEL=/NAS/chenfeng/TinyZero-NOOOM-dr-grpo/checkpoints/TinyZero/cntdn-qwen2.5-3b--COLDSTART_cntdn_DAPO/actor/global_step_30 + +export DATA_DIR="/NAS/chenfeng/dataset/countdown" +export ROLLOUT_TP_SIZE=4 +export EXPERIMENT_NAME=cntdn-qwen2.5-3b--COLDSTART_cntdn_DAPO +export VLLM_ATTENTION_BACKEND=XFORMERS + +export SWANLAB_API_KEY=YOUR_API_KEY_HERE + +export ENABLE_RESUME=True +export RESUME_ID=YOUR_EXPERIMENT_ID +export RESUME_START_STEP=30 + +# params + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +loss_agg_mode="token-mean" + +max_prompt_length=$((256)) +max_response_length=$((1024 * 2)) # 2K +enable_overlong_buffer=True +overlong_buffer_len=256 # $((1024 * 1.5)) # 1.5K # maxlen = 2048 这个是buffer,而不是期望的长度 +overlong_penalty_factor=1.0 + +# bash ./scripts/train_tiny_zero_a100_drgrpo_tang3.sh +# nohup bash ./scripts/train_tiny_zero_a100_drgrpo_tang3.sh &>./nohupoutput +mkdir -p ./outputs/$EXPERIMENT_NAME +echo "\ +# Output saved to ./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput"; \ +nohup /home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \ + trainer.tracking.resume=$ENABLE_RESUME \ + trainer.tracking.resume_id=$RESUME_ID \ + trainer.tracking.start_step=$RESUME_START_STEP \ + \ + algorithm.adv_estimator=grpo \ + algorithm.norm_adv_by_std_in_grpo=True \ + algorithm.use_kl_in_reward=$use_kl_in_reward \ + algorithm.kl_ctrl.kl_coef=$kl_coef \ + data.train_files=$DATA_DIR/train.parquet \ + data.val_files=$DATA_DIR/test.parquet \ + data.train_batch_size=128 \ + data.val_batch_size=640 \ + data.max_prompt_length=$max_prompt_length \ + data.max_response_length=$max_response_length \ + actor_rollout_ref.model.path=$BASE_MODEL \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size=4 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \ + actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.actor.loss_agg_mode=$loss_agg_mode \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.grad_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \ + actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \ + actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + reward_model.overlong_buffer.enable=${enable_overlong_buffer} \ + reward_model.overlong_buffer.len=${overlong_buffer_len} \ + reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ + trainer.critic_warmup=0 \ + trainer.logger=['swanlab'] \ + ++trainer.val_before_train=False \ + trainer.default_hdfs_dir=null \ + trainer.n_gpus_per_node=$N_GPUS \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.project_name=TinyZero \ + trainer.experiment_name=$EXPERIMENT_NAME \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log\ +&>"./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput" + diff --git a/train_drgrpo_3b_resume.sh b/train_drgrpo_3b_resume.sh new file mode 100644 index 00000000..c769fe15 --- /dev/null +++ b/train_drgrpo_3b_resume.sh @@ -0,0 +1,18 @@ + +export N_GPUS=2 +export CUDA_VISIBLE_DEVICES=0,1 +ray stop --force && ray start --head --include-dashboard=True --dashboard-port=8263 +# resume model path +export BASE_MODEL=/NAS/chenfeng/TinyZero-NOOOM-dr-grpo/checkpoints/TinyZero/countdown-qwen2.5-3b-dr-grpo/actor/global_step_90 +export DATA_DIR="/NAS/chenfeng/dataset/countdown" +export ROLLOUT_TP_SIZE=2 +export EXPERIMENT_NAME=countdown-qwen2.5-3b-dr-grpo +export VLLM_ATTENTION_BACKEND=XFORMERS + +export SWANLAB_API_KEY=YOUR_API_KEY_HERE + +export RESUME_ID=YOUR_EXPERIMENT_ID_HERE like yg7az62y5... +export RESUME_START_STEP=96 + +# nohup bash ./scripts/train_tiny_zero_a100_drgrpo_tang3.sh &>./nohupoutput +nohup bash ./scripts/train_tiny_zero_drgrpo_2a100_resume_tang3.sh &>"./outputs/drgrpo/$(date +'%y%m%d-%H%M%S').nohupoutput" diff --git a/train_grpo_3b_4a20.sh b/train_grpo_3b_4a20.sh new file mode 100644 index 00000000..727cbb3d --- /dev/null +++ b/train_grpo_3b_4a20.sh @@ -0,0 +1,27 @@ +# This one works for 3B GRPO on 4A20 GPUs + +export N_GPUS=4 +export CUDA_VISIBLE_DEVICES=2,3,4,6 +# ray stop --force && ray start --head --include-dashboard=True --dashboard-port=8263 + +# from scratch +export BASE_MODEL="/NAS/chenfeng/models/Qwen/Qwen2.5-3B" + +# coldstart +# export BASE_MODEL="/NAS/chenfeng/TinyZero-NOOOM-dr-grpo/checkpoints/sft/countdown-qwen2.5-3b-coldstart-SFT/global_step_61" + +# resume +# export BASE_MODEL=/NAS/chenfeng/TinyZero-NOOOM-dr-grpo/checkpoints/TinyZero/countdown-qwen2.5-3b-dr-grpo/actor/global_step_90 + +export DATA_DIR="/NAS/chenfeng/dataset/countdown" +export ROLLOUT_TP_SIZE=4 +export EXPERIMENT_NAME=countdown-qwen2.5-3b--COLDSTART_countdown_1e-5--grpo +export VLLM_ATTENTION_BACKEND=XFORMERS + +export SWANLAB_API_KEY=YOUR_API_KEY_HERE + +# bash ./scripts/train_tiny_zero_a100_drgrpo_tang3.sh +# nohup bash ./scripts/train_tiny_zero_a100_drgrpo_tang3.sh &>./nohupoutput +mkdir -p ./outputs/$EXPERIMENT_NAME +echo "\n# Output saved to ./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput"; \ +nohup bash ./scripts/train_tiny_zero_grpo_4a20_tang3.sh &>"./outputs/$EXPERIMENT_NAME/$(date +'%y%m%d-%H%M%S').nohupoutput" diff --git a/verl/trainer/config/generation.yaml b/verl/trainer/config/generation.yaml index ed805a8c..77a23ffc 100644 --- a/verl/trainer/config/generation.yaml +++ b/verl/trainer/config/generation.yaml @@ -2,6 +2,11 @@ trainer: nnodes: 1 n_gpus_per_node: 8 + tracking: + resume: False + resume_id: null + start_step: 0 # start step for resuming training, default is 0 + data: path: ~/data/rlhf/math/test.parquet prompt_key: prompt diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml index 6ae26851..1898a736 100644 --- a/verl/trainer/config/ppo_megatron_trainer.yaml +++ b/verl/trainer/config/ppo_megatron_trainer.yaml @@ -128,6 +128,7 @@ algorithm: gamma: 1.0 lam: 1.0 adv_estimator: gae + norm_adv_by_std_in_grpo: True # whether to normalize advantage by std in grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml index 8429aeea..bf3fb726 100644 --- a/verl/trainer/config/ppo_trainer.yaml +++ b/verl/trainer/config/ppo_trainer.yaml @@ -26,10 +26,13 @@ actor_rollout_ref: ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 clip_ratio: 0.2 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 entropy_coeff: 0.001 use_kl_loss: False # True for GRPO kl_loss_coef: 0.001 # for grpo kl_loss_type: low_var_kl # for grpo + loss_agg_mode: token-mean # token-mean for dapo, seq-mean-token-sum-norm for dr-grpo ppo_epochs: 1 shuffle: False ulysses_sequence_parallel_size: 1 # sp size @@ -74,7 +77,7 @@ actor_rollout_ref: load_format: dummy_dtensor tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 - max_num_seqs: 1024 + max_num_seqs: 2048 log_prob_micro_batch_size: 128 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} @@ -134,15 +137,28 @@ reward_model: ulysses_sequence_parallel_size: 1 # sp size use_dynamic_bsz: ${critic.use_dynamic_bsz} forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + overlong_buffer: + enable: False # We try to avoid forgetting to set enable + len: 0 + penalty_factor: 0.0 + log: False + algorithm: gamma: 1.0 lam: 1.0 adv_estimator: gae + use_kl_in_reward: False + norm_adv_by_std_in_grpo: True # whether to normalize advantage by std in grpo kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed kl_coef: 0.001 + filter_groups: # for dapo + enable: False + metric: score # acc # score / seq_reward / seq_final_reward / ... + max_num_gen_batches: 10 # Non-positive values mean no upper limit + trainer: total_epochs: 30 @@ -157,3 +173,9 @@ trainer: critic_warmup: 0 default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + tracking: + resume: False + resume_id: null + start_step: 0 # start step for resuming training, default is 0 + diff --git a/verl/trainer/config/ppo_trainer_paramtest.yaml b/verl/trainer/config/ppo_trainer_paramtest.yaml new file mode 100644 index 00000000..5179dc39 --- /dev/null +++ b/verl/trainer/config/ppo_trainer_paramtest.yaml @@ -0,0 +1,167 @@ +data: + tokenizer: null + train_files: ~/data/rlhf/gsm8k/train.parquet + val_files: ~/data/rlhf/gsm8k/test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 512 + train_batch_size: 1024 + val_batch_size: 1312 + return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs + return_raw_chat: False + +actor_rollout_ref: + hybrid_engine: True + model: + path: ~/models/deepseek-llm-7b-chat + external_lib: null + override_config: { } + enable_gradient_checkpointing: False + use_remove_padding: False + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 256 + ppo_micro_batch_size: 64 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.5 + entropy_coeff: 0.001 + use_kl_loss: False # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + loss_agg_mode: token-mean # token-mean for grpo, seq-mean-token-sum-norm for dr-grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + grad_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + log_prob_micro_batch_size: 128 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.5 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: 128 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + # for hf rollout + do_sample: True + # number of responses (i.e. num sample times) + n: 1 # > 1 for grpo + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: False + use_remove_padding: False + fsdp_config: + param_offload: False + grad_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: 64 + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + micro_batch_size: 64 + max_length: null + ulysses_sequence_parallel_size: 1 # sp size + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: gae + norm_adv_by_std_in_grpo: True # whether to normalize advantage by std in grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 + +trainer: + total_epochs: 30 + total_training_steps: null + project_name: verl_examples + experiment_name: gsm8k + logger: [ 'console', 'wandb' ] + nnodes: 1 + n_gpus_per_node: 8 + save_freq: -1 + test_freq: -1 + critic_warmup: 0 + default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + tracking: + resume: False + resume_id: null + start_step: 0 # start step for resuming training, default is 0 + diff --git a/verl/trainer/config/sft_trainer.yaml b/verl/trainer/config/sft_trainer.yaml index 1bf7b6ec..70b49bc9 100644 --- a/verl/trainer/config/sft_trainer.yaml +++ b/verl/trainer/config/sft_trainer.yaml @@ -5,8 +5,10 @@ data: val_files: ~/data/gsm8k/test.parquet prompt_key: question response_key: answer - max_length: 1024 - truncation: error + prompt_dict_keys: null + response_dict_keys: null + max_length: 2048 + truncation: pass balance_dp_token: False chat_template: null model: @@ -35,4 +37,6 @@ trainer: total_epochs: 4 logger: ['console'] seed: 1 - + validate_every_n_steps: 10 + val_before_training: True + total_training_steps: null \ No newline at end of file diff --git a/verl/trainer/fsdp_sft_trainer.py b/verl/trainer/fsdp_sft_trainer.py index bb876a73..90bdc8cb 100644 --- a/verl/trainer/fsdp_sft_trainer.py +++ b/verl/trainer/fsdp_sft_trainer.py @@ -1,3 +1,4 @@ +# original file # Copyright 2024 Bytedance Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -44,6 +45,8 @@ import verl.utils.hdfs_io as hdfs_io from verl.utils.debug import log_gpu_memory_usage +from tqdm import tqdm + logger = logging.getLogger(__file__) logger.setLevel(os.getenv('VERL_SFT_LOGGING_LEVEL', 'WARN')) @@ -201,19 +204,19 @@ def _build_model_optimizer(self): log_gpu_memory_usage('After initialize optimizer', logger=logger) - steps_per_epoch = len(self.train_dataloader) - total_steps = steps_per_epoch * self.config.trainer.total_epochs + self.steps_per_epoch = len(self.train_dataloader) + self.total_steps = self.steps_per_epoch * self.config.trainer.total_epochs if self.device_mesh.get_rank() == 0: print( - f'Number of steps/epoch {steps_per_epoch}, number of epochs {self.config.trainer.total_epochs}, total number of steps {total_steps}' + f'Number of steps/epoch {self.steps_per_epoch}, number of epochs {self.config.trainer.total_epochs}, total number of steps {self.total_steps}' ) - num_warmup_steps = int(total_steps * self.config.optim.warmup_steps_ratio) + num_warmup_steps = int(self.total_steps * self.config.optim.warmup_steps_ratio) self.lr_scheduler = get_cosine_schedule_with_warmup(optimizer=self.optimizer, num_warmup_steps=num_warmup_steps, - num_training_steps=total_steps) + num_training_steps=self.total_steps) def _compute_loss(self, batch): loss_mask = batch.pop('loss_mask')[:, :-1].reshape(-1).cuda() @@ -283,7 +286,7 @@ def training_step(self, batch: TensorDict): step_loss = torch.tensor(step_loss).cuda() torch.distributed.all_reduce(step_loss, op=torch.distributed.ReduceOp.AVG) - return {'train/loss': step_loss.detach().item(), 'train/lr(1e-3)': lr * 1e3} + return {'train/loss': step_loss.detach().item(), 'train/lr': lr} def validation_step(self, batch: TensorDict): self.fsdp_model.eval() @@ -323,29 +326,75 @@ def fit(self): # TODO (zhangchi.usc1992) add back checkpoint manager. Currently, it blocks when uploading to hdfs. So very slow. + # compute the total training steps. + # the total training steps in SFT is mainly for early exit + self.total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs + + if self.config.trainer.total_training_steps is not None: + self.total_training_steps = self.config.trainer.total_training_steps + + self.total_samples = len(self.train_dataset) + + print(f'Total training samples: {len(self.train_dataset)}') + print(f'Total training steps: {self.total_training_steps}') + for epoch in range(self.config.trainer.total_epochs): self.train_sampler.set_epoch(epoch=epoch) - for data in self.train_dataloader: + + self.steps_per_epoch = len(self.train_dataloader) + + for data in tqdm(self.train_dataloader, + total=self.steps_per_epoch, + desc=f"Epoch {epoch+1}/{self.config.trainer.total_epochs}"): + # for data in self.train_dataloader: data = TensorDict(data, batch_size=self.config.data.train_batch_size).cuda() metric = self.training_step(data) if rank == 0: tracking.log(data=metric, step=global_step) global_step += 1 - - # validation - val_losses = [] - for data in self.val_dataloader: - data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda() - val_loss = self.validation_step(data) - val_losses.append(val_loss) - if rank == 0: - val_loss = torch.mean(torch.stack(val_losses)) - metric = {'val/loss': val_loss.detach().item()} - tracking.log(data=metric, step=global_step) - torch.distributed.barrier() - - # save checkpoint - self.save_checkpoint(step=global_step) + + # validate + if self.config.trainer.validate_every_n_steps > 0 and (global_step - 1) % self.config.trainer.validate_every_n_steps == 0: + print(f'## valid & save checkpoint at global step {global_step}') + # validation + val_losses = [] + for data in self.val_dataloader: + data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda() + val_loss = self.validation_step(data) + val_losses.append(val_loss) + print(f'## data: {data["input_ids"].shape}') + print(f'## val loss: {val_loss.item()}') + print(f'## val losses: {val_losses}') + if rank == 0: + val_loss = torch.mean(torch.stack(val_losses)) + metric = {'val/loss': val_loss.detach().item()} + tracking.log(data=metric, step=global_step) + torch.distributed.barrier() + + # save checkpoint + self.save_checkpoint(step=global_step) + + if global_step >= self.total_training_steps: + print(f'Reached total training steps {self.total_training_steps}, exiting...') + # validation + val_losses = [] + for data in self.val_dataloader: + data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda() + val_loss = self.validation_step(data) + val_losses.append(val_loss) + print(f'## data: {data["input_ids"].shape}') + print(f'## val loss: {val_loss.item()}') + print(f'## val losses: {val_losses}') + if rank == 0: + val_loss = torch.mean(torch.stack(val_losses)) + metric = {'val/loss': val_loss.detach().item()} + tracking.log(data=metric, step=global_step) + torch.distributed.barrier() + + # save checkpoint + self.save_checkpoint(step=global_step) + + return from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer diff --git a/verl/trainer/fsdp_sft_trainer_better_metrics.py b/verl/trainer/fsdp_sft_trainer_better_metrics.py new file mode 100644 index 00000000..b35a961c --- /dev/null +++ b/verl/trainer/fsdp_sft_trainer_better_metrics.py @@ -0,0 +1,482 @@ +# original file +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A lightweight one-file FSDP SFT Trainer +TODO(zhangchi.usc1992) +- Add calculation of mfu +- Add validation +""" + +import os + +os.environ['NCCL_DEBUG'] = 'WARN' +os.environ['TOKENIZERS_PARALLELISM'] = 'true' + +import logging +import re +import torch +import torch.distributed +from torch import nn, optim +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, MixedPrecision, ShardingStrategy, CPUOffload +from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, AutoConfig +from verl.utils.torch_functional import get_cosine_schedule_with_warmup +from tensordict import TensorDict +from torch.utils.data import DataLoader, DistributedSampler + +from verl.utils.fsdp_utils import get_fsdp_wrap_policy, init_fn, get_init_weight_context_manager +from verl.utils.dataset import SFTDataset +from verl.utils.fs import copy_local_path_from_hdfs +from verl.utils.tracking import Tracking + +from torch.distributed.device_mesh import DeviceMesh + +import verl.utils.hdfs_io as hdfs_io +from verl.utils.debug import log_gpu_memory_usage + +from tqdm import tqdm + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv('VERL_SFT_LOGGING_LEVEL', 'WARN')) + + +def extract_step(path): + match = re.search(r'global_step_(\d+)', path) + if match: + return int(match.group(1)) + return None + + +class FSDPSFTTrainer(object): + + def __init__(self, config, device_mesh: DeviceMesh): + self.config = config + self.device_mesh = device_mesh + # build tokenizer first + local_model_path = copy_local_path_from_hdfs(src=self.config.model.partial_pretrain, verbose=True) + from verl.utils import hf_tokenizer + self.tokenizer = hf_tokenizer(local_model_path, trust_remote_code=self.config.model.trust_remote_code) + if self.config.data.chat_template is not None: + raise ValueError('Apply Chat template from config is not supported yet.') + + # normalize dp size + self._normalize_config_bsz() + + self._build_dataloader() + # build model + self._build_model_optimizer() + + # TODO: add checkpoint manager + if self.device_mesh.get_rank() == 0: + print(self.config) + + def _normalize_config_bsz(self): + dp_size = self.device_mesh.size() + if self.device_mesh.get_rank() == 0: + print(f'Normalize batch size by dp {dp_size}') + + assert self.config.data.train_batch_size % dp_size == 0 + assert self.config.data.micro_batch_size % dp_size == 0 + + self.config.data.train_batch_size //= dp_size + self.config.data.micro_batch_size //= dp_size + + def _build_dataloader(self): + config = self.config + # build dataset + self.train_dataset = SFTDataset(parquet_files=config.data.train_files, + tokenizer=self.tokenizer, + prompt_key=config.data.prompt_key, + prompt_dict_keys=config.data.get('prompt_dict_keys', None), + response_key=config.data.response_key, + response_dict_keys=config.data.get('response_dict_keys', None), + max_length=config.data.max_length, + truncation=config.data.truncation) + self.val_dataset = SFTDataset(parquet_files=config.data.val_files, + tokenizer=self.tokenizer, + prompt_key=config.data.prompt_key, + prompt_dict_keys=config.data.get('prompt_dict_keys', None), + response_key=config.data.response_key, + response_dict_keys=config.data.get('response_dict_keys', None), + max_length=config.data.max_length, + truncation=config.data.truncation) + + # build dataloader + rank = self.device_mesh.get_rank() + world_size = self.device_mesh.size() + self.train_sampler = DistributedSampler(self.train_dataset, + shuffle=True, + num_replicas=world_size, + rank=rank, + drop_last=True) + self.train_dataloader = DataLoader(dataset=self.train_dataset, + batch_size=config.data.train_batch_size, + sampler=self.train_sampler, + num_workers=8, + pin_memory=True, + drop_last=True) + + self.val_sampler = DistributedSampler(self.val_dataset, + shuffle=True, + num_replicas=world_size, + rank=rank, + drop_last=True) + self.val_dataloader = DataLoader(dataset=self.val_dataset, + batch_size=config.data.micro_batch_size, + sampler=self.val_sampler, + num_workers=8, + pin_memory=True, + drop_last=True) + + def _build_model_optimizer(self): + # TODO (zhangchi.usc1992): + # 1. support pretrain from random weights + # 2. support init directly from sharded weights + local_model_path = copy_local_path_from_hdfs(src=self.config.model.partial_pretrain, verbose=True) + + if self.config.model.get('external_lib', None) is not None: + # This is used to import external_lib into the huggingface systems + import importlib + importlib.import_module(self.config.model.external_lib) + + log_gpu_memory_usage('Before model allocation', logger=logger) + + trust_remote_code = self.config.model.trust_remote_code + # load config first + config = AutoConfig.from_pretrained(local_model_path, trust_remote_code=trust_remote_code) + + # This may be very large + init_context = get_init_weight_context_manager(use_meta_tensor=not config.tie_word_embeddings) + + with init_context(): + self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(local_model_path, + config=config, + torch_dtype=torch.float32, + attn_implementation='flash_attention_2', + trust_remote_code=trust_remote_code) + + if self.config.model.enable_gradient_checkpointing: + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False}) + + log_gpu_memory_usage('After model allocation', logger=logger) + + mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + buffer_dtype=torch.float32) + + auto_wrap_policy = get_fsdp_wrap_policy(self.model, config=self.config.model.fsdp_config.wrap_policy) + if self.device_mesh.get_rank() == 0: + print(auto_wrap_policy) + + if not self.config.model.fsdp_config.cpu_offload: + cpu_offload = None + else: + cpu_offload = CPUOffload(offload_params=self.config.model.fsdp_config.offload_params) + + self.fsdp_model = FSDP(module=self.model, + auto_wrap_policy=auto_wrap_policy, + param_init_fn=init_fn, + sharding_strategy=ShardingStrategy.FULL_SHARD, + mixed_precision=mixed_precision, + device_mesh=self.device_mesh, + sync_module_states=True, + device_id=torch.cuda.current_device(), + cpu_offload=cpu_offload, + use_orig_params=False) + + log_gpu_memory_usage('After FSDP wrapping', logger=logger) + + self.optimizer = optim.AdamW(self.fsdp_model.parameters(), + lr=self.config.optim.lr, + betas=self.config.optim.betas, + weight_decay=self.config.optim.weight_decay) + + log_gpu_memory_usage('After initialize optimizer', logger=logger) + + self.steps_per_epoch = len(self.train_dataloader) + self.total_steps = self.steps_per_epoch * self.config.trainer.total_epochs + + if self.device_mesh.get_rank() == 0: + print( + f'Number of steps/epoch {self.steps_per_epoch}, number of epochs {self.config.trainer.total_epochs}, total number of steps {self.total_steps}' + ) + + num_warmup_steps = int(self.total_steps * self.config.optim.warmup_steps_ratio) + + self.lr_scheduler = get_cosine_schedule_with_warmup(optimizer=self.optimizer, + num_warmup_steps=num_warmup_steps, + num_training_steps=self.total_steps) + + def _compute_loss(self, batch): + loss_mask = batch.pop('loss_mask')[:, :-1].reshape(-1).cuda() + labels = batch['input_ids'][:, 1:].cuda() + + with torch.autocast(device_type='cuda', dtype=torch.bfloat16): + output = self.fsdp_model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + position_ids=batch['position_ids'], + use_cache=False + ) # prevent model thinks it it generating + + logits = output.logits + + # 计算交叉熵损失 + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels.contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss(reduction='none') + shift_logits = shift_logits.view(-1, self.model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + loss = loss * loss_mask + + valid_token_this_rank = torch.sum(loss_mask) + + if self.config.data.balance_dp_token: + torch.distributed.all_reduce(valid_token_this_rank) # becomes total valid tokens in all ranks + dp_size = torch.distributed.get_world_size() + else: + dp_size = 1 + + loss = torch.sum(loss) / valid_token_this_rank * dp_size # possible bugs here for dp + return loss + + # 新增:计算评分指标 + def _compute_metrics(self, batch): + with torch.no_grad(): + # 获取模型生成的响应(使用贪婪解码) + generated_ids = torch.argmax(logits, dim=-1) + + # 只取响应部分(假设prompt长度已知) + # 这里需要根据你的数据结构调整如何分离prompt和response + prompt_lengths = batch.get('prompt_lengths', [input_ids_full.shape[1] // 2]) # 示例:假设prompt占一半 + + scores = [] + for i in range(input_ids_full.shape[0]): + # 解码生成的响应 + response_tokens = generated_ids[i, prompt_lengths[i]:] + response_str = self.tokenizer.decode( + response_tokens, + skip_special_tokens=True + ) + + # 解码真实响应(用于参考) + true_response_tokens = input_ids_full[i, prompt_lengths[i]:] + true_response_str = self.tokenizer.decode( + true_response_tokens, + skip_special_tokens=True + ) + + # 计算评分 + score = self.compute_score(response_str, true_response_str) # 修改compute_score以接受参考响应 + scores.append(score) + + avg_score = torch.tensor(sum(scores) / len(scores), device=loss.device) + return { + "score": avg_score + } + + def training_step(self, batch: TensorDict): + self.fsdp_model.train() + + log_gpu_memory_usage('Before optimizer zero_grad', logger=logger) + + self.optimizer.zero_grad() + + log_gpu_memory_usage('After optimizer zero_grad', logger=logger) + + micro_batches = batch.split(self.config.data.micro_batch_size) + n_micro_batches = len(micro_batches) + step_loss = 0 + for micro_batch in micro_batches: + loss = self._compute_loss(batch=micro_batch) / n_micro_batches + loss.backward() + step_loss += loss.item() + + self.fsdp_model.clip_grad_norm_(max_norm=self.config.optim.clip_grad) + + log_gpu_memory_usage('Before optimizer step', logger=logger) + + self.optimizer.step() + + log_gpu_memory_usage('After optimizer step', logger=logger) + + self.lr_scheduler.step() + + # reduce loss across dp ranks + lr = self.lr_scheduler.get_last_lr()[0] + + log_gpu_memory_usage('After offload weights', logger=logger) + + step_loss = torch.tensor(step_loss).cuda() + torch.distributed.all_reduce(step_loss, op=torch.distributed.ReduceOp.AVG) + + # compute metrics + metrics = self._compute_metrics(batch) + score = metrics.get('score', torch.tensor(0.0, device=batch['input_ids'].device)) + torch.distributed.all_reduce(score, op=torch.distributed.ReduceOp.AVG) + return { + 'train/loss': step_loss.detach().item(), + 'train/score': score.detach().item(), + 'train/lr': lr + } + + def validation_step(self, batch: TensorDict): + self.fsdp_model.eval() + with torch.no_grad(): + loss = self._compute_loss(batch) + torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.AVG) + + # compute metrics + metrics = self._compute_metrics(batch) + score = metrics.get('score', torch.tensor(0.0, device=batch['input_ids'].device)) + torch.distributed.all_reduce(score, op=torch.distributed.ReduceOp.AVG) + return { + "loss": loss, + "score": score.detach().item() + } + + def save_checkpoint(self, step): + # save checkpoint + from torch.distributed.fsdp import FullStateDictConfig, StateDictType + cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) + with FSDP.state_dict_type(self.fsdp_model, StateDictType.FULL_STATE_DICT, cfg): + state_dict = self.fsdp_model.state_dict() + + path = os.path.join(self.config.trainer.default_local_dir, f'global_step_{step}') + # save huggingface model + if self.device_mesh.get_rank() == 0: + os.makedirs(path, exist_ok=True) + self.model.save_pretrained(path, state_dict=state_dict) + self.tokenizer.save_pretrained(path) + if self.config.trainer.default_hdfs_dir: + hdfs_io.makedirs(self.config.trainer.default_hdfs_dir, exist_ok=True) + hdfs_io.copy(src=path, dst=self.config.trainer.default_hdfs_dir, dirs_exist_ok=True) + torch.distributed.barrier() + + def fit(self): + rank = self.device_mesh.get_rank() + + # TODO: add a unified tracking + if rank == 0: + tracking = Tracking(project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger) + + global_step = 0 + + # TODO (zhangchi.usc1992) add back checkpoint manager. Currently, it blocks when uploading to hdfs. So very slow. + + # compute the total training steps. + # the total training steps in SFT is mainly for early exit + self.total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs + + if self.config.trainer.total_training_steps is not None: + self.total_training_steps = self.config.trainer.total_training_steps + + self.total_samples = len(self.train_dataset) + + print(f'Total training samples: {len(self.train_dataset)}') + print(f'Total training steps: {self.total_training_steps}') + + for epoch in range(self.config.trainer.total_epochs): + self.train_sampler.set_epoch(epoch=epoch) + + self.steps_per_epoch = len(self.train_dataloader) + + for data in tqdm(self.train_dataloader, + total=self.steps_per_epoch, + desc=f"Epoch {epoch+1}/{self.config.trainer.total_epochs}"): + # for data in self.train_dataloader: + data = TensorDict(data, batch_size=self.config.data.train_batch_size).cuda() + metric = self.training_step(data) + if rank == 0: + tracking.log(data=metric, step=global_step) + global_step += 1 + + # validate + if self.config.trainer.validate_every_n_steps > 0 and (global_step - 1) % self.config.trainer.validate_every_n_steps == 0: + print(f'## valid & save checkpoint at global step {global_step}') + # validation + val_losses = [] + val_scores = [] + for val_data in self.val_dataloader: + val_data = TensorDict(val_data, batch_size=self.config.val_data.micro_batch_size).cuda() + metrics = self.validation_step(val_data) + val_loss = metrics['loss'] + val_score = metrics['score'] + val_losses.append(val_loss) + val_scores.append(val_score) + # print(f'## val_data: {val_data["input_ids"].shape}') + # print(f'## val loss: {val_loss.item()}') + # print(f'## val losses: {val_losses}') + if rank == 0: + val_loss = torch.mean(torch.stack(val_losses)) + val_score = torch.mean(torch.tensor(val_scores, device=val_loss.device)) + print(f'## val loss: {val_loss.item()}, val score: {val_score.item()}') + metric = {'val/loss': val_loss.detach().item(), 'val/score': val_score.detach().item()} + tracking.log(data=metric, step=global_step) + # print(f'## logged validation metrics: {metric}') + # metric = {'val/loss': val_loss.detach().item()} + # tracking.log(data=metric, step=global_step) + torch.distributed.barrier() + + # save checkpoint + self.save_checkpoint(step=global_step) + + if global_step >= self.total_training_steps: + print(f'Reached total training steps {self.total_training_steps}, exiting...') + # validation + val_losses = [] + for data in self.val_dataloader: + data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda() + val_loss = self.validation_step(data) + val_losses.append(val_loss) + print(f'## data: {data["input_ids"].shape}') + print(f'## val loss: {val_loss.item()}') + print(f'## val losses: {val_losses}') + if rank == 0: + val_loss = torch.mean(torch.stack(val_losses)) + metric = {'val/loss': val_loss.detach().item()} + tracking.log(data=metric, step=global_step) + torch.distributed.barrier() + + # save checkpoint + self.save_checkpoint(step=global_step) + + return + + +from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer +import hydra + +from torch.distributed.device_mesh import init_device_mesh + +from verl.utils.distributed import initialize_global_process_group + + +@hydra.main(config_path='config', config_name='sft_trainer', version_base=None) +def main(config): + local_rank, rank, world_size = initialize_global_process_group() + + device_mesh = init_device_mesh(device_type='cuda', mesh_shape=(world_size,), mesh_dim_names=('dp',)) + trainer = FSDPSFTTrainer(config=config, device_mesh=device_mesh) + trainer.fit() + + +if __name__ == '__main__': + main() diff --git a/verl/trainer/fsdp_sft_trainer_verl03.py b/verl/trainer/fsdp_sft_trainer_verl03.py new file mode 100644 index 00000000..73956511 --- /dev/null +++ b/verl/trainer/fsdp_sft_trainer_verl03.py @@ -0,0 +1,532 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A lightweight one-file FSDP SFT Trainer +TODO(zhangchi.usc1992) +- Add calculation of mfu +- Add validation +""" + +import os + +os.environ['NCCL_DEBUG'] = 'WARN' +os.environ['TOKENIZERS_PARALLELISM'] = 'true' + +import logging +import re +from contextlib import nullcontext +import torch +import torch.distributed +from torch import nn, optim +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, MixedPrecision, ShardingStrategy, CPUOffload +from tqdm import tqdm +from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedModel, AutoConfig +from verl.utils.torch_functional import get_cosine_schedule_with_warmup +from tensordict import TensorDict +from torch.utils.data import DataLoader, DistributedSampler +from flash_attn.bert_padding import pad_input, unpad_input, rearrange, index_first_axis + +from verl.utils.fsdp_utils import get_fsdp_wrap_policy, init_fn, get_init_weight_context_manager +from verl.utils.dataset import SFTDataset +from verl.utils.fs import copy_local_path_from_hdfs +from verl.utils.tracking import Tracking +from verl.utils.ulysses import get_ulysses_sequence_parallel_world_size, set_ulysses_sequence_parallel_group +from torch.distributed.device_mesh import DeviceMesh + +import verl.utils.hdfs_io as hdfs_io +from verl.utils.debug import log_gpu_memory_usage +from peft import LoraConfig, TaskType, get_peft_model + +from verl.workers.sharding_manager import FSDPUlyssesShardingManager +from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad +# from verl import DataProto + +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv('VERL_SFT_LOGGING_LEVEL', 'WARN')) + + +def extract_step(path): + match = re.search(r'global_step_(\d+)', path) + if match: + return int(match.group(1)) + return None + + +def convert_to_regular_types(obj): + """Convert Hydra configs and other special types to regular Python types.""" + from omegaconf import ListConfig, DictConfig + if isinstance(obj, (ListConfig, DictConfig)): + return {k: convert_to_regular_types(v) for k, v in obj.items()} if isinstance(obj, DictConfig) else list(obj) + elif isinstance(obj, (list, tuple)): + return [convert_to_regular_types(x) for x in obj] + elif isinstance(obj, dict): + return {k: convert_to_regular_types(v) for k, v in obj.items()} + return obj + + +class FSDPSFTTrainer(object): + + def __init__(self, config, device_mesh: DeviceMesh, ulysses_device_mesh: DeviceMesh): + self.config = config + self.device_mesh = device_mesh + self.ulysses_device_mesh = ulysses_device_mesh + self.sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh) + # build tokenizer first + local_model_path = copy_local_path_from_hdfs(src=self.config.model.partial_pretrain, verbose=True) + from verl.utils import hf_tokenizer + self.tokenizer = hf_tokenizer(local_model_path, trust_remote_code=self.config.model.trust_remote_code) + if self.config.data.chat_template is not None: + raise ValueError('Apply Chat template from config is not supported yet.') + + # normalize dp size + self._normalize_config_bsz() + + # Set sequence parallel size + self.config.ulysses_sequence_parallel_size = getattr(self.config, 'ulysses_sequence_parallel_size', 1) + self.use_remove_padding = getattr(self.config, 'use_remove_padding', False) + if self.device_mesh.get_rank() == 0: + print(f'Using sequence parallel size: {self.config.ulysses_sequence_parallel_size}') + print(f'Using remove padding: {self.use_remove_padding}') + + self._build_dataloader() + # build model + self._build_model_optimizer() + + # TODO: add checkpoint manager + if self.device_mesh.get_rank() == 0: + print(self.config) + + def _normalize_config_bsz(self): + dp_size = self.device_mesh.size(0) if not self.ulysses_device_mesh else self.ulysses_device_mesh.size(0) + if self.device_mesh.get_rank() == 0: + print(f'Normalize batch size by dp {dp_size}') + + assert self.config.data.train_batch_size % dp_size == 0, f"Global batch size {self.config.data.train_batch_size} is not divisible by dp size {dp_size}" + + self.config.data.train_batch_size //= dp_size + + assert self.config.data.train_batch_size % self.config.data.micro_batch_size_per_gpu == 0 + + def _build_dataloader(self): + config = self.config + # build dataset + self.train_dataset = SFTDataset(parquet_files=config.data.train_files, + tokenizer=self.tokenizer, + prompt_key=config.data.prompt_key, + prompt_dict_keys=config.data.get('prompt_dict_keys', None), + response_key=config.data.response_key, + response_dict_keys=config.data.get('response_dict_keys', None), + max_length=config.data.max_length, + truncation=config.data.truncation) + self.val_dataset = SFTDataset(parquet_files=config.data.val_files, + tokenizer=self.tokenizer, + prompt_key=config.data.prompt_key, + prompt_dict_keys=config.data.get('prompt_dict_keys', None), + response_key=config.data.response_key, + response_dict_keys=config.data.get('response_dict_keys', None), + max_length=config.data.max_length, + truncation=config.data.truncation) + + # build dataloader + # Use data parallel rank and size instead of global rank and world size + + # If doing SP, we need to use the local rank and size + if self.config.ulysses_sequence_parallel_size > 1: + rank = self.ulysses_device_mesh.get_local_rank('dp') + world_size = self.ulysses_device_mesh.size(0) + if self.ulysses_device_mesh.get_rank() == 0: + print(f'Using SP rank {rank} and size {world_size} for data distribution') + print(f'Each SP rank gets different data, but the same data WITHIN the same rank') + else: + rank = self.device_mesh.get_rank() + world_size = self.device_mesh.size() + if self.device_mesh.get_rank() == 0: + print(f'Using FSDP rank {rank} and size {world_size} for data distribution') + + self.train_sampler = DistributedSampler(self.train_dataset, + shuffle=True, + num_replicas=world_size, + rank=rank, + drop_last=True) + self.train_dataloader = DataLoader(dataset=self.train_dataset, + batch_size=config.data.train_batch_size, + sampler=self.train_sampler, + num_workers=8, + pin_memory=True, + drop_last=True) + + self.val_sampler = DistributedSampler(self.val_dataset, + shuffle=False, + num_replicas=world_size, + rank=rank, + drop_last=True) + self.val_dataloader = DataLoader(dataset=self.val_dataset, + batch_size=config.data.micro_batch_size_per_gpu, + sampler=self.val_sampler, + num_workers=8, + pin_memory=True, + drop_last=True) + + def _build_model_optimizer(self): + # TODO (zhangchi.usc1992): + # 1. support pretrain from random weights + # 2. support init directly from sharded weights + local_model_path = copy_local_path_from_hdfs(src=self.config.model.partial_pretrain, verbose=True) + + if self.config.model.get('external_lib', None) is not None: + # This is used to import external_lib into the huggingface systems + import importlib + importlib.import_module(self.config.model.external_lib) + + log_gpu_memory_usage('Before model allocation', logger=logger) + + trust_remote_code = self.config.model.trust_remote_code + # load config first + config = AutoConfig.from_pretrained(local_model_path, trust_remote_code=trust_remote_code) + if self.config.ulysses_sequence_parallel_size > 1: + assert self.use_remove_padding, "Sequence parallel is only supported when remove_padding is enabled" + + # This may be very large + init_context = get_init_weight_context_manager(use_meta_tensor=not config.tie_word_embeddings, + mesh=self.device_mesh) + + with init_context(): + self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(local_model_path, + config=config, + torch_dtype=torch.float32, + attn_implementation='flash_attention_2', + trust_remote_code=trust_remote_code) + + if self.use_remove_padding or self.config.ulysses_sequence_parallel_size > 1: + from verl.models.transformers.monkey_patch import apply_monkey_patch + apply_monkey_patch(model=self.model, ulysses_sp_size=self.config.ulysses_sequence_parallel_size) + + # Apply Liger kernel if use_liger is enabled + if self.config.model.get('use_liger', False): + from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance + _apply_liger_kernel_to_instance(model=self.model) + + if self.config.model.get('lora_rank', 0) > 0: + self.model.enable_input_require_grads() + # Convert config to regular Python types before creating PEFT model + lora_config = { + 'task_type': TaskType.CAUSAL_LM, + 'r': self.config.model.lora_rank, + 'lora_alpha': self.config.model.lora_alpha, + 'target_modules': convert_to_regular_types(self.config.model.target_modules), + 'bias': "none" + } + self.model = get_peft_model(self.model, LoraConfig(**lora_config)) + + if self.config.model.enable_gradient_checkpointing: + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={'use_reentrant': False}) + + log_gpu_memory_usage('After model allocation', logger=logger) + + mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + buffer_dtype=torch.float32) + + auto_wrap_policy = get_fsdp_wrap_policy(self.model, + config=self.config.model.fsdp_config.wrap_policy, + is_lora=self.config.model.get('lora_rank', 0) > 0) + if self.device_mesh.get_rank() == 0: + print(auto_wrap_policy) + + if not self.config.model.fsdp_config.cpu_offload: + cpu_offload = None + else: + cpu_offload = CPUOffload(offload_params=self.config.model.fsdp_config.offload_params) + + self.fsdp_model = FSDP(module=self.model, + auto_wrap_policy=auto_wrap_policy, + param_init_fn=init_fn, + sharding_strategy=ShardingStrategy.FULL_SHARD, + mixed_precision=mixed_precision, + device_mesh=self.device_mesh, + sync_module_states=True, + device_id=torch.cuda.current_device(), + cpu_offload=cpu_offload, + use_orig_params=False) + + log_gpu_memory_usage('After FSDP wrapping', logger=logger) + + self.optimizer = optim.AdamW(self.fsdp_model.parameters(), + lr=self.config.optim.lr, + betas=self.config.optim.betas, + weight_decay=self.config.optim.weight_decay) + + log_gpu_memory_usage('After initialize optimizer', logger=logger) + + self.steps_per_epoch = len(self.train_dataloader) + self.total_steps = self.steps_per_epoch * self.config.trainer.total_epochs + + if self.device_mesh.get_rank() == 0: + print( + f'Number of steps/epoch {self.steps_per_epoch}, number of epochs {self.config.trainer.total_epochs}, total number of steps {self.total_steps}' + ) + + num_warmup_steps = int(self.total_steps * self.config.optim.warmup_steps_ratio) + + self.lr_scheduler = get_cosine_schedule_with_warmup(optimizer=self.optimizer, + num_warmup_steps=num_warmup_steps, + num_training_steps=self.total_steps) + + def _compute_loss_and_backward(self, batch, do_backward=True): + """Compute loss with optional sequence parallelism and remove padding features""" + use_sp = self.use_remove_padding and self.config.ulysses_sequence_parallel_size > 1 + + # Move inputs to GPU and prepare loss mask + input_ids = batch['input_ids'].cuda() + attention_mask = batch['attention_mask'].cuda() + position_ids = batch['position_ids'].cuda() + loss_mask = batch.pop('loss_mask')[:, :-1].reshape(-1).cuda() + loss_fct = nn.CrossEntropyLoss(reduction='none') + + # Context manager for sequence parallel if needed + context = self.sharding_manager if use_sp else nullcontext() + with context: + with torch.autocast(device_type='cuda', dtype=torch.bfloat16): + if not use_sp: + # Standard forward pass without sequence parallel + labels = input_ids[:, 1:].contiguous() + output = self.fsdp_model(input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + use_cache=False) + logits = output.logits + + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels.contiguous() + # Flatten the tokens + shift_logits = shift_logits.view(-1, self.model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + loss = loss * loss_mask.to(loss.device) + else: + # IMPORTANT: We have a big assumption here, so we can shard the SAME sequence across SP ranks + # i.e., each GPU has <1 sequence, and each SP group has 1 sequence + # 1. All SP ranks will receive the *SAME* batch + # 2. Different SP groups will receive *DIFFERENT* batches + # This is implemented by the DistributedSampler + + batch_size, seqlen = input_ids.shape + # Remove padding + input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), + attention_mask) # input_ids_rmpad (total_nnz, ...) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + + # Unpad position_ids to align rotary + position_ids_rmpad = index_first_axis(rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), + indices).transpose(0, 1) + + # Pad and slice inputs for sequence parallelism + input_ids_rmpad_sliced, position_ids_rmpad_padded, pad_size = ulysses_pad_and_slice_inputs( + input_ids_rmpad, position_ids_rmpad, sp_size=get_ulysses_sequence_parallel_world_size()) + # For computing loss + input_ids_rmpad_rolled = torch.roll(input_ids_rmpad, shifts=-1, dims=1) # (1, total_nnz) + input_ids_rmpad_rolled, _, _ = ulysses_pad_and_slice_inputs( + input_ids_rmpad_rolled, None, get_ulysses_sequence_parallel_world_size()) + input_ids_rmpad_rolled = input_ids_rmpad_rolled.squeeze(0) # ((total_nnz / sp) + pad) + + # Forward pass + output = self.fsdp_model( + input_ids=input_ids_rmpad_sliced, + attention_mask=None, # Not needed with flash attention varlen + position_ids=position_ids_rmpad_padded, + use_cache=False) + + # Compute loss locally then aggregate + logits_rmpad = output.logits.squeeze(0) + input_ids_rmpad_rolled = input_ids_rmpad_rolled.to(logits_rmpad.device) + loss = loss_fct(logits_rmpad, input_ids_rmpad_rolled) + # Gather and unpad for sequence parallelism + loss = gather_outpus_and_unpad(loss, gather_dim=0, unpad_dim=0, padding_size=pad_size) + + # This is the loss collected from all ulysses ranks + full_loss = pad_input(hidden_states=loss.unsqueeze(-1), + indices=indices, + batch=batch_size, + seqlen=seqlen) + full_loss = full_loss.squeeze(-1)[:, :-1] # Remove last token's loss + full_loss = full_loss.reshape(-1) + loss_mask = loss_mask.to(full_loss.device) + loss = full_loss * loss_mask + + valid_token_this_rank = torch.sum(loss_mask) + + if self.config.data.balance_dp_token: + torch.distributed.all_reduce(valid_token_this_rank) + dp_size = self.ulysses_device_mesh.size('dp') if use_sp else torch.distributed.get_world_size() + else: + dp_size = 1 + + loss = torch.sum(loss) / (valid_token_this_rank + 1e-8) * dp_size + + if do_backward: + loss.backward() + return loss + + def training_step(self, batch: TensorDict): + self.fsdp_model.train() + + log_gpu_memory_usage('Before optimizer zero_grad', logger=logger) + + self.optimizer.zero_grad() + + log_gpu_memory_usage('After optimizer zero_grad', logger=logger) + + micro_batches = batch.split(self.config.data.micro_batch_size_per_gpu) + n_micro_batches = len(micro_batches) + step_loss = 0 + for micro_batch in micro_batches: + loss = self._compute_loss_and_backward(batch=micro_batch) / n_micro_batches + step_loss += loss.item() + + self.fsdp_model.clip_grad_norm_(max_norm=self.config.optim.clip_grad) + + log_gpu_memory_usage('Before optimizer step', logger=logger) + + self.optimizer.step() + + log_gpu_memory_usage('After optimizer step', logger=logger) + + self.lr_scheduler.step() + + # reduce loss across dp ranks + lr = self.lr_scheduler.get_last_lr()[0] + + log_gpu_memory_usage('After offload weights', logger=logger) + + step_loss = torch.tensor(step_loss).cuda() + torch.distributed.all_reduce(step_loss, op=torch.distributed.ReduceOp.AVG) + return {'train/loss': step_loss.detach().item(), 'train/lr(1e-3)': lr * 1e3} + + def validation_step(self, batch: TensorDict): + self.fsdp_model.eval() + with torch.no_grad(): + loss = self._compute_loss_and_backward(batch, do_backward=False) + torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.AVG) + return loss + + def save_checkpoint(self, step): + # save checkpoint + from torch.distributed.fsdp import FullStateDictConfig, StateDictType + cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) + with FSDP.state_dict_type(self.fsdp_model, StateDictType.FULL_STATE_DICT, cfg): + state_dict = self.fsdp_model.state_dict() + + path = os.path.join(self.config.trainer.default_local_dir, f'global_step_{step}') + # save huggingface model + if self.device_mesh.get_rank() == 0: + os.makedirs(path, exist_ok=True) + self.model.save_pretrained(path, state_dict=state_dict) + self.tokenizer.save_pretrained(path) + if self.config.trainer.default_hdfs_dir: + hdfs_io.makedirs(self.config.trainer.default_hdfs_dir, exist_ok=True) + hdfs_io.copy(src=path, dst=self.config.trainer.default_hdfs_dir, dirs_exist_ok=True) + torch.distributed.barrier() + + def fit(self): + rank = self.device_mesh.get_rank() + + # TODO: add a unified tracking + if rank == 0: + tracking = Tracking(project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger) + + global_step = 0 + # compute the total training steps. + # the total training steps in SFT is mainly for early exit + total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs + + if self.config.trainer.total_training_steps is not None: + total_training_steps = self.config.trainer.total_training_steps + + self.total_training_steps = total_training_steps + print(f'Total training steps: {self.total_training_steps}') + + # TODO (zhangchi.usc1992) add back checkpoint manager. Currently, it blocks when uploading to hdfs. So very slow. + + for epoch in range(self.config.trainer.total_epochs): + self.train_sampler.set_epoch(epoch=epoch) + for data in tqdm(self.train_dataloader, + total=self.steps_per_epoch, + desc=f"Epoch {epoch+1}/{self.config.trainer.total_epochs}"): + global_step += 1 + data = TensorDict(data, batch_size=self.config.data.train_batch_size).cuda() + metric = self.training_step(data) + if rank == 0: + tracking.log(data=metric, step=global_step) + + # for early exit validation + if global_step >= self.total_training_steps: + # Perform final validation + val_losses = [] + for val_data in self.val_dataloader: + val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda() + val_loss = self.validation_step(val_data) + val_losses.append(val_loss) + if rank == 0: + avg_val_loss = torch.mean(torch.stack(val_losses)) + metric = {'val/loss': avg_val_loss.detach().item()} + tracking.log(data=metric, step=global_step) + torch.distributed.barrier() + + # Save final checkpoint + self.save_checkpoint(step=global_step) + return + + # validation + val_losses = [] + for data in self.val_dataloader: + data = TensorDict(data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda() + val_loss = self.validation_step(data) + val_losses.append(val_loss) + if rank == 0: + val_loss = torch.mean(torch.stack(val_losses)) + metric = {'val/loss': val_loss.detach().item()} + tracking.log(data=metric, step=global_step) + torch.distributed.barrier() + + # save checkpoint + self.save_checkpoint(step=global_step) + + +from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer +import hydra + +from torch.distributed.device_mesh import init_device_mesh + +from verl.utils.distributed import initialize_global_process_group + + +@hydra.main(config_path='config', config_name='sft_trainer', version_base=None) +def main(config): + local_rank, rank, world_size = initialize_global_process_group() + + device_mesh = init_device_mesh(device_type='cuda', mesh_shape=(world_size,), mesh_dim_names=('fsdp',)) + dp_size = world_size // config.ulysses_sequence_parallel_size + ulysses_device_mesh = init_device_mesh(device_type='cuda', + mesh_shape=(dp_size, config.ulysses_sequence_parallel_size), + mesh_dim_names=('dp', 'sp')) + trainer = FSDPSFTTrainer(config=config, device_mesh=device_mesh, ulysses_device_mesh=ulysses_device_mesh) + trainer.fit() + + +if __name__ == '__main__': + main() diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index 29e5e83d..d64e4c7e 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -17,7 +17,7 @@ from verl import DataProto import torch -from verl.utils.reward_score import gsm8k, math, multiply, countdown +from verl.utils.reward_score import gsm8k, math, multiply, countdown_better_format as countdown from verl.trainer.ppo.ray_trainer import RayPPOTrainer @@ -38,9 +38,17 @@ class RewardManager(): """The reward manager. """ - def __init__(self, tokenizer, num_examine) -> None: + def __init__(self, + tokenizer, + num_examine, + config=None, + max_resp_len=None) -> None: self.tokenizer = tokenizer self.num_examine = num_examine # the number of batches of decoded responses to print to the console + self.config = config + if self.config and self.config.overlong_buffer.enable: + assert max_resp_len is not None, f"max_resp_len must be provided if self.config.overlong_buffer is enabled, but got None" + self.max_resp_len = max_resp_len def __call__(self, data: DataProto): """We will expand this function gradually based on the available datasets""" @@ -75,17 +83,37 @@ def __call__(self, data: DataProto): # select rm_score data_source = data_item.non_tensor_batch['data_source'] + + + # compute score compute_score_fn = _select_rm_score_fn(data_source) - score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth) - reward_tensor[i, valid_response_length - 1] = score + score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth) + # 还有一点不同,他这里用的是concat (prompt + response) 的方式 + # 但是DAPO里面只用response来计算score + + reward = score + + # 更改:for dapo, we add overlong penalty factor + if self.config and self.config.overlong_buffer.enable: + overlong_buffer_len = int(self.config.overlong_buffer.len) + expected_len = self.max_resp_len - overlong_buffer_len + exceed_len = valid_response_length - expected_len + overlong_penalty_factor = self.config.overlong_buffer.penalty_factor + overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0) + reward += overlong_reward + + reward_tensor[i, valid_response_length - 1] = reward if data_source not in already_print_data_sources: already_print_data_sources[data_source] = 0 if already_print_data_sources[data_source] < self.num_examine: already_print_data_sources[data_source] += 1 + print(f'[sequence {i}]') print(sequences_str) + print("[ground_truth]", ground_truth) + print(f"[score]", score) return reward_tensor @@ -98,7 +126,7 @@ def __call__(self, data: DataProto): def main(config): if not ray.is_initialized(): # this is for local ray cluster - ray.init(runtime_env={'env_vars': {'TOKENIZERS_PARALLELISM': 'true', 'NCCL_DEBUG': 'WARN'}}) + ray.init(runtime_env={'env_vars': {'TOKENIZERS_PARALLELISM': 'true', 'NCCL_DEBUG': 'WARN'}}, _temp_dir="/NAS/chenfeng/ray/tmp/") ray.get(main_task.remote(config)) @@ -171,10 +199,19 @@ def main_task(config): role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) mapping[Role.RewardModel] = global_pool_id - reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0) + reward_fn = RewardManager( + tokenizer=tokenizer, + num_examine=0, + config=config.reward_model, + max_resp_len=config.data.max_response_length + ) # Note that we always use function-based RM for validation - val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1) + # val_reward_fn就不用限制长度了 + val_reward_fn = RewardManager( + tokenizer=tokenizer, + num_examine=1 + ) resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) diff --git a/verl/trainer/main_ppo_paramtest.py b/verl/trainer/main_ppo_paramtest.py new file mode 100644 index 00000000..7fd2abc8 --- /dev/null +++ b/verl/trainer/main_ppo_paramtest.py @@ -0,0 +1,193 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Note that we don't combine the main with ray_trainer as ray_trainer is used by other main. +""" + +from verl import DataProto +import torch +from verl.utils.reward_score import gsm8k, math, multiply, countdown_better_format as countdown +from verl.trainer.ppo.ray_trainer import RayPPOTrainer + + +def _select_rm_score_fn(data_source): + if data_source == 'openai/gsm8k': + return gsm8k.compute_score + elif data_source == 'lighteval/MATH': + return math.compute_score + elif "multiply" in data_source or "arithmetic" in data_source: + return multiply.compute_score + elif "countdown" in data_source: + return countdown.compute_score + else: + raise NotImplementedError + + +class RewardManager(): + """The reward manager. + """ + + def __init__(self, tokenizer, num_examine) -> None: + self.tokenizer = tokenizer + self.num_examine = num_examine # the number of batches of decoded responses to print to the console + + def __call__(self, data: DataProto): + """We will expand this function gradually based on the available datasets""" + + # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn + if 'rm_scores' in data.batch.keys(): + return data.batch['rm_scores'] + + reward_tensor = torch.zeros_like(data.batch['responses'], dtype=torch.float32) + + already_print_data_sources = {} + + for i in range(len(data)): + data_item = data[i] # DataProtoItem + + prompt_ids = data_item.batch['prompts'] + + prompt_length = prompt_ids.shape[-1] + + valid_prompt_length = data_item.batch['attention_mask'][:prompt_length].sum() + valid_prompt_ids = prompt_ids[-valid_prompt_length:] + + response_ids = data_item.batch['responses'] + valid_response_length = data_item.batch['attention_mask'][prompt_length:].sum() + valid_response_ids = response_ids[:valid_response_length] + + # decode + sequences = torch.cat((valid_prompt_ids, valid_response_ids)) + sequences_str = self.tokenizer.decode(sequences) + + ground_truth = data_item.non_tensor_batch['reward_model']['ground_truth'] + + # select rm_score + data_source = data_item.non_tensor_batch['data_source'] + compute_score_fn = _select_rm_score_fn(data_source) + + score = compute_score_fn(solution_str=sequences_str, ground_truth=ground_truth) + reward_tensor[i, valid_response_length - 1] = score + + if data_source not in already_print_data_sources: + already_print_data_sources[data_source] = 0 + + if already_print_data_sources[data_source] < self.num_examine: + already_print_data_sources[data_source] += 1 + print(sequences_str) + + return reward_tensor + + +import ray +import hydra + + +@hydra.main(config_path='config', config_name='ppo_trainer_paramtest', version_base=None) +def main(config): + if not ray.is_initialized(): + # this is for local ray cluster + ray.init(runtime_env={'env_vars': {'TOKENIZERS_PARALLELISM': 'true', 'NCCL_DEBUG': 'WARN'}}, _temp_dir="/NAS/chenfeng/ray/tmp/") + + ray.get(main_task.remote(config)) + + +@ray.remote +def main_task(config): + from verl.utils.fs import copy_local_path_from_hdfs + from transformers import AutoTokenizer + + # print initial config + from pprint import pprint + from omegaconf import OmegaConf + pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values + OmegaConf.resolve(config) + + # download the checkpoint from hdfs + local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path) + + # instantiate tokenizer + from verl.utils import hf_tokenizer + tokenizer = hf_tokenizer(local_path) + + # define worker classes + if config.actor_rollout_ref.actor.strategy == 'fsdp': + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker + from verl.single_controller.ray import RayWorkerGroup + ray_worker_group_cls = RayWorkerGroup + + elif config.actor_rollout_ref.actor.strategy == 'megatron': + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker + from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + ray_worker_group_cls = NVMegatronRayWorkerGroup + + else: + raise NotImplementedError + + from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role + + role_worker_mapping = { + Role.ActorRollout: ray.remote(ActorRolloutRefWorker), + Role.Critic: ray.remote(CriticWorker), + Role.RefPolicy: ray.remote(ActorRolloutRefWorker) + } + + global_pool_id = 'global_pool' + resource_pool_spec = { + global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes, + } + mapping = { + Role.ActorRollout: global_pool_id, + Role.Critic: global_pool_id, + Role.RefPolicy: global_pool_id, + } + + # we should adopt a multi-source reward function here + # - for rule-based rm, we directly call a reward score + # - for model-based rm, we call a model + # - for code related prompt, we send to a sandbox if there are test cases + # - finally, we combine all the rewards together + # - The reward type depends on the tag of the data + if config.reward_model.enable: + if config.reward_model.strategy == 'fsdp': + from verl.workers.fsdp_workers import RewardModelWorker + elif config.reward_model.strategy == 'megatron': + from verl.workers.megatron_workers import RewardModelWorker + else: + raise NotImplementedError + role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker) + mapping[Role.RewardModel] = global_pool_id + + reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0) + + # Note that we always use function-based RM for validation + val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1) + + resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) + + trainer = RayPPOTrainer(config=config, + tokenizer=tokenizer, + role_worker_mapping=role_worker_mapping, + resource_pool_manager=resource_pool_manager, + ray_worker_group_cls=ray_worker_group_cls, + reward_fn=reward_fn, + val_reward_fn=val_reward_fn) + trainer.init_workers() + trainer.fit() + + +if __name__ == '__main__': + main() diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index cf4ce7fe..ebee04e9 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -108,10 +108,13 @@ def compute_gae_advantage_return(token_level_rewards: torch.Tensor, values: torc # NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar. + def compute_grpo_outcome_advantage(token_level_rewards: torch.Tensor, eos_mask: torch.Tensor, index: torch.Tensor, - epsilon: float = 1e-6): + epsilon: float = 1e-6, + norm_adv_by_std_in_grpo: bool = True, + ): """ Compute advantage for GRPO, operating only on Outcome reward (with only one scalar reward for each response). @@ -149,7 +152,11 @@ def compute_grpo_outcome_advantage(token_level_rewards: torch.Tensor, else: raise ValueError(f"no score in prompt index: {idx}") for i in range(bsz): - scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon) + ## 更改的地方 + if norm_adv_by_std_in_grpo: + scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + epsilon) + else: + scores[i] = scores[i] - id2mean[index[i]] scores = scores.unsqueeze(-1).tile([1, response_length]) * eos_mask return scores, scores @@ -159,39 +166,122 @@ def compute_rewards(token_level_scores, old_log_prob, ref_log_prob, kl_ratio): kl = old_log_prob - ref_log_prob return token_level_scores - kl * kl_ratio +## 更改的部分 + +def compute_policy_loss( + old_log_prob, + log_prob, + advantages, + response_mask, + cliprange=None, + cliprange_low=None, + cliprange_high=None, + clip_ratio_c=3.0, + loss_agg_mode: str = "token-mean", +): + """ + Compute the clipped policy objective and related metrics for PPO. -def compute_policy_loss(old_log_prob, log_prob, advantages, eos_mask, cliprange): - """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122 + Adapted from + https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122 Args: - old_log_prob: `(torch.Tensor)` - shape: (bs, response_length) - log_prob: `(torch.Tensor)` - shape: (bs, response_length) - advantages: `(torch.Tensor)` - shape: (bs, response_length) - eos_mask: `(torch.Tensor)` - shape: (bs, response_length) - cliprange: (float) - The clip range used in PPO. See https://arxiv.org/abs/1707.06347 - - Returns: - pg_loss: `a scalar torch.Tensor` - policy gradient loss computed via PPO - pg_clipfrac: (float) - a float number indicating the fraction of policy gradient loss being clipped - + old_log_prob (torch.Tensor): + Log-probabilities of actions under the old policy, shape (batch_size, response_length). + log_prob (torch.Tensor): + Log-probabilities of actions under the current policy, shape (batch_size, response_length). + advantages (torch.Tensor): + Advantage estimates for each action, shape (batch_size, response_length). + response_mask (torch.Tensor): + Mask indicating which tokens to include in the loss, shape (batch_size, response_length). + cliprange (float, optional): + Clipping parameter ε for standard PPO. See https://arxiv.org/abs/1707.06347. + Defaults to None (must be provided). + cliprange_low (float, optional): + Lower clip range for dual-clip PPO. Defaults to same as `cliprange`. + cliprange_high (float, optional): + Upper clip range for dual-clip PPO. Defaults to same as `cliprange`. + clip_ratio_c (float, optional): + Lower bound of the ratio for dual-clip PPO. See https://arxiv.org/pdf/1912.09729. + Defaults to 3.0. + loss_agg_mode (str, optional): + Aggregation mode for `agg_loss`. Defaults to "token-mean". """ + assert clip_ratio_c > 1.0, ( + "The lower bound of the clip_ratio_c for dual-clip PPO should be greater than 1.0," + + f" but get the value: {clip_ratio_c}." + ) + negative_approx_kl = log_prob - old_log_prob + # Clamp negative_approx_kl for stability + negative_approx_kl = torch.clamp(negative_approx_kl, min=-20.0, max=20.0) ratio = torch.exp(negative_approx_kl) - ppo_kl = verl_F.masked_mean(-negative_approx_kl, eos_mask) - - pg_losses = -advantages * ratio - pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange, 1.0 + cliprange) - - pg_loss = verl_F.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask) - pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask) - return pg_loss, pg_clipfrac, ppo_kl + ppo_kl = verl_F.masked_mean(-negative_approx_kl, response_mask) + + pg_losses1 = -advantages * ratio + if cliprange_low is None: + cliprange_low = cliprange + if cliprange_high is None: + cliprange_high = cliprange + pg_losses2 = -advantages * torch.clamp( + ratio, 1 - cliprange_low, 1 + cliprange_high + ) # - clip(ratio, 1-cliprange, 1+cliprange) * A + clip_pg_losses1 = torch.maximum( + pg_losses1, pg_losses2 + ) # max(-ratio * A, -clip(ratio, 1-cliprange, 1+cliprange) * A) + pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses1).float(), response_mask) + + pg_losses3 = -advantages * clip_ratio_c + clip_pg_losses2 = torch.min(pg_losses3, clip_pg_losses1) + pg_clipfrac_lower = verl_F.masked_mean( + torch.gt(clip_pg_losses1, pg_losses3) * (advantages < 0).float(), response_mask + ) + + pg_losses = torch.where(advantages < 0, clip_pg_losses2, clip_pg_losses1) + pg_loss = agg_loss(loss_mat=pg_losses, loss_mask=response_mask, loss_agg_mode=loss_agg_mode) + + return pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower + +# def compute_policy_loss( +# old_log_prob, +# log_prob, +# advantages, +# eos_mask, # response_mask +# cliprange, +# loss_agg_mode: str = "token-mean", +# ): +# """Adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/ppo_trainer.py#L1122 + +# Args: +# old_log_prob: `(torch.Tensor)` +# shape: (bs, response_length) +# log_prob: `(torch.Tensor)` +# shape: (bs, response_length) +# advantages: `(torch.Tensor)` +# shape: (bs, response_length) +# eos_mask: `(torch.Tensor)` +# shape: (bs, response_length) +# cliprange: (float) +# The clip range used in PPO. See https://arxiv.org/abs/1707.06347 + +# Returns: +# pg_loss: `a scalar torch.Tensor` +# policy gradient loss computed via PPO +# pg_clipfrac: (float) +# a float number indicating the fraction of policy gradient loss being clipped + +# """ +# negative_approx_kl = log_prob - old_log_prob +# ratio = torch.exp(negative_approx_kl) +# ppo_kl = verl_F.masked_mean(-negative_approx_kl, eos_mask) + +# pg_losses = -advantages * ratio +# pg_losses2 = -advantages * torch.clamp(ratio, 1.0 - cliprange, 1.0 + cliprange) + +# pg_loss = agg_loss(loss_mat=pg_losses, loss_mask=eos_mask, loss_agg_mode=loss_agg_mode) +# # pg_loss = verl_F.masked_mean(torch.max(pg_losses, pg_losses2), eos_mask) +# pg_clipfrac = verl_F.masked_mean(torch.gt(pg_losses2, pg_losses).float(), eos_mask) +# return pg_loss, pg_clipfrac, ppo_kl def compute_entropy_loss(logits, eos_mask): @@ -272,3 +362,41 @@ def kl_penalty(logprob: torch.FloatTensor, ref_logprob: torch.FloatTensor, kl_pe raise NotImplementedError raise NotImplementedError + +## 更改的部分(添加) + +def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str): + """ + Aggregate the loss matrix into a scalar. + + Args: + loss_mat: `(torch.Tensor)`: + shape: (bs, response_length) + loss_mask: `(torch.Tensor)`: + shape: (bs, response_length) + loss_agg_mode: (str) choices: + method to aggregate the loss matrix into a scalar. + Returns: + loss: `a scalar torch.Tensor` + aggregated loss + """ + if loss_agg_mode == "token-mean": + loss = verl_F.masked_mean(loss_mat, loss_mask) + elif loss_agg_mode == "seq-mean-token-sum": + seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum + loss = torch.mean(seq_losses) # seq-mean + elif loss_agg_mode == "seq-mean-token-mean": + seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1) # token-mean + loss = torch.mean(seq_losses) # seq-mean + elif loss_agg_mode == "seq-mean-token-sum-norm": + seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) + loss = torch.sum(seq_losses) / loss_mask.shape[-1] # The divisor + # (loss_mask.shape[-1]) should ideally be constant + # throughout training to well-replicate the DrGRPO paper. + # TODO: Perhaps add user-defined normalizer argument to + # agg_loss to ensure divisor stays constant throughout. + else: + raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}") + + return loss + diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index e3ec83a7..7b1ddae7 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -23,6 +23,7 @@ from enum import Enum from pprint import pprint from typing import Type, Dict +from typing import Optional, Type import numpy as np from codetiming import Timer @@ -113,7 +114,16 @@ def apply_kl_penalty(data: DataProto, kl_ctrl: core_algos.AdaptiveKLController, return data, metrics -def compute_advantage(data: DataProto, adv_estimator, gamma=1.0, lam=1.0, num_repeat=1): +## 更改过后的原来的compute_advantage函数 (verl_v02) +def compute_advantage( + data: DataProto, + adv_estimator, + gamma=1.0, + lam=1.0, + num_repeat=1, + norm_adv_by_std_in_grpo=True, + config = None # currently not used, but can be used to pass additional config + ): # prepare response group # TODO: add other ways to estimate advantages if adv_estimator == 'gae': @@ -137,9 +147,12 @@ def compute_advantage(data: DataProto, adv_estimator, gamma=1.0, lam=1.0, num_re response_length = responses.size(-1) attention_mask = data.batch['attention_mask'] response_mask = attention_mask[:, -response_length:] - advantages, returns = core_algos.compute_grpo_outcome_advantage(token_level_rewards=token_level_rewards, - eos_mask=response_mask, - index=index) + advantages, returns = core_algos.compute_grpo_outcome_advantage( + token_level_rewards=token_level_rewards, + eos_mask=response_mask, + index=index, + norm_adv_by_std_in_grpo = norm_adv_by_std_in_grpo + ) data.batch['advantages'] = advantages data.batch['returns'] = returns else: @@ -556,9 +569,11 @@ def fit(self): logger = Tracking(project_name=self.config.trainer.project_name, experiment_name=self.config.trainer.experiment_name, default_backend=self.config.trainer.logger, + resume=self.config.trainer.tracking.get('resume', False), + resume_id=self.config.trainer.tracking.get('resume_id', None), config=OmegaConf.to_container(self.config, resolve=True)) - self.global_steps = 0 + self.global_steps = self.config.trainer.tracking.get('start_step', 0) # perform validation before training # currently, we only support validation using the reward_function. @@ -627,22 +642,43 @@ def fit(self): reward_tensor = self.reward_fn(batch) batch.batch['token_level_scores'] = reward_tensor + + if (self.config.algorithm.use_kl_in_reward and self.config.actor_rollout_ref.actor.use_kl_loss): + print(f"NOTICE: You have both enabled in-reward kl and kl loss, default to ban in-reward kl") + # compute rewards. apply_kl_penalty if available - if not self.config.actor_rollout_ref.actor.use_kl_loss: - batch, kl_metrics = apply_kl_penalty(batch, - kl_ctrl=self.kl_ctrl, - kl_penalty=self.config.algorithm.kl_penalty) + if self.config.algorithm.use_kl_in_reward and not self.config.actor_rollout_ref.actor.use_kl_loss: + batch, kl_metrics = apply_kl_penalty( + batch, + kl_ctrl=self.kl_ctrl, + kl_penalty=self.config.algorithm.kl_penalty + ) metrics.update(kl_metrics) else: batch.batch['token_level_rewards'] = batch.batch['token_level_scores'] # compute advantages, executed on the driver process - batch = compute_advantage(batch, - adv_estimator=self.config.algorithm.adv_estimator, - gamma=self.config.algorithm.gamma, - lam=self.config.algorithm.lam, - num_repeat=self.config.actor_rollout_ref.rollout.n) - + # batch = compute_advantage(batch, + # adv_estimator=self.config.algorithm.adv_estimator, + # gamma=self.config.algorithm.gamma, + # lam=self.config.algorithm.lam, + # num_repeat=self.config.actor_rollout_ref.rollout.n, + # ) + + ### 更改的地方 + norm_adv_by_std_in_grpo = self.config.algorithm.get( + "norm_adv_by_std_in_grpo", True + ) # GRPO adv normalization factor + + batch = compute_advantage( + batch, + adv_estimator=self.config.algorithm.adv_estimator, + gamma=self.config.algorithm.gamma, + lam=self.config.algorithm.lam, + num_repeat=self.config.actor_rollout_ref.rollout.n, + norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo, + config=self.config.algorithm, + ) # update critic if self.use_critic: with _timer('update_critic', timing_raw): diff --git a/verl/utils/dataset/sft_dataset.py b/verl/utils/dataset/sft_dataset.py index 9c7a2966..b32ba8a0 100644 --- a/verl/utils/dataset/sft_dataset.py +++ b/verl/utils/dataset/sft_dataset.py @@ -45,7 +45,7 @@ def __init__(self, response_dict_keys=None, max_length=1024, truncation='error'): - assert truncation in ['error', 'left', 'right'] + assert truncation in ['error', 'left', 'right', 'pass'] self.truncation = truncation if not isinstance(parquet_files, List): @@ -84,6 +84,7 @@ def series_to_item(ls): dataframe = pd.read_parquet(parquet_file) dataframes.append(dataframe) self.dataframe = pd.concat(dataframes) + print(f'cols: {self.dataframe.columns}') self.prompts = self.dataframe[self.prompt_key] for key in self.prompt_dict_keys: # type(x): pandas.core.series.Series @@ -94,7 +95,7 @@ def series_to_item(ls): except Exception: print(f'self.prompts={self.prompts}') raise - self.prompts = self.prompts.tolist() + self.prompts = list(self.prompts) self.responses = self.dataframe[self.response_key] for key in self.response_dict_keys: try: @@ -102,7 +103,9 @@ def series_to_item(ls): except Exception: print(f'self.responses={self.responses}') raise - self.responses = self.responses.tolist() + self.responses = list(self.responses) + + print(f'## loaded {len(self.prompts)} prompts and {len(self.responses)} responses from {self.parquet_files}') def __len__(self): return len(self.prompts) @@ -110,6 +113,14 @@ def __len__(self): def __getitem__(self, item): tokenizer = self.tokenizer + if item > len(self.prompts) - 1: + if self.truncation == 'pass': + # this is a very dangerous way to handle the index, which may lead to infinite loop + print(f'Warning: item {item} is out of range, so we let it wrap around to {item % len(self.prompts)}') + item = item % len(self.prompts) + else: + raise IndexError(f'item {item} is out of range, dataset length is {len(self.prompts)}') + prompt = self.prompts[item] response = self.responses[item] @@ -152,6 +163,8 @@ def __getitem__(self, item): elif self.truncation == 'right': input_ids = input_ids[:self.max_length] attention_mask = attention_mask[:self.max_length] + elif self.truncation == 'pass': + return self.__getitem__(item + 1) # skip this item elif self.truncation == 'error': raise NotImplementedError(f'{sequence_length=} is larger than {self.max_length=}') else: diff --git a/verl/utils/reward_score/countdown_better_format.py b/verl/utils/reward_score/countdown_better_format.py new file mode 100644 index 00000000..052ac3de --- /dev/null +++ b/verl/utils/reward_score/countdown_better_format.py @@ -0,0 +1,157 @@ +import re +import random +import ast +import operator + + +def extract_solution(solution_str): + """Extract the equation from the solution string.""" + # Remove everything before the first "Assistant:" + if "Assistant:" in solution_str: + solution_str = solution_str.split("Assistant:", 1)[1] + elif "<|im_start|>assistant" in solution_str: + solution_str = solution_str.split("<|im_start|>assistant", 1)[1] + else: + return None + solution_str = solution_str.split('\n')[-1] + + answer_pattern = r'(.*?)' + match = re.finditer(answer_pattern, solution_str, re.S) + matches = list(match) + if matches: + final_answer = matches[-1].group(1).strip() + else: + final_answer = None + return final_answer + + +def validate_equation(equation_str, available_numbers): + """Validate that equation only uses available numbers and each number once.""" + try: + ## if the equation_str is an equation, then split it and only return the left part + if '=' in equation_str: + equation_str = equation_str.split('=')[0].strip() + + # Extract all numbers from the equation + numbers_in_eq = [int(n) for n in re.findall(r'\d+', equation_str)] + + # Check if all numbers in equation are available + available_numbers = sorted(available_numbers) + numbers_in_eq = sorted(numbers_in_eq) + + # Each number should be used exactly once + return numbers_in_eq == available_numbers + except: + return False + + +def evaluate_equation(equation_str): + """Safely evaluate the arithmetic equation using eval() with precautions.""" + try: + ## if the equation_str is an equation, then split it and only return the left part + if '=' in equation_str: + equation_str = equation_str.split('=')[0].strip() + + # Define a regex pattern that only allows numbers, operators, parentheses, and whitespace + allowed_pattern = r'^[\d+\-*/().\s]+$' + if not re.match(allowed_pattern, equation_str): + raise ValueError("Invalid characters in equation.") + + # Evaluate the equation with restricted globals and locals + result = eval(equation_str, {"__builtins__": None}, {}) + return result + except Exception as e: + return None + +def validate_output_format(solution_str): + """Validate the output format of the solution string.""" + # should contain only one , + + if "Assistant:" in solution_str: + assitant_answer_str = solution_str.split("Assistant:", 1)[1] + elif "<|im_start|>assistant" in solution_str: + assitant_answer_str = solution_str.split("<|im_start|>assistant", 1)[1] + else: + return False + + think_count = assitant_answer_str.count('') + end_think_count = assitant_answer_str.count('') + answer_count = assitant_answer_str.count('') + end_answer_count = assitant_answer_str.count('') + # print(f'think_count: {think_count}, end_think_count: {end_think_count}, answer_count: {answer_count}, end_answer_count: {end_answer_count}') + if think_count != end_think_count: + return False + if answer_count != end_answer_count: + return False + if answer_count != 1: + return False + return True + +def compute_score(solution_str, ground_truth, method='strict', format_score=0.1, score=1.): + """The scoring function for countdown task. + + Args: + solution_str: the solution text + ground_truth: dictionary containing target number and available numbers + method: the method to extract the solution + format_score: the score for correct format but wrong answer + score: the score for the correct answer + + equation_format_score = 0.05 + output_format_score = 0.05 + correct_score = 1.0 + """ + target = ground_truth['target'] + numbers = ground_truth['numbers'] + + equation = extract_solution(solution_str=solution_str) + do_print = random.randint(1, 64) == 1 # 对1/64的结果输出 + + if do_print: + print(f"--------------------------------") + print(f"Target: {target} | Numbers: {numbers}") + print(f"Extracted equation: {equation}") + print(f"Solution string: {solution_str}") + + if equation is None: + if do_print: + print(f"No equation found") + return 0 + + equation_format_score = format_score * 0.5 + output_format_score = format_score * 0.5 if validate_output_format(solution_str) else 0 + final_format_score = equation_format_score + output_format_score + if do_print: + if output_format_score: + print(f"Valid output format") + else: + print(f"Invalid output format") + + # Validate equation uses correct numbers + if not validate_equation(equation, numbers): + if do_print: + print(f"Invalid equation") + return final_format_score + + # Evaluate equation + try: + result = evaluate_equation(equation) + if result is None: + if do_print: + print(f"Could not evaluate equation") + return final_format_score + + if abs(result - target) < 1e-5: # Account for floating point precision + if do_print: + print(f"Correct equation: {equation} = {result}") + correct_answer_score = score * 0.9 + return correct_answer_score + final_format_score + else: + if do_print: + print(f"Wrong result: equation = {result}, target = {target}") + return final_format_score + except: + if do_print: + print(f"Error evaluating equation") + return final_format_score + \ No newline at end of file diff --git a/verl/utils/tracking.py b/verl/utils/tracking.py index b1fbd6f3..84941c6f 100644 --- a/verl/utils/tracking.py +++ b/verl/utils/tracking.py @@ -22,9 +22,15 @@ class Tracking(object): - supported_backend = ['wandb', 'mlflow', 'console'] - - def __init__(self, project_name, experiment_name, default_backend: Union[str, List[str]] = 'console', config=None): + supported_backend = ['wandb', 'mlflow', 'console', 'swanlab'] + + def __init__(self, + project_name, + experiment_name, + default_backend: Union[str, List[str]] = 'console', + resume: bool = False, + resume_id: Union[str, None] = None, + config=None): if isinstance(default_backend, str): default_backend = [default_backend] for backend in default_backend: @@ -45,6 +51,20 @@ def __init__(self, project_name, experiment_name, default_backend: Union[str, Li wandb.init(project=project_name, name=experiment_name, config=config) self.logger['wandb'] = wandb + if 'swanlab' in default_backend: + import swanlab + import os + SWANLAB_API_KEY = os.environ.get("SWANLAB_API_KEY", None) + if SWANLAB_API_KEY: + swanlab.login(api_key=SWANLAB_API_KEY) + # 更改: 添加tracking resume + if resume and resume_id is not None: + swanlab.init(project=project_name, name=experiment_name, resume=True, id=resume_id, config=config) + else: + swanlab.init(project=project_name, name=experiment_name, config=config) + + self.logger['wandb'] = swanlab + if 'mlflow' in default_backend: import mlflow mlflow.start_run(run_name=experiment_name) @@ -62,6 +82,7 @@ def log(self, data, step, backend=None): logger_instance.log(data=data, step=step) + class _MlflowLoggingAdapter: def log(self, data, step): diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py index 5b7fef8a..cb0ebf66 100644 --- a/verl/workers/actor/dp_actor.py +++ b/verl/workers/actor/dp_actor.py @@ -30,6 +30,7 @@ from verl.utils.ulysses import ulysses_pad_and_slice_inputs, gather_outpus_and_unpad from verl.utils.seqlen_balancing import rearrange_micro_batches, get_reverse_idx import verl.utils.torch_functional as verl_F +from verl.trainer.ppo.core_algos import agg_loss from flash_attn.bert_padding import pad_input, unpad_input, rearrange, index_first_axis @@ -240,21 +241,48 @@ def update_policy(self, data: DataProto): advantages = data['advantages'] clip_ratio = self.config.clip_ratio + clip_ratio_low = ( + self.config.clip_ratio_low if self.config.clip_ratio_low is not None else clip_ratio + ) + clip_ratio_high = ( + self.config.clip_ratio_high if self.config.clip_ratio_high is not None else clip_ratio + ) entropy_coeff = self.config.entropy_coeff + loss_agg_mode = self.config.loss_agg_mode + # all return: (bsz, response_length) entropy, log_prob = self._forward_micro_batch(micro_batch=data, temperature=temperature) - pg_loss, pg_clipfrac, ppo_kl = core_algos.compute_policy_loss(old_log_prob=old_log_prob, - log_prob=log_prob, - advantages=advantages, - eos_mask=response_mask, - cliprange=clip_ratio) + ## 更改的部分 + pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower = core_algos.compute_policy_loss( + old_log_prob=old_log_prob, + log_prob=log_prob, + advantages=advantages, + # eos_mask=response_mask, + response_mask=response_mask, + cliprange=clip_ratio, + cliprange_low=clip_ratio_low, + cliprange_high=clip_ratio_high, + loss_agg_mode=loss_agg_mode, + ) # compute entropy loss from entropy - entropy_loss = verl_F.masked_mean(entropy, response_mask) + + ## 更改的部分 + # entropy_loss = verl_F.masked_mean(entropy, response_mask) + + # # compute policy loss + # policy_loss = pg_loss - entropy_loss * entropy_coeff + + + if entropy_coeff != 0: + entropy_loss = agg_loss(loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=loss_agg_mode) + + # compute policy loss + policy_loss = pg_loss - entropy_loss * entropy_coeff + else: + policy_loss = pg_loss - # compute policy loss - policy_loss = pg_loss - entropy_loss * entropy_coeff if self.config.use_kl_loss: ref_log_prob = data['ref_log_prob'] @@ -262,7 +290,8 @@ def update_policy(self, data: DataProto): kld = core_algos.kl_penalty(logprob=log_prob, ref_logprob=ref_log_prob, kl_penalty=self.config.kl_loss_type) - kl_loss = masked_mean(kld, response_mask) + kl_loss = agg_loss(loss_mat=kld, loss_mask=response_mask, loss_agg_mode=loss_agg_mode) + # kl_loss = masked_mean(kld, response_mask) policy_loss = policy_loss - kl_loss * self.config.kl_loss_coef metrics['actor/kl_loss'] = kl_loss.detach().item() @@ -276,6 +305,7 @@ def update_policy(self, data: DataProto): 'actor/pg_loss': pg_loss.detach().item(), 'actor/pg_clipfrac': pg_clipfrac.detach().item(), 'actor/ppo_kl': ppo_kl.detach().item(), + "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(), } append_to_dict(metrics, data) diff --git a/verl/workers/critic/dp_critic.py b/verl/workers/critic/dp_critic.py index 0842ff4a..88000ed7 100644 --- a/verl/workers/critic/dp_critic.py +++ b/verl/workers/critic/dp_critic.py @@ -74,10 +74,23 @@ def _forward_micro_batch(self, micro_batch): sp_size=self.ulysses_sequence_parallel_size) # only pass input_ids and position_ids to enable flash_attn_varlen - output = self.critic_module(input_ids=input_ids_rmpad, - attention_mask=None, - position_ids=position_ids_rmpad, - use_cache=False) # prevent model thinks we are generating + # 更改的部分(try-except) + flag_finished = False + while not flag_finished: + try: + output = self.critic_module(input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids_rmpad, + use_cache=False) # prevent model thinks we are generating + + flag_finished = True + except RuntimeError as e: + if 'CUDA out of memory' in str(e): + torch.cuda.empty_cache() + print("CUDA OOM, retrying...") + else: + raise e + values_rmpad = output.logits values_rmpad = values_rmpad.squeeze(0) # (total_nnz) diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py index 53e84452..0dd51803 100644 --- a/verl/workers/fsdp_workers.py +++ b/verl/workers/fsdp_workers.py @@ -40,8 +40,9 @@ from codetiming import Timer -logger = logging.getLogger(__file__) -logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN')) +# logger = logging.getLogger(__file__) +logger = None +# logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'DEBUG')) class ActorRolloutRefWorker(Worker): @@ -122,7 +123,7 @@ def _build_model_optimizer(self, from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, ShardingStrategy, MixedPrecision from torch import optim - log_gpu_memory_usage('Before init from HF AutoModel', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_model_optimizer()Before init from HF AutoModel', logger=logger) local_path = copy_local_path_from_hdfs(model_path) # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect @@ -176,7 +177,7 @@ def _build_model_optimizer(self, if self.rank == 0: print_model_size(actor_module) - log_gpu_memory_usage('After init from HF AutoModel', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_model_optimizer() After init from HF AutoModel', logger=logger) # We wrap FSDP for rollout as well mixed_precision_config = fsdp_config.get('mixed_precision', None) @@ -221,7 +222,7 @@ def _build_model_optimizer(self, device_mesh=self.device_mesh, forward_prefetch=False) - log_gpu_memory_usage('After Actor FSDP init', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_model_optimizer() After Actor FSDP init', logger=logger) # TODO: add more optimizer args into config if self._is_actor: @@ -243,7 +244,7 @@ def _build_model_optimizer(self, actor_optimizer = None actor_lr_scheduler = None - log_gpu_memory_usage('After actor optimizer init', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_model_optimizer() After actor optimizer init', logger=logger) return actor_module_fsdp, actor_optimizer, actor_lr_scheduler, actor_model_config @@ -264,12 +265,12 @@ def _build_rollout(self): elif self.config.rollout.name == 'vllm': from verl.workers.rollout.vllm_rollout import vLLMRollout from verl.workers.sharding_manager import FSDPVLLMShardingManager - log_gpu_memory_usage('Before building vllm rollout', logger=None) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_rollout() Before building vllm rollout', logger=None) rollout = vLLMRollout(actor_module=self.actor_module_fsdp, config=self.config.rollout, tokenizer=self.tokenizer, model_hf_config=self.actor_model_config) - log_gpu_memory_usage('After building vllm rollout', logger=None) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_rollout() After building vllm rollout', logger=None) if torch.distributed.get_world_size() == 1: self.config.rollout.load_format = 'dummy_hf' rollout_sharding_manager = FSDPVLLMShardingManager(module=self.actor_module_fsdp, @@ -277,7 +278,7 @@ def _build_rollout(self): model_config=self.actor_model_config, full_params='hf' in self.config.rollout.load_format, device_mesh=rollout_device_mesh) - log_gpu_memory_usage('After building sharding manager', logger=None) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_rollout() After building sharding manager', logger=None) return rollout, rollout_sharding_manager @@ -315,10 +316,10 @@ def init_model(self): if self._is_offload_param: # param is require during state_dict in sharding manager offload_fsdp_grad(module=self.actor_module_fsdp) - log_gpu_memory_usage('After offload actor grad during init', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker.init_model() After offload actor grad during init', logger=logger) if self._is_offload_optimizer: offload_fsdp_optimizer(optimizer=self.actor_optimizer) - log_gpu_memory_usage('After offload actor optimizer during init', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker.init_model() After offload actor optimizer during init', logger=logger) # load from checkpoint if self._is_actor: OmegaConf.set_struct(self.config.actor, True) @@ -332,13 +333,29 @@ def init_model(self): self.rollout, self.rollout_sharding_manager = self._build_rollout() if self._is_ref: - self.ref_module_fsdp = self._build_model_optimizer(model_path=self.config.model.path, - fsdp_config=self.config.ref.fsdp_config, - optim_config=None, - override_model_config=override_model_config, - use_remove_padding=use_remove_padding, - trust_remote_code=self.config.model.get( - 'trust_remote_code', False))[0] + + ## 更改添加的部分(try-except) + flag_finished = False + while not flag_finished: + try: + self.ref_module_fsdp = self._build_model_optimizer(model_path=self.config.model.path, + fsdp_config=self.config.ref.fsdp_config, + optim_config=None, + override_model_config=override_model_config, + use_remove_padding=use_remove_padding, + trust_remote_code=self.config.model.get( + 'trust_remote_code', False))[0] + flag_finished = True + except Exception as e: + + if 'CUDA out of memory' in str(e): + torch.cuda.empty_cache() + print("CUDA OOM, retrying...") + else: + raise e + # logger.error(f'Failed to build reference policy: {e}') + # flag_finished = False + if self._is_offload_param: offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad) @@ -347,14 +364,21 @@ def init_model(self): self.config.ref.use_remove_padding = use_remove_padding self.ref_policy = DataParallelPPOActor(config=self.config.ref, actor_module=self.ref_module_fsdp) + if self._is_actor: self.flops_counter = FlopsCounter(self.actor_model_config) - torch.cuda.empty_cache() + # torch.cuda.empty_cache() @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) def update_actor(self, data: DataProto): + ## 尝试的更改,只在 ActorRolloutRefWorker.update_actor() 中添加empty_cache + torch.cuda.empty_cache() + log_gpu_memory_usage('@ActorRolloutRefWorker.update_actor() beginning', logger=logger) + data = data.to('cuda') + + log_gpu_memory_usage('@ActorRolloutRefWorker.update_actor() after data.to(cuda)', logger=logger) assert self._is_actor if self._is_offload_param: @@ -366,7 +390,7 @@ def update_actor(self, data: DataProto): data.batch = data.batch.cuda() - log_gpu_memory_usage('Before update policy', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker.update_actor() After data.batch = data.batch.cuda() Before update policy', logger=logger) with self.ulysses_sharding_manager: data = self.ulysses_sharding_manager.preprocess_data(data=data) @@ -382,7 +406,7 @@ def update_actor(self, data: DataProto): lr = self.actor_lr_scheduler.get_last_lr()[0] metrics['actor/lr'] = lr - log_gpu_memory_usage('After update policy', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker.update_actor() After update policy', logger=logger) # TODO: here, we should return all metrics output = DataProto(meta_info={'metrics': metrics}) @@ -395,10 +419,12 @@ def update_actor(self, data: DataProto): if self._is_offload_optimizer: offload_fsdp_optimizer(optimizer=self.actor_optimizer) torch.cuda.empty_cache() + log_gpu_memory_usage('@ActorRolloutRefWorker.update_actor() After empty cache', logger=logger) return output @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) def generate_sequences(self, prompts: DataProto): + log_gpu_memory_usage('@ActorRolloutRefWorker.generate_sequences() beginning', logger=logger) prompts = prompts.to('cuda') # set to False if it is validation recompute_log_prob = prompts.meta_info.get('recompute_log_prob', True) @@ -413,12 +439,12 @@ def generate_sequences(self, prompts: DataProto): meta_info = {'eos_token_id': self.tokenizer.eos_token_id, 'pad_token_id': self.tokenizer.pad_token_id} prompts.meta_info.update(meta_info) with self.rollout_sharding_manager: - log_gpu_memory_usage('After entering rollout sharding manager', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker.generate_sequences() After entering rollout sharding manager', logger=logger) prompts = self.rollout_sharding_manager.preprocess_data(prompts) output = self.rollout.generate_sequences(prompts=prompts) - log_gpu_memory_usage('After rollout generation', logger=logger) + log_gpu_memory_usage('@ActorRolloutRefWorker.generate_sequences() After rollout generation', logger=logger) output = self.rollout_sharding_manager.postprocess_data(output) @@ -441,13 +467,15 @@ def generate_sequences(self, prompts: DataProto): # NOTE(sgm): the grad is already in CPU, only offload param here offload_fsdp_param_and_grad(module=self.actor_module_fsdp, offload_grad=self._is_offload_grad) # clear kv cache - torch.cuda.empty_cache() - log_gpu_memory_usage('After recompute log prob', logger=logger) + ## 尝试的更改,只在 ActorRolloutRefWorker.update_actor() 中添加empty_cache + # torch.cuda.empty_cache() + log_gpu_memory_usage('@ActorRolloutRefWorker.generate_sequences() After recompute log prob', logger=logger) return output @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) def compute_ref_log_prob(self, data: DataProto): assert self._is_ref + log_gpu_memory_usage('@ActorRolloutRefWorker.compute_ref_log_prob() beginning', logger=logger) data = data.to('cuda') @@ -471,12 +499,14 @@ def compute_ref_log_prob(self, data: DataProto): if self._is_offload_param: offload_fsdp_param_and_grad(module=self.ref_module_fsdp, offload_grad=self._is_offload_grad) - torch.cuda.empty_cache() + ## 尝试的更改,只在 ActorRolloutRefWorker.update_actor() 中添加empty_cache + # torch.cuda.empty_cache() return output @register(dispatch_mode=Dispatch.ONE_TO_ALL) def save_checkpoint(self, local_path, hdfs_path=None): assert self._is_actor + log_gpu_memory_usage('@ActorRolloutRefWorker.save_checkpoint() beginning', logger=logger) import torch if self._is_offload_param: load_fsdp_param_and_grad(module=self.actor_module_fsdp, @@ -617,7 +647,7 @@ def _build_critic_model_optimizer(self, config): auto_wrap_policy = get_fsdp_wrap_policy(module=critic_module, config=self.config.model.fsdp_config.wrap_policy) - log_gpu_memory_usage('Before critic FSDP', logger=None) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_critic_model_optimizer() Before critic FSDP', logger=None) critic_module = FSDP(critic_module, param_init_fn=init_fn, @@ -629,7 +659,7 @@ def _build_critic_model_optimizer(self, config): sync_module_states=True, forward_prefetch=False) - log_gpu_memory_usage('After critic FSDP', logger=None) + log_gpu_memory_usage('@ActorRolloutRefWorker._build_critic_model_optimizer() After critic FSDP', logger=None) critic_optimizer = optim.AdamW(critic_module.parameters(), lr=config.optim.lr, @@ -668,7 +698,8 @@ def init_model(self): self.flops_counter = FlopsCounter(self.critic_model_config) - torch.cuda.empty_cache() + ## 尝试的更改,只在 ActorRolloutRefWorker.update_actor() 中添加empty_cache + # torch.cuda.empty_cache() @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) def compute_values(self, data: DataProto): @@ -692,11 +723,16 @@ def compute_values(self, data: DataProto): output = output.to('cpu') if self._is_offload_param: offload_fsdp_param_and_grad(module=self.critic_module, offload_grad=self._is_offload_grad) - torch.cuda.empty_cache() + ## 尝试的更改,只在 ActorRolloutRefWorker.update_actor() 中添加empty_cache + # torch.cuda.empty_cache() return output @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) def update_critic(self, data: DataProto): + log_gpu_memory_usage('@CriticWorker.update_critic() beginning', logger=logger) + ## 尝试的更改,只在 ActorRolloutRefWorker.update_actor() 中添加empty_cache + torch.cuda.empty_cache() + log_gpu_memory_usage('@CriticWorker.update_critic() after empty_cache', logger=logger) data = data.to('cuda') if self._is_offload_param: load_fsdp_param_and_grad(module=self.critic_module, @@ -734,6 +770,7 @@ def update_critic(self, data: DataProto): @register(dispatch_mode=Dispatch.ONE_TO_ALL) def save_checkpoint(self, local_path, hdfs_path=None): + log_gpu_memory_usage('@CriticWorker.save_checkpoint() beginning', logger=logger) import torch if self._is_offload_param: load_fsdp_param_and_grad(module=self.critic_module,