Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,26 @@
**/playground
**/wandb

# my custom ignore
**/swanlog
**/verl_original
**/verl_v03
**/verl_v04
**/nohupoutput
**/nohupoutput*
**/*.nohupoutput
**/train_3b_drgrpo_tang3.sh
**/train_3b_drgrpo_song2.sh
**/train_tiny_zero_a100_drgrpo_tang3.sh
**/train_tiny_zero_a100_drgrpo_song2.sh
**/0.7.0
**/tempCodeRunnerFile.python
**/*_tang3.sh
**/*_song2.sh
**/train_paramtest*
**/x *
**/save

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# TinyZero

![image](cover.png)

TinyZero is a reproduction of [DeepSeek R1 Zero](https://github.com/deepseek-ai/DeepSeek-R1) in countdown and multiplication tasks. We built upon [veRL](https://github.com/volcengine/verl).
Expand All @@ -11,12 +12,12 @@ Twitter thread: https://x.com/jiayi_pirate/status/1882839370505621655

Full experiment log: https://wandb.ai/jiayipan/TinyZero

Paper's on it's way!
> 📢: We release [Apative Parallel Reasoning](https://github.com/Parallel-Reasoning/APR), where we explore a new dimension in scaling reasoining models

## Installation

```
conda create -n zero python=3.9
conda create -n zero python=3.10.18
# install torch [or you can skip this step and let vllm to install the correct version for you]
pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121
# install vllm
Expand Down
101 changes: 101 additions & 0 deletions data_preprocess-gsm8k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Preprocess the GSM8k dataset to parquet format
"""

import re
import os
import datasets

from verl.utils.hdfs_io import copy, makedirs
import argparse


def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split('#### ')[1].replace(',', '')
return final_solution


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='~/data/gsm8k')
parser.add_argument('--hdfs_dir', default=None)

args = parser.parse_args()

num_few_shot = 5
data_source = 'openai/gsm8k'

dataset = datasets.load_dataset(data_source, 'main')

train_dataset = dataset['train']
test_dataset = dataset['test']

print(f'train_dataset: {type(train_dataset)}, {len(train_dataset)}')
print(f'test_dataset: {type(test_dataset)}, {len(test_dataset)}')
print(f'train_dataset[0]: {train_dataset[0]}')

exit(0)

instruction_following = "Let's think step by step and output the final answer after \"####\"."

# add a row to each data item that represents a unique id
def make_map_fn(split):

def process_fn(example, idx):
question_raw = example.pop('question')

question = question_raw + ' ' + instruction_following

answer_raw = example.pop('answer')
solution = extract_solution(answer_raw)
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": question,
}],
"ability": "math",
"reward_model": {
"style": "rule",
"ground_truth": solution
},
"extra_info": {
'split': split,
'index': idx,
'answer': answer_raw,
"question": question_raw,
}
}
return data

return process_fn

train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)

local_dir = args.local_dir
hdfs_dir = args.hdfs_dir

train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))

if hdfs_dir is not None:
makedirs(hdfs_dir)

copy(src=local_dir, dst=hdfs_dir)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ ray
tensordict<0.6
transformers<4.48
vllm<=0.6.3
swanlab
wandb
43 changes: 43 additions & 0 deletions scripts/sft_countdown_2a40.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Tested with 2 & 4 GPUs

set -x

if [ "$#" -lt 2 ]; then
echo "Usage: run_gemma_2b.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi

nproc_per_node=$1
save_path=$2

# Shift the arguments so $@ refers to the rest
shift 2

torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$DATA_DIR/countdown_CoT_train_filtered_token_length_1024.parquet \
data.val_files=$DATA_DIR/countdown_CoT_test_filtered_token_length_1024.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
data.prompt_dict_keys=['question'] \
data.response_dict_keys=['answer'] \
data.truncation=left \
data.max_length=2048 \
data.train_batch_size=32 \
data.micro_batch_size=4 \
\
model.partial_pretrain=$BASE_MODEL \
\
optim.lr=1e-5 \
optim.weight_decay=0.01 \
optim.warmup_steps_ratio=0.1 \
\
trainer.default_local_dir=$save_path \
trainer.project_name=$PROJECT_NAME \
trainer.experiment_name=$EXPERIMENT_NAME \
trainer.logger=['swanlab'] \
trainer.total_epochs=1 \
trainer.default_hdfs_dir=null \
trainer.val_before_training=True \
trainer.validate_every_n_steps=10 \
$@
44 changes: 44 additions & 0 deletions scripts/sft_countdown_4a40.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Tested with 2 & 4 GPUs

set -x

if [ "$#" -lt 2 ]; then
echo "Usage: run_gemma_2b.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi

nproc_per_node=$1
save_path=$2

# Shift the arguments so $@ refers to the rest
shift 2

torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$TRAIN_FILE_PATH \
data.val_files=$TEST_FILE_PATH \
data.prompt_key=extra_info \
data.response_key=extra_info \
data.prompt_dict_keys=['question'] \
data.response_dict_keys=['answer'] \
data.truncation=left \
data.max_length=2048 \
data.train_batch_size=32 \
data.micro_batch_size=4 \
data.truncation=pass \
\
model.partial_pretrain=$BASE_MODEL \
\
optim.lr=1e-6 \
optim.weight_decay=0.01 \
optim.warmup_steps_ratio=0.1 \
\
trainer.default_local_dir=$save_path \
trainer.project_name=$PROJECT_NAME \
trainer.experiment_name=$EXPERIMENT_NAME \
trainer.logger=['swanlab'] \
trainer.total_epochs=1 \
trainer.default_hdfs_dir=null \
trainer.val_before_training=True \
trainer.validate_every_n_steps=10 \
$@
36 changes: 36 additions & 0 deletions scripts/sft_gsm8k_2a40.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Tested with 2 & 4 GPUs

set -x

if [ "$#" -lt 2 ]; then
echo "Usage: run_gemma_2b.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi

nproc_per_node=$1
save_path=$2

# Shift the arguments so $@ refers to the rest
shift 2

torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$DATA_DIR/train.parquet \
data.val_files=$DATA_DIR/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.train_batch_size=64 \
data.micro_batch_size=8 \
model.partial_pretrain=$BASE_MODEL \
trainer.default_local_dir=$save_path \
trainer.project_name=$PROJECT_NAME \
trainer.experiment_name=$EXPERIMENT_NAME \
trainer.logger=['swanlab'] \
trainer.total_epochs=1 \
trainer.total_training_steps=20 \
trainer.default_hdfs_dir=null \
trainer.val_before_training=True \
trainer.validate_every_n_steps=10 \
$@
43 changes: 43 additions & 0 deletions scripts/train_tiny_zero_drgrpo_2a40.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# this is a training script for Dr.GRPO algorithm

/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
algorithm.norm_adv_by_std_in_grpo=False \
data.train_files=$DATA_DIR/train.parquet \
data.val_files=$DATA_DIR/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=640 \
data.max_prompt_length=256 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=$BASE_MODEL \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size=4 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum-norm \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=2 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['swanlab'] \
++trainer.val_before_train=False \
trainer.default_hdfs_dir=null \
trainer.n_gpus_per_node=$N_GPUS \
trainer.nnodes=1 \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.project_name=TinyZero \
trainer.experiment_name=$EXPERIMENT_NAME \
trainer.total_epochs=15 2>&1 | tee verl_demo.log
46 changes: 46 additions & 0 deletions scripts/train_tiny_zero_drgrpo_2a40_resume.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# this is a training script for Dr.GRPO algorithm

/home/zhangyi/miniconda3/envs/wcf-zero-py3.10/bin/python3 -m verl.trainer.main_ppo \
trainer.tracking.resume=True \
trainer.tracking.resume_id=$RESUME_ID \
trainer.tracking.start_step=$RESUME_START_STEP \
algorithm.adv_estimator=grpo \
algorithm.norm_adv_by_std_in_grpo=False \
data.train_files=$DATA_DIR/train.parquet \
data.val_files=$DATA_DIR/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=640 \
data.max_prompt_length=256 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=$BASE_MODEL \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size=4 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-sum-norm \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP_SIZE \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=2 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['swanlab'] \
++trainer.val_before_train=False \
trainer.default_hdfs_dir=null \
trainer.n_gpus_per_node=$N_GPUS \
trainer.nnodes=1 \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.project_name=TinyZero \
trainer.experiment_name=$EXPERIMENT_NAME \
trainer.total_epochs=15 2>&1 | tee verl_demo.log
Loading