From 62b180b2741dbc807198ec4fcd9c72b143824143 Mon Sep 17 00:00:00 2001 From: odashi Date: Thu, 28 Aug 2025 04:20:08 +0900 Subject: [PATCH 1/2] refine scripts --- .../v4-upstream-megatron-abci/README.md | 2 +- .../v4-upstream-training-template/README.md | 14 ++++++------- .../pretrain/qsub_train.sh | 20 ++++++++++--------- .../pretrain/run_train.sh | 19 +++++++++--------- .../task_template/train_iters.txt | 1 - 5 files changed, 28 insertions(+), 28 deletions(-) delete mode 100644 pretrain/scripts/v4-upstream-training-template/task_template/train_iters.txt diff --git a/pretrain/installers/v4-upstream-megatron-abci/README.md b/pretrain/installers/v4-upstream-megatron-abci/README.md index b194ca01..b7dd58e2 100644 --- a/pretrain/installers/v4-upstream-megatron-abci/README.md +++ b/pretrain/installers/v4-upstream-megatron-abci/README.md @@ -5,7 +5,7 @@ ABCI 3.0上で以下のコマンドを実行し、``に環境を構築できる ```bash -cd pretrain/installers/v5-megatron-abci/ +cd pretrain/installers/v4-upstream-megatron-abci/ bash run_setup.sh ``` diff --git a/pretrain/scripts/v4-upstream-training-template/README.md b/pretrain/scripts/v4-upstream-training-template/README.md index 8ea7afd2..2852acc1 100644 --- a/pretrain/scripts/v4-upstream-training-template/README.md +++ b/pretrain/scripts/v4-upstream-training-template/README.md @@ -13,7 +13,7 @@ ABCI 3.0 上で Megatron-LM を利用した LLM-jp v5 用の学習スクリプ ```bash cd $EXP_DIR -git clone git@github.com:llm-jp/scripts.git +git clone https://github.com/llm-jp/scripts.git ``` 次に、 [pretrain/installers/v5-megatron-abci](../../installers/v5-megatron-abci/README.md) を利用し、`$EXP_DIR/env` に環境を構築する。 @@ -47,20 +47,22 @@ cp -r scripts/pretrain/task_template/ $EXP_DIR/tasks/$TASK_NAME ```bash cd $EXP_DIR/scripts/pretrain/$TRAINING_SCRIPT_DIR/ -bash run_train.sh +bash run_train.sh # Example: -bash run_train.sh R0123456789 0123 /path/to/0123_experiment task_name 0123_experiment 32 +bash run_train.sh gcg51557 R0123456789 0123_pretrain /path/to/0123_experiment task_name 0123_experiment 32 720:00:00 ``` CLIからは以下の引数を指定する +- ``: ABCI グループ ID - ``: ABCI の予約キュー ID -- ``: 実験の識別子 (e.g. `0123`) -- ``: 実験ディレクトリのパス (e.g. `/home/ach17726fj/experiments/0123_experiment`) +- ``: ジョブ名 (e.g., `0123_pretrain`) +- ``: 実験ディレクトリのパス (e.g. `/path/to/0123_experiment`) - ``: タスクディレクトリ名 (e.g. `task_name`) - ``: WandB に記録するプロジェクト名 (e.g. `0123_experiment`) - ``: 使用するノード数 (e.g. `32`) +- ` ${LOGFILE} 2> ${ERRFILE} -set -eu -o pipefail - ENV_DIR=${EXPERIMENT_DIR}/env SCRIPT_DIR=${EXPERIMENT_DIR}/scripts @@ -55,21 +55,19 @@ echo "hostname: ${MASTER_ADDR}" NUM_NODES=$(wc -l < ${PBS_NODEFILE}) NUM_GPUS_PER_NODE=8 NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) -echo "nnodes: ${NUM_NODES}; ngpus: ${NUM_GPUS}" echo NUM_NODES=${NUM_NODES} echo NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE} echo NUM_GPUS=${NUM_GPUS} +# For logging +echo "PBS_NODEFILE:" cat ${PBS_NODEFILE} -# Training steps -TRAIN_ITERS=$(cat ${TASK_DIR}/train_iters.txt) - -# Training data: TRAIN_DATA_PATH +# Load training data: TRAIN_DATA_PATH source ${TASK_DIR}/train_data.sh -# Synthesize all model params: ALL_PARAMS -# Requires TRAIN_ITERS and TRAIN_DATA_PATH +# Load model params: ALL_PARAMS +# Requires TRAIN_DATA_PATH source ${TASK_DIR}/params.sh # Add logging params @@ -89,8 +87,10 @@ ALL_PARAMS+=( --save-interval 1000 ) +# For logging echo "ALL_PARAMS: ${ALL_PARAMS[@]}" +echo "Start training..." mpirun \ --display-allocation \ --report-bindings \ @@ -102,3 +102,5 @@ mpirun \ python \ ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ ${ALL_PARAMS[@]} + +echo "Training completed successfully." diff --git a/pretrain/scripts/v4-upstream-training-template/pretrain/run_train.sh b/pretrain/scripts/v4-upstream-training-template/pretrain/run_train.sh index 9fc61ce7..dc118090 100755 --- a/pretrain/scripts/v4-upstream-training-template/pretrain/run_train.sh +++ b/pretrain/scripts/v4-upstream-training-template/pretrain/run_train.sh @@ -2,31 +2,30 @@ set -eu -o pipefail -if [ $# -ne 6 ]; then - >&2 echo "Usage: $0 " - >&2 echo "Example: $0 R0123456789 0123 /path/to/0123_experiment task_name 0123_experiment 32" +if [ $# -ne 8 ]; then + >&2 echo "Usage: $0 " + >&2 echo "Example: $0 gcg51557 R0123456789 0123 /path/to/0123_experiment task_name 0123_experiment 32 720:00:00" exit 1 fi +GROUP_ID=$1; shift RESERVATION_ID=$1; shift -EXPERIMENT_ID=$1; shift +JOB_NAME=$1; shift EXPERIMENT_DIR=$1; shift TASK_NAME=$1; shift WANDB_PROJECT=$1; shift NUM_NODES=$1; shift +WALLTIME=$1; shift # This directory SCRIPT_ROOT=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -WALLTIME=720:00:00 # 30 days -# WALLTIME=01:00:00 # 1 hour - qsub \ - -P gcg51557 \ + -P ${GROUP_ID} \ -q ${RESERVATION_ID} \ - -N ${EXPERIMENT_ID}_pretrain \ + -N ${JOB_NAME} \ -l select=${NUM_NODES},walltime=${WALLTIME} \ - -v RTYPE=rt_HF,EXPERIMENT_DIR=${EXPERIMENT_DIR},TASK_NAME=${TASK_NAME},WANDB_PROJECT=${WANDB_PROJECT} \ + -v RTYPE=rt_HF,USE_SSH=1,EXPERIMENT_DIR=${EXPERIMENT_DIR},TASK_NAME=${TASK_NAME},WANDB_PROJECT=${WANDB_PROJECT} \ -o /dev/null \ -e /dev/null \ -m n \ diff --git a/pretrain/scripts/v4-upstream-training-template/task_template/train_iters.txt b/pretrain/scripts/v4-upstream-training-template/task_template/train_iters.txt deleted file mode 100644 index 5caff40c..00000000 --- a/pretrain/scripts/v4-upstream-training-template/task_template/train_iters.txt +++ /dev/null @@ -1 +0,0 @@ -10000 From d3e65a1f1c5c3b853d6075d74daf099ae4c44f10 Mon Sep 17 00:00:00 2001 From: odashi Date: Thu, 28 Aug 2025 04:32:05 +0900 Subject: [PATCH 2/2] fix --- pretrain/scripts/v4-upstream-training-template/README.md | 2 +- .../v4-upstream-training-template/task_template/params.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pretrain/scripts/v4-upstream-training-template/README.md b/pretrain/scripts/v4-upstream-training-template/README.md index 2852acc1..4943682f 100644 --- a/pretrain/scripts/v4-upstream-training-template/README.md +++ b/pretrain/scripts/v4-upstream-training-template/README.md @@ -62,7 +62,7 @@ CLIからは以下の引数を指定する - ``: タスクディレクトリ名 (e.g. `task_name`) - ``: WandB に記録するプロジェクト名 (e.g. `0123_experiment`) - ``: 使用するノード数 (e.g. `32`) -- ``: ジョブの制限時間 (e.g., `720:00:00`) ### Training Configuration diff --git a/pretrain/scripts/v4-upstream-training-template/task_template/params.sh b/pretrain/scripts/v4-upstream-training-template/task_template/params.sh index 039e3010..ccd1b698 100644 --- a/pretrain/scripts/v4-upstream-training-template/task_template/params.sh +++ b/pretrain/scripts/v4-upstream-training-template/task_template/params.sh @@ -47,9 +47,9 @@ ALL_PARAMS+=( # Scheduler ALL_PARAMS+=( - --train-iters ${TRAIN_ITERS} + --train-iters 100000 --lr-warmup-iters 2000 - --lr-decay-iters ${TRAIN_ITERS} + --lr-decay-iters 100000 --lr-decay-style cosine --eval-interval 999999999 --eval-iters 0