From eda7c86688738bd44d756b1edecffcf88c934458 Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Sun, 3 Aug 2025 20:21:04 +0900 Subject: [PATCH 1/9] Add merge-settings --- .../merge/1.3b_3e-5-fix_300b.yaml | 21 +++++++++++++++++++ .../merge/1.3b_3e-5-fix_50b.yaml | 20 ++++++++++++++++++ .../merge/7.7b_3e-5-fix_50b.yaml | 21 +++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_300b.yaml create mode 100644 pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_50b.yaml create mode 100644 pretrain/scripts/v4-midtraining/merge/7.7b_3e-5-fix_50b.yaml diff --git a/pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_300b.yaml b/pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_300b.yaml new file mode 100644 index 00000000..32803861 --- /dev/null +++ b/pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_300b.yaml @@ -0,0 +1,21 @@ +# Merge base configuration: +# model: 1.3B +# lr schedule: 3e-5 fixed +# train data: 300B + +# Output path: +# /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/300B/checkpoints_hf/3e-5-fix/merged/iter_1899920 + + +merge_method: linear +models: + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/300B/checkpoints_hf/3e-5-fix/seed42/iter_1899920 + parameters: + weight: 1.0 + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/300B/checkpoints_hf/3e-5-fix/seed666/iter_1899920 + parameters: + weight: 1.0 + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/300B/checkpoints_hf/3e-5-fix/seed42069/iter_1899920 + parameters: + weight: 1.0 +dtype: bfloat16 diff --git a/pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_50b.yaml b/pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_50b.yaml new file mode 100644 index 00000000..7a656104 --- /dev/null +++ b/pretrain/scripts/v4-midtraining/merge/1.3b_3e-5-fix_50b.yaml @@ -0,0 +1,20 @@ +# Merge base configuration: +# model: 1.3B +# lr schedule: 3e-5 fixed +# train data: 50B + +# Output path: +# /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/merged/iter_1866317 + +merge_method: linear +models: + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/seed42/iter_1866317/ + parameters: + weight: 1.0 + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/seed666/iter_1866317/ + parameters: + weight: 1.0 + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/1.3b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/seed42069/iter_1866317/ + parameters: + weight: 1.0 +dtype: bfloat16 diff --git a/pretrain/scripts/v4-midtraining/merge/7.7b_3e-5-fix_50b.yaml b/pretrain/scripts/v4-midtraining/merge/7.7b_3e-5-fix_50b.yaml new file mode 100644 index 00000000..701961d5 --- /dev/null +++ b/pretrain/scripts/v4-midtraining/merge/7.7b_3e-5-fix_50b.yaml @@ -0,0 +1,21 @@ +# Merge base configuration: +# model: 7.7B +# lr schedule: 3e-5 fixed +# train data: 50B + +# Output path: +# /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/7.7b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/merged/iter_1866317 + + +merge_method: linear +models: + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/7.7b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/seed42/iter_1866317 + parameters: + weight: 1.0 + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/7.7b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/seed666/iter_1866317 + parameters: + weight: 1.0 + - model: /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining/tasks/v4-dolmino-mix-1124/7.7b-llama3-ecjk/50B/checkpoints_hf/3e-5-fix/seed42069/iter_1866317 + parameters: + weight: 1.0 +dtype: bfloat16 From 2115fbea915cfdb481290f79a766d4be3767da86 Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Sun, 3 Aug 2025 21:37:26 +0900 Subject: [PATCH 2/9] Add initial midtraining script --- .../README.md | 129 +++++++++++++++ .../convert/convert_latest.sh | 23 +++ .../convert/qsub_convert.sh | 153 +++++++++++++++++ .../memo.md | 18 ++ .../midtrain/common/setup.sh | 30 ++++ .../params/7.7b_v4_3.5t_tokenizer_v3.1.sh | 156 ++++++++++++++++++ .../midtrain/qsub_train.sh | 61 +++++++ .../midtrain/run_train.sh | 22 +++ .../midtrain/run_train_with_deps.sh | 27 +++ .../preprocess/build_train_data.sh | 25 +++ .../preprocess/extract.sh | 97 +++++++++++ .../preprocess/merge_files.sh | 139 ++++++++++++++++ .../preprocess/tokenize.sh | 72 ++++++++ .../tasks/.gitignore | 6 + .../v4-megamath-pro-max/train_data.all.sh | 3 + 15 files changed, 961 insertions(+) create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/qsub_convert.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/common/setup.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train_with_deps.sh create mode 100755 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh create mode 100755 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/extract.sh create mode 100755 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/merge_files.sh create mode 100755 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/.gitignore create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md new file mode 100644 index 00000000..bc7e838c --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md @@ -0,0 +1,129 @@ +# LLMjp-v4 Midtraining + +## Overview + +MegaMathPro-Maxを含めた実験 + +### tokenize + +```bash +export EXP_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/" +export EXP_SCRIPT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining" +cd $EXP_DIR + +# 1. Huggingfaceからdolmino-mix-1124をダウンロード +huggingface-cli download allenai/dolmino-mix-1124 --local-dir "$EXP_DIR/dolmino-mix-1124" + +cd $EXP_SCRIPT_DIR +# 2. データセットの展開 (`$EXP_DIR/dolmino-mix-1124-extracted` に展開される) +bash ./preprocess/extract.sh + +# 3. データセットファイルのmerge (`$EXP_DIR/dolmino-mix-1124-extracted-merged` に結合ファイルが作成される) +qsub ./preprocess/merge_files.sh + +# (3が完了したら) +# 4. データセットのtokenize (`$EXP_DIR/dolmino-mix-1124-tokenized` にtokenizeされたファイルが作成される) +qsub ./preprocess/tokenize.sh + +# (optional) 中間ファイルの削除 +rm -rf $EXP_DIR/dolmino-mix-1124-extracted $EXP_DIR/dolmino-mix-1124-extracted-merged +``` + +### データセットの作成 + +データセットの作成前に事前にtokenizeが完了している必要がある。 + +```sh +# ./tasks/v4-dolmino-mix-1124/train_data.all.shを作成 +# 自動的にtoken数を計算し、"token数 PATH"をtrain_data.all.shに書き込む +./preprocess/build_train_data.sh + +# ./tasks/v4-dolmino-mix-1124/train_data.all.shから./tasks/v4-dolmino-mix-1124/train_data_50B.shを作成 +# dolminoのmidtrainingと同じ配合の50Bのデータセットサイズになるようにtoken数を更新する +./preprocess/update_train_data_to_50B.sh +# 100B, 300Bも同様 +``` + +## 環境構築 + +ref: [scripts/pretrain/installers/v4-megatron-abci at 0130-instruct-pretrain · llm-jp/scripts](https://github.com/llm-jp/scripts/tree/0130-instruct-pretrain/pretrain/installers/v4-megatron-abci) + +```sh +cd /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/install-scripts/pretrain/installers/v4-megatron-abci +bash run_setup.sh /path/to/target_dir +# ex +# bash run_setup.sh /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/environment +``` + +> [!CAUTION] +> Transformer engineのv1.10以上を使うとエラーが出るため、environment2を今回利用している(Transformer engineのversionを1.9にdowngradeした。) +> ref: https://docs.nvidia.com/nemo-framework/user-guide/24.07/knownissues.html + +> [!CAUTION] +> `environment/src/Megatron-LM/megatron/core/dist_checkpointing/strategies/common.py`の72行目に"weights_only=False"を加えた +> ref: https://github.com/huggingface/accelerate/issues/3539 + + +## job実行 + +```sh +cd /path/to/v4-midtraining + +# example: +# 1.3b-llama3-ecjk +bash midtrain/run_train.sh $(realpath tasks/v4-megamath-pro-max) 7.7b_v4_3.5t_tokenizer_v3.1 80B 16 +``` + +### [Option] 依存関係付きのjob実行 + +qsub の `-W depend=...` の機能を利用して、ジョブ間に依存関係をつけて実行するためのスクリプトを用意している。 +`run_train.sh` ではなく `run_train_with_deps.sh` を利用して実行する。 + +```sh +# 最後の引数に `-W depend=` に渡す値を書く +bash midtrain/run_train.sh $(realpath tasks/v4-megamath-pro-max) 7.7b_v4_3.5t_tokenizer_v3.1 80B 16 afterok:xxxx.pbs1:yyyy.pbs1 +``` + +依存関係の詳しい記法は ABCI 3.0 上で `man qsub` を参照すること + +## Checkpoint変換 + +> [!CAUTION] +> 下のスクリプトを実行する前に、`scripts/pretrain/scripts/v4-midtraining/midtrain/params`の`--no-load-optim`を外してください。 + +```sh +cd /path/to/v4-midtraining + +bash convert/convert_latest.sh {TASK_DIR} {PARAM_NAME} {DATASET_SIZE} + +# example: +bash convert/convert_latest.sh $(realpath tasks/v4-megamath-pro-max) 7.7b_v4_3.5t_tokenizer_v3.1 80B +``` + +> [!CAUTION] +> `/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/environment2/src/Megatron-LM/tools/checkpoint/loader_mcore.py`の先頭に以下のコードを加えた +> ``` +> import json, os, sys, torch, functools +> torch.load = functools.partial(torch.load, weights_only=False) +> ``` + +## Model soup + +[arcee-ai/mergekit](https://github.com/arcee-ai/mergekit) を利用して、モデルのマージを行う + +モデルマージ用の環境は `$EXP_DIR/venv-mergekit` に用意した + +```sh +source $EXP_DIR/venv-mergekit/bin/activate + +# 初回にmergekitをインストール +pip install mergekit +``` + +`./merge/` 配下にマージの設定ファイルを配置している + +merge実行コマンド + +```sh +mergekit-yaml merge/your_config.yaml model/output/path/ +``` diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh new file mode 100644 index 00000000..ec6a287a --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# LLM-jp v4 model converter (PBS version) +# Usage: +# bash convert_latest.sh \ +# /path/to/task \ ... TASK_DIR: path to the model to save +# v3-13b \ ... PARAM_NAME: model config; corresponding file in `params/` should exist + +set -eu -o pipefail + +task_dir=$1; shift +param_name=$1; shift +dataset_size=$1; shift # 80B +iter=$(cat ${task_dir}/${param_name}/${dataset_size}/checkpoints/latest_checkpointed_iteration.txt) + +script_root=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokzenir + +qsub \ + -v TASK_DIR=${task_dir},PARAM_NAME=${param_name},DATASET_SIZE=${dataset_size},ITER=${iter},RTYPE=rt_HF \ + -m n \ + -o /dev/null \ + -e /dev/null \ + ${script_root}/convert/qsub_convert.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/qsub_convert.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/qsub_convert.sh new file mode 100644 index 00000000..9b2bdccf --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/qsub_convert.sh @@ -0,0 +1,153 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -N 0193_convert +#PBS -l select=1 +#PBS -o /dev/null +#PBS -e /dev/null +#PBS -m n + +cd $PBS_O_WORKDIR + +JOBID=${PBS_JOBID%%.*} +mkdir -p ${TASK_DIR}/logs +LOGFILE=${TASK_DIR}/logs/convert-$JOBID.out +ERRFILE=${TASK_DIR}/logs/convert-$JOBID.err +exec > $LOGFILE 2> $ERRFILE + +set -eu -o pipefail + +# Arguments +EXPERIMENT_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction +SCRIPT_DIR=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain +ENV_DIR=${EXPERIMENT_DIR}/environment3 +echo "EXPERIMENT_DIR=${EXPERIMENT_DIR}" +echo "SCRIPT_DIR=${SCRIPT_DIR}" +echo "TASK_DIR=${TASK_DIR}" +echo "PARAM_NAME=${PARAM_NAME}" +echo "DATASET_SIZE=${DATASET_SIZE}" +echo "ITER=${ITER}" + +# Setup environment +source ${SCRIPT_DIR}/common/setup.sh + +export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | hostname -f) +export MASTER_PORT=$((10000 + RANDOM % 1000)) +echo "hostname: ${MASTER_ADDR}" + +ITER_NAME=iter_$(printf %07d ${ITER}) # iter_0123456 + +MEGATRON_PATH=${ENV_DIR}/src/Megatron-LM +TOKENIZER_MODEL_PATH=${ENV_DIR}/src/llm-jp-tokenizer/hf/ver3.1/llm-jp-tokenizer-100k.ver3.1 # TODO +OUTPUT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints_hf/${ITER_NAME} +echo "OUTPUT_DIR=${OUTPUT_DIR}" + +# Setup working directory +TEMP_DIR=$(mktemp -d "${HOME}/converter_${JOBID}_XXXXXX") +echo "TEMP_DIR=${TEMP_DIR}" +function rm_tempdir { + if [ -e ${TEMP_DIR} ]; then + echo "Removing remporary directory: ${TEMP_DIR}" + rm -rf ${TEMP_DIR} + echo "Done removing" + fi +} +trap rm_tempdir EXIT +trap 'trap - EXIT; rm_tempdir; exit 1' INT PIPE TERM + +######## +# Step 1: Convert `torch_dist` format to `torch` +# This process requires to launch the trainer script with the same parallelism configs. +######## +echo "Start converting: torch_dist --> torch" + +# Prepare source model at specific iteration +mkdir ${TEMP_DIR}/torch_dist +echo ${ITER} > ${TEMP_DIR}/torch_dist/latest_checkpointed_iteration.txt +ln -s ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints/${ITER_NAME} ${TEMP_DIR}/torch_dist/${ITER_NAME} + +# Load ALL_PARAMS +source ${SCRIPT_DIR}/params/${PARAM_NAME}.sh +# Remove wandb params +EXCLUDE_KEYS=("--wandb-entity" "--wandb-project" "--wandb-exp-name") +NEW_PARAMS=() +skip_next=0 +for param in "${ALL_PARAMS[@]}"; do + if [[ $skip_next -eq 1 ]]; then + skip_next=0 + continue + fi + for key in "${EXCLUDE_KEYS[@]}"; do + if [[ "$param" == "$key" ]]; then + skip_next=1 + continue 2 + fi + done + NEW_PARAMS+=("$param") +done +ALL_PARAMS=("${NEW_PARAMS[@]}") + +# Add params specific to model conversion +ALL_PARAMS+=( + --load ${TEMP_DIR}/torch_dist + --ckpt-convert-format torch + --ckpt-convert-save ${TEMP_DIR} +) +echo "ALL_PARAMS: ${ALL_PARAMS[@]}" + +NUM_NODES=$(wc -l < $PBS_NODEFILE) +NUM_GPUS_PER_NODE=8 +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) +echo "nnodes: ${NUM_NODES}; ngpus: ${NUM_GPUS}" +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +export NVTE_FUSED_ATTN=0 +# Launch trainer script to convert the checkpoint +mpirun \ + --display-allocation \ + --report-bindings \ + --oversubscribe \ + -np ${NUM_GPUS} \ + --npernode ${NUM_GPUS_PER_NODE} \ + -bind-to none \ + -map-by slot \ + python ${MEGATRON_PATH}/pretrain_gpt.py \ + ${ALL_PARAMS[@]} + +#echo "Files created by the Step 1:" +find ${TEMP_DIR}/torch | sort + +######## +# Step 2: Convert `torch` to `Hugging Face Llama2` +######## + +echo "Start converting: torch --> hf" + +python ${MEGATRON_PATH}/tools/checkpoint/convert.py \ + --model-type GPT \ + --loader mcore \ + --saver llmjp4_hf \ + --load-dir ${TEMP_DIR}/torch \ + --save-dir ${OUTPUT_DIR} \ + --hf-tokenizer-path ${TOKENIZER_MODEL_PATH} \ + --save-dtype bfloat16 \ + --loader-transformer-impl transformer_engine \ + --megatron-path ${MEGATRON_PATH} + +echo "Files created by the Step 2:" +find ${OUTPUT_DIR} | sort + +######## +# Step 3: Replace tokenizer model +######## + +echo "Start replacing tokenizer" + +cp ${TOKENIZER_MODEL_PATH}/* ${OUTPUT_DIR} + +echo "Final model files:" +find ${OUTPUT_DIR} | sort + +echo "Done processing" diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md new file mode 100644 index 00000000..7432cd02 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md @@ -0,0 +1,18 @@ +# 中間学習memo + +- 学習にあたり、READMEに書くほど整理されていないが書き残しておきたいことを書く場所 +- 内容が固まってきたら適宜README.mdに移行する + +## 学習の手順 + +1. latest_checkpointed_iteration.txtの追加 +2. train_data.all.shに学習data pathを記入 + - `ls -al /path/to/data.bin`の値を4で割れば良い +3. data pathの値を合計して、midtrain/paramsに学習iter値を書き込む +4. train_iters.txtに全体の学習量を書く + +## Tokenizerのコピー + +- `/groups/gcg51557/experiments/0138_corpus_v4_pretrain/src/llm-jp-tokenizer/hf/ver3.1` +- `/groups/gcg51557/experiments/0138_corpus_v4_pretrain/src/llm-jp-tokenizer/models/ver3.1` +の内容をコピーする diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/common/setup.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/common/setup.sh new file mode 100644 index 00000000..b2c74f37 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/common/setup.sh @@ -0,0 +1,30 @@ +# Script for setup trainer environment. + +source /etc/profile.d/modules.sh +# module load cuda/12.1/12.1.1 +module load cuda/12.4/12.4.1 +module load cudnn/9.5/9.5.1 +module load hpcx/2.20 +# module load nccl/2.23/2.23.4-1 +module load nccl/2.25/2.25.1-1 +# echo $(module list) +loaded=$(module -t list 2>&1) +echo "-----" +echo "Modules: $loaded" +echo "-----" + +ENV_DIR=${EXPERIMENT_DIR}/environment3 + +source ${ENV_DIR}/venv/bin/activate +# source ${ENV_DIR}/scripts/environment.sh # ADD + +## Debug/logging flags +export LOGLEVEL=INFO +# export NCCL_DEBUG=WARN +export NCCL_DEBUG=INFO +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh new file mode 100644 index 00000000..79f6828a --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh @@ -0,0 +1,156 @@ +# Pretraining hyperparameters for v4 7.7B. +# Model card: https://github.com/llm-jp/model-cards/pull/30 +# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh + +ALL_PARAMS=() + +# Model hyperparameters +ALL_PARAMS+=( + --num-layers 32 + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --group-query-attention + --num-query-groups 8 + --seq-length 8192 + --max-position-embeddings 8192 + --position-embedding-type rope + --rotary-base 500000 + --untie-embeddings-and-output-weights + --swiglu + --normalization RMSNorm + --norm-epsilon 1e-5 + --disable-bias-linear +) + +# Tokenizer +ALL_PARAMS+=( + --tokenizer-type Llama2Tokenizer + --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.1/llm-jp-tokenizer-100k.ver3.1.model # TODO +) + +# Optimizer hyperparameters +ALL_PARAMS+=( + --optimizer adam + # --lr 3e-4 # will be defined later + # --min-lr 3e-5 # will be defined later + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --clip-grad 1.0 + --weight-decay 0.1 + --init-method-std 0.02 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --override-opt_param-scheduler + # --no-load-optim +) + +# pretrain_iters: 432,581 +# 80B: ceil( 83,527,699,000/ 8192 / 1024) == 9958 +# 80B sum: 432,581 + 9958 = 442,539 +# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652 +# 50B sum: 432,581 + 6,652 = 1,866,317 +MIDTRAIN_START=432581 +TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) +MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) + +# Scheduler +# Scheduler +ALL_PARAMS+=( + --lr 3e-5 # Start LR + --min-lr 3e-5 # End LR + # --min-lr 0 # End LR + # --lr-warmup-iters ${MIDTRAIN_START} # No warmup + --lr-warmup-iters 0 # No warmup + # --lr-decay-iters ${TRAIN_ITERS} + --lr-decay-iters ${MIDTRAIN_ITERS} + --lr-decay-style linear + --train-iters ${TRAIN_ITERS} + --eval-interval 999999999 + --eval-iters 0 +) + +# Batch sizes +ALL_PARAMS+=( + --micro-batch-size 2 + --global-batch-size 1024 +) + +# Parallelism +ALL_PARAMS+=( + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 2 + --context-parallel-size 1 + --sequence-parallel + --use-distributed-optimizer + --distributed-backend nccl + # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset. + --distributed-timeout-minutes 120 + --use-mpi +) + +# Load TRAIN_DATA_PATH +source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 80B +SEED=42 +# Dataset +ALL_PARAMS+=( + --data-path ${TRAIN_DATA_PATH[@]} + --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache + --split 1,0,0 + --seed ${SEED} +) + + TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints +mkdir -p ${TASK_CHECKPOINT_DIR} + +if [ -e ${TASK_CHECKPOINT_DIR}/${PARAM_NAME}/${DATASET_SIZE}/latest_checkpointed_iteration.txt ]; then + # Continue existing training + ALL_PARAMS+=( + --load ${TASK_CHECKPOINT_DIR} + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Continue existing training" +else + # Start new training from scratch + ALL_PARAMS+=( + --load ${TASK_CHECKPOINT_DIR} + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Start new training from scratch" +fi +ALL_PARAMS+=( + --save-interval 1000 +) + +# Other implementation-related parameters +ALL_PARAMS+=( + --bf16 + --use-mcore-models + --no-masked-softmax-fusion + --use-flash-attn + + # NOTE(odashi): For adjusting throughput + #--recompute-activations + #--recompute-granularity selective + #--overlap-grad-reduce + #--overlap-param-gather + + --attention-softmax-in-fp32 + --transformer-impl transformer_engine + + # NOTE(odashi): Newer implementation requires to set attention backend by parameter. + #--attention-backend flash +) + +# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency. +export NVTE_FUSED_ATTN=0 + +# Logging +ALL_PARAMS+=( + --log-interval 1 + --log-throughput + --wandb-entity llm-jp + --wandb-project 0193_midtrain + --wandb-exp-name train_$(basename ${TASK_DIR}) +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh new file mode 100644 index 00000000..2032f351 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh @@ -0,0 +1,61 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -N 0193_midtrain-megamath +#PBS -l select=16 +#PBS -l walltime=240:00:00 +#PBS -m n + +cd $PBS_O_WORKDIR + +JOBID=${PBS_JOBID%%.*} +mkdir -p ${TASK_DIR}/logs +LOGFILE=${TASK_DIR}/logs/train-${JOBID}.out +ERRFILE=${TASK_DIR}/logs/train-${JOBID}.err +exec > $LOGFILE 2> $ERRFILE + +set -eu -o pipefail + +EXPERIMENT_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction +SCRIPT_DIR=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain +ENV_DIR=${EXPERIMENT_DIR}/environment3 + +# Setup environment +source ${SCRIPT_DIR}/common/setup.sh + +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | hostname -f) +export MASTER_PORT=$((10000 + RANDOM % 1000)) +echo "hostname: ${MASTER_ADDR}" + +NUM_NODES=$(wc -l < $PBS_NODEFILE) +NUM_GPUS_PER_NODE=8 +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) +echo "nnodes: ${NUM_NODES}; ngpus: ${NUM_GPUS}" +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +cat $PBS_NODEFILE + +# Load TRAIN_DATA_PATH +source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 80B +echo "TRAIN_DATA_PATH: ${TRAIN_DATA_PATH}" + +# Load ALL_PARAMS +source ${SCRIPT_DIR}/params/${PARAM_NAME}.sh +echo "ALL_PARAMS: ${ALL_PARAMS[@]}" + +export NVTE_FUSED_ATTN=0 + +mpirun \ + --display-allocation \ + --report-bindings \ + --oversubscribe \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + ${ALL_PARAMS[@]} diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh new file mode 100644 index 00000000..08caa8ef --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -eu -o pipefail + +if [ $# -ne 4 ]; then + >&2 echo "Usage: $0 " + >&2 echo "Example: $0 v4-high-quality v3-13b 32" + exit 1 +fi + +task_dir=$1; shift +param_name=$1; shift +dataset_size=$1; shift # 80B +num_nodes=$1; shift + +script_root=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer + +qsub -l select=${num_nodes} \ + -v TASK_DIR=${task_dir},PARAM_NAME=${param_name},DATASET_SIZE=${dataset_size},RTYPE=rt_HF \ + -o /dev/null -e /dev/null \ + -m n \ + ${script_root}/midtrain/qsub_train.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train_with_deps.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train_with_deps.sh new file mode 100644 index 00000000..2cf3a569 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train_with_deps.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +set -eu -o pipefail + +if [ $# -ne 5 ]; then + >&2 echo "Usage: $0 " + >&2 echo "Example: $0 v4-high-quality v3-13b 32 afterok:xxxx.pbs1" + exit 1 +fi + +task_dir=$1; shift +param_name=$1; shift +dataset_size=$1; shift # 80B +num_nodes=$1; shift + +# qsub -W depend="$job_dependency" ... +# See `man qsub` +job_dependency=$1; shift + +script_root=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer + +qsub -l select=${num_nodes} \ + -v TASK_DIR=${task_dir},PARAM_NAME=${param_name},DATASET_SIZE=${dataset_size},RTYPE=rt_HF \ + -o /dev/null -e /dev/null \ + -m n \ + -W depend="$job_dependency" \ + ${script_root}/midtrain/qsub_train.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh new file mode 100755 index 00000000..9aaf99b9 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +set -euo pipefail + +ROOT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-tokenized" +OUT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-midtrainig/tasks/v4-dolmino-mix-1124" +OUT_FILE="${OUT_DIR}/train_data.all.sh" + +mkdir -p "${OUT_DIR}" + +{ + echo "# Auto-generated: $(date '+%F %T')" + echo "export TRAIN_DATA_PATH=(" + + find "${ROOT_DIR}" -type f -name '*_text_document.bin' | sort | while read -r BIN; do + BYTES=$(stat -c%s "${BIN}") + TOKENS=$(( BYTES / 4 )) + PREFIX="${BIN%.bin}" + printf " %s %s\n" "${TOKENS}" "${PREFIX}" + done + + echo ")" +} > "${OUT_FILE}" + +echo "Generated ${OUT_FILE}" diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/extract.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/extract.sh new file mode 100755 index 00000000..471a8150 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/extract.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +set -eu -o pipefail + +DATA_ROOT="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124/data" +OUTPUT_ROOT="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-extracted" + +mkdir -p "$OUTPUT_ROOT" + +extract_zstd() { + local input_file="$1" + local output_file="${input_file%.zst}" + output_file="${output_file/$DATA_ROOT/$OUTPUT_ROOT}" + mkdir -p "$(dirname "$output_file")" + echo zstd -d "$input_file" -o "$output_file" + zstd -f -d "$input_file" -o "$output_file" +} + +extract_gzip() { + local input_file="$1" + local output_file="${input_file%.gz}" + output_file="${output_file/$DATA_ROOT/$OUTPUT_ROOT}" + mkdir -p "$(dirname "$output_file")" + echo gunzip -c "$input_file" \> "$output_file" + gunzip -c "$input_file" > "$output_file" +} + +copy_only() { + local input_file="$1" + local output_file="${input_file}" + output_file="${output_file/$DATA_ROOT/$OUTPUT_ROOT}" + mkdir -p "$(dirname "$output_file")" + echo cp "$input_file" "$output_file" + cp "$input_file" "$output_file" +} + +# DCLM +for file in $(find "$DATA_ROOT/dclm" -name "*.json.zst" -type f); do + extract_zstd "$file" +done + +# flan +for file in $(find "$DATA_ROOT/flan" -name "*.json.gz" -type f); do + extract_gzip "$file" +done + +# pes2o +for file in $(find "$DATA_ROOT/pes2o" -name "*.json.gz" -type f); do + extract_gzip "$file" +done + +# stackexchange +for file in $(find "$DATA_ROOT/stackexchange" -name "*.json.gz" -type f); do + extract_gzip "$file" +done + +# wiki +for file in $(find "$DATA_ROOT/wiki" -name "*.json.gz" -type f); do + extract_gzip "$file" +done + +# math +## codesearchnet-owmfilter +for file in $(find "$DATA_ROOT/math/codesearchnet-owmfilter" -name "*.jsonl.gz" -type f); do + extract_gzip "$file" +done + +## gsm8k (train only) +for file in $(find $DATA_ROOT/math/gsm8k/**/train -name "*.jsonl.zst" -type f); do + extract_zstd "$file" +done + +## metamath-owmfilter +for file in $(find "$DATA_ROOT/math/metamath-owmfilter" -name "*.jsonl.gz" -type f); do + extract_gzip "$file" +done + +## tulu_math +for file in $(find "$DATA_ROOT/math/tulu_math" -name "*.jsonl" -type f); do + copy_only "$file" +done + +## dolmino_math_synth +for file in $(find "$DATA_ROOT/math/dolmino_math_synth" -name "*.jsonl" -type f); do + copy_only "$file" +done + +## mathcoder2-synthmath +for file in $(find "$DATA_ROOT/math/mathcoder2-synthmath" -name "*.jsonl" -type f); do + copy_only "$file" +done + +## tinyGSM-MIND +for file in $(find "$DATA_ROOT/math/tinyGSM-MIND" -name "*.jsonl.gz" -type f); do + extract_gzip "$file" +done + diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/merge_files.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/merge_files.sh new file mode 100755 index 00000000..d56d6c04 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/merge_files.sh @@ -0,0 +1,139 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -N 0156_preprocess_merge_files +#PBS -l select=1 +#PBS -o /dev/null +#PBS -e /dev/null +#PBS -m n +#PBS -v RTYPE=rt_HC + +set -eu -o pipefail +shopt -s globstar +shopt -s nullglob +shopt -s failglob + +EXP_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction" +DATA_ROOT="${EXP_DIR}/dolmino-mix-1124-extracted" +OUTPUT_ROOT="${EXP_DIR}/dolmino-mix-1124-extracted-merged" + +JOBID=${PBS_JOBID:-shell} +JOBID=${JOBID%%.*} +LOG_DIR="${EXP_DIR}/task/logs" +mkdir -p "$LOG_DIR" +exec > "$LOG_DIR/merge_files-$JOBID.log" 2>&1 + +min() { + a="$1" + b="$2" + if [ "$a" -lt "$b" ]; then + echo "$a" + else + echo "$b" + fi +} + +# Workaround for `codesearchnet-ownfilter` and `dolmino-mathsynth` +merge_jsonl_nl() { + for f in "$@"; do + cat "$f" + # If the file is not empty and does not have new line at the end, add a new line. + [ -s "$f" ] && [ "$(tail -c1 "$f")" != $'\n' ] && printf '\n' + done +} + +# DCLM +## 0000 - 0009, 0010 - 0019, ..., 0240 - 0246 +DCLM_DIR="$DATA_ROOT/dclm" +max_num=246 +increment=10 +for i in $(seq 0 $increment $max_num); do + # cat "$DCLM_DIR/0000/*.json" "$DCLM_DIR/0001/*.json" ... "$DCLM_DIR/0009/*.json" > "$OUTPUT_ROOT/dclm/dclm-0000-0009.jsonl" + start=$i + end=$(min $(($i + $increment - 1)) $max_num) + echo "Merging DCLM files from $start to $end" + dir_list=$(seq -f "${DCLM_DIR}/%04g" -s " " $start $end) + concat_files=$(find $dir_list -name "*.json" | sort) + output_file="$OUTPUT_ROOT/dclm/dclm-$(printf '%04d' $start)-$(printf '%04d' $end).jsonl" + mkdir -p "$(dirname "$output_file")" + cat $concat_files > $output_file + echo "Output file: $output_file" +done + +# flan +echo "Merging FLAN files" +output_flan="$OUTPUT_ROOT/flan/flan-all.jsonl" +mkdir -p "$(dirname "$output_flan")" +cat $DATA_ROOT/flan/*.json > "$output_flan" +echo "Output file: $output_flan" + +# pes2o +echo "Merging PES2O files" +output_pes2o="$OUTPUT_ROOT/pes2o/pes2o-all.jsonl" +mkdir -p "$(dirname "$output_pes2o")" +cat $DATA_ROOT/pes2o/*.json > "$output_pes2o" +echo "Output file: $output_pes2o" + +# stackexchange +echo "Merging StackExchange files" +output_stackexchange="$OUTPUT_ROOT/stackexchange/stackexchange-all.jsonl" +mkdir -p "$(dirname "$output_stackexchange")" +cat $DATA_ROOT/stackexchange/*.json > "$output_stackexchange" +echo "Output file: $output_stackexchange" + +# wiki +echo "Merging Wiki files" +output_wiki="$OUTPUT_ROOT/wiki/wiki-all.jsonl" +mkdir -p "$(dirname "$output_wiki")" +cat $DATA_ROOT/wiki/*.json > "$output_wiki" +echo "Output file: $output_wiki" + +# math +## codesearchnet-owmfilter +echo "Merging codesearchnet-owmfilter files" +output_codesearchnet="$OUTPUT_ROOT/math/codesearchnet-owmfilter-all.jsonl" +mkdir -p "$(dirname "$output_codesearchnet")" +merge_jsonl_nl $DATA_ROOT/math/codesearchnet-owmfilter/**/*.jsonl > "$output_codesearchnet" +echo "Output file: $output_codesearchnet" + +## gsm8k +echo "Merging gsm8k files" +output_gsm8k="$OUTPUT_ROOT/math/gsm8k-all.jsonl" +mkdir -p "$(dirname "$output_gsm8k")" +merge_jsonl_nl $DATA_ROOT/math/gsm8k/**/*.jsonl > "$output_gsm8k" +echo "Output file: $output_gsm8k" + +## metamath-owmfilter +echo "Merging metamath-owmfilter files" +output_metamath="$OUTPUT_ROOT/math/metamath-owmfilter-all.jsonl" +mkdir -p "$(dirname "$output_metamath")" +merge_jsonl_nl $DATA_ROOT/math/metamath-owmfilter/**/*.jsonl > "$output_metamath" +echo "Output file: $output_metamath" + +## tulu_math +echo "Merging tulu_math files" +output_tulu_math="$OUTPUT_ROOT/math/tulu_math-all.jsonl" +mkdir -p "$(dirname "$output_tulu_math")" +merge_jsonl_nl $DATA_ROOT/math/tulu_math/**/*.jsonl > "$output_tulu_math" +echo "Output file: $output_tulu_math" + +## dolmino_math_synth +echo "Merging dolmino_math_synth files" +output_dolmino_math_synth="$OUTPUT_ROOT/math/dolmino_math_synth-all.jsonl" +mkdir -p "$(dirname "$output_dolmino_math_synth")" +merge_jsonl_nl $DATA_ROOT/math/dolmino_math_synth/**/*.jsonl > "$output_dolmino_math_synth" +echo "Output file: $output_dolmino_math_synth" + +## mathcoder2-synthmath +echo "Merging mathcoder2-synthmath files" +output_mathcoder2="$OUTPUT_ROOT/math/mathcoder2-synthmath-all.jsonl" +mkdir -p "$(dirname "$output_mathcoder2")" +merge_jsonl_nl $DATA_ROOT/math/mathcoder2-synthmath/**/*.jsonl > "$output_mathcoder2" +echo "Output file: $output_mathcoder2" + +## tinyGSM-MIND +echo "Merging tinyGSM-MIND files" +output_tinygsm_mind="$OUTPUT_ROOT/math/tinyGSM-MIND-all.jsonl" +mkdir -p "$(dirname "$output_tinygsm_mind")" +merge_jsonl_nl $DATA_ROOT/math/tinyGSM-MIND/**/*.jsonl > "$output_tinygsm_mind" +echo "Output file: $output_tinygsm_mind" diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh new file mode 100755 index 00000000..4902f504 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh @@ -0,0 +1,72 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -l walltime=240:00:00 +#PBS -N 0193_tokenize +#PBS -l select=1 +#PBS -o /dev/null +#PBS -e /dev/null +#PBS -m n +#PBS -v RTYPE=rt_HF + +cd $PBS_O_WORKDIR + +EXPERIMENT_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction +SCRIPT_DIR=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer +ENV_DIR=${EXPERIMENT_DIR}/environment3 +MEGATRON_PATH=${ENV_DIR}/src/Megatron-LM + +JOBID=${PBS_JOBID%%.*} +TASK_DIR="$EXPERIMENT_DIR/task" +TOKENIZE_LOG_DIR="${TASK_DIR}/logs/tokenize-$JOBID/" +mkdir -p ${TOKENIZE_LOG_DIR} +LOGFILE=${TOKENIZE_LOG_DIR}/stdout.log +ERRFILE=${TOKENIZE_LOG_DIR}/stderr.log +exec > $LOGFILE 2> $ERRFILE + +set -eu -o pipefail + +# Arguments +echo "EXPERIMENT_DIR=${EXPERIMENT_DIR}" +echo "SCRIPT_DIR=${SCRIPT_DIR}" + +# Load environments +source ${ENV_DIR}/venv/bin/activate +source ${ENV_DIR}/scripts/environment.sh + +# Tokenizer config +export TOKENIZER_MODEL="${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.1/llm-jp-tokenizer-100k.ver3.1.model" # TODO +export TOKENIZER_TYPE=Llama2Tokenizer + +export WORKERS_PER_PROC=16 +N_PROCS=$(($(nproc) / $WORKERS_PER_PROC)) + +export DATA_DIR=${EXPERIMENT_DIR}/dolmino-mix-1124-extracted-merged +export OUTPUT_DIR=${EXPERIMENT_DIR}/dolmino-mix-1124-tokenized +mkdir -p ${OUTPUT_DIR} +export MEGATRON_PATH +export TOKENIZE_LOG_DIR + +# Tokenize +find ${DATA_DIR} -name "*.jsonl" -print0 | \ + sort -z | \ + xargs -0 -P${N_PROCS} -I "{}" bash -c ' + file="{}" + echo "Tokenizing ${file}" + relative_path="${file#${DATA_DIR}/}" + output_path="${OUTPUT_DIR}/${relative_path}" + tokenize_log_file="${TOKENIZE_LOG_DIR}/${relative_path}.log" + mkdir -p "$(dirname "$output_path")" + mkdir -p "$(dirname "$tokenize_log_file")" + + python $MEGATRON_PATH/tools/preprocess_data.py \ + --input "$file" \ + --output-prefix "${output_path%.jsonl}" \ + --tokenizer-model "$TOKENIZER_MODEL" \ + --tokenizer-type "$TOKENIZER_TYPE" \ + --workers "$WORKERS_PER_PROC" \ + --append-eod > "$tokenize_log_file" 2>&1 + + echo "Tokenization completed for ${file}" + ' + diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/.gitignore b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/.gitignore new file mode 100644 index 00000000..0d5b44a8 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/.gitignore @@ -0,0 +1,6 @@ +cache/ +checkpoints/ +checkpoints_hf/ +logs/ +checkpoints_bak/ +train_iters.txt diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh new file mode 100644 index 00000000..8c606b1c --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh @@ -0,0 +1,3 @@ +export TRAIN_DATA_PATH=( + 83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +) From 828b7c10c5f5b0665bdda8be2476b313bc082a1c Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Sun, 3 Aug 2025 23:12:59 +0900 Subject: [PATCH 3/9] Fix param script --- .../memo.md | 40 +++++++++++++++++++ .../params/7.7b_v4_3.5t_tokenizer_v3.1.sh | 14 +++---- .../preprocess/tokenize.sh | 2 +- .../v4-megamath-pro-max/train_data.all.sh | 1 + .../v4-megamath-pro-max/train_data_80B.sh | 3 ++ 5 files changed, 52 insertions(+), 8 deletions(-) create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md index 7432cd02..c21f21bb 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md @@ -16,3 +16,43 @@ - `/groups/gcg51557/experiments/0138_corpus_v4_pretrain/src/llm-jp-tokenizer/hf/ver3.1` - `/groups/gcg51557/experiments/0138_corpus_v4_pretrain/src/llm-jp-tokenizer/models/ver3.1` の内容をコピーする + +## checkpoint読み込めねえ + +`/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/environment3/src/Megatron-LM/read_metadata.py`を作成 +```py +# read_ckpt_metadata.py +import os +from megatron.core import dist_checkpointing + +def read_parallel_sizes(ckpt_dir: str): + cs = dist_checkpointing.load_common_state_dict(ckpt_dir) + # まずどのキーに入っているか確認 + if "metadata" in cs: + meta = cs["metadata"] + elif "args" in cs: + meta = cs["args"] + elif "megatron_args" in cs: + meta = cs["megatron_args"] + else: + raise KeyError(f"No metadata-like key in common_state: {list(cs.keys())}") + + print(f"Checkpoint dir : {ckpt_dir}") + print(f"Tensor parallel size : {meta.tensor_model_parallel_size}") + print(f"Pipeline parallel size: {meta.pipeline_model_parallel_size}") + print(f"Data parallel size : {meta.data_parallel_size}") + +if __name__ == "__main__": + # チェックポイントの親ディレクトリを指定 + # 例: /…/iter_0432581 + ckpt_dir = "/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/checkpoints_bak/7.7b_v4_3.5t_tokenizer_v3.1/iter_0432581/" + if not os.path.isdir(ckpt_dir): + raise FileNotFoundError(f"{ckpt_dir} が見つかりません") + read_parallel_sizes(ckpt_dir) +``` + +```stdout +Tensor parallel size : 1 +Pipeline parallel size: 1 +Data parallel size : 512 +``` diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh index 79f6828a..7ad9fc15 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh @@ -43,16 +43,14 @@ ALL_PARAMS+=( --attention-dropout 0.0 --hidden-dropout 0.0 --override-opt_param-scheduler - # --no-load-optim + --no-load-optim ) # pretrain_iters: 432,581 # 80B: ceil( 83,527,699,000/ 8192 / 1024) == 9958 -# 80B sum: 432,581 + 9958 = 442,539 -# 50B: ceil( 55,797,411,281 / 8192 / 1024 ) == 6652 -# 50B sum: 432,581 + 6,652 = 1,866,317 +# 80B sum: 432,581 + 9,958 = 442,539 MIDTRAIN_START=432581 -TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) +TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) # 442539 MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) # Scheduler @@ -73,14 +71,16 @@ ALL_PARAMS+=( # Batch sizes ALL_PARAMS+=( - --micro-batch-size 2 + --micro-batch-size 1 + # --global-batch-size 512 + # --micro-batch-size 2 --global-batch-size 1024 ) # Parallelism ALL_PARAMS+=( --tensor-model-parallel-size 1 - --pipeline-model-parallel-size 2 + --pipeline-model-parallel-size 1 --context-parallel-size 1 --sequence-parallel --use-distributed-optimizer diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh index 4902f504..d7aee765 100755 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh @@ -42,7 +42,7 @@ export WORKERS_PER_PROC=16 N_PROCS=$(($(nproc) / $WORKERS_PER_PROC)) export DATA_DIR=${EXPERIMENT_DIR}/dolmino-mix-1124-extracted-merged -export OUTPUT_DIR=${EXPERIMENT_DIR}/dolmino-mix-1124-tokenized +export OUTPUT_DIR=${EXPERIMENT_DIR}/dolmino-mix-1124-v3.1-tokenized mkdir -p ${OUTPUT_DIR} export MEGATRON_PATH export TOKENIZE_LOG_DIR diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh index 8c606b1c..eb86c89a 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh @@ -1,3 +1,4 @@ export TRAIN_DATA_PATH=( 83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document + # TODO: Add dolmino ) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh new file mode 100644 index 00000000..8c606b1c --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh @@ -0,0 +1,3 @@ +export TRAIN_DATA_PATH=( + 83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +) From 78efa93a6ba366f90383850c396b3b5d354375c8 Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Mon, 4 Aug 2025 22:45:42 +0900 Subject: [PATCH 4/9] Add dolmino with 3.1 tokenizer --- .../params/7.7b_v4_3.5t_tokenizer_v3.1.sh | 5 ++- .../preprocess/build_train_data.sh | 4 +- .../preprocess/update_train_data_to_100B.sh | 35 ++++++++++++++++ .../preprocess/update_train_data_to_300B.sh | 37 +++++++++++++++++ .../preprocess/update_train_data_to_50B.sh | 35 ++++++++++++++++ .../v4-megamath-pro-max/train_data.all.sh | 39 +++++++++++++++++- .../v4-megamath-pro-max/train_data_100B.sh | 39 ++++++++++++++++++ .../v4-megamath-pro-max/train_data_130B.sh | 40 +++++++++++++++++++ .../v4-megamath-pro-max/train_data_180B.sh | 40 +++++++++++++++++++ .../v4-megamath-pro-max/train_data_300B.sh | 39 ++++++++++++++++++ .../v4-megamath-pro-max/train_data_380B.sh | 40 +++++++++++++++++++ .../v4-megamath-pro-max/train_data_50B.sh | 39 ++++++++++++++++++ 12 files changed, 387 insertions(+), 5 deletions(-) create mode 100755 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_100B.sh create mode 100755 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_300B.sh create mode 100755 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_50B.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh index 7ad9fc15..6acfc743 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh @@ -47,8 +47,11 @@ ALL_PARAMS+=( ) # pretrain_iters: 432,581 -# 80B: ceil( 83,527,699,000/ 8192 / 1024) == 9958 +# 80B: ceil( 83,527,699,000 / 8192 / 1024) == 9958 # 80B sum: 432,581 + 9,958 = 442,539 +# 130B: ceil((57,872,011,633+83,527,699,000)/8192/1024) == 16857 +# 130B sum: 432,581 + 16857 = 449438 +# MIDTRAIN_START=449438 MIDTRAIN_START=432581 TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) # 442539 MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh index 9aaf99b9..c90281e9 100755 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/build_train_data.sh @@ -2,8 +2,8 @@ set -euo pipefail -ROOT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-tokenized" -OUT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-midtrainig/tasks/v4-dolmino-mix-1124" +ROOT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized" +OUT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max" OUT_FILE="${OUT_DIR}/train_data.all.sh" mkdir -p "${OUT_DIR}" diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_100B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_100B.sh new file mode 100755 index 00000000..a4dab60c --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_100B.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +update_train_data () { + local IN="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh" + local OUT="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh" + + awk ' + BEGIN { + FS = OFS = " " + } + function ceil(x) { return (x == int(x) ? x : int(x) + 1) } + + /^[[:space:]]*[0-9]/ { + tok = $1 + path = $2 + + ratio = 1 # default 100% + if (path ~ /\/dclm\//) ratio = 0.0685 # 6.85 % + else if (path ~ /\/pes2o\//) ratio = 0.167 # 16.7 % + else if (path ~ /\/math\//) ratio = 2.0 # 200 % + # flan / stackexchange / wiki は ratio = 1 + + newtok = ceil(tok * ratio) + $1 = newtok + print + next + } + + { print } + ' "$IN" > "$OUT" + + echo "Created $OUT" +} + +update_train_data diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_300B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_300B.sh new file mode 100755 index 00000000..39f486c2 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_300B.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +update_train_data () { + local IN="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh" + local OUT="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh" + + awk ' + BEGIN { + FS = OFS = " " + } + function ceil(x) { return (x == int(x) ? x : int(x) + 1) } + + /^[[:space:]]*[0-9]/ { + tok = $1 + path = $2 + + ratio = 1 # default 100% + if (path ~ /\/dclm\//) ratio = 0.2078 # 20.78 % + else if (path ~ /\/flan\//) ratio = 2.0 # 200 % + else if (path ~ /\/stackexchange\//) ratio = 4.0 # 400 % + else if (path ~ /\/math\//) ratio = 4.0 # 400 % + else if (path ~ /\/wiki\//) ratio = 4.0 # 400 % + # peS2o ratio = 1 + + newtok = ceil(tok * ratio) + $1 = newtok + print + next + } + + { print } + ' "$IN" > "$OUT" + + echo "Created $OUT" +} + +update_train_data diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_50B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_50B.sh new file mode 100755 index 00000000..11420ffd --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/update_train_data_to_50B.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +update_train_data () { + local IN="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh" + local OUT="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh" + + awk ' + BEGIN { + FS = OFS = " " + } + function ceil(x) { return (x == int(x) ? x : int(x) + 1) } + + /^[[:space:]]*[0-9]/ { + tok = $1 + path = $2 + + ratio = 1 # default 100% + if (path ~ /\/dclm\//) ratio = 0.0323 # 3.23 % + else if (path ~ /\/flan\//) ratio = 0.5 # 50 % + else if (path ~ /\/pes2o\//) ratio = 0.0515 # 5.15 % + # math / stackexchange / wiki: ratio = 1 + + newtok = ceil(tok * ratio) + $1 = newtok + print + next + } + + { print } + ' "$IN" > "$OUT" + + echo "Created $OUT" +} + +update_train_data diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh index eb86c89a..5d380fa4 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh @@ -1,4 +1,39 @@ +# Auto-generated: 2025-08-04 22:27:29 export TRAIN_DATA_PATH=( - 83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document - # TODO: Add dolmino + 34024238242 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document + 34350909111 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document + 34144149137 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document + 34168429457 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document + 34463761854 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document + 34025285070 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document + 34032832514 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document + 33958926097 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document + 34242839474 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document + 34310201068 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document + 34383227118 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document + 34327466501 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document + 33753035160 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document + 34181910065 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document + 34077214822 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document + 34272697869 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document + 34198552799 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document + 34301276299 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document + 34105355242 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document + 34134875229 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document + 34175053243 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document + 34220951267 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document + 33871464987 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document + 33794406503 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document + 21151196985 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document + 19020984357 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document + 2254531 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document + 36095523 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document + 1048444 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document + 4208362913 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document + 96121651 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document + 7585407056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document + 269992014 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document + 65556621655 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document + 1504642032 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document + 4127779989 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document ) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh new file mode 100644 index 00000000..300ff0e7 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh @@ -0,0 +1,39 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +2330660320 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +2353037275 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +2338874216 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +2340537418 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +2360767687 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +2330732028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +2331249028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +2326186438 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +2345634504 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +2350248774 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +2355251058 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +2351431456 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +2312082909 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +2341460840 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +2334289216 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +2347679805 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +2342600867 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +2349637427 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +2336216835 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +2338238954 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +2340991148 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +2344135162 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +2320195352 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +2314916846 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +1448856994 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +19020984357 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +4509062 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +72191046 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +2096888 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +8416725826 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +192243302 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +15170814112 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +539984028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +10947955817 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +1504642032 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +4127779989 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh new file mode 100644 index 00000000..5f2defe1 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh @@ -0,0 +1,40 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +1098982896 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +1109534365 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +1102856018 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +1103640272 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +1113179508 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +1099016708 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +1099260491 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +1096873313 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +1106043716 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +1108219495 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +1110578236 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +1108777168 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +1090223036 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +1104075696 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +1100694039 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +1107008142 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +1104613256 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +1107931225 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +1101602975 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +1102556470 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +1103854220 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +1105336726 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +1094048320 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +1091559331 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +683183663 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +9510492179 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +2254531 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +36095523 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +1048444 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +4208362913 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +96121651 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +7585407056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +269992014 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +3376166016 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +1504642032 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +4127779989 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh new file mode 100644 index 00000000..357e4931 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh @@ -0,0 +1,40 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +2330660320 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +2353037275 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +2338874216 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +2340537418 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +2360767687 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +2330732028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +2331249028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +2326186438 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +2345634504 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +2350248774 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +2355251058 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +2351431456 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +2312082909 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +2341460840 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +2334289216 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +2347679805 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +2342600867 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +2349637427 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +2336216835 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +2338238954 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +2340991148 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +2344135162 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +2320195352 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +2314916846 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +1448856994 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +19020984357 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +4509062 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +72191046 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +2096888 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +8416725826 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +192243302 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +15170814112 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +539984028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +10947955817 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +1504642032 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +4127779989 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh new file mode 100644 index 00000000..f845a1d2 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh @@ -0,0 +1,39 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +7070236707 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +7138118914 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +7095154191 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +7100199642 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +7161569714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +7070454238 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +7072022597 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +7056664843 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +7115662043 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +7129659782 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +7144834596 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +7133247539 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +7013880707 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +7103000912 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +7081245241 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +7121866618 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +7106459272 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +7127805215 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +7087092820 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +7093227073 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +7101576064 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +7111113674 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +7038490425 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +7022477672 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +4395218734 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +38041968714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +9018124 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +144382092 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +4193776 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +16833451652 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +384486604 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +30341628224 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +1079968056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +65556621655 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +6018568128 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +16511119956 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh new file mode 100644 index 00000000..7a5147c1 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh @@ -0,0 +1,40 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +7070236707 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +7138118914 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +7095154191 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +7100199642 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +7161569714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +7070454238 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +7072022597 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +7056664843 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +7115662043 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +7129659782 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +7144834596 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +7133247539 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +7013880707 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +7103000912 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +7081245241 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +7121866618 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +7106459272 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +7127805215 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +7087092820 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +7093227073 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +7101576064 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +7111113674 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +7038490425 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +7022477672 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +4395218734 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +38041968714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +9018124 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +144382092 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +4193776 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +16833451652 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +384486604 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +30341628224 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +1079968056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +65556621655 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +6018568128 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +16511119956 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh new file mode 100644 index 00000000..3d34c88d --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh @@ -0,0 +1,39 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +1098982896 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +1109534365 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +1102856018 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +1103640272 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +1113179508 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +1099016708 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +1099260491 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +1096873313 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +1106043716 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +1108219495 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +1110578236 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +1108777168 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +1090223036 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +1104075696 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +1100694039 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +1107008142 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +1104613256 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +1107931225 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +1101602975 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +1102556470 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +1103854220 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +1105336726 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +1094048320 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +1091559331 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +683183663 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +9510492179 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +2254531 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +36095523 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +1048444 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +4208362913 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +96121651 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +7585407056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +269992014 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +3376166016 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +1504642032 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +4127779989 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +) From 12726b853900b1c8ec86996bc97fed1774253ad0 Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Tue, 5 Aug 2025 13:23:35 +0900 Subject: [PATCH 5/9] Add gsm8k tokenize --- .../README.md | 6 +- .../convert/convert_latest.sh | 2 +- .../memo.md | 10 +++ .../midtrain/run_train.sh | 2 +- .../preprocess/gsm8k_tokenize.sh | 77 +++++++++++++++++++ 5 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/gsm8k_tokenize.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md index bc7e838c..c3b6e8b7 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/README.md @@ -8,7 +8,7 @@ MegaMathPro-Maxを含めた実験 ```bash export EXP_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/" -export EXP_SCRIPT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining" +export EXP_SCRIPT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer" cd $EXP_DIR # 1. Huggingfaceからdolmino-mix-1124をダウンロード @@ -67,7 +67,7 @@ bash run_setup.sh /path/to/target_dir ## job実行 ```sh -cd /path/to/v4-midtraining +cd /path/to/v4-midtraining-with-v3.1-tokenizer # example: # 1.3b-llama3-ecjk @@ -92,7 +92,7 @@ bash midtrain/run_train.sh $(realpath tasks/v4-megamath-pro-max) 7.7b_v4_3.5t_to > 下のスクリプトを実行する前に、`scripts/pretrain/scripts/v4-midtraining/midtrain/params`の`--no-load-optim`を外してください。 ```sh -cd /path/to/v4-midtraining +cd /path/to/v4-midtraining-with-v3.1-tokenizer bash convert/convert_latest.sh {TASK_DIR} {PARAM_NAME} {DATASET_SIZE} diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh index ec6a287a..339a4b93 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/convert/convert_latest.sh @@ -13,7 +13,7 @@ param_name=$1; shift dataset_size=$1; shift # 80B iter=$(cat ${task_dir}/${param_name}/${dataset_size}/checkpoints/latest_checkpointed_iteration.txt) -script_root=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokzenir +script_root=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer qsub \ -v TASK_DIR=${task_dir},PARAM_NAME=${param_name},DATASET_SIZE=${dataset_size},ITER=${iter},RTYPE=rt_HF \ diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md index c21f21bb..d953ecaa 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md @@ -56,3 +56,13 @@ Tensor parallel size : 1 Pipeline parallel size: 1 Data parallel size : 512 ``` + + +## GSM8Kのファイルに空行が混じっていたので、アドほっくはスクリプトを作って修正 + +- script: `/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/gsm8k_tokenize.sh` +- directory: `/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-tmp/math/gsm8k-all_clean.jsonl` + +```sh +qsub ./scripts/gsm8k_tokenize.sh +``` diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh index 08caa8ef..8f3e1402 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/run_train.sh @@ -4,7 +4,7 @@ set -eu -o pipefail if [ $# -ne 4 ]; then >&2 echo "Usage: $0 " - >&2 echo "Example: $0 v4-high-quality v3-13b 32" + >&2 echo "Example: $0 v4-high-quality v3-13b 50B 32" exit 1 fi diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/gsm8k_tokenize.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/gsm8k_tokenize.sh new file mode 100644 index 00000000..c2cc559b --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/gsm8k_tokenize.sh @@ -0,0 +1,77 @@ +#!/bin/bash +#PBS -P gcg51557 +#PBS -q R9920251000 +#PBS -l walltime=240:00:00 +#PBS -N 0193_tokenize_gsm8k_clean +#PBS -l select=1 +#PBS -o /dev/null +#PBS -e /dev/null +#PBS -m n +#PBS -v RTYPE=rt_HF + +# --- 初期設定 --- +cd $PBS_O_WORKDIR + +# 実験ディレクトリなどの設定 (元のスクリプトから流用) +EXPERIMENT_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction +ENV_DIR=${EXPERIMENT_DIR}/environment3 +MEGATRON_PATH=${ENV_DIR}/src/Megatron-LM + +# ログディレクトリの設定 +JOBID=${PBS_JOBID%%.*} +TASK_DIR="$EXPERIMENT_DIR/task" +TOKENIZE_LOG_DIR="${TASK_DIR}/logs/tokenize-$JOBID/" +mkdir -p ${TOKENIZE_LOG_DIR} +LOGFILE=${TOKENIZE_LOG_DIR}/stdout.log +ERRFILE=${TOKENIZE_LOG_DIR}/stderr.log +exec > $LOGFILE 2> $ERRFILE # スクリプト全体の出力をログファイルにリダイレクト + +set -eu -o pipefail # エラー時にスクリプトを終了 + +# --- 環境のロード --- +echo "Loading environments..." +source ${ENV_DIR}/venv/bin/activate +source ${ENV_DIR}/scripts/environment.sh +echo "Environments loaded." + +# --- トークナイザ設定 --- +export TOKENIZER_MODEL="${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.1/llm-jp-tokenizer-100k.ver3.1.model" +export TOKENIZER_TYPE=Llama2Tokenizer + +# --- 【修正点】入出力パスの直接指定 --- +# 処理したい単一の入力ファイルを指定 +INPUT_FILE="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-tmp/math/gsm8k-all_clean.jsonl" + +# 出力先のディレクトリを指定 +OUTPUT_DIR="/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math" + +# 出力ファイル名のプレフィックスを定義 (拡張子 .jsonl を除いたもの) +OUTPUT_FILENAME="gsm8k-all_clean" +OUTPUT_PREFIX="${OUTPUT_DIR}/${OUTPUT_FILENAME}" + +# 出力ディレクトリが存在しない場合は作成 +mkdir -p ${OUTPUT_DIR} + +echo "--- Configuration ---" +echo "INPUT_FILE: ${INPUT_FILE}" +echo "OUTPUT_PREFIX: ${OUTPUT_PREFIX}" +echo "TOKENIZER_MODEL: ${TOKENIZER_MODEL}" +echo "TOKENIZER_TYPE: ${TOKENIZER_TYPE}" +echo "MEGATRON_PATH: ${MEGATRON_PATH}" +echo "---------------------" + +# --- 【修正点】トークン化処理の簡略化 --- +# findとxargsによるループを削除し、単一のコマンドで実行 + +echo "Starting tokenization for ${INPUT_FILE}..." + +python $MEGATRON_PATH/tools/preprocess_data.py \ + --input "${INPUT_FILE}" \ + --output-prefix "${OUTPUT_PREFIX}" \ + --tokenizer-model "${TOKENIZER_MODEL}" \ + --tokenizer-type "${TOKENIZER_TYPE}" \ + --workers "$(nproc)" \ + --append-eod + +echo "Tokenization completed successfully." +echo "Output files have been saved as ${OUTPUT_PREFIX}.bin and ${OUTPUT_PREFIX}.idx" From bbedf4a73b89934c72a4d4b3cd27375996793610 Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Tue, 5 Aug 2025 13:44:52 +0900 Subject: [PATCH 6/9] Update memo.md --- pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md index d953ecaa..927c4423 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md @@ -10,6 +10,12 @@ - `ls -al /path/to/data.bin`の値を4で割れば良い 3. data pathの値を合計して、midtrain/paramsに学習iter値を書き込む 4. train_iters.txtに全体の学習量を書く +5. checkpointのシンボリックリンクを貼る +```sh +export TARGET_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/7.7b_v4_3.5t_tokenizer_v3.1/80B/checkpoints/iter_0432581 +export SRC_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/checkpoints_bak/7.7b_v4_3.5t_tokenizer_v3.1/iter_0432581 +ln -s $SRC_DIR $TARGET_DIR +``` ## Tokenizerのコピー From 3660a89eadd9ec8683624cc29582961054aa4a79 Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Tue, 5 Aug 2025 22:19:28 +0900 Subject: [PATCH 7/9] Update train data scripts --- .../v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh | 1 + .../tasks/v4-megamath-pro-max/train_data.all.sh | 4 ++-- .../tasks/v4-megamath-pro-max/train_data_100B.sh | 4 ++-- .../tasks/v4-megamath-pro-max/train_data_130B.sh | 2 +- .../tasks/v4-megamath-pro-max/train_data_180B.sh | 2 +- .../tasks/v4-megamath-pro-max/train_data_300B.sh | 4 ++-- .../tasks/v4-megamath-pro-max/train_data_380B.sh | 2 +- .../tasks/v4-megamath-pro-max/train_data_50B.sh | 4 ++-- .../tasks/v4-megamath-pro-max/train_data_80B.sh | 2 +- 9 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh index d7aee765..9503c134 100755 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/preprocess/tokenize.sh @@ -48,6 +48,7 @@ export MEGATRON_PATH export TOKENIZE_LOG_DIR # Tokenize +# find ${DATA_DIR} -name "gsm8k-all.jsonl" -print0 | \ find ${DATA_DIR} -name "*.jsonl" -print0 | \ sort -z | \ xargs -0 -P${N_PROCS} -I "{}" bash -c ' diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh index 5d380fa4..f56c024f 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data.all.sh @@ -1,4 +1,4 @@ -# Auto-generated: 2025-08-04 22:27:29 +# Auto-generated: 2025-08-05 22:16:59 export TRAIN_DATA_PATH=( 34024238242 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document 34350909111 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document @@ -28,7 +28,7 @@ export TRAIN_DATA_PATH=( 19020984357 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document 2254531 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document 36095523 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document - 1048444 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document + 3279374 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document 4208362913 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document 96121651 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document 7585407056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh index 300ff0e7..9a4bb46c 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_100B.sh @@ -1,4 +1,4 @@ -# Auto-generated: 2025-08-04 22:27:29 +# Auto-generated: 2025-08-05 22:16:59 export TRAIN_DATA_PATH=( 2330660320 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document 2353037275 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document @@ -28,7 +28,7 @@ export TRAIN_DATA_PATH=( 19020984357 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document 4509062 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document 72191046 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document -2096888 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +6558748 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document 8416725826 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document 192243302 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document 15170814112 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh index 5f2defe1..43aaafc5 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_130B.sh @@ -28,7 +28,7 @@ export TRAIN_DATA_PATH=( 9510492179 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document 2254531 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document 36095523 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document -1048444 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +3279374 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document 4208362913 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document 96121651 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document 7585407056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh index 357e4931..8e251f74 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_180B.sh @@ -28,7 +28,7 @@ export TRAIN_DATA_PATH=( 19020984357 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document 4509062 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document 72191046 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document -2096888 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +6558748 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document 8416725826 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document 192243302 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document 15170814112 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh index f845a1d2..1c6161c5 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_300B.sh @@ -1,4 +1,4 @@ -# Auto-generated: 2025-08-04 22:27:29 +# Auto-generated: 2025-08-05 22:16:59 export TRAIN_DATA_PATH=( 7070236707 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document 7138118914 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document @@ -28,7 +28,7 @@ export TRAIN_DATA_PATH=( 38041968714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document 9018124 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document 144382092 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document -4193776 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +13117496 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document 16833451652 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document 384486604 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document 30341628224 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh index 7a5147c1..5baa3f0e 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_380B.sh @@ -28,7 +28,7 @@ export TRAIN_DATA_PATH=( 38041968714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document 9018124 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document 144382092 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document -4193776 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +13117496 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document 16833451652 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document 384486604 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document 30341628224 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh index 3d34c88d..eb44d522 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_50B.sh @@ -1,4 +1,4 @@ -# Auto-generated: 2025-08-04 22:27:29 +# Auto-generated: 2025-08-05 22:16:59 export TRAIN_DATA_PATH=( 1098982896 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document 1109534365 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document @@ -28,7 +28,7 @@ export TRAIN_DATA_PATH=( 9510492179 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document 2254531 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document 36095523 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document -1048444 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +3279374 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document 4208362913 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document 96121651 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document 7585407056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh index 8c606b1c..e4fca886 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_80B.sh @@ -1,3 +1,3 @@ export TRAIN_DATA_PATH=( - 83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document ) From 5736e903b8dd05e558a846a0813ec0700253a6cf Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Fri, 8 Aug 2025 10:19:56 +0900 Subject: [PATCH 8/9] Update the number of tokens --- .../midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh index 6acfc743..ebb447a4 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh @@ -47,10 +47,14 @@ ALL_PARAMS+=( ) # pretrain_iters: 432,581 -# 80B: ceil( 83,527,699,000 / 8192 / 1024) == 9958 +# 80B: ceil( 83,527,699,000 / 8192 / 1024) == 9,958 # 80B sum: 432,581 + 9,958 = 442,539 -# 130B: ceil((57,872,011,633+83,527,699,000)/8192/1024) == 16857 -# 130B sum: 432,581 + 16857 = 449438 +# 130B: ceil((57,872,011,633+83,527,699,000)/8192/1024) == 16,857 +# 130B sum: 432,581 + 16,857 = 449,438 +# 180B: ceil((201,117,999,876+83,527,699,000)/8192/1024) == 33,933 +# 180B sum: 432,581 + 33,933 = 466,514 +# 380B: ceil((433,153,308,934+83,527,699,000)/8192/1024) == 61,594 +# 380B sum: 432,581 + 61,594 = 494,175 # MIDTRAIN_START=449438 MIDTRAIN_START=432581 TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) # 442539 From 76b40ff346504dc39c730d17ea9f17fba8ab3765 Mon Sep 17 00:00:00 2001 From: koshieguchi Date: Mon, 1 Sep 2025 16:39:27 +0900 Subject: [PATCH 9/9] Add 190B and 420B --- .../memo.md | 54 ++++++ .../params/7.7b_v4_3.5t_tokenizer_v3.1.sh | 25 ++- .../7.7b_v4_3.5t_tokenizer_v3.1_curriculum.sh | 178 ++++++++++++++++++ .../midtrain/qsub_train.sh | 2 +- .../v4-megamath-pro-max/train_data_190B.sh | 40 ++++ .../v4-megamath-pro-max/train_data_420B.sh | 40 ++++ 6 files changed, 333 insertions(+), 6 deletions(-) create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1_curriculum.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_190B.sh create mode 100644 pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_420B.sh diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md index 927c4423..d5d2c1fb 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/memo.md @@ -17,6 +17,13 @@ export SRC_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scr ln -s $SRC_DIR $TARGET_DIR ``` +50Bシンポリックリンク: +```sh +export SRC_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/7.7b_v4_3.5t_tokenizer_v3.1/300B/checkpoints/iter_0474260 +export TARGET_DIR=/groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/scripts/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/7.7b_v4_3.5t_tokenizer_v3.1_curriculum/380B/checkpoints/iter_0474260 +ln -s $SRC_DIR $TARGET_DIR +``` + ## Tokenizerのコピー - `/groups/gcg51557/experiments/0138_corpus_v4_pretrain/src/llm-jp-tokenizer/hf/ver3.1` @@ -72,3 +79,50 @@ Data parallel size : 512 ```sh qsub ./scripts/gsm8k_tokenize.sh ``` + + +## 学習率設定 + +stable (成功) +--no-load-optim必要 +```sh +# Scheduler +# Scheduler +ALL_PARAMS+=( + --lr 3e-5 # Start LR + --min-lr 3e-5 # End LR + # --min-lr 0 # End LR + # --lr-warmup-iters ${MIDTRAIN_START} # No warmup + --lr-warmup-iters 0 # No warmup + # --lr-decay-iters ${TRAIN_ITERS} + --lr-decay-iters ${MIDTRAIN_ITERS} + --lr-decay-style linear + --train-iters ${TRAIN_ITERS} + --eval-interval 999999999 + --eval-iters 0 +) +``` + +linear decay (まだ試していない) +--no-load-optim必要 +```sh +# Scheduler +# Scheduler +ALL_PARAMS+=( + --lr 3e-5 # Start LR + # --min-lr 3e-5 # End LR + --min-lr 0 # End LR + # --lr-warmup-iters ${MIDTRAIN_START} # No warmup + --lr-warmup-iters 0 # No warmup + # --lr-decay-iters ${TRAIN_ITERS} + --lr-decay-iters ${MIDTRAIN_ITERS} + --lr-decay-style linear + --train-iters ${TRAIN_ITERS} + --eval-interval 999999999 + --eval-iters 0 +) +``` + +999356.pbs1 0193_midtrain-m* ach17725rd 0 Q R9920251000 # 50B +999357.pbs1 0193_midtrain-m* ach17725rd 0 Q R9920251000 # 100B +999358.pbs1 0193_midtrain-m* ach17725rd 0 Q R9920251000 # 300B diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh index ebb447a4..b53ac1b8 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1.sh @@ -43,19 +43,34 @@ ALL_PARAMS+=( --attention-dropout 0.0 --hidden-dropout 0.0 --override-opt_param-scheduler - --no-load-optim + # --no-load-optim ) # pretrain_iters: 432,581 -# 80B: ceil( 83,527,699,000 / 8192 / 1024) == 9,958 -# 80B sum: 432,581 + 9,958 = 442,539 + +# * Updated: 2025.08.14 +# 50B: ceil(57874242563/8192/1024) == 6900 +# 50B sum: 432581+6900 = 439,481 +# 80B: ceil(83,527,699,000 / 8192 / 1024) == 9,958 +# 80B sum: 432,581 + 9,958 = 442,539 # Same +# 100B: ceil(117,590,300,876/8192/1024) == 14,018 +# 100B sum: 432,581 + 14,018 = 446,599 +# 130B: ceil((57874242563+83,527,699,000)/8192/1024) == 16,857 +# 130B sum: 432,581 + 16,857 = 449,438 # Same +# 180B: ceil((117,590,300,876+83,527,699,000)/8192/1024) == 23976 +# 180B sum: 432,581 + 23976 = 456,557 # Different (190B) +# 300B: ceil(349,625,609,934/8192/1024) == 41679 +# 300B sum: 432,581 + 41679 = 474,260 +# 380B: ceil((349,625,609,934+83,527,699,000)/8192/1024) == 51,636 +# 380B sum: 432,581 + 51,636 = 484,217 # Different (420B) + +# * Before: 2025.08.14 # 130B: ceil((57,872,011,633+83,527,699,000)/8192/1024) == 16,857 # 130B sum: 432,581 + 16,857 = 449,438 # 180B: ceil((201,117,999,876+83,527,699,000)/8192/1024) == 33,933 # 180B sum: 432,581 + 33,933 = 466,514 # 380B: ceil((433,153,308,934+83,527,699,000)/8192/1024) == 61,594 # 380B sum: 432,581 + 61,594 = 494,175 -# MIDTRAIN_START=449438 MIDTRAIN_START=432581 TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) # 442539 MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) @@ -127,7 +142,7 @@ else echo "Start new training from scratch" fi ALL_PARAMS+=( - --save-interval 1000 + --save-interval 300 ) # Other implementation-related parameters diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1_curriculum.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1_curriculum.sh new file mode 100644 index 00000000..e0e448ba --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/params/7.7b_v4_3.5t_tokenizer_v3.1_curriculum.sh @@ -0,0 +1,178 @@ +# Pretraining hyperparameters for v4 7.7B. +# Model card: https://github.com/llm-jp/model-cards/pull/30 +# Ref: https://github.com/llm-jp/scripts/blob/ec3516a38f93047b7bc0d8305879d62a375e6ee2/pretrain/scripts/v4-training/params/7.7b-cont1.sh + +ALL_PARAMS=() + +# Model hyperparameters +ALL_PARAMS+=( + --num-layers 32 + --hidden-size 4096 + --ffn-hidden-size 14336 + --num-attention-heads 32 + --group-query-attention + --num-query-groups 8 + --seq-length 8192 + --max-position-embeddings 8192 + --position-embedding-type rope + --rotary-base 500000 + --untie-embeddings-and-output-weights + --swiglu + --normalization RMSNorm + --norm-epsilon 1e-5 + --disable-bias-linear +) + +# Tokenizer +ALL_PARAMS+=( + --tokenizer-type Llama2Tokenizer + --tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.1/llm-jp-tokenizer-100k.ver3.1.model # TODO +) + +# Optimizer hyperparameters +ALL_PARAMS+=( + --optimizer adam + # --lr 3e-4 # will be defined later + # --min-lr 3e-5 # will be defined later + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --clip-grad 1.0 + --weight-decay 0.1 + --init-method-std 0.02 + --attention-dropout 0.0 + --hidden-dropout 0.0 + --override-opt_param-scheduler + # --no-load-optim +) + +# pretrain_iters: 432,581 + +# * Updated: 2025.08.14 +# 50B: ceil(57874242563/8192/1024) == 6900 +# 50B sum: 432581+6900 = 439,481 +# 80B: ceil(83,527,699,000 / 8192 / 1024) == 9,958 +# 80B sum: 432,581 + 9,958 = 442,539 # Same +# 100B: ceil(117,590,300,876/8192/1024) == 14,018 +# 100B sum: 432,581 + 14,018 = 446,599 +# 130B: ceil((57874242563+83,527,699,000)/8192/1024) == 16,857 +# 130B sum: 432,581 + 16,857 = 449,438 # Same +# 180B: ceil((117,590,300,876+83,527,699,000)/8192/1024) == 23976 +# 180B sum: 432,581 + 23976 = 456,557 # Different (190B) +# 300B: ceil(349,625,609,934/8192/1024) == 41679 +# 300B sum: 432,581 + 41679 = 474,260 +# 380B: ceil((349,625,609,934+83,527,699,000)/8192/1024) == 51,636 +# 380B sum: 432,581 + 51,636 = 484,217 # Different (420B) + +# * Before: 2025.08.14 +# 130B: ceil((57,872,011,633+83,527,699,000)/8192/1024) == 16,857 +# 130B sum: 432,581 + 16,857 = 449,438 +# 180B: ceil((201,117,999,876+83,527,699,000)/8192/1024) == 33,933 +# 180B sum: 432,581 + 33,933 = 466,514 +# 380B: ceil((433,153,308,934+83,527,699,000)/8192/1024) == 61,594 +# 380B sum: 432,581 + 61,594 = 494,175 +MIDTRAIN_START=439481 +TRAIN_ITERS=$(cat ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/train_iters.txt) # 442539 +MIDTRAIN_ITERS=$((TRAIN_ITERS - MIDTRAIN_START)) + +# Scheduler +# Scheduler +ALL_PARAMS+=( + --lr 3e-5 # Start LR + # --min-lr 3e-5 # End LR + --min-lr 0 # End LR + # --lr-warmup-iters ${MIDTRAIN_START} # No warmup + --lr-warmup-iters 0 # No warmup + # --lr-decay-iters ${TRAIN_ITERS} + --lr-decay-iters ${MIDTRAIN_ITERS} + --lr-decay-style linear + --train-iters ${TRAIN_ITERS} + --eval-interval 999999999 + --eval-iters 0 +) + +# Batch sizes +ALL_PARAMS+=( + --micro-batch-size 1 + # --global-batch-size 512 + # --micro-batch-size 2 + --global-batch-size 1024 +) + +# Parallelism +ALL_PARAMS+=( + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --sequence-parallel + --use-distributed-optimizer + --distributed-backend nccl + # NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset. + --distributed-timeout-minutes 120 + --use-mpi +) + +# Load TRAIN_DATA_PATH +source ${TASK_DIR}/train_data_${DATASET_SIZE}.sh # options: 80B +SEED=42 +# Dataset +ALL_PARAMS+=( + --data-path ${TRAIN_DATA_PATH[@]} + --data-cache-path ${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/cache + --split 1,0,0 + --seed ${SEED} +) + + TASK_CHECKPOINT_DIR=${TASK_DIR}/${PARAM_NAME}/${DATASET_SIZE}/checkpoints +mkdir -p ${TASK_CHECKPOINT_DIR} + +if [ -e ${TASK_CHECKPOINT_DIR}/${PARAM_NAME}/${DATASET_SIZE}/latest_checkpointed_iteration.txt ]; then + # Continue existing training + ALL_PARAMS+=( + --load ${TASK_CHECKPOINT_DIR} + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Continue existing training" +else + # Start new training from scratch + ALL_PARAMS+=( + --load ${TASK_CHECKPOINT_DIR} + --save ${TASK_CHECKPOINT_DIR} + ) + echo "Start new training from scratch" +fi +ALL_PARAMS+=( + --save-interval 300 +) + +# Other implementation-related parameters +ALL_PARAMS+=( + --bf16 + --use-mcore-models + --no-masked-softmax-fusion + --use-flash-attn + + # NOTE(odashi): For adjusting throughput + #--recompute-activations + #--recompute-granularity selective + #--overlap-grad-reduce + #--overlap-param-gather + + --attention-softmax-in-fp32 + --transformer-impl transformer_engine + + # NOTE(odashi): Newer implementation requires to set attention backend by parameter. + #--attention-backend flash +) + +# NOTE(odashi): Disable fused attention for Sakura cluster due to some inconsistency. +export NVTE_FUSED_ATTN=0 + +# Logging +ALL_PARAMS+=( + --log-interval 1 + --log-throughput + --wandb-entity llm-jp + --wandb-project 0193_midtrain + --wandb-exp-name train_$(basename ${TASK_DIR}) +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh index 2032f351..3b9320ee 100644 --- a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/midtrain/qsub_train.sh @@ -1,7 +1,7 @@ #!/bin/bash #PBS -P gcg51557 #PBS -q R9920251000 -#PBS -N 0193_midtrain-megamath +#PBS -N 0193 #PBS -l select=16 #PBS -l walltime=240:00:00 #PBS -m n diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_190B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_190B.sh new file mode 100644 index 00000000..8e251f74 --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_190B.sh @@ -0,0 +1,40 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +2330660320 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +2353037275 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +2338874216 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +2340537418 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +2360767687 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +2330732028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +2331249028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +2326186438 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +2345634504 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +2350248774 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +2355251058 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +2351431456 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +2312082909 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +2341460840 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +2334289216 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +2347679805 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +2342600867 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +2349637427 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +2336216835 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +2338238954 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +2340991148 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +2344135162 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +2320195352 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +2314916846 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +1448856994 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +19020984357 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +4509062 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +72191046 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +6558748 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +8416725826 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +192243302 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +15170814112 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +539984028 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +10947955817 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +1504642032 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +4127779989 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +) diff --git a/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_420B.sh b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_420B.sh new file mode 100644 index 00000000..5baa3f0e --- /dev/null +++ b/pretrain/scripts/v4-midtraining-with-v3.1-tokenizer/tasks/v4-megamath-pro-max/train_data_420B.sh @@ -0,0 +1,40 @@ +# Auto-generated: 2025-08-04 22:27:29 +export TRAIN_DATA_PATH=( +7070236707 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0000-0009_text_document +7138118914 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0010-0019_text_document +7095154191 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0020-0029_text_document +7100199642 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0030-0039_text_document +7161569714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0040-0049_text_document +7070454238 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0050-0059_text_document +7072022597 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0060-0069_text_document +7056664843 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0070-0079_text_document +7115662043 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0080-0089_text_document +7129659782 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0090-0099_text_document +7144834596 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0100-0109_text_document +7133247539 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0110-0119_text_document +7013880707 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0120-0129_text_document +7103000912 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0130-0139_text_document +7081245241 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0140-0149_text_document +7121866618 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0150-0159_text_document +7106459272 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0160-0169_text_document +7127805215 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0170-0179_text_document +7087092820 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0180-0189_text_document +7093227073 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0190-0199_text_document +7101576064 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0200-0209_text_document +7111113674 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0210-0219_text_document +7038490425 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0220-0229_text_document +7022477672 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0230-0239_text_document +4395218734 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/dclm/dclm-0240-0246_text_document +38041968714 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/flan/flan-all_text_document +9018124 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/codesearchnet-owmfilter-all_text_document +144382092 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/dolmino_math_synth-all_text_document +13117496 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/gsm8k-all_text_document +16833451652 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/mathcoder2-synthmath-all_text_document +384486604 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/metamath-owmfilter-all_text_document +30341628224 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tinyGSM-MIND-all_text_document +1079968056 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/math/tulu_math-all_text_document +65556621655 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/pes2o/pes2o-all_text_document +6018568128 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/stackexchange/stackexchange-all_text_document +16511119956 /groups/gcg51557/experiments/0156_olmo2-midtrain-reproduction/dolmino-mix-1124-v3.1-tokenized/wiki/wiki-all_text_document +83527699000 /groups/gcg51557/experiments/0193_llmjpv4_midtraining/datasets/megamath_pro_max_tokenizer_v3.1/megamath_web_pro_max_text_document +)