diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 2749026aeb..fc3bc504b3 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -2,10 +2,7 @@ name: CPU tests on: push: - branches: [main] - pull_request_target: - branches: [main] - types: [opened, reopened, ready_for_review, labeled, synchronize] + branches: [main, pwgardipee/fix-ci-2] pull_request: {} # todo workflow_dispatch: {} @@ -100,6 +97,11 @@ jobs: cache-dependency-path: pyproject.toml cache: "pip" + # - name: Debug with SSH + # uses: mxschmitt/action-tmate@v3 + # with: + # detached: true + # Add caching for HF models and tokenizers - name: HF cache uses: actions/cache@v4 @@ -125,7 +127,9 @@ jobs: - name: Run tests env: HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: pytest -v litgpt/ tests/ --timeout=180 --durations=100 + LIGHTNING_USER_ID: ${{ secrets.LIGHTNING_USER_ID }} + LIGHTNING_API_KEY: ${{ secrets.LIGHTNING_API_KEY }} + run: pytest -v litgpt/ tests/test_tokenizer.py::test_tokenizer_against_hf --timeout=600 --durations=100 -s - name: Show cache run: | diff --git a/litgpt/config.py b/litgpt/config.py index 97549a114d..708024a736 100644 --- a/litgpt/config.py +++ b/litgpt/config.py @@ -1,6 +1,5 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. -from copy import deepcopy from dataclasses import dataclass, field from pathlib import Path from typing import Any, List, Literal, Optional, Type, Union @@ -224,2914 +223,2850 @@ def norm_class(self) -> Type: configs = [ # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")), - # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json - dict( - name="stablelm-base-alpha-7b", - hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"), - n_head=48, - n_embd=6144, - padding_multiple=256, - ), - # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json - dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32), - # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json - dict( - name="stablelm-tuned-alpha-7b", - hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"), - n_head=48, - n_embd=6144, - padding_multiple=256, - ), - # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json - dict( - name="stablelm-3b-4e1t", - hf_config=dict(org="stabilityai", name="stablelm-3b-4e1t"), - padded_vocab_size=50304, - n_layer=32, - n_head=32, - n_embd=2560, - parallel_residual=False, - bias=False, - mlp_class_name="LLaMAMLP", - intermediate_size=6912, - ), - # https://huggingface.co/stabilityai/stablelm-zephyr-3b/blob/main/config.json - dict( - name="stablelm-zephyr-3b", - hf_config=dict(org="stabilityai", name="stablelm-zephyr-3b"), - padded_vocab_size=50304, - n_layer=32, - n_head=32, - n_embd=2560, - parallel_residual=False, - bias=False, - mlp_class_name="LLaMAMLP", - intermediate_size=6912, - ), + # # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json + # dict( + # name="stablelm-base-alpha-7b", + # hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"), + # n_head=48, + # n_embd=6144, + # padding_multiple=256, + # ), + # # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json + # dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32), + # # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json + # dict( + # name="stablelm-tuned-alpha-7b", + # hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"), + # n_head=48, + # n_embd=6144, + # padding_multiple=256, + # ), + # # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json + # dict( + # name="stablelm-3b-4e1t", + # hf_config=dict(org="stabilityai", name="stablelm-3b-4e1t"), + # padded_vocab_size=50304, + # n_layer=32, + # n_head=32, + # n_embd=2560, + # parallel_residual=False, + # bias=False, + # mlp_class_name="LLaMAMLP", + # intermediate_size=6912, + # ), + # # https://huggingface.co/stabilityai/stablelm-zephyr-3b/blob/main/config.json + # dict( + # name="stablelm-zephyr-3b", + # hf_config=dict(org="stabilityai", name="stablelm-zephyr-3b"), + # padded_vocab_size=50304, + # n_layer=32, + # n_head=32, + # n_embd=2560, + # parallel_residual=False, + # bias=False, + # mlp_class_name="LLaMAMLP", + # intermediate_size=6912, + # ), + # ] + # ########################## + # # Stability AI StableCode + # ########################## + # stablecode = [ + # # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json + # dict( + # name="stablecode-completion-alpha-3b", + # hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), + # block_size=16384, + # vocab_size=49152, + # n_layer=32, + # n_embd=2560, + # ), + # # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json + # dict( + # name="stablecode-completion-alpha-3b-4k", + # hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), + # vocab_size=49152, + # n_layer=32, + # n_embd=2560, + # ), + # # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json + # # dict( + # # name="stablecode-instruct-alpha-3b", + # # hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"), + # # vocab_size=49152, + # # n_layer=32, + # # n_embd=2560, + # # ), + # # https://huggingface.co/stabilityai/stable-code-3b/blob/main/config.json + # dict( + # name="stable-code-3b", + # hf_config=dict(org="stabilityai", name="stable-code-3b"), + # padded_vocab_size=50304, + # n_layer=32, + # n_embd=2560, + # block_size=16384, + # parallel_residual=False, + # bias=False, + # mlp_class_name="LLaMAMLP", + # intermediate_size=6912, + # ), + # ] + # configs.extend(stablecode) + # #################### + # # EleutherAI Pythia + # #################### + # pythia = [ + # # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json + # dict( + # name="pythia-14m", + # hf_config=dict(org="EleutherAI", name="pythia-14m"), + # block_size=512, + # n_layer=6, + # n_embd=128, + # n_head=4, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json + # dict( + # name="pythia-31m", + # hf_config=dict(org="EleutherAI", name="pythia-31m"), + # block_size=1024, + # n_layer=6, + # n_embd=256, + # n_head=8, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json + # dict( + # name="pythia-70m", + # hf_config=dict(org="EleutherAI", name="pythia-70m"), + # block_size=2048, + # n_layer=6, + # n_embd=512, + # n_head=8, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json + # dict( + # name="pythia-160m", + # hf_config=dict(org="EleutherAI", name="pythia-160m"), + # block_size=2048, + # n_layer=12, + # n_embd=768, + # n_head=12, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json + # dict( + # name="pythia-410m", + # hf_config=dict(org="EleutherAI", name="pythia-410m"), + # block_size=2048, + # n_layer=24, + # n_embd=1024, + # n_head=16, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json + # dict( + # name="pythia-1b", + # hf_config=dict(org="EleutherAI", name="pythia-1b"), + # block_size=2048, + # n_embd=2048, + # n_head=8, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json + # dict( + # name="pythia-1.4b", + # hf_config=dict(org="EleutherAI", name="pythia-1.4b"), + # block_size=2048, + # n_layer=24, + # n_embd=2048, + # n_head=16, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json + # dict( + # name="pythia-2.8b", + # hf_config=dict(org="EleutherAI", name="pythia-2.8b"), + # block_size=2048, + # n_layer=32, + # n_embd=2560, + # padding_multiple=128, + # ), + # # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json + # dict( + # name="pythia-6.9b", + # hf_config=dict(org="EleutherAI", name="pythia-6.9b"), + # block_size=2048, + # n_layer=32, + # padding_multiple=256, + # ), + # # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json + # dict( + # name="pythia-12b", + # hf_config=dict(org="EleutherAI", name="pythia-12b"), + # block_size=2048, + # n_layer=36, + # n_embd=5120, + # n_head=40, + # ), + # ] + # configs.extend(pythia) + # for c in pythia: + # # "pythia-14m" and "pythia-31m" don't have deduped version + # if c["name"] in ("pythia-14m", "pythia-31m"): + # continue + # copy = deepcopy(c) + # copy["name"] = f"{c['name']}-deduped" + # copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped" + # configs.append(copy) + # ################# + # # TII UAE Falcon + # ################# + # falcon = [ + # # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json + # dict( + # name="falcon-7b{}", + # hf_config=dict(org="tiiuae", name="falcon-7b{}"), + # block_size=2048, + # vocab_size=65024, + # padded_vocab_size=65024, + # n_layer=32, + # n_head=71, + # n_embd=4544, + # rotary_percentage=1.0, + # n_query_groups=1, + # bias=False, + # # this is not in the config, but in the original model implementation, only for this config + # shared_attention_norm=True, + # ), + # # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json + # dict( + # name="falcon-40b{}", + # hf_config=dict(org="tiiuae", name="falcon-40b{}"), + # block_size=2048, + # vocab_size=65024, + # padded_vocab_size=65024, + # n_layer=60, + # n_head=128, + # n_embd=8192, + # rotary_percentage=1.0, + # n_query_groups=8, + # bias=False, + # ), + # ] + # for c in falcon: + # for kind in ("", "-instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # # https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json + # falcon180b = dict( + # name="falcon-180B{}", + # hf_config=dict(org="tiiuae", name="falcon-180B{}"), + # block_size=2048, + # vocab_size=65024, + # padded_vocab_size=65024, + # n_layer=80, + # n_head=232, + # n_embd=14848, + # rotary_percentage=1.0, + # n_query_groups=8, + # bias=False, + # ) + # for kind in ("", "-chat"): + # copy = deepcopy(falcon180b) + # copy["name"] = falcon180b["name"].format(kind) + # copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind) + # configs.append(copy) + # falcon3 = [ + # # https://huggingface.co/tiiuae/Falcon3-1B-Base/blob/main/config.json + # dict( + # name="Falcon3-1B{}", + # hf_config=dict(org="tiiuae", name="Falcon3-1B{}"), + # block_size=4096, + # vocab_size=131072, + # padded_vocab_size=131072, + # n_layer=18, + # n_head=8, + # n_query_groups=4, + # n_embd=2048, + # rotary_percentage=1.0, + # parallel_residual=False, + # rope_base=1000042, + # norm_eps=1e-6, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8192, + # ), + # # https://huggingface.co/tiiuae/Falcon3-3B-Base/blob/main/config.json + # dict( + # name="Falcon3-3B{}", + # hf_config=dict(org="tiiuae", name="Falcon3-3B{}"), + # block_size=32768, + # vocab_size=131072, + # padded_vocab_size=131072, + # n_layer=22, + # n_head=12, + # n_query_groups=4, + # n_embd=3072, + # rotary_percentage=1.0, + # parallel_residual=False, + # rope_base=1000042, + # norm_eps=1e-6, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=9216, + # ), + # # https://huggingface.co/tiiuae/Falcon3-7B-Base/blob/main/config.json + # dict( + # name="Falcon3-7B{}", + # hf_config=dict(org="tiiuae", name="Falcon3-7B{}"), + # block_size=32768, + # vocab_size=131072, + # padded_vocab_size=131072, + # n_layer=28, + # n_head=12, + # n_query_groups=4, + # n_embd=3072, + # rotary_percentage=1.0, + # parallel_residual=False, + # rope_base=1000042, + # norm_eps=1e-6, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=23040, + # ), + # # https://huggingface.co/tiiuae/Falcon3-10B-Base/blob/main/config.json + # dict( + # name="Falcon3-10B{}", + # hf_config=dict(org="tiiuae", name="Falcon3-10B{}"), + # block_size=32768, + # vocab_size=131072, + # padded_vocab_size=131072, + # n_layer=40, + # n_head=12, + # n_query_groups=4, + # n_embd=3072, + # rotary_percentage=1.0, + # parallel_residual=False, + # rope_base=1000042, + # norm_eps=1e-6, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=23040, + # ), + # ] + # for c in falcon3: + # for kind in ("-Base", "-Instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # ############################# + # # OpenLM Research Open LLaMA + # ############################# + # open_LLaMA = [ + # # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json + # dict( + # name="open_llama_3b", + # hf_config=dict(org="openlm-research", name="open_llama_3b"), + # block_size=2048, + # vocab_size=32000, + # padding_multiple=64, + # n_layer=26, + # n_embd=3200, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-6, + # mlp_class_name="LLaMAMLP", + # intermediate_size=8640, + # ), + # # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json + # dict( + # name="open_llama_7b", + # hf_config=dict(org="openlm-research", name="open_llama_7b"), + # block_size=2048, + # vocab_size=32000, + # padding_multiple=64, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-6, + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # ), + # # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json + # dict( + # name="open_llama_13b", + # hf_config=dict(org="openlm-research", name="open_llama_13b"), + # block_size=2048, + # vocab_size=32000, + # padding_multiple=64, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-6, + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # ), + # ] + # configs.extend(open_LLaMA) + # ############### + # # Meta LLaMA 2 + # ############### + # llama_2 = [ + # # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json + # dict( + # name="Llama-2-7b{}-hf", + # hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"), + # vocab_size=32000, + # padding_multiple=64, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # ), + # # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json + # dict( + # name="Llama-2-13b{}-hf", + # hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"), + # vocab_size=32000, + # padding_multiple=64, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # ), + # # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json + # dict( + # name="Llama-2-70b{}-hf", + # hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"), + # vocab_size=32000, + # padding_multiple=64, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # ), + # ] + # for c in llama_2: + # for kind in ("", "-chat"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # ############### + # # Meta LLaMA 3 + # ############### + # llama_3 = [ + # # https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json + # dict( + # name="Llama-3-8B{}", + # hf_config=dict(org="meta-llama", name="Meta-Llama-3-8B{}"), + # block_size=8192, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=32, + # n_head=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # rope_base=500000, + # ), + # # https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/blob/main/config.json + # dict( + # name="Llama-3.1-8B{}", + # hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-8B{}"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=32, + # n_head=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # rope_base=500000, + # rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # # https://huggingface.co/meta-llama/Meta-Llama-3-70B/blob/main/config.json + # dict( + # name="Llama-3-70B{}", + # hf_config=dict(org="meta-llama", name="Meta-Llama-3-70B{}"), + # block_size=8192, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=500000, + # ), + # # https://huggingface.co/meta-llama/Meta-Llama-3.1-70B/blob/main/config.json + # dict( + # name="Llama-3.1-70B{}", + # hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-70B{}"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=500000, + # rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # # https://huggingface.co/meta-llama/Meta-Llama-3.1-405B/blob/main/config.json + # dict( + # name="Llama-3.1-405B{}", + # hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-405B{}"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=126, + # n_head=128, + # n_embd=16384, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=53248, + # rope_base=500000, + # rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # # https://huggingface.co/meta-llama/Llama-3.2-1B/blob/main/config.json + # dict( + # name="Llama-3.2-1B{}", + # hf_config=dict(org="meta-llama", name="Llama-3.2-1B{}"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=16, + # n_embd=2048, + # n_head=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8192, + # rope_base=500000, + # rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # # https://huggingface.co/meta-llama/Llama-3.2-3B/blob/main/config.json + # dict( + # name="Llama-3.2-3B{}", + # hf_config=dict(org="meta-llama", name="Llama-3.2-3B{}"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=28, + # n_embd=3072, + # n_head=24, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8192, + # rope_base=500000, + # rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # # https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/config.json + # dict( + # name="Llama-3.3-70B-Instruct", + # hf_config=dict(org="meta-llama", name="Llama-3.3-70B-Instruct"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=500000, + # rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # ] + # for c in llama_3: + # if c["name"] == "Llama-3.3-70B-Instruct": + # configs.append(c) + # continue + # for kind in ("", "-Instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # ######################### + # # NVIDIA Llama Nemotron + # ######################### + # configs.append( + # dict( + # name="Llama-3.1-Nemotron-70B-Instruct-HF", + # hf_config=dict(org="nvidia", name="Llama-3.1-Nemotron-70B-Instruct-HF"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=500000, + # rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # ) + # ################# + # # Allen AI OLMo + # ################# + # olmo = [ + # # https://huggingface.co/allenai/OLMo-1B-hf/blob/main/config.json + # dict( + # name="OLMo-1B-hf", + # hf_config=dict(org="allenai", name="OLMo-1B-hf"), + # vocab_size=50280, + # padded_vocab_size=50304, + # block_size=2048, + # n_embd=2048, + # n_layer=16, + # n_head=16, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="LayerNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8192, + # ), + # # https://huggingface.co/allenai/OLMo-7B-hf/blob/main/config.json + # dict( + # name="OLMo-7B-hf", + # hf_config=dict(org="allenai", name="OLMo-7B-hf"), + # vocab_size=50280, + # padded_vocab_size=50304, + # block_size=2048, + # n_layer=32, + # n_head=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="LayerNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # ), + # # https://huggingface.co/allenai/OLMo-7B-Instruct-hf/blob/main/config.json + # dict( + # name="OLMo-7B-Instruct-hf", + # hf_config=dict(org="allenai", name="OLMo-7B-Instruct-hf"), + # vocab_size=50280, + # padded_vocab_size=50304, + # block_size=2048, + # n_layer=32, + # n_head=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="LayerNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # ), + # ] + # configs.extend(olmo) + # olmo2 = [ + # # https://huggingface.co/allenai/OLMo-2-1124-7B/blob/main/config.json + # dict( + # name="OLMo-2-1124-7B{}", + # hf_config=dict(org="allenai", name="OLMo-2-1124-7B{}"), + # vocab_size=100278, + # padded_vocab_size=100352, + # block_size=4096, + # n_embd=4096, + # n_layer=32, + # n_head=32, + # n_query_groups=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # norm_eps=1e-06, + # intermediate_size=11008, + # rope_base=500000, + # norm_qk=True, + # post_mlp_norm=True, + # norm_1=False, + # norm_2=False, + # norm_qk_type="olmo2", + # post_attention_norm=True, + # ), + # # https://huggingface.co/allenai/OLMo-2-1124-13B/blob/main/config.json + # dict( + # name="OLMo-2-1124-13B{}", + # hf_config=dict(org="allenai", name="OLMo-2-1124-13B{}"), + # vocab_size=100278, + # padded_vocab_size=100352, + # block_size=4096, + # n_embd=5120, + # n_layer=40, + # n_head=40, + # n_query_groups=40, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # norm_eps=1e-06, + # intermediate_size=13824, + # rope_base=500000, + # norm_qk=True, + # post_mlp_norm=True, + # norm_1=False, + # norm_2=False, + # norm_qk_type="olmo2", + # post_attention_norm=True, + # ), + # ] + # for c in olmo2: + # for kind in ("", "-SFT", "-DPO", "-Instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # ############### + # # Google Gemma + # ############### + # gemma = [ + # # https://huggingface.co/google/gemma-2b/blob/main/config.json + # dict( + # name="Gemma-2b", + # hf_config=dict(org="google", name="gemma-2b"), + # scale_embeddings=True, + # vocab_size=256000, + # padding_multiple=64, + # n_embd=2048, + # n_layer=18, + # n_head=8, + # n_query_groups=1, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # intermediate_size=16384, + # ), + # # https://huggingface.co/google/gemma-7b/blob/main/config.json + # dict( + # name="Gemma-7b", + # hf_config=dict(org="google", name="gemma-7b"), + # scale_embeddings=True, + # vocab_size=256000, + # padding_multiple=64, + # n_embd=3072, + # n_layer=28, + # n_head=16, + # head_size=256, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # intermediate_size=24576, + # ), + # # https://huggingface.co/google/gemma-2-2b/blob/main/config.json + # dict( + # name="Gemma-2-2b", + # hf_config=dict(org="google", name="gemma-2-2b"), + # scale_embeddings=True, + # attention_scores_scalar=256, + # vocab_size=256000, + # block_size=8192, + # sliding_window_size=4096, + # # only layer with idx 0, 2, 4, ... have sliding window attention + # sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(26)], + # intermediate_size=9216, + # n_embd=2304, + # n_layer=26, + # n_head=8, + # n_query_groups=4, + # head_size=256, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # post_attention_norm=True, + # post_mlp_norm=True, + # attention_logit_softcapping=50.0, + # final_logit_softcapping=30.0, + # ), + # # https://huggingface.co/google/gemma-2-9b/blob/main/config.json + # dict( + # name="Gemma-2-9b", + # hf_config=dict(org="google", name="gemma-2-9b"), + # scale_embeddings=True, + # attention_scores_scalar=256, + # vocab_size=256000, + # block_size=8192, + # sliding_window_size=4096, + # # only layer with idx 0, 2, 4, ... have sliding window attention + # sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(42)], + # intermediate_size=14336, + # n_embd=3584, + # n_layer=42, + # n_head=16, + # n_query_groups=8, + # head_size=256, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # post_attention_norm=True, + # post_mlp_norm=True, + # attention_logit_softcapping=50.0, + # final_logit_softcapping=30.0, + # ), + # # https://huggingface.co/google/gemma-2-27b/blob/main/config.json + # dict( + # name="Gemma-2-27b", + # hf_config=dict(org="google", name="gemma-2-27b"), + # scale_embeddings=True, + # # In Gemma 2 27B attention scores are scaled not by `sqrt(head_size)` (11.31), + # # but by `sqrt(n_emb // n_head)` = sqrt(4608 // 32) = 12 + # attention_scores_scalar=144, + # vocab_size=256000, + # block_size=8192, + # sliding_window_size=4096, + # # only layer with idx 0, 2, 4, ... have sliding window attention + # sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(46)], + # intermediate_size=36864, + # n_embd=4608, + # n_layer=46, + # n_head=32, + # n_query_groups=16, + # head_size=128, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # post_attention_norm=True, + # post_mlp_norm=True, + # attention_logit_softcapping=50.0, + # final_logit_softcapping=30.0, + # ), + # ] + # configs.extend(gemma) + # for c in gemma: + # copy = deepcopy(c) + # copy["name"] = f"{c['name']}-it" + # copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it" + # configs.append(copy) + # ################## + # # Google Gemma 3 + # ################## + # gemma3 = [ + # # https://huggingface.co/google/gemma-3-1b-it/blob/main/config.json + # dict( + # name="Gemma-3-1b-it", + # hf_config=dict(org="google", name="gemma-3-1b-it"), + # scale_embeddings=True, + # attention_scores_scalar=256, + # vocab_size=262144, + # block_size=131072, + # sliding_window_size=512, + # # 5 local layers for every global layer + # sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)], + # intermediate_size=6912, + # n_embd=1152, + # n_layer=26, + # n_head=4, + # n_query_groups=1, + # head_size=256, + # rotary_percentage=1.0, + # rope_adjustments=None, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # post_attention_norm=True, + # post_mlp_norm=True, + # norm_qk=True, + # rope_base=1000000, + # rope_local_base_freq=10000, + # # 5 local layers for every global layer + # rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)], + # ), + # # https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json + # dict( + # name="Gemma-3-4b-it", + # hf_config=dict(org="google", name="gemma-3-4b-it"), + # scale_embeddings=True, + # attention_scores_scalar=256, + # vocab_size=262144, + # block_size=131072, + # sliding_window_size=1024, + # # 5 local layers for every global layer + # sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)], + # intermediate_size=10240, + # n_embd=2560, + # n_layer=34, + # n_head=8, + # n_query_groups=4, + # head_size=256, + # rotary_percentage=1.0, + # rope_adjustments=dict(factor=8.0), + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # post_attention_norm=True, + # post_mlp_norm=True, + # norm_qk=True, + # rope_base=1000000, + # rope_local_base_freq=10000, + # # 5 local layers for every global layer + # rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)], + # ), + # # https://huggingface.co/google/gemma-3-12b-it/blob/main/config.json + # dict( + # name="Gemma-3-12b-it", + # hf_config=dict(org="google", name="gemma-3-12b-it"), + # scale_embeddings=True, + # attention_scores_scalar=256, + # vocab_size=262144, + # block_size=131072, + # sliding_window_size=1024, + # # 5 local layers for every global layer + # sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)], + # intermediate_size=15360, + # n_embd=3840, + # n_layer=48, + # n_head=16, + # n_query_groups=8, + # head_size=256, + # rotary_percentage=1.0, + # rope_adjustments=dict(factor=8.0), + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # post_attention_norm=True, + # post_mlp_norm=True, + # norm_qk=True, + # rope_base=1000000, + # rope_local_base_freq=10000, + # # 5 local layers for every global layer + # rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)], + # ), + # # https://huggingface.co/google/gemma-3-27b-it/blob/main/config.json + # dict( + # name="Gemma-3-27b-it", + # hf_config=dict(org="google", name="gemma-3-27b-it"), + # scale_embeddings=True, + # attention_scores_scalar=168, + # vocab_size=262144, + # block_size=131072, + # sliding_window_size=1024, + # # 5 local layers for every global layer + # sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)], + # intermediate_size=21504, + # n_embd=5376, + # n_layer=62, + # n_head=32, + # n_query_groups=16, + # head_size=128, + # rotary_percentage=1.0, + # rope_adjustments=dict(factor=8.0), + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # post_attention_norm=True, + # post_mlp_norm=True, + # norm_qk=True, + # rope_base=1000000, + # rope_local_base_freq=10000, + # # 5 local layers for every global layer + # rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)], + # ), + # ] + # configs.extend(gemma3) + # ################## + # # Google CodeGemma + # ################## + # codegemma = [ + # # https://huggingface.co/google/codegemma-7b-it/blob/main/config.json + # dict( + # name="CodeGemma-7b-it", + # hf_config=dict(org="google", name="codegemma-7b-it"), + # scale_embeddings=True, + # vocab_size=256000, + # padding_multiple=64, + # n_embd=3072, + # n_layer=28, + # n_head=16, + # head_size=256, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="GemmaMLP", + # gelu_approximate="tanh", + # intermediate_size=24576, + # ), + # ] + # configs.extend(codegemma) + # ########################## + # # Stability AI FreeWilly2 + # ########################## + # freewilly_2 = [ + # # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json + # dict( + # name="FreeWilly2", + # hf_config=dict(org="stabilityai", name="FreeWilly2"), + # vocab_size=32000, + # padding_multiple=64, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # ) + # ] + # configs.extend(freewilly_2) + # ################## + # # Meta Code Llama + # ################## + # code_llama = [ + # # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json + # dict( + # name="CodeLlama-7b-hf", + # hf_config=dict(org="codellama", name="CodeLlama-7b-hf"), + # block_size=16384, + # vocab_size=32016, + # padding_multiple=16, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json + # dict( + # name="CodeLlama-13b-hf", + # hf_config=dict(org="codellama", name="CodeLlama-13b-hf"), + # block_size=16384, + # vocab_size=32016, + # padding_multiple=16, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json + # dict( + # name="CodeLlama-34b-hf", + # hf_config=dict(org="codellama", name="CodeLlama-34b-hf"), + # block_size=16384, + # vocab_size=32000, + # padded_vocab_size=32000, + # n_layer=48, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=22016, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-70b-hf/blob/main/config.json + # dict( + # name="CodeLlama-70b-hf", + # hf_config=dict(org="codellama", name="CodeLlama-70b-hf"), + # block_size=16384, + # vocab_size=32016, + # padding_multiple=16, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json + # dict( + # name="CodeLlama-7b-Python-hf", + # hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"), + # block_size=16384, + # vocab_size=32000, + # padded_vocab_size=32000, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json + # dict( + # name="CodeLlama-13b-Python-hf", + # hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"), + # block_size=16384, + # vocab_size=32000, + # padded_vocab_size=32000, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json + # dict( + # name="CodeLlama-34b-Python-hf", + # hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"), + # block_size=16384, + # vocab_size=32000, + # padded_vocab_size=32000, + # n_layer=48, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=22016, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-70b-Python-hf/blob/main/config.json + # dict( + # name="CodeLlama-70b-Python-hf", + # hf_config=dict(org="codellama", name="CodeLlama-70b-Python-hf"), + # block_size=16384, + # vocab_size=32016, + # padding_multiple=16, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/config.json + # dict( + # name="CodeLlama-7b-Instruct-hf", + # hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"), + # block_size=16384, + # vocab_size=32016, + # padding_multiple=16, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json + # dict( + # name="CodeLlama-13b-Instruct-hf", + # hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"), + # block_size=2048, + # vocab_size=32016, + # padding_multiple=16, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json + # dict( + # name="CodeLlama-34b-Instruct-hf", + # hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"), + # block_size=16384, + # vocab_size=32000, + # padded_vocab_size=32000, + # n_layer=48, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=22016, + # rope_base=1000000, + # ), + # # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/config.json + # dict( + # name="CodeLlama-70b-Instruct-hf", + # hf_config=dict(org="codellama", name="CodeLlama-70b-Instruct-hf"), + # block_size=16384, + # # 32016 is an added token, so not reported in vocab_size + # # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/tokenizer_config.json + # vocab_size=32015, + # padding_multiple=16, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=1000000, + # ), + # ] + # configs.extend(code_llama) + # ######################## + # # garage-bAInd Platypus + # ######################## + # platypus = [ + # # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json + # dict( + # name="Platypus-30B", + # hf_config=dict(org="garage-bAInd", name="Platypus-30B"), + # block_size=2048, + # padded_vocab_size=32000, + # n_layer=60, + # n_head=52, + # n_embd=6656, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-06, + # mlp_class_name="LLaMAMLP", + # intermediate_size=17920, + # ), + # # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json + # dict( + # name="Platypus2-7B", + # hf_config=dict(org="garage-bAInd", name="Platypus2-7B"), + # padded_vocab_size=32000, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # ), + # # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json + # dict( + # name="Platypus2-13B", + # hf_config=dict(org="garage-bAInd", name="Platypus2-13B"), + # padded_vocab_size=32000, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # ), + # # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json + # dict( + # name="Platypus2-70B", + # hf_config=dict(org="garage-bAInd", name="Platypus2-70B"), + # padded_vocab_size=32000, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # ), + # # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json + # dict( + # name="Camel-Platypus2-13B", + # hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"), + # padded_vocab_size=32000, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # ), + # # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json + # dict( + # name="Camel-Platypus2-70B", + # hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"), + # padded_vocab_size=32000, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # ), + # # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json + # dict( + # name="Stable-Platypus2-13B", + # hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"), + # padded_vocab_size=32000, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # ), + # # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json + # dict( + # name="Platypus2-70B-instruct", + # hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"), + # padded_vocab_size=32000, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # ), + # ] + # configs.extend(platypus) + # ################################## + # # togethercomputer LLaMA-2-7B-32K + # ################################## + # together_llama2_32k = [ + # # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json + # dict( + # name="LLaMA-2-7B-32K", + # hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"), + # vocab_size=32000, + # padding_multiple=64, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # rope_condense_ratio=8, + # ) + # ] + # configs.extend(together_llama2_32k) + # ################ + # # Microsoft Phi + # ################ + # phi = [ + # # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json + # dict( + # name="phi-1_5", + # hf_config=dict(org="microsoft", name="phi-1_5"), + # vocab_size=50257, + # padded_vocab_size=51200, + # block_size=2048, + # n_embd=2048, + # n_layer=24, + # rotary_percentage=0.5, # 32 / (n_embd / n_head) = 32 / 64 + # shared_attention_norm=True, + # lm_head_bias=True, + # gelu_approximate="tanh", + # ), + # # https://huggingface.co/microsoft/phi-2/blob/main/config.json + # dict( + # name="phi-2", + # hf_config=dict(org="microsoft", name="phi-2"), + # vocab_size=50257, + # padded_vocab_size=51200, + # block_size=2048, + # n_embd=2560, + # n_layer=32, + # rotary_percentage=0.4, # 32 / (n_embd / n_head) = 32 / 80 + # shared_attention_norm=True, + # lm_head_bias=True, + # gelu_approximate="tanh", + # ), + # # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json + # dict( + # name="Phi-3-mini-4k-instruct", + # hf_config=dict(org="microsoft", name="Phi-3-mini-4k-instruct"), + # vocab_size=32000, + # padded_vocab_size=32064, + # block_size=4096, + # n_embd=3072, + # n_layer=32, + # rotary_percentage=1.0, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=8192, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # sliding_window_size=2048, + # ), + # # https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json + # dict( + # name="Phi-3-mini-128k-instruct", + # hf_config=dict(org="microsoft", name="Phi-3-mini-128k-instruct"), + # vocab_size=32000, + # padded_vocab_size=32064, + # block_size=131072, + # n_embd=3072, + # n_layer=32, + # rotary_percentage=1.0, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=8192, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # sliding_window_size=262145, + # ), + # # https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/config.json + # dict( + # name="Phi-3.5-mini-instruct", + # hf_config=dict(org="microsoft", name="Phi-3.5-mini-instruct"), + # vocab_size=32000, + # padded_vocab_size=32064, + # block_size=4096, + # n_embd=3072, + # n_layer=32, + # rotary_percentage=1.0, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=8192, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # ), + # # https://huggingface.co/microsoft/phi-4/blob/main/config.json + # dict( + # name="phi-4", + # hf_config=dict(org="microsoft", name="phi-4"), + # vocab_size=100352, + # padded_vocab_size=100352, + # block_size=16384, + # n_embd=5120, + # n_layer=40, + # n_head=40, + # n_query_groups=10, + # rotary_percentage=1.0, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=17920, + # rope_base=250000, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # ), + # # https://huggingface.co/microsoft/Phi-4-reasoning/blob/main/config.json + # dict( + # name="Phi-4-reasoning", + # hf_config=dict(org="microsoft", name="Phi-4-reasoning"), + # vocab_size=100352, + # padded_vocab_size=100352, + # block_size=32768, + # n_embd=5120, + # n_layer=40, + # n_head=40, + # n_query_groups=10, + # rotary_percentage=1.0, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=17920, + # rope_base=500000, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # ), + # # https://huggingface.co/microsoft/Phi-4-reasoning-plus/blob/main/config.json + # dict( + # name="Phi-4-reasoning-plus", + # hf_config=dict(org="microsoft", name="Phi-4-reasoning-plus"), + # vocab_size=100352, + # padded_vocab_size=100352, + # block_size=32768, + # n_embd=5120, + # n_layer=40, + # n_head=40, + # n_query_groups=10, + # rotary_percentage=1.0, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=17920, + # rope_base=500000, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # ), + # # https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json + # dict( + # name="Phi-4-mini-instruct", + # hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"), + # vocab_size=200019, + # padded_vocab_size=200064, + # block_size=131072, + # n_embd=3072, + # n_layer=32, + # n_head=24, + # n_query_groups=8, + # rotary_percentage=0.75, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=8192, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # sliding_window_size=262145, + # ), + # # https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/config.json + # dict( + # name="Phi-4-mini-reasoning", + # hf_config=dict(org="microsoft", name="Phi-4-mini-reasoning"), + # vocab_size=200019, + # padded_vocab_size=200064, + # block_size=131072, + # n_embd=3072, + # n_layer=32, + # n_head=24, + # n_query_groups=8, + # rotary_percentage=0.75, + # bias=False, + # norm_class_name="RMSNorm", + # intermediate_size=8192, + # mlp_class_name="LLaMAMLP", + # parallel_residual=False, + # sliding_window_size=262145, + # ), + # ] + # configs.extend(phi) + # ############# + # # Mistral AI + # ############# + # configs.append( + # # https://huggingface.co/mistralai/mathstral-7B-v0.1/blob/main/config.json + # dict( + # name="Mathstral-7B-v0.1", + # hf_config=dict(org="mistralai", name="mathstral-7B-v0.1"), + # padded_vocab_size=32768, + # block_size=32768, + # n_layer=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # sliding_window_size=4096, + # ) + # ) + # mistral = [ + # # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json + # dict( + # name="Mistral-7B-{}v0.1", + # hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"), + # padded_vocab_size=32000, + # block_size=4096, # should be 32768 but sliding window attention is not implemented + # n_layer=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # sliding_window_size=4096, + # ), + # # https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json + # dict( + # name="Mixtral-8x7B-{}v0.1", + # hf_config=dict(org="mistralai", name="Mixtral-8x7B-{}v0.1"), + # padded_vocab_size=32000, + # block_size=32768, + # n_layer=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMoE", + # intermediate_size=14336, + # rope_base=1000000, + # n_expert=8, + # n_expert_per_token=2, + # ), + # # https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1/blob/main/config.json + # dict( + # name="Mixtral-8x22B-{}v0.1", + # hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"), + # padded_vocab_size=32768, + # block_size=65536, + # n_layer=56, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMoE", + # intermediate_size=16384, + # n_head=48, + # n_embd=6144, + # rope_base=1000000, + # n_expert=8, + # n_expert_per_token=2, + # ), + # ] + # for c in mistral: + # for kind in ("", "Instruct-"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # configs.append( + # # https://huggingface.co/unsloth/mistral-7b-v0.2/blob/main/config.json + # dict( + # name="Mistral-7B-v0.2", + # hf_config=dict(org="unsloth", name="Mistral-7B-v0.2"), + # padded_vocab_size=32000, + # block_size=32768, + # n_layer=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # ) + # ) + # configs.append( + # # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/main/config.json + # dict( + # name="Mistral-7B-Instruct-v0.2", + # hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.2"), + # padded_vocab_size=32000, + # block_size=32768, + # n_layer=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # ) + # ) + # configs.append( + # # https://huggingface.co/mistralai/Mistral-7B-v0.3/blob/main/config.json + # dict( + # name="Mistral-7B-v0.3", + # hf_config=dict(org="mistralai", name="Mistral-7B-v0.3"), + # padded_vocab_size=32768, + # block_size=32768, + # n_layer=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # ) + # ) + # configs.append( + # # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/blob/main/config.json + # dict( + # name="Mistral-7B-Instruct-v0.3", + # hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.3"), + # padded_vocab_size=32768, + # block_size=32768, + # n_layer=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # ) + # ) + # configs.append( + # # https://huggingface.co/mistralai/Mistral-Large-Instruct-2407/blob/main/config.json + # dict( + # name="Mistral-Large-Instruct-2407", + # hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2407"), + # padded_vocab_size=32768, + # block_size=32768, + # n_layer=88, + # n_head=96, + # n_embd=12288, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # ) + # ) + # configs.append( + # # https://huggingface.co/mistralai/Mistral-Large-Instruct-2411/blob/main/config.json + # dict( + # name="Mistral-Large-Instruct-2411", + # hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2411"), + # padded_vocab_size=32768, + # block_size=32768, + # n_layer=88, + # n_head=96, + # n_embd=12288, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # norm_eps=1e-05, + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # ) + # ) + # ############ + # # TinyLlama + # ############ + # tiny_llama = [ + # dict( + # name="tiny-llama-1.1b{}", + # hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"), + # block_size=2048, + # vocab_size=32000, + # padding_multiple=64, + # n_layer=22, + # n_head=32, + # n_embd=2048, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", # original TinyLlama use FusedRMSNorm + # norm_eps=1e-5, + # mlp_class_name="LLaMAMLP", + # intermediate_size=5632, + # n_query_groups=4, + # ) + # ] + # for c in tiny_llama: + # for kind, hf_postfix in (("", "-intermediate-step-1431k-3T"), ("-chat", "-Chat-v1.0")): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix) + # configs.append(copy) + # ############ + # # MicroLlama + # ############ + # micro_llama = [ + # dict( + # name="micro-llama-300M", + # hf_config=dict(org="keeeeenw", name="MicroLlama"), + # block_size=2048, + # vocab_size=32000, + # padding_multiple=64, + # n_layer=12, + # n_head=16, + # n_embd=1024, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", # original TinyLlama and MicroLlama use FusedRMSNorm + # norm_eps=1e-5, + # mlp_class_name="LLaMAMLP", + # intermediate_size=5632, + # n_query_groups=4, + # ) + # ] + # configs.extend(micro_llama) + # ########################## + # # Trelis Function Calling + # ########################## + # llama_2_function_calling = [ + # # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json + # dict( + # name="Llama-2-7b-chat-hf-function-calling-v2", + # hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"), + # padding_multiple=64, + # n_layer=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # norm_eps=1e-6, + # block_size=4096, + # vocab_size=32000, + # n_head=32, + # n_embd=4096, + # rope_base=10000, + # ) + # ] + # configs.extend(llama_2_function_calling) + # ########## + # # Qwen2.5 + # ########## + # qwen_2_5 = [ + # # https://huggingface.co/Qwen/Qwen2.5-0.5B/blob/main/config.json + # dict( + # name="Qwen2.5-0.5B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=24, + # n_head=14, + # n_embd=896, + # n_query_groups=2, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=4864, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-1.5B/blob/main/config.json + # dict( + # name="Qwen2.5-1.5B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"), + # block_size=131072, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=28, + # n_head=12, + # n_embd=1536, + # n_query_groups=2, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8960, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-3B/blob/main/config.json + # dict( + # name="Qwen2.5-3B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=36, + # n_head=16, + # n_embd=2048, + # n_query_groups=2, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/config.json + # dict( + # name="Qwen2.5-7B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"), + # block_size=131072, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=28, + # n_head=28, + # n_embd=3584, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=18944, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-14B/blob/main/config.json + # dict( + # name="Qwen2.5-14B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"), + # block_size=131072, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=48, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # norm_eps=1e-5, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json + # dict( + # name="Qwen2.5-32B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"), + # block_size=131072, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=64, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=27648, + # norm_eps=1e-5, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-72B/blob/main/config.json + # dict( + # name="Qwen2.5-72B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"), + # block_size=131072, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=29568, + # norm_eps=1e-5, + # rope_base=1000000, + # ), + # ] + # qwen_2_5_coder = [ + # # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json + # dict( + # name="Qwen2.5-Coder-0.5B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Coder-0.5B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=24, + # n_head=14, + # n_embd=896, + # n_query_groups=2, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=4864, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B/blob/main/config.json + # dict( + # name="Qwen2.5-Coder-1.5B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Coder-1.5B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=28, + # n_head=12, + # n_embd=1536, + # n_query_groups=2, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8960, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-Coder-3B/blob/main/config.json + # dict( + # name="Qwen2.5-Coder-3B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Coder-3B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=36, + # n_head=16, + # n_embd=2048, + # n_query_groups=2, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-Coder-7B/blob/main/config.json + # dict( + # name="Qwen2.5-Coder-7B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Coder-7B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=28, + # n_head=28, + # n_embd=3584, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=18944, + # norm_eps=1e-6, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-Coder-14B/blob/main/config.json + # dict( + # name="Qwen2.5-Coder-14B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Coder-14B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=48, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # norm_eps=1e-5, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-Coder-32B/blob/main/config.json + # dict( + # name="Qwen2.5-Coder-32B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Coder-32B{}"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=64, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=27648, + # norm_eps=1e-5, + # rope_base=1000000, + # ), + # ] + # qwen_2_5.extend(qwen_2_5_coder) + # qwen_2_5_math = [ + # # https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/config.json + # dict( + # name="Qwen2.5-Math-1.5B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Math-1.5B{}"), + # block_size=4096, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=28, + # n_head=12, + # n_embd=1536, + # n_query_groups=2, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8960, + # norm_eps=1e-6, + # rope_base=10000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-Math-7B/blob/main/config.json + # dict( + # name="Qwen2.5-Math-7B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Math-7B{}"), + # block_size=4096, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=28, + # n_head=28, + # n_embd=3584, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=18944, + # norm_eps=1e-6, + # rope_base=10000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-Math-72B/blob/main/config.json + # dict( + # name="Qwen2.5-Math-72B{}", + # hf_config=dict(org="Qwen", name="Qwen2.5-Math-72B{}"), + # block_size=4096, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=29568, + # norm_eps=1e-5, + # rope_base=10000, + # ), + # ] + # qwen_2_5.extend(qwen_2_5_math) + # for c in qwen_2_5: + # for kind in ("", "-Instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # qwen_2_5_1m = [ + # # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json + # dict( + # name="Qwen2.5-7B-Instruct-1M", + # hf_config=dict(org="Qwen", name="Qwen2.5-7B-Instruct-1M"), + # block_size=1010000, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=28, + # n_head=28, + # n_embd=3584, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=18944, + # norm_eps=1e-5, + # rope_base=10000000, + # ), + # # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json + # dict( + # name="Qwen2.5-14B-Instruct-1M", + # hf_config=dict(org="Qwen", name="Qwen2.5-14B-Instruct-1M"), + # block_size=1010000, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=48, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=13824, + # norm_eps=1e-5, + # rope_base=10000000, + # ), + # ] + # configs.extend(qwen_2_5_1m) + # ########## + # # QwQ + # ########## + # qwq = [ + # # https://huggingface.co/Qwen/QwQ-32B/blob/main/config.json + # dict( + # name="QwQ-32B", + # hf_config=dict(org="Qwen", name="QwQ-32B"), + # block_size=131072, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=64, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=27648, + # norm_eps=1e-5, + # rope_base=1000000, + # ), + # # https://huggingface.co/Qwen/QwQ-32B-Preview/blob/main/config.json + # dict( + # name="QwQ-32B-Preview", + # hf_config=dict(org="Qwen", name="QwQ-32B-Preview"), + # block_size=32768, + # vocab_size=151643, + # padded_vocab_size=152064, + # n_layer=64, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # attn_bias=True, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=27648, + # norm_eps=1e-5, + # rope_base=1000000, + # ), + # ] + # configs.extend(qwq) + # ########## + # # Qwen3 + # ########## + # qwen_3 = [ + # # https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/config.json + # dict( + # name="Qwen3-0.6B{}", + # hf_config=dict(org="Qwen", name="Qwen3-0.6B{}"), + # block_size=40960, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=28, + # n_head=16, + # n_embd=1024, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=3072, + # norm_eps=1e-6, + # rope_base=1000000, + # head_size=128, + # norm_qk=True, + # ), + # # https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/config.json + # dict( + # name="Qwen3-1.7B{}", + # hf_config=dict(org="Qwen", name="Qwen3-1.7B{}"), + # block_size=40960, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=28, + # n_head=16, + # n_embd=2048, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=6144, + # norm_eps=1e-6, + # rope_base=1000000, + # norm_qk=True, + # ), + # # https://huggingface.co/Qwen/Qwen3-4B/blob/main/config.json + # dict( + # name="Qwen3-4B{}", + # hf_config=dict(org="Qwen", name="Qwen3-4B{}"), + # block_size=40960, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=36, + # n_head=32, + # n_embd=2560, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=9728, + # norm_eps=1e-6, + # rope_base=1000000, + # head_size=128, + # norm_qk=True, + # ), + # # https://huggingface.co/Qwen/Qwen3-8B/blob/main/config.json + # dict( + # name="Qwen3-8B{}", + # hf_config=dict(org="Qwen", name="Qwen3-8B{}"), + # block_size=40960, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=36, + # n_head=32, + # n_embd=4096, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=12288, + # norm_eps=1e-6, + # rope_base=1000000, + # norm_qk=True, + # ), + # # https://huggingface.co/Qwen/Qwen3-14B/blob/main/config.json + # dict( + # name="Qwen3-14B{}", + # hf_config=dict(org="Qwen", name="Qwen3-14B{}"), + # block_size=40960, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=40, + # n_head=40, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=17408, + # norm_eps=1e-6, + # rope_base=1000000, + # norm_qk=True, + # ), + # ] + # for c in qwen_3: + # for kind in ("", "-Base"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # qwen_3_32b = [ + # # https://huggingface.co/Qwen/Qwen3-32B/blob/main/config.json + # dict( + # name="Qwen3-32B", + # hf_config=dict(org="Qwen", name="Qwen3-32B"), + # block_size=40960, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=64, + # n_head=64, + # n_embd=5120, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=25600, + # norm_eps=1e-6, + # rope_base=1000000, + # head_size=128, + # norm_qk=True, + # ), + # ] + # configs.extend(qwen_3_32b) + # qwen_3_moe = [ + # # https://huggingface.co/Qwen/Qwen3-30B-A3B/blob/main/config.json + # dict( + # name="Qwen3-30B-A3B", + # hf_config=dict(org="Qwen", name="Qwen3-30B-A3B"), + # block_size=40960, + # head_size=128, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=48, + # n_head=32, + # n_embd=2048, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMoE", + # intermediate_size=6144, + # moe_intermediate_size=768, + # norm_eps=1e-6, + # rope_base=1000000, + # norm_qk=True, + # n_expert=128, + # n_expert_per_token=8, + # ), + # # https://huggingface.co/Qwen/Qwen3-30B-A3B-Base/blob/main/config.json + # dict( + # name="Qwen3-30B-A3B-Base", + # hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Base"), + # block_size=40960, + # head_size=128, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=48, + # n_head=32, + # n_embd=2048, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMoE", + # intermediate_size=6144, + # moe_intermediate_size=768, + # norm_eps=1e-6, + # rope_base=1000000, + # norm_qk=True, + # n_expert=128, + # n_expert_per_token=8, + # ), + # # https://huggingface.co/Qwen/Qwen3-235B-A22B/blob/main/config.json + # dict( + # name="Qwen3-235B-A22B", + # hf_config=dict(org="Qwen", name="Qwen3-235B-A22B"), + # block_size=40960, + # head_size=128, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=94, + # n_head=64, + # n_embd=4096, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMoE", + # intermediate_size=12288, + # moe_intermediate_size=1536, + # norm_eps=1e-6, + # rope_base=1000000, + # norm_qk=True, + # n_expert=128, + # n_expert_per_token=8, + # ), + # ] + # configs.extend(qwen_3_moe) + # qwen_3_2507_thinking_instruct = [ + # # https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json + # dict( + # name="Qwen3-235B-A22B-{}-2507", + # hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-{}-2507"), + # block_size=262144, + # head_size=128, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=94, + # n_head=64, + # n_embd=4096, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMoE", + # intermediate_size=12288, + # moe_intermediate_size=1536, + # norm_eps=1e-6, + # rope_base=5000000, + # norm_qk=True, + # n_expert=128, + # n_expert_per_token=8, + # ), + # # https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json + # dict( + # name="Qwen3-30B-A3B-{}-2507", + # hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-{}-2507"), + # block_size=262144, + # head_size=128, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=48, + # n_head=32, + # n_embd=2048, + # n_query_groups=4, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMoE", + # intermediate_size=6144, + # moe_intermediate_size=768, + # norm_eps=1e-6, + # rope_base=10000000, + # norm_qk=True, + # n_expert=128, + # n_expert_per_token=8, + # ), + # # https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json + # dict( + # name="Qwen3-4B-{}-2507", + # hf_config=dict(org="Qwen", name="Qwen3-4B-{}-2507"), + # block_size=262144, + # vocab_size=151643, + # padded_vocab_size=151936, + # n_layer=36, + # n_head=32, + # n_embd=2560, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=9728, + # norm_eps=1e-6, + # rope_base=5000000, + # head_size=128, + # norm_qk=True, + # ), + # ] + # for c in qwen_3_2507_thinking_instruct: + # for kind in ("Thinking", "Instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # ############# + # # Salamandra + # ############# + # salamandra = [ + # # https://huggingface.co/BSC-LT/salamandra-2b-instruct/blob/main/config.json + # dict( + # name="salamandra-2b{}", + # hf_config=dict(org="BSC-LT", name="salamandra-2b{}"), + # block_size=8192, + # vocab_size=256000, + # padded_vocab_size=256000, + # n_layer=24, + # n_head=16, + # n_embd=2048, + # n_query_groups=16, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=5440, + # norm_eps=1e-5, + # rope_base=10000, + # ), + # # https://huggingface.co/BSC-LT/salamandra-7b-instruct/blob/main/config.json + # dict( + # name="salamandra-7b{}", + # hf_config=dict(org="BSC-LT", name="salamandra-7b{}"), + # block_size=8192, + # vocab_size=256000, + # padded_vocab_size=256000, + # n_layer=32, + # n_head=32, + # n_embd=4096, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=11008, + # norm_eps=1e-6, + # rope_base=10000, + # ), + # ] + # for c in salamandra: + # for kind in ("", "-instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # ############### + # # SmolLM2 + # ############### + # smollm2 = [ + # # https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config.json + # dict( + # name="SmolLM2-135M{}", + # hf_config=dict(org="HuggingFaceTB", name="SmolLM2-135M{}"), + # block_size=8192, + # vocab_size=49152, + # padded_vocab_size=49152, + # n_layer=30, + # n_head=9, + # n_embd=576, + # n_query_groups=3, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=1536, + # rope_base=100000, + # norm_eps=1e-5, + # ), + # # https://huggingface.co/HuggingFaceTB/SmolLM2-360M/blob/main/config.json + # dict( + # name="SmolLM2-360M{}", + # hf_config=dict(org="HuggingFaceTB", name="SmolLM2-360M{}"), + # block_size=8192, + # vocab_size=49152, + # padded_vocab_size=49152, + # n_layer=32, + # n_head=15, + # n_embd=960, + # n_query_groups=5, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=2560, + # rope_base=100000, + # norm_eps=1e-5, + # ), + # # https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B/blob/main/config.json + # dict( + # name="SmolLM2-1.7B{}", + # hf_config=dict(org="HuggingFaceTB", name="SmolLM2-1.7B{}"), + # block_size=8192, + # vocab_size=49152, + # padded_vocab_size=49152, + # n_layer=24, + # n_head=32, + # n_embd=2048, + # n_query_groups=32, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=8192, + # rope_base=130000, + # norm_eps=1e-5, + # ), + # ] + # for c in smollm2: + # for kind in ("", "-Instruct"): + # copy = deepcopy(c) + # copy["name"] = c["name"].format(kind) + # copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) + # configs.append(copy) + # ############### + # # DeepSeek R1 Distill + # ############### + # r1_distill_llama = [ + # # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/blob/main/config.json + # dict( + # name="R1-Distill-Llama-8B", + # hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-8B"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=32, + # n_head=32, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=14336, + # rope_base=500000, + # rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), + # # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/config.json + # dict( + # name="R1-Distill-Llama-70B", + # hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-70B"), + # block_size=131072, + # vocab_size=128000, + # padded_vocab_size=128256, + # n_layer=80, + # n_head=64, + # n_embd=8192, + # n_query_groups=8, + # rotary_percentage=1.0, + # parallel_residual=False, + # bias=False, + # norm_class_name="RMSNorm", + # mlp_class_name="LLaMAMLP", + # intermediate_size=28672, + # rope_base=500000, + # rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), + # ), ] - -########################## -# Stability AI StableCode -########################## -stablecode = [ - # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json - dict( - name="stablecode-completion-alpha-3b", - hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), - block_size=16384, - vocab_size=49152, - n_layer=32, - n_embd=2560, - ), - # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json - dict( - name="stablecode-completion-alpha-3b-4k", - hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), - vocab_size=49152, - n_layer=32, - n_embd=2560, - ), - # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json - dict( - name="stablecode-instruct-alpha-3b", - hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"), - vocab_size=49152, - n_layer=32, - n_embd=2560, - ), - # https://huggingface.co/stabilityai/stable-code-3b/blob/main/config.json - dict( - name="stable-code-3b", - hf_config=dict(org="stabilityai", name="stable-code-3b"), - padded_vocab_size=50304, - n_layer=32, - n_embd=2560, - block_size=16384, - parallel_residual=False, - bias=False, - mlp_class_name="LLaMAMLP", - intermediate_size=6912, - ), -] -configs.extend(stablecode) - - -#################### -# EleutherAI Pythia -#################### -pythia = [ - # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json - dict( - name="pythia-14m", - hf_config=dict(org="EleutherAI", name="pythia-14m"), - block_size=512, - n_layer=6, - n_embd=128, - n_head=4, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json - dict( - name="pythia-31m", - hf_config=dict(org="EleutherAI", name="pythia-31m"), - block_size=1024, - n_layer=6, - n_embd=256, - n_head=8, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json - dict( - name="pythia-70m", - hf_config=dict(org="EleutherAI", name="pythia-70m"), - block_size=2048, - n_layer=6, - n_embd=512, - n_head=8, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json - dict( - name="pythia-160m", - hf_config=dict(org="EleutherAI", name="pythia-160m"), - block_size=2048, - n_layer=12, - n_embd=768, - n_head=12, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json - dict( - name="pythia-410m", - hf_config=dict(org="EleutherAI", name="pythia-410m"), - block_size=2048, - n_layer=24, - n_embd=1024, - n_head=16, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json - dict( - name="pythia-1b", - hf_config=dict(org="EleutherAI", name="pythia-1b"), - block_size=2048, - n_embd=2048, - n_head=8, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json - dict( - name="pythia-1.4b", - hf_config=dict(org="EleutherAI", name="pythia-1.4b"), - block_size=2048, - n_layer=24, - n_embd=2048, - n_head=16, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json - dict( - name="pythia-2.8b", - hf_config=dict(org="EleutherAI", name="pythia-2.8b"), - block_size=2048, - n_layer=32, - n_embd=2560, - padding_multiple=128, - ), - # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json - dict( - name="pythia-6.9b", - hf_config=dict(org="EleutherAI", name="pythia-6.9b"), - block_size=2048, - n_layer=32, - padding_multiple=256, - ), - # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json - dict( - name="pythia-12b", - hf_config=dict(org="EleutherAI", name="pythia-12b"), - block_size=2048, - n_layer=36, - n_embd=5120, - n_head=40, - ), -] -configs.extend(pythia) -for c in pythia: - # "pythia-14m" and "pythia-31m" don't have deduped version - if c["name"] in ("pythia-14m", "pythia-31m"): - continue - copy = deepcopy(c) - copy["name"] = f"{c['name']}-deduped" - copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped" - configs.append(copy) - - -################# -# TII UAE Falcon -################# -falcon = [ - # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json - dict( - name="falcon-7b{}", - hf_config=dict(org="tiiuae", name="falcon-7b{}"), - block_size=2048, - vocab_size=65024, - padded_vocab_size=65024, - n_layer=32, - n_head=71, - n_embd=4544, - rotary_percentage=1.0, - n_query_groups=1, - bias=False, - # this is not in the config, but in the original model implementation, only for this config - shared_attention_norm=True, - ), - # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json - dict( - name="falcon-40b{}", - hf_config=dict(org="tiiuae", name="falcon-40b{}"), - block_size=2048, - vocab_size=65024, - padded_vocab_size=65024, - n_layer=60, - n_head=128, - n_embd=8192, - rotary_percentage=1.0, - n_query_groups=8, - bias=False, - ), -] -for c in falcon: - for kind in ("", "-instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - -# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json -falcon180b = dict( - name="falcon-180B{}", - hf_config=dict(org="tiiuae", name="falcon-180B{}"), - block_size=2048, - vocab_size=65024, - padded_vocab_size=65024, - n_layer=80, - n_head=232, - n_embd=14848, - rotary_percentage=1.0, - n_query_groups=8, - bias=False, -) - -for kind in ("", "-chat"): - copy = deepcopy(falcon180b) - copy["name"] = falcon180b["name"].format(kind) - copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind) - configs.append(copy) - -falcon3 = [ - # https://huggingface.co/tiiuae/Falcon3-1B-Base/blob/main/config.json - dict( - name="Falcon3-1B{}", - hf_config=dict(org="tiiuae", name="Falcon3-1B{}"), - block_size=4096, - vocab_size=131072, - padded_vocab_size=131072, - n_layer=18, - n_head=8, - n_query_groups=4, - n_embd=2048, - rotary_percentage=1.0, - parallel_residual=False, - rope_base=1000042, - norm_eps=1e-6, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8192, - ), - # https://huggingface.co/tiiuae/Falcon3-3B-Base/blob/main/config.json - dict( - name="Falcon3-3B{}", - hf_config=dict(org="tiiuae", name="Falcon3-3B{}"), - block_size=32768, - vocab_size=131072, - padded_vocab_size=131072, - n_layer=22, - n_head=12, - n_query_groups=4, - n_embd=3072, - rotary_percentage=1.0, - parallel_residual=False, - rope_base=1000042, - norm_eps=1e-6, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=9216, - ), - # https://huggingface.co/tiiuae/Falcon3-7B-Base/blob/main/config.json - dict( - name="Falcon3-7B{}", - hf_config=dict(org="tiiuae", name="Falcon3-7B{}"), - block_size=32768, - vocab_size=131072, - padded_vocab_size=131072, - n_layer=28, - n_head=12, - n_query_groups=4, - n_embd=3072, - rotary_percentage=1.0, - parallel_residual=False, - rope_base=1000042, - norm_eps=1e-6, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=23040, - ), - # https://huggingface.co/tiiuae/Falcon3-10B-Base/blob/main/config.json - dict( - name="Falcon3-10B{}", - hf_config=dict(org="tiiuae", name="Falcon3-10B{}"), - block_size=32768, - vocab_size=131072, - padded_vocab_size=131072, - n_layer=40, - n_head=12, - n_query_groups=4, - n_embd=3072, - rotary_percentage=1.0, - parallel_residual=False, - rope_base=1000042, - norm_eps=1e-6, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=23040, - ), -] -for c in falcon3: - for kind in ("-Base", "-Instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - - -############################# -# OpenLM Research Open LLaMA -############################# -open_LLaMA = [ - # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json - dict( - name="open_llama_3b", - hf_config=dict(org="openlm-research", name="open_llama_3b"), - block_size=2048, - vocab_size=32000, - padding_multiple=64, - n_layer=26, - n_embd=3200, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-6, - mlp_class_name="LLaMAMLP", - intermediate_size=8640, - ), - # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json - dict( - name="open_llama_7b", - hf_config=dict(org="openlm-research", name="open_llama_7b"), - block_size=2048, - vocab_size=32000, - padding_multiple=64, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-6, - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - ), - # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json - dict( - name="open_llama_13b", - hf_config=dict(org="openlm-research", name="open_llama_13b"), - block_size=2048, - vocab_size=32000, - padding_multiple=64, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-6, - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - ), -] -configs.extend(open_LLaMA) - -############### -# Meta LLaMA 2 -############### -llama_2 = [ - # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json - dict( - name="Llama-2-7b{}-hf", - hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"), - vocab_size=32000, - padding_multiple=64, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - ), - # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json - dict( - name="Llama-2-13b{}-hf", - hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"), - vocab_size=32000, - padding_multiple=64, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - ), - # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json - dict( - name="Llama-2-70b{}-hf", - hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"), - vocab_size=32000, - padding_multiple=64, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - ), -] -for c in llama_2: - for kind in ("", "-chat"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - - -############### -# Meta LLaMA 3 -############### -llama_3 = [ - # https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json - dict( - name="Llama-3-8B{}", - hf_config=dict(org="meta-llama", name="Meta-Llama-3-8B{}"), - block_size=8192, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=32, - n_head=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - rope_base=500000, - ), - # https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/blob/main/config.json - dict( - name="Llama-3.1-8B{}", - hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-8B{}"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=32, - n_head=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - rope_base=500000, - rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), - # https://huggingface.co/meta-llama/Meta-Llama-3-70B/blob/main/config.json - dict( - name="Llama-3-70B{}", - hf_config=dict(org="meta-llama", name="Meta-Llama-3-70B{}"), - block_size=8192, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=500000, - ), - # https://huggingface.co/meta-llama/Meta-Llama-3.1-70B/blob/main/config.json - dict( - name="Llama-3.1-70B{}", - hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-70B{}"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=500000, - rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), - # https://huggingface.co/meta-llama/Meta-Llama-3.1-405B/blob/main/config.json - dict( - name="Llama-3.1-405B{}", - hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-405B{}"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=126, - n_head=128, - n_embd=16384, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=53248, - rope_base=500000, - rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), - # https://huggingface.co/meta-llama/Llama-3.2-1B/blob/main/config.json - dict( - name="Llama-3.2-1B{}", - hf_config=dict(org="meta-llama", name="Llama-3.2-1B{}"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=16, - n_embd=2048, - n_head=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8192, - rope_base=500000, - rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), - # https://huggingface.co/meta-llama/Llama-3.2-3B/blob/main/config.json - dict( - name="Llama-3.2-3B{}", - hf_config=dict(org="meta-llama", name="Llama-3.2-3B{}"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=28, - n_embd=3072, - n_head=24, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8192, - rope_base=500000, - rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), - # https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/config.json - dict( - name="Llama-3.3-70B-Instruct", - hf_config=dict(org="meta-llama", name="Llama-3.3-70B-Instruct"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=500000, - rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), -] -for c in llama_3: - if c["name"] == "Llama-3.3-70B-Instruct": - configs.append(c) - continue - for kind in ("", "-Instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - -######################### -# NVIDIA Llama Nemotron -######################### -configs.append( - dict( - name="Llama-3.1-Nemotron-70B-Instruct-HF", - hf_config=dict(org="nvidia", name="Llama-3.1-Nemotron-70B-Instruct-HF"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=500000, - rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), -) - -################# -# Allen AI OLMo -################# -olmo = [ - # https://huggingface.co/allenai/OLMo-1B-hf/blob/main/config.json - dict( - name="OLMo-1B-hf", - hf_config=dict(org="allenai", name="OLMo-1B-hf"), - vocab_size=50280, - padded_vocab_size=50304, - block_size=2048, - n_embd=2048, - n_layer=16, - n_head=16, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="LayerNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8192, - ), - # https://huggingface.co/allenai/OLMo-7B-hf/blob/main/config.json - dict( - name="OLMo-7B-hf", - hf_config=dict(org="allenai", name="OLMo-7B-hf"), - vocab_size=50280, - padded_vocab_size=50304, - block_size=2048, - n_layer=32, - n_head=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="LayerNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - ), - # https://huggingface.co/allenai/OLMo-7B-Instruct-hf/blob/main/config.json - dict( - name="OLMo-7B-Instruct-hf", - hf_config=dict(org="allenai", name="OLMo-7B-Instruct-hf"), - vocab_size=50280, - padded_vocab_size=50304, - block_size=2048, - n_layer=32, - n_head=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="LayerNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - ), -] - -configs.extend(olmo) - -olmo2 = [ - # https://huggingface.co/allenai/OLMo-2-1124-7B/blob/main/config.json - dict( - name="OLMo-2-1124-7B{}", - hf_config=dict(org="allenai", name="OLMo-2-1124-7B{}"), - vocab_size=100278, - padded_vocab_size=100352, - block_size=4096, - n_embd=4096, - n_layer=32, - n_head=32, - n_query_groups=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - norm_eps=1e-06, - intermediate_size=11008, - rope_base=500000, - norm_qk=True, - post_mlp_norm=True, - norm_1=False, - norm_2=False, - norm_qk_type="olmo2", - post_attention_norm=True, - ), - # https://huggingface.co/allenai/OLMo-2-1124-13B/blob/main/config.json - dict( - name="OLMo-2-1124-13B{}", - hf_config=dict(org="allenai", name="OLMo-2-1124-13B{}"), - vocab_size=100278, - padded_vocab_size=100352, - block_size=4096, - n_embd=5120, - n_layer=40, - n_head=40, - n_query_groups=40, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - norm_eps=1e-06, - intermediate_size=13824, - rope_base=500000, - norm_qk=True, - post_mlp_norm=True, - norm_1=False, - norm_2=False, - norm_qk_type="olmo2", - post_attention_norm=True, - ), -] - -for c in olmo2: - for kind in ("", "-SFT", "-DPO", "-Instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - -############### -# Google Gemma -############### -gemma = [ - # https://huggingface.co/google/gemma-2b/blob/main/config.json - dict( - name="Gemma-2b", - hf_config=dict(org="google", name="gemma-2b"), - scale_embeddings=True, - vocab_size=256000, - padding_multiple=64, - n_embd=2048, - n_layer=18, - n_head=8, - n_query_groups=1, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - intermediate_size=16384, - ), - # https://huggingface.co/google/gemma-7b/blob/main/config.json - dict( - name="Gemma-7b", - hf_config=dict(org="google", name="gemma-7b"), - scale_embeddings=True, - vocab_size=256000, - padding_multiple=64, - n_embd=3072, - n_layer=28, - n_head=16, - head_size=256, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - intermediate_size=24576, - ), - # https://huggingface.co/google/gemma-2-2b/blob/main/config.json - dict( - name="Gemma-2-2b", - hf_config=dict(org="google", name="gemma-2-2b"), - scale_embeddings=True, - attention_scores_scalar=256, - vocab_size=256000, - block_size=8192, - sliding_window_size=4096, - # only layer with idx 0, 2, 4, ... have sliding window attention - sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(26)], - intermediate_size=9216, - n_embd=2304, - n_layer=26, - n_head=8, - n_query_groups=4, - head_size=256, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - post_attention_norm=True, - post_mlp_norm=True, - attention_logit_softcapping=50.0, - final_logit_softcapping=30.0, - ), - # https://huggingface.co/google/gemma-2-9b/blob/main/config.json - dict( - name="Gemma-2-9b", - hf_config=dict(org="google", name="gemma-2-9b"), - scale_embeddings=True, - attention_scores_scalar=256, - vocab_size=256000, - block_size=8192, - sliding_window_size=4096, - # only layer with idx 0, 2, 4, ... have sliding window attention - sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(42)], - intermediate_size=14336, - n_embd=3584, - n_layer=42, - n_head=16, - n_query_groups=8, - head_size=256, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - post_attention_norm=True, - post_mlp_norm=True, - attention_logit_softcapping=50.0, - final_logit_softcapping=30.0, - ), - # https://huggingface.co/google/gemma-2-27b/blob/main/config.json - dict( - name="Gemma-2-27b", - hf_config=dict(org="google", name="gemma-2-27b"), - scale_embeddings=True, - # In Gemma 2 27B attention scores are scaled not by `sqrt(head_size)` (11.31), - # but by `sqrt(n_emb // n_head)` = sqrt(4608 // 32) = 12 - attention_scores_scalar=144, - vocab_size=256000, - block_size=8192, - sliding_window_size=4096, - # only layer with idx 0, 2, 4, ... have sliding window attention - sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(46)], - intermediate_size=36864, - n_embd=4608, - n_layer=46, - n_head=32, - n_query_groups=16, - head_size=128, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - post_attention_norm=True, - post_mlp_norm=True, - attention_logit_softcapping=50.0, - final_logit_softcapping=30.0, - ), -] -configs.extend(gemma) -for c in gemma: - copy = deepcopy(c) - copy["name"] = f"{c['name']}-it" - copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it" - configs.append(copy) - -################## -# Google Gemma 3 -################## -gemma3 = [ - # https://huggingface.co/google/gemma-3-1b-it/blob/main/config.json - dict( - name="Gemma-3-1b-it", - hf_config=dict(org="google", name="gemma-3-1b-it"), - scale_embeddings=True, - attention_scores_scalar=256, - vocab_size=262144, - block_size=131072, - sliding_window_size=512, - # 5 local layers for every global layer - sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)], - intermediate_size=6912, - n_embd=1152, - n_layer=26, - n_head=4, - n_query_groups=1, - head_size=256, - rotary_percentage=1.0, - rope_adjustments=None, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - post_attention_norm=True, - post_mlp_norm=True, - norm_qk=True, - rope_base=1000000, - rope_local_base_freq=10000, - # 5 local layers for every global layer - rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)], - ), - # https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json - dict( - name="Gemma-3-4b-it", - hf_config=dict(org="google", name="gemma-3-4b-it"), - scale_embeddings=True, - attention_scores_scalar=256, - vocab_size=262144, - block_size=131072, - sliding_window_size=1024, - # 5 local layers for every global layer - sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)], - intermediate_size=10240, - n_embd=2560, - n_layer=34, - n_head=8, - n_query_groups=4, - head_size=256, - rotary_percentage=1.0, - rope_adjustments=dict(factor=8.0), - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - post_attention_norm=True, - post_mlp_norm=True, - norm_qk=True, - rope_base=1000000, - rope_local_base_freq=10000, - # 5 local layers for every global layer - rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)], - ), - # https://huggingface.co/google/gemma-3-12b-it/blob/main/config.json - dict( - name="Gemma-3-12b-it", - hf_config=dict(org="google", name="gemma-3-12b-it"), - scale_embeddings=True, - attention_scores_scalar=256, - vocab_size=262144, - block_size=131072, - sliding_window_size=1024, - # 5 local layers for every global layer - sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)], - intermediate_size=15360, - n_embd=3840, - n_layer=48, - n_head=16, - n_query_groups=8, - head_size=256, - rotary_percentage=1.0, - rope_adjustments=dict(factor=8.0), - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - post_attention_norm=True, - post_mlp_norm=True, - norm_qk=True, - rope_base=1000000, - rope_local_base_freq=10000, - # 5 local layers for every global layer - rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)], - ), - # https://huggingface.co/google/gemma-3-27b-it/blob/main/config.json - dict( - name="Gemma-3-27b-it", - hf_config=dict(org="google", name="gemma-3-27b-it"), - scale_embeddings=True, - attention_scores_scalar=168, - vocab_size=262144, - block_size=131072, - sliding_window_size=1024, - # 5 local layers for every global layer - sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)], - intermediate_size=21504, - n_embd=5376, - n_layer=62, - n_head=32, - n_query_groups=16, - head_size=128, - rotary_percentage=1.0, - rope_adjustments=dict(factor=8.0), - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - post_attention_norm=True, - post_mlp_norm=True, - norm_qk=True, - rope_base=1000000, - rope_local_base_freq=10000, - # 5 local layers for every global layer - rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)], - ), -] -configs.extend(gemma3) - -################## -# Google CodeGemma -################## -codegemma = [ - # https://huggingface.co/google/codegemma-7b-it/blob/main/config.json - dict( - name="CodeGemma-7b-it", - hf_config=dict(org="google", name="codegemma-7b-it"), - scale_embeddings=True, - vocab_size=256000, - padding_multiple=64, - n_embd=3072, - n_layer=28, - n_head=16, - head_size=256, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="GemmaMLP", - gelu_approximate="tanh", - intermediate_size=24576, - ), -] -configs.extend(codegemma) - - -########################## -# Stability AI FreeWilly2 -########################## -freewilly_2 = [ - # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json - dict( - name="FreeWilly2", - hf_config=dict(org="stabilityai", name="FreeWilly2"), - vocab_size=32000, - padding_multiple=64, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - ) -] -configs.extend(freewilly_2) - - -################## -# Meta Code Llama -################## -code_llama = [ - # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json - dict( - name="CodeLlama-7b-hf", - hf_config=dict(org="codellama", name="CodeLlama-7b-hf"), - block_size=16384, - vocab_size=32016, - padding_multiple=16, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json - dict( - name="CodeLlama-13b-hf", - hf_config=dict(org="codellama", name="CodeLlama-13b-hf"), - block_size=16384, - vocab_size=32016, - padding_multiple=16, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json - dict( - name="CodeLlama-34b-hf", - hf_config=dict(org="codellama", name="CodeLlama-34b-hf"), - block_size=16384, - vocab_size=32000, - padded_vocab_size=32000, - n_layer=48, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=22016, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-70b-hf/blob/main/config.json - dict( - name="CodeLlama-70b-hf", - hf_config=dict(org="codellama", name="CodeLlama-70b-hf"), - block_size=16384, - vocab_size=32016, - padding_multiple=16, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json - dict( - name="CodeLlama-7b-Python-hf", - hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"), - block_size=16384, - vocab_size=32000, - padded_vocab_size=32000, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json - dict( - name="CodeLlama-13b-Python-hf", - hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"), - block_size=16384, - vocab_size=32000, - padded_vocab_size=32000, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json - dict( - name="CodeLlama-34b-Python-hf", - hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"), - block_size=16384, - vocab_size=32000, - padded_vocab_size=32000, - n_layer=48, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=22016, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-70b-Python-hf/blob/main/config.json - dict( - name="CodeLlama-70b-Python-hf", - hf_config=dict(org="codellama", name="CodeLlama-70b-Python-hf"), - block_size=16384, - vocab_size=32016, - padding_multiple=16, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/config.json - dict( - name="CodeLlama-7b-Instruct-hf", - hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"), - block_size=16384, - vocab_size=32016, - padding_multiple=16, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json - dict( - name="CodeLlama-13b-Instruct-hf", - hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"), - block_size=2048, - vocab_size=32016, - padding_multiple=16, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json - dict( - name="CodeLlama-34b-Instruct-hf", - hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"), - block_size=16384, - vocab_size=32000, - padded_vocab_size=32000, - n_layer=48, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=22016, - rope_base=1000000, - ), - # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/config.json - dict( - name="CodeLlama-70b-Instruct-hf", - hf_config=dict(org="codellama", name="CodeLlama-70b-Instruct-hf"), - block_size=16384, - # 32016 is an added token, so not reported in vocab_size - # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/tokenizer_config.json - vocab_size=32015, - padding_multiple=16, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=1000000, - ), -] -configs.extend(code_llama) - - -######################## -# garage-bAInd Platypus -######################## -platypus = [ - # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json - dict( - name="Platypus-30B", - hf_config=dict(org="garage-bAInd", name="Platypus-30B"), - block_size=2048, - padded_vocab_size=32000, - n_layer=60, - n_head=52, - n_embd=6656, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-06, - mlp_class_name="LLaMAMLP", - intermediate_size=17920, - ), - # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json - dict( - name="Platypus2-7B", - hf_config=dict(org="garage-bAInd", name="Platypus2-7B"), - padded_vocab_size=32000, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - ), - # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json - dict( - name="Platypus2-13B", - hf_config=dict(org="garage-bAInd", name="Platypus2-13B"), - padded_vocab_size=32000, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - ), - # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json - dict( - name="Platypus2-70B", - hf_config=dict(org="garage-bAInd", name="Platypus2-70B"), - padded_vocab_size=32000, - n_layer=80, - n_head=64, - n_embd=8192, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - ), - # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json - dict( - name="Camel-Platypus2-13B", - hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"), - padded_vocab_size=32000, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - ), - # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json - dict( - name="Camel-Platypus2-70B", - hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"), - padded_vocab_size=32000, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - ), - # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json - dict( - name="Stable-Platypus2-13B", - hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"), - padded_vocab_size=32000, - n_layer=40, - n_head=40, - n_embd=5120, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - ), - # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json - dict( - name="Platypus2-70B-instruct", - hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"), - padded_vocab_size=32000, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - ), -] -configs.extend(platypus) - - -################################## -# togethercomputer LLaMA-2-7B-32K -################################## -together_llama2_32k = [ - # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json - dict( - name="LLaMA-2-7B-32K", - hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"), - vocab_size=32000, - padding_multiple=64, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - rope_condense_ratio=8, - ) -] -configs.extend(together_llama2_32k) - - -################ -# Microsoft Phi -################ -phi = [ - # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json - dict( - name="phi-1_5", - hf_config=dict(org="microsoft", name="phi-1_5"), - vocab_size=50257, - padded_vocab_size=51200, - block_size=2048, - n_embd=2048, - n_layer=24, - rotary_percentage=0.5, # 32 / (n_embd / n_head) = 32 / 64 - shared_attention_norm=True, - lm_head_bias=True, - gelu_approximate="tanh", - ), - # https://huggingface.co/microsoft/phi-2/blob/main/config.json - dict( - name="phi-2", - hf_config=dict(org="microsoft", name="phi-2"), - vocab_size=50257, - padded_vocab_size=51200, - block_size=2048, - n_embd=2560, - n_layer=32, - rotary_percentage=0.4, # 32 / (n_embd / n_head) = 32 / 80 - shared_attention_norm=True, - lm_head_bias=True, - gelu_approximate="tanh", - ), - # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json - dict( - name="Phi-3-mini-4k-instruct", - hf_config=dict(org="microsoft", name="Phi-3-mini-4k-instruct"), - vocab_size=32000, - padded_vocab_size=32064, - block_size=4096, - n_embd=3072, - n_layer=32, - rotary_percentage=1.0, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=8192, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - sliding_window_size=2048, - ), - # https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json - dict( - name="Phi-3-mini-128k-instruct", - hf_config=dict(org="microsoft", name="Phi-3-mini-128k-instruct"), - vocab_size=32000, - padded_vocab_size=32064, - block_size=131072, - n_embd=3072, - n_layer=32, - rotary_percentage=1.0, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=8192, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - sliding_window_size=262145, - ), - # https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/config.json - dict( - name="Phi-3.5-mini-instruct", - hf_config=dict(org="microsoft", name="Phi-3.5-mini-instruct"), - vocab_size=32000, - padded_vocab_size=32064, - block_size=4096, - n_embd=3072, - n_layer=32, - rotary_percentage=1.0, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=8192, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - ), - # https://huggingface.co/microsoft/phi-4/blob/main/config.json - dict( - name="phi-4", - hf_config=dict(org="microsoft", name="phi-4"), - vocab_size=100352, - padded_vocab_size=100352, - block_size=16384, - n_embd=5120, - n_layer=40, - n_head=40, - n_query_groups=10, - rotary_percentage=1.0, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=17920, - rope_base=250000, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - ), - # https://huggingface.co/microsoft/Phi-4-reasoning/blob/main/config.json - dict( - name="Phi-4-reasoning", - hf_config=dict(org="microsoft", name="Phi-4-reasoning"), - vocab_size=100352, - padded_vocab_size=100352, - block_size=32768, - n_embd=5120, - n_layer=40, - n_head=40, - n_query_groups=10, - rotary_percentage=1.0, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=17920, - rope_base=500000, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - ), - # https://huggingface.co/microsoft/Phi-4-reasoning-plus/blob/main/config.json - dict( - name="Phi-4-reasoning-plus", - hf_config=dict(org="microsoft", name="Phi-4-reasoning-plus"), - vocab_size=100352, - padded_vocab_size=100352, - block_size=32768, - n_embd=5120, - n_layer=40, - n_head=40, - n_query_groups=10, - rotary_percentage=1.0, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=17920, - rope_base=500000, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - ), - # https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json - dict( - name="Phi-4-mini-instruct", - hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"), - vocab_size=200019, - padded_vocab_size=200064, - block_size=131072, - n_embd=3072, - n_layer=32, - n_head=24, - n_query_groups=8, - rotary_percentage=0.75, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=8192, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - sliding_window_size=262145, - ), - # https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/config.json - dict( - name="Phi-4-mini-reasoning", - hf_config=dict(org="microsoft", name="Phi-4-mini-reasoning"), - vocab_size=200019, - padded_vocab_size=200064, - block_size=131072, - n_embd=3072, - n_layer=32, - n_head=24, - n_query_groups=8, - rotary_percentage=0.75, - bias=False, - norm_class_name="RMSNorm", - intermediate_size=8192, - mlp_class_name="LLaMAMLP", - parallel_residual=False, - sliding_window_size=262145, - ), -] -configs.extend(phi) - - -############# -# Mistral AI -############# - -configs.append( - # https://huggingface.co/mistralai/mathstral-7B-v0.1/blob/main/config.json - dict( - name="Mathstral-7B-v0.1", - hf_config=dict(org="mistralai", name="mathstral-7B-v0.1"), - padded_vocab_size=32768, - block_size=32768, - n_layer=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - sliding_window_size=4096, - ) -) - -mistral = [ - # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json - dict( - name="Mistral-7B-{}v0.1", - hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"), - padded_vocab_size=32000, - block_size=4096, # should be 32768 but sliding window attention is not implemented - n_layer=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - sliding_window_size=4096, - ), - # https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json - dict( - name="Mixtral-8x7B-{}v0.1", - hf_config=dict(org="mistralai", name="Mixtral-8x7B-{}v0.1"), - padded_vocab_size=32000, - block_size=32768, - n_layer=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMoE", - intermediate_size=14336, - rope_base=1000000, - n_expert=8, - n_expert_per_token=2, - ), - # https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1/blob/main/config.json - dict( - name="Mixtral-8x22B-{}v0.1", - hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"), - padded_vocab_size=32768, - block_size=65536, - n_layer=56, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMoE", - intermediate_size=16384, - n_head=48, - n_embd=6144, - rope_base=1000000, - n_expert=8, - n_expert_per_token=2, - ), -] -for c in mistral: - for kind in ("", "Instruct-"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) -configs.append( - # https://huggingface.co/unsloth/mistral-7b-v0.2/blob/main/config.json - dict( - name="Mistral-7B-v0.2", - hf_config=dict(org="unsloth", name="Mistral-7B-v0.2"), - padded_vocab_size=32000, - block_size=32768, - n_layer=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - ) -) -configs.append( - # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/main/config.json - dict( - name="Mistral-7B-Instruct-v0.2", - hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.2"), - padded_vocab_size=32000, - block_size=32768, - n_layer=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - ) -) -configs.append( - # https://huggingface.co/mistralai/Mistral-7B-v0.3/blob/main/config.json - dict( - name="Mistral-7B-v0.3", - hf_config=dict(org="mistralai", name="Mistral-7B-v0.3"), - padded_vocab_size=32768, - block_size=32768, - n_layer=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - ) -) -configs.append( - # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/blob/main/config.json - dict( - name="Mistral-7B-Instruct-v0.3", - hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.3"), - padded_vocab_size=32768, - block_size=32768, - n_layer=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - ) -) -configs.append( - # https://huggingface.co/mistralai/Mistral-Large-Instruct-2407/blob/main/config.json - dict( - name="Mistral-Large-Instruct-2407", - hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2407"), - padded_vocab_size=32768, - block_size=32768, - n_layer=88, - n_head=96, - n_embd=12288, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - ) -) -configs.append( - # https://huggingface.co/mistralai/Mistral-Large-Instruct-2411/blob/main/config.json - dict( - name="Mistral-Large-Instruct-2411", - hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2411"), - padded_vocab_size=32768, - block_size=32768, - n_layer=88, - n_head=96, - n_embd=12288, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - norm_eps=1e-05, - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - ) -) - - -############ -# TinyLlama -############ -tiny_llama = [ - dict( - name="tiny-llama-1.1b{}", - hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"), - block_size=2048, - vocab_size=32000, - padding_multiple=64, - n_layer=22, - n_head=32, - n_embd=2048, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", # original TinyLlama use FusedRMSNorm - norm_eps=1e-5, - mlp_class_name="LLaMAMLP", - intermediate_size=5632, - n_query_groups=4, - ) -] -for c in tiny_llama: - for kind, hf_postfix in (("", "-intermediate-step-1431k-3T"), ("-chat", "-Chat-v1.0")): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix) - configs.append(copy) - - -############ -# MicroLlama -############ -micro_llama = [ - dict( - name="micro-llama-300M", - hf_config=dict(org="keeeeenw", name="MicroLlama"), - block_size=2048, - vocab_size=32000, - padding_multiple=64, - n_layer=12, - n_head=16, - n_embd=1024, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", # original TinyLlama and MicroLlama use FusedRMSNorm - norm_eps=1e-5, - mlp_class_name="LLaMAMLP", - intermediate_size=5632, - n_query_groups=4, - ) -] -configs.extend(micro_llama) - - -########################## -# Trelis Function Calling -########################## -llama_2_function_calling = [ - # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json - dict( - name="Llama-2-7b-chat-hf-function-calling-v2", - hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"), - padding_multiple=64, - n_layer=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - norm_eps=1e-6, - block_size=4096, - vocab_size=32000, - n_head=32, - n_embd=4096, - rope_base=10000, - ) -] - -configs.extend(llama_2_function_calling) - -########## -# Qwen2.5 -########## -qwen_2_5 = [ - # https://huggingface.co/Qwen/Qwen2.5-0.5B/blob/main/config.json - dict( - name="Qwen2.5-0.5B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=24, - n_head=14, - n_embd=896, - n_query_groups=2, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=4864, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-1.5B/blob/main/config.json - dict( - name="Qwen2.5-1.5B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"), - block_size=131072, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=28, - n_head=12, - n_embd=1536, - n_query_groups=2, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8960, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-3B/blob/main/config.json - dict( - name="Qwen2.5-3B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=36, - n_head=16, - n_embd=2048, - n_query_groups=2, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/config.json - dict( - name="Qwen2.5-7B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"), - block_size=131072, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=28, - n_head=28, - n_embd=3584, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=18944, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-14B/blob/main/config.json - dict( - name="Qwen2.5-14B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"), - block_size=131072, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=48, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - norm_eps=1e-5, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json - dict( - name="Qwen2.5-32B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"), - block_size=131072, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=64, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=27648, - norm_eps=1e-5, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-72B/blob/main/config.json - dict( - name="Qwen2.5-72B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"), - block_size=131072, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=29568, - norm_eps=1e-5, - rope_base=1000000, - ), -] - -qwen_2_5_coder = [ - # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json - dict( - name="Qwen2.5-Coder-0.5B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Coder-0.5B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=24, - n_head=14, - n_embd=896, - n_query_groups=2, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=4864, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B/blob/main/config.json - dict( - name="Qwen2.5-Coder-1.5B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Coder-1.5B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=28, - n_head=12, - n_embd=1536, - n_query_groups=2, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8960, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-Coder-3B/blob/main/config.json - dict( - name="Qwen2.5-Coder-3B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Coder-3B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=36, - n_head=16, - n_embd=2048, - n_query_groups=2, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-Coder-7B/blob/main/config.json - dict( - name="Qwen2.5-Coder-7B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Coder-7B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=28, - n_head=28, - n_embd=3584, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=18944, - norm_eps=1e-6, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-Coder-14B/blob/main/config.json - dict( - name="Qwen2.5-Coder-14B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Coder-14B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=48, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - norm_eps=1e-5, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-Coder-32B/blob/main/config.json - dict( - name="Qwen2.5-Coder-32B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Coder-32B{}"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=64, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=27648, - norm_eps=1e-5, - rope_base=1000000, - ), -] - -qwen_2_5.extend(qwen_2_5_coder) - -qwen_2_5_math = [ - # https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/config.json - dict( - name="Qwen2.5-Math-1.5B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Math-1.5B{}"), - block_size=4096, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=28, - n_head=12, - n_embd=1536, - n_query_groups=2, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8960, - norm_eps=1e-6, - rope_base=10000, - ), - # https://huggingface.co/Qwen/Qwen2.5-Math-7B/blob/main/config.json - dict( - name="Qwen2.5-Math-7B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Math-7B{}"), - block_size=4096, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=28, - n_head=28, - n_embd=3584, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=18944, - norm_eps=1e-6, - rope_base=10000, - ), - # https://huggingface.co/Qwen/Qwen2.5-Math-72B/blob/main/config.json - dict( - name="Qwen2.5-Math-72B{}", - hf_config=dict(org="Qwen", name="Qwen2.5-Math-72B{}"), - block_size=4096, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=29568, - norm_eps=1e-5, - rope_base=10000, - ), -] - -qwen_2_5.extend(qwen_2_5_math) - -for c in qwen_2_5: - for kind in ("", "-Instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - -qwen_2_5_1m = [ - # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json - dict( - name="Qwen2.5-7B-Instruct-1M", - hf_config=dict(org="Qwen", name="Qwen2.5-7B-Instruct-1M"), - block_size=1010000, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=28, - n_head=28, - n_embd=3584, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=18944, - norm_eps=1e-5, - rope_base=10000000, - ), - # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json - dict( - name="Qwen2.5-14B-Instruct-1M", - hf_config=dict(org="Qwen", name="Qwen2.5-14B-Instruct-1M"), - block_size=1010000, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=48, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=13824, - norm_eps=1e-5, - rope_base=10000000, - ), -] - -configs.extend(qwen_2_5_1m) - -########## -# QwQ -########## -qwq = [ - # https://huggingface.co/Qwen/QwQ-32B/blob/main/config.json - dict( - name="QwQ-32B", - hf_config=dict(org="Qwen", name="QwQ-32B"), - block_size=131072, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=64, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=27648, - norm_eps=1e-5, - rope_base=1000000, - ), - # https://huggingface.co/Qwen/QwQ-32B-Preview/blob/main/config.json - dict( - name="QwQ-32B-Preview", - hf_config=dict(org="Qwen", name="QwQ-32B-Preview"), - block_size=32768, - vocab_size=151643, - padded_vocab_size=152064, - n_layer=64, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - attn_bias=True, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=27648, - norm_eps=1e-5, - rope_base=1000000, - ), -] - -configs.extend(qwq) - -########## -# Qwen3 -########## -qwen_3 = [ - # https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/config.json - dict( - name="Qwen3-0.6B{}", - hf_config=dict(org="Qwen", name="Qwen3-0.6B{}"), - block_size=40960, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=28, - n_head=16, - n_embd=1024, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=3072, - norm_eps=1e-6, - rope_base=1000000, - head_size=128, - norm_qk=True, - ), - # https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/config.json - dict( - name="Qwen3-1.7B{}", - hf_config=dict(org="Qwen", name="Qwen3-1.7B{}"), - block_size=40960, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=28, - n_head=16, - n_embd=2048, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=6144, - norm_eps=1e-6, - rope_base=1000000, - norm_qk=True, - ), - # https://huggingface.co/Qwen/Qwen3-4B/blob/main/config.json - dict( - name="Qwen3-4B{}", - hf_config=dict(org="Qwen", name="Qwen3-4B{}"), - block_size=40960, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=36, - n_head=32, - n_embd=2560, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=9728, - norm_eps=1e-6, - rope_base=1000000, - head_size=128, - norm_qk=True, - ), - # https://huggingface.co/Qwen/Qwen3-8B/blob/main/config.json - dict( - name="Qwen3-8B{}", - hf_config=dict(org="Qwen", name="Qwen3-8B{}"), - block_size=40960, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=36, - n_head=32, - n_embd=4096, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=12288, - norm_eps=1e-6, - rope_base=1000000, - norm_qk=True, - ), - # https://huggingface.co/Qwen/Qwen3-14B/blob/main/config.json - dict( - name="Qwen3-14B{}", - hf_config=dict(org="Qwen", name="Qwen3-14B{}"), - block_size=40960, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=40, - n_head=40, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=17408, - norm_eps=1e-6, - rope_base=1000000, - norm_qk=True, - ), -] -for c in qwen_3: - for kind in ("", "-Base"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) -qwen_3_32b = [ - # https://huggingface.co/Qwen/Qwen3-32B/blob/main/config.json - dict( - name="Qwen3-32B", - hf_config=dict(org="Qwen", name="Qwen3-32B"), - block_size=40960, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=64, - n_head=64, - n_embd=5120, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=25600, - norm_eps=1e-6, - rope_base=1000000, - head_size=128, - norm_qk=True, - ), -] -configs.extend(qwen_3_32b) - -qwen_3_moe = [ - # https://huggingface.co/Qwen/Qwen3-30B-A3B/blob/main/config.json - dict( - name="Qwen3-30B-A3B", - hf_config=dict(org="Qwen", name="Qwen3-30B-A3B"), - block_size=40960, - head_size=128, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=48, - n_head=32, - n_embd=2048, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMoE", - intermediate_size=6144, - moe_intermediate_size=768, - norm_eps=1e-6, - rope_base=1000000, - norm_qk=True, - n_expert=128, - n_expert_per_token=8, - ), - # https://huggingface.co/Qwen/Qwen3-30B-A3B-Base/blob/main/config.json - dict( - name="Qwen3-30B-A3B-Base", - hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Base"), - block_size=40960, - head_size=128, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=48, - n_head=32, - n_embd=2048, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMoE", - intermediate_size=6144, - moe_intermediate_size=768, - norm_eps=1e-6, - rope_base=1000000, - norm_qk=True, - n_expert=128, - n_expert_per_token=8, - ), - # https://huggingface.co/Qwen/Qwen3-235B-A22B/blob/main/config.json - dict( - name="Qwen3-235B-A22B", - hf_config=dict(org="Qwen", name="Qwen3-235B-A22B"), - block_size=40960, - head_size=128, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=94, - n_head=64, - n_embd=4096, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMoE", - intermediate_size=12288, - moe_intermediate_size=1536, - norm_eps=1e-6, - rope_base=1000000, - norm_qk=True, - n_expert=128, - n_expert_per_token=8, - ), -] -configs.extend(qwen_3_moe) - -qwen_3_2507_thinking_instruct = [ - # https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json - dict( - name="Qwen3-235B-A22B-{}-2507", - hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-{}-2507"), - block_size=262144, - head_size=128, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=94, - n_head=64, - n_embd=4096, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMoE", - intermediate_size=12288, - moe_intermediate_size=1536, - norm_eps=1e-6, - rope_base=5000000, - norm_qk=True, - n_expert=128, - n_expert_per_token=8, - ), - # https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json - dict( - name="Qwen3-30B-A3B-{}-2507", - hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-{}-2507"), - block_size=262144, - head_size=128, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=48, - n_head=32, - n_embd=2048, - n_query_groups=4, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMoE", - intermediate_size=6144, - moe_intermediate_size=768, - norm_eps=1e-6, - rope_base=10000000, - norm_qk=True, - n_expert=128, - n_expert_per_token=8, - ), - # https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json - dict( - name="Qwen3-4B-{}-2507", - hf_config=dict(org="Qwen", name="Qwen3-4B-{}-2507"), - block_size=262144, - vocab_size=151643, - padded_vocab_size=151936, - n_layer=36, - n_head=32, - n_embd=2560, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=9728, - norm_eps=1e-6, - rope_base=5000000, - head_size=128, - norm_qk=True, - ), -] - -for c in qwen_3_2507_thinking_instruct: - for kind in ("Thinking", "Instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - -############# -# Salamandra -############# -salamandra = [ - # https://huggingface.co/BSC-LT/salamandra-2b-instruct/blob/main/config.json - dict( - name="salamandra-2b{}", - hf_config=dict(org="BSC-LT", name="salamandra-2b{}"), - block_size=8192, - vocab_size=256000, - padded_vocab_size=256000, - n_layer=24, - n_head=16, - n_embd=2048, - n_query_groups=16, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=5440, - norm_eps=1e-5, - rope_base=10000, - ), - # https://huggingface.co/BSC-LT/salamandra-7b-instruct/blob/main/config.json - dict( - name="salamandra-7b{}", - hf_config=dict(org="BSC-LT", name="salamandra-7b{}"), - block_size=8192, - vocab_size=256000, - padded_vocab_size=256000, - n_layer=32, - n_head=32, - n_embd=4096, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=11008, - norm_eps=1e-6, - rope_base=10000, - ), -] - -for c in salamandra: - for kind in ("", "-instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - - -############### -# SmolLM2 -############### -smollm2 = [ - # https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config.json - dict( - name="SmolLM2-135M{}", - hf_config=dict(org="HuggingFaceTB", name="SmolLM2-135M{}"), - block_size=8192, - vocab_size=49152, - padded_vocab_size=49152, - n_layer=30, - n_head=9, - n_embd=576, - n_query_groups=3, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=1536, - rope_base=100000, - norm_eps=1e-5, - ), - # https://huggingface.co/HuggingFaceTB/SmolLM2-360M/blob/main/config.json - dict( - name="SmolLM2-360M{}", - hf_config=dict(org="HuggingFaceTB", name="SmolLM2-360M{}"), - block_size=8192, - vocab_size=49152, - padded_vocab_size=49152, - n_layer=32, - n_head=15, - n_embd=960, - n_query_groups=5, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=2560, - rope_base=100000, - norm_eps=1e-5, - ), - # https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B/blob/main/config.json - dict( - name="SmolLM2-1.7B{}", - hf_config=dict(org="HuggingFaceTB", name="SmolLM2-1.7B{}"), - block_size=8192, - vocab_size=49152, - padded_vocab_size=49152, - n_layer=24, - n_head=32, - n_embd=2048, - n_query_groups=32, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=8192, - rope_base=130000, - norm_eps=1e-5, - ), -] - -for c in smollm2: - for kind in ("", "-Instruct"): - copy = deepcopy(c) - copy["name"] = c["name"].format(kind) - copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) - configs.append(copy) - -############### -# DeepSeek R1 Distill -############### - -r1_distill_llama = [ - # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/blob/main/config.json - dict( - name="R1-Distill-Llama-8B", - hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-8B"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=32, - n_head=32, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=14336, - rope_base=500000, - rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), - # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/config.json - dict( - name="R1-Distill-Llama-70B", - hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-70B"), - block_size=131072, - vocab_size=128000, - padded_vocab_size=128256, - n_layer=80, - n_head=64, - n_embd=8192, - n_query_groups=8, - rotary_percentage=1.0, - parallel_residual=False, - bias=False, - norm_class_name="RMSNorm", - mlp_class_name="LLaMAMLP", - intermediate_size=28672, - rope_base=500000, - rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192), - ), -] - -configs.extend(r1_distill_llama) +# configs.extend(r1_distill_llama) name_to_config = {config["name"]: config for config in configs} diff --git a/pyproject.toml b/pyproject.toml index ee108ed524..d9e75ab68b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ optional-dependencies.extra = [ # download: "huggingface-hub[hf-transfer]>=0.21", "litdata==0.2.51", + "litmodels>=0.1.8", # litgpt.deploy: "litserve>0.2", "lm-eval>=0.4.2,!=0.4.9.1", diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 7d49a19338..1201d71537 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,18 +1,15 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. import os -import shutil -import warnings from types import SimpleNamespace from unittest import mock +import litmodels import pytest from tokenizers import Tokenizer as HFTokenizer from tokenizers.models import BPE -from transformers import AutoTokenizer -from transformers.utils import cached_file import litgpt.config as config_module -from litgpt import PromptStyle, Tokenizer +from litgpt import Tokenizer # @pytest.mark.flaky(reruns=3, rerun_except=["AssertionError", "assert", "TypeError"]) @@ -20,66 +17,83 @@ def test_tokenizer_against_hf(config, tmp_path): config = config_module.Config(**config) - repo_id = f"{config.hf_config['org']}/{config.hf_config['name']}" - theirs = AutoTokenizer.from_pretrained(repo_id, token=os.getenv("HF_TOKEN")) - - # create a checkpoint directory that points to the HF files - hf_files = {} - for filename in ("tokenizer.json", "generation_config.json", "tokenizer.model", "tokenizer_config.json"): - try: # download the HF tokenizer config - hf_file = cached_file(path_or_repo_id=repo_id, filename=filename) - hf_files[filename] = str(hf_file) - except Exception as ex: - warnings.warn(str(ex), RuntimeWarning) - if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files: - raise ConnectionError("Unable to download any tokenizer files from HF") - - # we need to rename the dir to match the model name in testing as well - # since we use to it determine the model in tokenizer.py - tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"]) - - for filename, hf_file in hf_files.items(): - shutil.copy(hf_file, str(tmp_path / filename)) - - ours = Tokenizer(tmp_path) - - assert ours.vocab_size == theirs.vocab_size - if config.name == "Mixtral-8x22B-v0.1": - pytest.xfail(reason="Mixtral certainly lists 32000 vocab in its config") - else: - assert ours.vocab_size == config.vocab_size - - if config.name.startswith(("falcon", "stablecode", "Qwen2.5", "QwQ", "Qwen3")): - # even though their config defines it, it's set as None in HF - assert isinstance(ours.bos_id, int) - assert theirs.bos_token_id is None - elif config.name.startswith("Falcon3"): - if isinstance(ours.bos_id, int): - assert theirs.bos_token_id is None - else: - assert ours.bos_id == theirs.bos_token_id is None - else: - assert ours.bos_id == theirs.bos_token_id - - if config.name.startswith("stablecode"): - # even though their config defines it, it's set as None in HF - assert ours.eos_id == 0 - assert ours.eos_id == theirs.eos_token_id or theirs.eos_token_id is None - else: - assert ours.eos_id == theirs.eos_token_id - - prompt = "Hello, readers of this test!" - prompt = PromptStyle.from_config(config).apply(prompt) - actual = ours.encode(prompt) - expected = theirs.encode(prompt) - assert actual.tolist() == expected - assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True) - - if not config.name.startswith(("Mistral", "Mixtral")): - decoded_output = "".join([ours.decode(x) for x in actual]) - if ours.apply_decoding_fix and decoded_output[0] == " ": - decoded_output = decoded_output[1:] # the "hack" adds an empty space to the beginning - assert decoded_output == ours.decode(actual), type(theirs) + lightning_repo_id = f"lightning-ai/ci/{config.hf_config['name']}" + print(f"DEBUG: Starting download for {lightning_repo_id}") + + # Ensure local-models directory exists + local_models_dir = "./local-models" + os.makedirs(local_models_dir, exist_ok=True) + print(f"DEBUG: Created/verified local-models directory: {local_models_dir}") + + model_path = litmodels.download_model( + name=lightning_repo_id, + download_dir=f"./local-models/{lightning_repo_id}", + progress_bar=False, + ) + print(f"DEBUG: Download completed for {lightning_repo_id}") + + # print(f"DEBUG: Loading AutoTokenizer for {lightning_repo_id}") + # theirs = AutoTokenizer.from_pretrained(f"./local-models/{lightning_repo_id}", use_fast=True) + # print(f"DEBUG: AutoTokenizer loaded for {lightning_repo_id}") + + # # create a checkpoint directory that points to the HF files + # hf_files = {} + # src_dir = f"./local-models/{lightning_repo_id}" + # for filename in ("tokenizer.json", "generation_config.json", "tokenizer.model", "tokenizer_config.json"): + # file_path = os.path.join(src_dir, filename) + # if os.path.isfile(file_path): + # hf_files[filename] = file_path + # else: + # warnings.warn(f"{file_path} not found", RuntimeWarning) + # if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files: + # raise ConnectionError("Unable to find any tokenizer files in the local model directory") + + # # we need to rename the dir to match the model name in testing as well + # # since we use to it determine the model in tokenizer.py + # tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"]) + + # for filename, hf_file in hf_files.items(): + # shutil.copy(hf_file, str(tmp_path / filename)) + + # ours = Tokenizer(tmp_path) + + # assert ours.vocab_size == theirs.vocab_size + # if config.name == "Mixtral-8x22B-v0.1": + # pytest.xfail(reason="Mixtral certainly lists 32000 vocab in its config") + # else: + # assert ours.vocab_size == config.vocab_size + + # if config.name.startswith(("falcon", "stablecode", "Qwen2.5", "QwQ", "Qwen3")): + # # even though their config defines it, it's set as None in HF + # assert isinstance(ours.bos_id, int) + # assert theirs.bos_token_id is None + # elif config.name.startswith("Falcon3"): + # if isinstance(ours.bos_id, int): + # assert theirs.bos_token_id is None + # else: + # assert ours.bos_id == theirs.bos_token_id is None + # else: + # assert ours.bos_id == theirs.bos_token_id + + # if config.name.startswith("stablecode"): + # # even though their config defines it, it's set as None in HF + # assert ours.eos_id == 0 + # assert ours.eos_id == theirs.eos_token_id or theirs.eos_token_id is None + # else: + # assert ours.eos_id == theirs.eos_token_id + + # prompt = "Hello, readers of this test!" + # prompt = PromptStyle.from_config(config).apply(prompt) + # actual = ours.encode(prompt) + # expected = theirs.encode(prompt) + # assert actual.tolist() == expected + # assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True) + + # if not config.name.startswith(("Mistral", "Mixtral")): + # decoded_output = "".join([ours.decode(x) for x in actual]) + # if ours.apply_decoding_fix and decoded_output[0] == " ": + # decoded_output = decoded_output[1:] # the "hack" adds an empty space to the beginning + # assert decoded_output == ours.decode(actual), type(theirs) def test_tokenizer_input_validation():