diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
index 2749026aeb..fc3bc504b3 100644
--- a/.github/workflows/cpu-tests.yml
+++ b/.github/workflows/cpu-tests.yml
@@ -2,10 +2,7 @@ name: CPU tests
 
 on:
   push:
-    branches: [main]
-  pull_request_target:
-    branches: [main]
-    types: [opened, reopened, ready_for_review, labeled, synchronize]
+    branches: [main, pwgardipee/fix-ci-2]
   pull_request: {} # todo
   workflow_dispatch: {}
 
@@ -100,6 +97,11 @@ jobs:
           cache-dependency-path: pyproject.toml
           cache: "pip"
 
+      # - name: Debug with SSH
+      #   uses: mxschmitt/action-tmate@v3
+      #   with:
+      #     detached: true
+
       # Add caching for HF models and tokenizers
       - name: HF cache
         uses: actions/cache@v4
@@ -125,7 +127,9 @@ jobs:
       - name: Run tests
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: pytest -v litgpt/ tests/ --timeout=180 --durations=100
+          LIGHTNING_USER_ID: ${{ secrets.LIGHTNING_USER_ID }}
+          LIGHTNING_API_KEY: ${{ secrets.LIGHTNING_API_KEY }}
+        run: pytest -v litgpt/ tests/test_tokenizer.py::test_tokenizer_against_hf --timeout=600 --durations=100 -s
 
       - name: Show cache
         run: |
diff --git a/litgpt/config.py b/litgpt/config.py
index 97549a114d..708024a736 100644
--- a/litgpt/config.py
+++ b/litgpt/config.py
@@ -1,6 +1,5 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
-from copy import deepcopy
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, List, Literal, Optional, Type, Union
@@ -224,2914 +223,2850 @@ def norm_class(self) -> Type:
 configs = [
     # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json
     dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
-    # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
-    dict(
-        name="stablelm-base-alpha-7b",
-        hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
-        n_head=48,
-        n_embd=6144,
-        padding_multiple=256,
-    ),
-    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
-    dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
-    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
-    dict(
-        name="stablelm-tuned-alpha-7b",
-        hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
-        n_head=48,
-        n_embd=6144,
-        padding_multiple=256,
-    ),
-    # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
-    dict(
-        name="stablelm-3b-4e1t",
-        hf_config=dict(org="stabilityai", name="stablelm-3b-4e1t"),
-        padded_vocab_size=50304,
-        n_layer=32,
-        n_head=32,
-        n_embd=2560,
-        parallel_residual=False,
-        bias=False,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=6912,
-    ),
-    # https://huggingface.co/stabilityai/stablelm-zephyr-3b/blob/main/config.json
-    dict(
-        name="stablelm-zephyr-3b",
-        hf_config=dict(org="stabilityai", name="stablelm-zephyr-3b"),
-        padded_vocab_size=50304,
-        n_layer=32,
-        n_head=32,
-        n_embd=2560,
-        parallel_residual=False,
-        bias=False,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=6912,
-    ),
+    #     # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
+    #     dict(
+    #         name="stablelm-base-alpha-7b",
+    #         hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
+    #         n_head=48,
+    #         n_embd=6144,
+    #         padding_multiple=256,
+    #     ),
+    #     # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
+    #     dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
+    #     # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
+    #     dict(
+    #         name="stablelm-tuned-alpha-7b",
+    #         hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
+    #         n_head=48,
+    #         n_embd=6144,
+    #         padding_multiple=256,
+    #     ),
+    #     # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
+    #     dict(
+    #         name="stablelm-3b-4e1t",
+    #         hf_config=dict(org="stabilityai", name="stablelm-3b-4e1t"),
+    #         padded_vocab_size=50304,
+    #         n_layer=32,
+    #         n_head=32,
+    #         n_embd=2560,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=6912,
+    #     ),
+    #     # https://huggingface.co/stabilityai/stablelm-zephyr-3b/blob/main/config.json
+    #     dict(
+    #         name="stablelm-zephyr-3b",
+    #         hf_config=dict(org="stabilityai", name="stablelm-zephyr-3b"),
+    #         padded_vocab_size=50304,
+    #         n_layer=32,
+    #         n_head=32,
+    #         n_embd=2560,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=6912,
+    #     ),
+    # ]
+    # ##########################
+    # # Stability AI StableCode
+    # ##########################
+    # stablecode = [
+    #     # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
+    #     dict(
+    #         name="stablecode-completion-alpha-3b",
+    #         hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
+    #         block_size=16384,
+    #         vocab_size=49152,
+    #         n_layer=32,
+    #         n_embd=2560,
+    #     ),
+    #     # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
+    #     dict(
+    #         name="stablecode-completion-alpha-3b-4k",
+    #         hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
+    #         vocab_size=49152,
+    #         n_layer=32,
+    #         n_embd=2560,
+    #     ),
+    #     # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
+    #     # dict(
+    #     #     name="stablecode-instruct-alpha-3b",
+    #     #     hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
+    #     #     vocab_size=49152,
+    #     #     n_layer=32,
+    #     #     n_embd=2560,
+    #     # ),
+    #     # https://huggingface.co/stabilityai/stable-code-3b/blob/main/config.json
+    #     dict(
+    #         name="stable-code-3b",
+    #         hf_config=dict(org="stabilityai", name="stable-code-3b"),
+    #         padded_vocab_size=50304,
+    #         n_layer=32,
+    #         n_embd=2560,
+    #         block_size=16384,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=6912,
+    #     ),
+    # ]
+    # configs.extend(stablecode)
+    # ####################
+    # # EleutherAI Pythia
+    # ####################
+    # pythia = [
+    #     # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json
+    #     dict(
+    #         name="pythia-14m",
+    #         hf_config=dict(org="EleutherAI", name="pythia-14m"),
+    #         block_size=512,
+    #         n_layer=6,
+    #         n_embd=128,
+    #         n_head=4,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json
+    #     dict(
+    #         name="pythia-31m",
+    #         hf_config=dict(org="EleutherAI", name="pythia-31m"),
+    #         block_size=1024,
+    #         n_layer=6,
+    #         n_embd=256,
+    #         n_head=8,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
+    #     dict(
+    #         name="pythia-70m",
+    #         hf_config=dict(org="EleutherAI", name="pythia-70m"),
+    #         block_size=2048,
+    #         n_layer=6,
+    #         n_embd=512,
+    #         n_head=8,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
+    #     dict(
+    #         name="pythia-160m",
+    #         hf_config=dict(org="EleutherAI", name="pythia-160m"),
+    #         block_size=2048,
+    #         n_layer=12,
+    #         n_embd=768,
+    #         n_head=12,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
+    #     dict(
+    #         name="pythia-410m",
+    #         hf_config=dict(org="EleutherAI", name="pythia-410m"),
+    #         block_size=2048,
+    #         n_layer=24,
+    #         n_embd=1024,
+    #         n_head=16,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
+    #     dict(
+    #         name="pythia-1b",
+    #         hf_config=dict(org="EleutherAI", name="pythia-1b"),
+    #         block_size=2048,
+    #         n_embd=2048,
+    #         n_head=8,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
+    #     dict(
+    #         name="pythia-1.4b",
+    #         hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
+    #         block_size=2048,
+    #         n_layer=24,
+    #         n_embd=2048,
+    #         n_head=16,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
+    #     dict(
+    #         name="pythia-2.8b",
+    #         hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
+    #         block_size=2048,
+    #         n_layer=32,
+    #         n_embd=2560,
+    #         padding_multiple=128,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
+    #     dict(
+    #         name="pythia-6.9b",
+    #         hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
+    #         block_size=2048,
+    #         n_layer=32,
+    #         padding_multiple=256,
+    #     ),
+    #     # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
+    #     dict(
+    #         name="pythia-12b",
+    #         hf_config=dict(org="EleutherAI", name="pythia-12b"),
+    #         block_size=2048,
+    #         n_layer=36,
+    #         n_embd=5120,
+    #         n_head=40,
+    #     ),
+    # ]
+    # configs.extend(pythia)
+    # for c in pythia:
+    #     # "pythia-14m" and "pythia-31m" don't have deduped version
+    #     if c["name"] in ("pythia-14m", "pythia-31m"):
+    #         continue
+    #     copy = deepcopy(c)
+    #     copy["name"] = f"{c['name']}-deduped"
+    #     copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
+    #     configs.append(copy)
+    # #################
+    # # TII UAE Falcon
+    # #################
+    # falcon = [
+    #     # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
+    #     dict(
+    #         name="falcon-7b{}",
+    #         hf_config=dict(org="tiiuae", name="falcon-7b{}"),
+    #         block_size=2048,
+    #         vocab_size=65024,
+    #         padded_vocab_size=65024,
+    #         n_layer=32,
+    #         n_head=71,
+    #         n_embd=4544,
+    #         rotary_percentage=1.0,
+    #         n_query_groups=1,
+    #         bias=False,
+    #         # this is not in the config, but in the original model implementation, only for this config
+    #         shared_attention_norm=True,
+    #     ),
+    #     # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
+    #     dict(
+    #         name="falcon-40b{}",
+    #         hf_config=dict(org="tiiuae", name="falcon-40b{}"),
+    #         block_size=2048,
+    #         vocab_size=65024,
+    #         padded_vocab_size=65024,
+    #         n_layer=60,
+    #         n_head=128,
+    #         n_embd=8192,
+    #         rotary_percentage=1.0,
+    #         n_query_groups=8,
+    #         bias=False,
+    #     ),
+    # ]
+    # for c in falcon:
+    #     for kind in ("", "-instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # # https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
+    # falcon180b = dict(
+    #     name="falcon-180B{}",
+    #     hf_config=dict(org="tiiuae", name="falcon-180B{}"),
+    #     block_size=2048,
+    #     vocab_size=65024,
+    #     padded_vocab_size=65024,
+    #     n_layer=80,
+    #     n_head=232,
+    #     n_embd=14848,
+    #     rotary_percentage=1.0,
+    #     n_query_groups=8,
+    #     bias=False,
+    # )
+    # for kind in ("", "-chat"):
+    #     copy = deepcopy(falcon180b)
+    #     copy["name"] = falcon180b["name"].format(kind)
+    #     copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
+    #     configs.append(copy)
+    # falcon3 = [
+    #     # https://huggingface.co/tiiuae/Falcon3-1B-Base/blob/main/config.json
+    #     dict(
+    #         name="Falcon3-1B{}",
+    #         hf_config=dict(org="tiiuae", name="Falcon3-1B{}"),
+    #         block_size=4096,
+    #         vocab_size=131072,
+    #         padded_vocab_size=131072,
+    #         n_layer=18,
+    #         n_head=8,
+    #         n_query_groups=4,
+    #         n_embd=2048,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         rope_base=1000042,
+    #         norm_eps=1e-6,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8192,
+    #     ),
+    #     # https://huggingface.co/tiiuae/Falcon3-3B-Base/blob/main/config.json
+    #     dict(
+    #         name="Falcon3-3B{}",
+    #         hf_config=dict(org="tiiuae", name="Falcon3-3B{}"),
+    #         block_size=32768,
+    #         vocab_size=131072,
+    #         padded_vocab_size=131072,
+    #         n_layer=22,
+    #         n_head=12,
+    #         n_query_groups=4,
+    #         n_embd=3072,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         rope_base=1000042,
+    #         norm_eps=1e-6,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=9216,
+    #     ),
+    #     # https://huggingface.co/tiiuae/Falcon3-7B-Base/blob/main/config.json
+    #     dict(
+    #         name="Falcon3-7B{}",
+    #         hf_config=dict(org="tiiuae", name="Falcon3-7B{}"),
+    #         block_size=32768,
+    #         vocab_size=131072,
+    #         padded_vocab_size=131072,
+    #         n_layer=28,
+    #         n_head=12,
+    #         n_query_groups=4,
+    #         n_embd=3072,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         rope_base=1000042,
+    #         norm_eps=1e-6,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=23040,
+    #     ),
+    #     # https://huggingface.co/tiiuae/Falcon3-10B-Base/blob/main/config.json
+    #     dict(
+    #         name="Falcon3-10B{}",
+    #         hf_config=dict(org="tiiuae", name="Falcon3-10B{}"),
+    #         block_size=32768,
+    #         vocab_size=131072,
+    #         padded_vocab_size=131072,
+    #         n_layer=40,
+    #         n_head=12,
+    #         n_query_groups=4,
+    #         n_embd=3072,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         rope_base=1000042,
+    #         norm_eps=1e-6,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=23040,
+    #     ),
+    # ]
+    # for c in falcon3:
+    #     for kind in ("-Base", "-Instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # #############################
+    # # OpenLM Research Open LLaMA
+    # #############################
+    # open_LLaMA = [
+    #     # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
+    #     dict(
+    #         name="open_llama_3b",
+    #         hf_config=dict(org="openlm-research", name="open_llama_3b"),
+    #         block_size=2048,
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=26,
+    #         n_embd=3200,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-6,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8640,
+    #     ),
+    #     # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
+    #     dict(
+    #         name="open_llama_7b",
+    #         hf_config=dict(org="openlm-research", name="open_llama_7b"),
+    #         block_size=2048,
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-6,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #     ),
+    #     # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
+    #     dict(
+    #         name="open_llama_13b",
+    #         hf_config=dict(org="openlm-research", name="open_llama_13b"),
+    #         block_size=2048,
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-6,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #     ),
+    # ]
+    # configs.extend(open_LLaMA)
+    # ###############
+    # # Meta LLaMA 2
+    # ###############
+    # llama_2 = [
+    #     # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
+    #     dict(
+    #         name="Llama-2-7b{}-hf",
+    #         hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #     ),
+    #     # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
+    #     dict(
+    #         name="Llama-2-13b{}-hf",
+    #         hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #     ),
+    #     # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
+    #     dict(
+    #         name="Llama-2-70b{}-hf",
+    #         hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #     ),
+    # ]
+    # for c in llama_2:
+    #     for kind in ("", "-chat"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # ###############
+    # # Meta LLaMA 3
+    # ###############
+    # llama_3 = [
+    #     # https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json
+    #     dict(
+    #         name="Llama-3-8B{}",
+    #         hf_config=dict(org="meta-llama", name="Meta-Llama-3-8B{}"),
+    #         block_size=8192,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=32,
+    #         n_head=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #         rope_base=500000,
+    #     ),
+    #     # https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/blob/main/config.json
+    #     dict(
+    #         name="Llama-3.1-8B{}",
+    #         hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-8B{}"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=32,
+    #         n_head=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    #     # https://huggingface.co/meta-llama/Meta-Llama-3-70B/blob/main/config.json
+    #     dict(
+    #         name="Llama-3-70B{}",
+    #         hf_config=dict(org="meta-llama", name="Meta-Llama-3-70B{}"),
+    #         block_size=8192,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=500000,
+    #     ),
+    #     # https://huggingface.co/meta-llama/Meta-Llama-3.1-70B/blob/main/config.json
+    #     dict(
+    #         name="Llama-3.1-70B{}",
+    #         hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-70B{}"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    #     # https://huggingface.co/meta-llama/Meta-Llama-3.1-405B/blob/main/config.json
+    #     dict(
+    #         name="Llama-3.1-405B{}",
+    #         hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-405B{}"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=126,
+    #         n_head=128,
+    #         n_embd=16384,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=53248,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    #     # https://huggingface.co/meta-llama/Llama-3.2-1B/blob/main/config.json
+    #     dict(
+    #         name="Llama-3.2-1B{}",
+    #         hf_config=dict(org="meta-llama", name="Llama-3.2-1B{}"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=16,
+    #         n_embd=2048,
+    #         n_head=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8192,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    #     # https://huggingface.co/meta-llama/Llama-3.2-3B/blob/main/config.json
+    #     dict(
+    #         name="Llama-3.2-3B{}",
+    #         hf_config=dict(org="meta-llama", name="Llama-3.2-3B{}"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=28,
+    #         n_embd=3072,
+    #         n_head=24,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8192,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    #     # https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/config.json
+    #     dict(
+    #         name="Llama-3.3-70B-Instruct",
+    #         hf_config=dict(org="meta-llama", name="Llama-3.3-70B-Instruct"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    # ]
+    # for c in llama_3:
+    #     if c["name"] == "Llama-3.3-70B-Instruct":
+    #         configs.append(c)
+    #         continue
+    #     for kind in ("", "-Instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # #########################
+    # # NVIDIA Llama Nemotron
+    # #########################
+    # configs.append(
+    #     dict(
+    #         name="Llama-3.1-Nemotron-70B-Instruct-HF",
+    #         hf_config=dict(org="nvidia", name="Llama-3.1-Nemotron-70B-Instruct-HF"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    # )
+    # #################
+    # # Allen AI OLMo
+    # #################
+    # olmo = [
+    #     # https://huggingface.co/allenai/OLMo-1B-hf/blob/main/config.json
+    #     dict(
+    #         name="OLMo-1B-hf",
+    #         hf_config=dict(org="allenai", name="OLMo-1B-hf"),
+    #         vocab_size=50280,
+    #         padded_vocab_size=50304,
+    #         block_size=2048,
+    #         n_embd=2048,
+    #         n_layer=16,
+    #         n_head=16,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="LayerNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8192,
+    #     ),
+    #     # https://huggingface.co/allenai/OLMo-7B-hf/blob/main/config.json
+    #     dict(
+    #         name="OLMo-7B-hf",
+    #         hf_config=dict(org="allenai", name="OLMo-7B-hf"),
+    #         vocab_size=50280,
+    #         padded_vocab_size=50304,
+    #         block_size=2048,
+    #         n_layer=32,
+    #         n_head=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="LayerNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #     ),
+    #     # https://huggingface.co/allenai/OLMo-7B-Instruct-hf/blob/main/config.json
+    #     dict(
+    #         name="OLMo-7B-Instruct-hf",
+    #         hf_config=dict(org="allenai", name="OLMo-7B-Instruct-hf"),
+    #         vocab_size=50280,
+    #         padded_vocab_size=50304,
+    #         block_size=2048,
+    #         n_layer=32,
+    #         n_head=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="LayerNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #     ),
+    # ]
+    # configs.extend(olmo)
+    # olmo2 = [
+    #     # https://huggingface.co/allenai/OLMo-2-1124-7B/blob/main/config.json
+    #     dict(
+    #         name="OLMo-2-1124-7B{}",
+    #         hf_config=dict(org="allenai", name="OLMo-2-1124-7B{}"),
+    #         vocab_size=100278,
+    #         padded_vocab_size=100352,
+    #         block_size=4096,
+    #         n_embd=4096,
+    #         n_layer=32,
+    #         n_head=32,
+    #         n_query_groups=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         norm_eps=1e-06,
+    #         intermediate_size=11008,
+    #         rope_base=500000,
+    #         norm_qk=True,
+    #         post_mlp_norm=True,
+    #         norm_1=False,
+    #         norm_2=False,
+    #         norm_qk_type="olmo2",
+    #         post_attention_norm=True,
+    #     ),
+    #     # https://huggingface.co/allenai/OLMo-2-1124-13B/blob/main/config.json
+    #     dict(
+    #         name="OLMo-2-1124-13B{}",
+    #         hf_config=dict(org="allenai", name="OLMo-2-1124-13B{}"),
+    #         vocab_size=100278,
+    #         padded_vocab_size=100352,
+    #         block_size=4096,
+    #         n_embd=5120,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_query_groups=40,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         norm_eps=1e-06,
+    #         intermediate_size=13824,
+    #         rope_base=500000,
+    #         norm_qk=True,
+    #         post_mlp_norm=True,
+    #         norm_1=False,
+    #         norm_2=False,
+    #         norm_qk_type="olmo2",
+    #         post_attention_norm=True,
+    #     ),
+    # ]
+    # for c in olmo2:
+    #     for kind in ("", "-SFT", "-DPO", "-Instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # ###############
+    # # Google Gemma
+    # ###############
+    # gemma = [
+    #     # https://huggingface.co/google/gemma-2b/blob/main/config.json
+    #     dict(
+    #         name="Gemma-2b",
+    #         hf_config=dict(org="google", name="gemma-2b"),
+    #         scale_embeddings=True,
+    #         vocab_size=256000,
+    #         padding_multiple=64,
+    #         n_embd=2048,
+    #         n_layer=18,
+    #         n_head=8,
+    #         n_query_groups=1,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         intermediate_size=16384,
+    #     ),
+    #     # https://huggingface.co/google/gemma-7b/blob/main/config.json
+    #     dict(
+    #         name="Gemma-7b",
+    #         hf_config=dict(org="google", name="gemma-7b"),
+    #         scale_embeddings=True,
+    #         vocab_size=256000,
+    #         padding_multiple=64,
+    #         n_embd=3072,
+    #         n_layer=28,
+    #         n_head=16,
+    #         head_size=256,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         intermediate_size=24576,
+    #     ),
+    #     # https://huggingface.co/google/gemma-2-2b/blob/main/config.json
+    #     dict(
+    #         name="Gemma-2-2b",
+    #         hf_config=dict(org="google", name="gemma-2-2b"),
+    #         scale_embeddings=True,
+    #         attention_scores_scalar=256,
+    #         vocab_size=256000,
+    #         block_size=8192,
+    #         sliding_window_size=4096,
+    #         # only layer with idx 0, 2, 4, ... have sliding window attention
+    #         sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(26)],
+    #         intermediate_size=9216,
+    #         n_embd=2304,
+    #         n_layer=26,
+    #         n_head=8,
+    #         n_query_groups=4,
+    #         head_size=256,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         post_attention_norm=True,
+    #         post_mlp_norm=True,
+    #         attention_logit_softcapping=50.0,
+    #         final_logit_softcapping=30.0,
+    #     ),
+    #     # https://huggingface.co/google/gemma-2-9b/blob/main/config.json
+    #     dict(
+    #         name="Gemma-2-9b",
+    #         hf_config=dict(org="google", name="gemma-2-9b"),
+    #         scale_embeddings=True,
+    #         attention_scores_scalar=256,
+    #         vocab_size=256000,
+    #         block_size=8192,
+    #         sliding_window_size=4096,
+    #         # only layer with idx 0, 2, 4, ... have sliding window attention
+    #         sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(42)],
+    #         intermediate_size=14336,
+    #         n_embd=3584,
+    #         n_layer=42,
+    #         n_head=16,
+    #         n_query_groups=8,
+    #         head_size=256,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         post_attention_norm=True,
+    #         post_mlp_norm=True,
+    #         attention_logit_softcapping=50.0,
+    #         final_logit_softcapping=30.0,
+    #     ),
+    #     # https://huggingface.co/google/gemma-2-27b/blob/main/config.json
+    #     dict(
+    #         name="Gemma-2-27b",
+    #         hf_config=dict(org="google", name="gemma-2-27b"),
+    #         scale_embeddings=True,
+    #         # In Gemma 2 27B attention scores are scaled not by `sqrt(head_size)` (11.31),
+    #         # but by `sqrt(n_emb // n_head)` = sqrt(4608 // 32) = 12
+    #         attention_scores_scalar=144,
+    #         vocab_size=256000,
+    #         block_size=8192,
+    #         sliding_window_size=4096,
+    #         # only layer with idx 0, 2, 4, ... have sliding window attention
+    #         sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(46)],
+    #         intermediate_size=36864,
+    #         n_embd=4608,
+    #         n_layer=46,
+    #         n_head=32,
+    #         n_query_groups=16,
+    #         head_size=128,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         post_attention_norm=True,
+    #         post_mlp_norm=True,
+    #         attention_logit_softcapping=50.0,
+    #         final_logit_softcapping=30.0,
+    #     ),
+    # ]
+    # configs.extend(gemma)
+    # for c in gemma:
+    #     copy = deepcopy(c)
+    #     copy["name"] = f"{c['name']}-it"
+    #     copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it"
+    #     configs.append(copy)
+    # ##################
+    # # Google Gemma 3
+    # ##################
+    # gemma3 = [
+    #     # https://huggingface.co/google/gemma-3-1b-it/blob/main/config.json
+    #     dict(
+    #         name="Gemma-3-1b-it",
+    #         hf_config=dict(org="google", name="gemma-3-1b-it"),
+    #         scale_embeddings=True,
+    #         attention_scores_scalar=256,
+    #         vocab_size=262144,
+    #         block_size=131072,
+    #         sliding_window_size=512,
+    #         # 5 local layers for every global layer
+    #         sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)],
+    #         intermediate_size=6912,
+    #         n_embd=1152,
+    #         n_layer=26,
+    #         n_head=4,
+    #         n_query_groups=1,
+    #         head_size=256,
+    #         rotary_percentage=1.0,
+    #         rope_adjustments=None,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         post_attention_norm=True,
+    #         post_mlp_norm=True,
+    #         norm_qk=True,
+    #         rope_base=1000000,
+    #         rope_local_base_freq=10000,
+    #         # 5 local layers for every global layer
+    #         rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)],
+    #     ),
+    #     # https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json
+    #     dict(
+    #         name="Gemma-3-4b-it",
+    #         hf_config=dict(org="google", name="gemma-3-4b-it"),
+    #         scale_embeddings=True,
+    #         attention_scores_scalar=256,
+    #         vocab_size=262144,
+    #         block_size=131072,
+    #         sliding_window_size=1024,
+    #         # 5 local layers for every global layer
+    #         sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)],
+    #         intermediate_size=10240,
+    #         n_embd=2560,
+    #         n_layer=34,
+    #         n_head=8,
+    #         n_query_groups=4,
+    #         head_size=256,
+    #         rotary_percentage=1.0,
+    #         rope_adjustments=dict(factor=8.0),
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         post_attention_norm=True,
+    #         post_mlp_norm=True,
+    #         norm_qk=True,
+    #         rope_base=1000000,
+    #         rope_local_base_freq=10000,
+    #         # 5 local layers for every global layer
+    #         rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)],
+    #     ),
+    #     # https://huggingface.co/google/gemma-3-12b-it/blob/main/config.json
+    #     dict(
+    #         name="Gemma-3-12b-it",
+    #         hf_config=dict(org="google", name="gemma-3-12b-it"),
+    #         scale_embeddings=True,
+    #         attention_scores_scalar=256,
+    #         vocab_size=262144,
+    #         block_size=131072,
+    #         sliding_window_size=1024,
+    #         # 5 local layers for every global layer
+    #         sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)],
+    #         intermediate_size=15360,
+    #         n_embd=3840,
+    #         n_layer=48,
+    #         n_head=16,
+    #         n_query_groups=8,
+    #         head_size=256,
+    #         rotary_percentage=1.0,
+    #         rope_adjustments=dict(factor=8.0),
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         post_attention_norm=True,
+    #         post_mlp_norm=True,
+    #         norm_qk=True,
+    #         rope_base=1000000,
+    #         rope_local_base_freq=10000,
+    #         # 5 local layers for every global layer
+    #         rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)],
+    #     ),
+    #     # https://huggingface.co/google/gemma-3-27b-it/blob/main/config.json
+    #     dict(
+    #         name="Gemma-3-27b-it",
+    #         hf_config=dict(org="google", name="gemma-3-27b-it"),
+    #         scale_embeddings=True,
+    #         attention_scores_scalar=168,
+    #         vocab_size=262144,
+    #         block_size=131072,
+    #         sliding_window_size=1024,
+    #         # 5 local layers for every global layer
+    #         sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)],
+    #         intermediate_size=21504,
+    #         n_embd=5376,
+    #         n_layer=62,
+    #         n_head=32,
+    #         n_query_groups=16,
+    #         head_size=128,
+    #         rotary_percentage=1.0,
+    #         rope_adjustments=dict(factor=8.0),
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         post_attention_norm=True,
+    #         post_mlp_norm=True,
+    #         norm_qk=True,
+    #         rope_base=1000000,
+    #         rope_local_base_freq=10000,
+    #         # 5 local layers for every global layer
+    #         rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)],
+    #     ),
+    # ]
+    # configs.extend(gemma3)
+    # ##################
+    # # Google CodeGemma
+    # ##################
+    # codegemma = [
+    #     # https://huggingface.co/google/codegemma-7b-it/blob/main/config.json
+    #     dict(
+    #         name="CodeGemma-7b-it",
+    #         hf_config=dict(org="google", name="codegemma-7b-it"),
+    #         scale_embeddings=True,
+    #         vocab_size=256000,
+    #         padding_multiple=64,
+    #         n_embd=3072,
+    #         n_layer=28,
+    #         n_head=16,
+    #         head_size=256,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="GemmaMLP",
+    #         gelu_approximate="tanh",
+    #         intermediate_size=24576,
+    #     ),
+    # ]
+    # configs.extend(codegemma)
+    # ##########################
+    # # Stability AI FreeWilly2
+    # ##########################
+    # freewilly_2 = [
+    #     # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
+    #     dict(
+    #         name="FreeWilly2",
+    #         hf_config=dict(org="stabilityai", name="FreeWilly2"),
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #     )
+    # ]
+    # configs.extend(freewilly_2)
+    # ##################
+    # # Meta Code Llama
+    # ##################
+    # code_llama = [
+    #     # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-7b-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
+    #         block_size=16384,
+    #         vocab_size=32016,
+    #         padding_multiple=16,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-13b-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
+    #         block_size=16384,
+    #         vocab_size=32016,
+    #         padding_multiple=16,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-34b-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
+    #         block_size=16384,
+    #         vocab_size=32000,
+    #         padded_vocab_size=32000,
+    #         n_layer=48,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=22016,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-70b-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-70b-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-70b-hf"),
+    #         block_size=16384,
+    #         vocab_size=32016,
+    #         padding_multiple=16,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-7b-Python-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
+    #         block_size=16384,
+    #         vocab_size=32000,
+    #         padded_vocab_size=32000,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-13b-Python-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
+    #         block_size=16384,
+    #         vocab_size=32000,
+    #         padded_vocab_size=32000,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-34b-Python-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
+    #         block_size=16384,
+    #         vocab_size=32000,
+    #         padded_vocab_size=32000,
+    #         n_layer=48,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=22016,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-70b-Python-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-70b-Python-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-70b-Python-hf"),
+    #         block_size=16384,
+    #         vocab_size=32016,
+    #         padding_multiple=16,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-7b-Instruct-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
+    #         block_size=16384,
+    #         vocab_size=32016,
+    #         padding_multiple=16,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-13b-Instruct-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
+    #         block_size=2048,
+    #         vocab_size=32016,
+    #         padding_multiple=16,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-34b-Instruct-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
+    #         block_size=16384,
+    #         vocab_size=32000,
+    #         padded_vocab_size=32000,
+    #         n_layer=48,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=22016,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/config.json
+    #     dict(
+    #         name="CodeLlama-70b-Instruct-hf",
+    #         hf_config=dict(org="codellama", name="CodeLlama-70b-Instruct-hf"),
+    #         block_size=16384,
+    #         # 32016 is an added token, so not reported in vocab_size
+    #         # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/tokenizer_config.json
+    #         vocab_size=32015,
+    #         padding_multiple=16,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=1000000,
+    #     ),
+    # ]
+    # configs.extend(code_llama)
+    # ########################
+    # # garage-bAInd Platypus
+    # ########################
+    # platypus = [
+    #     # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
+    #     dict(
+    #         name="Platypus-30B",
+    #         hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
+    #         block_size=2048,
+    #         padded_vocab_size=32000,
+    #         n_layer=60,
+    #         n_head=52,
+    #         n_embd=6656,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-06,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=17920,
+    #     ),
+    #     # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
+    #     dict(
+    #         name="Platypus2-7B",
+    #         hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
+    #         padded_vocab_size=32000,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #     ),
+    #     # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
+    #     dict(
+    #         name="Platypus2-13B",
+    #         hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
+    #         padded_vocab_size=32000,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #     ),
+    #     # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
+    #     dict(
+    #         name="Platypus2-70B",
+    #         hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
+    #         padded_vocab_size=32000,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #     ),
+    #     # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
+    #     dict(
+    #         name="Camel-Platypus2-13B",
+    #         hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
+    #         padded_vocab_size=32000,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #     ),
+    #     # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
+    #     dict(
+    #         name="Camel-Platypus2-70B",
+    #         hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
+    #         padded_vocab_size=32000,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #     ),
+    #     # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
+    #     dict(
+    #         name="Stable-Platypus2-13B",
+    #         hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
+    #         padded_vocab_size=32000,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #     ),
+    #     # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
+    #     dict(
+    #         name="Platypus2-70B-instruct",
+    #         hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
+    #         padded_vocab_size=32000,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #     ),
+    # ]
+    # configs.extend(platypus)
+    # ##################################
+    # # togethercomputer LLaMA-2-7B-32K
+    # ##################################
+    # together_llama2_32k = [
+    #     # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
+    #     dict(
+    #         name="LLaMA-2-7B-32K",
+    #         hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         rope_condense_ratio=8,
+    #     )
+    # ]
+    # configs.extend(together_llama2_32k)
+    # ################
+    # # Microsoft Phi
+    # ################
+    # phi = [
+    #     # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
+    #     dict(
+    #         name="phi-1_5",
+    #         hf_config=dict(org="microsoft", name="phi-1_5"),
+    #         vocab_size=50257,
+    #         padded_vocab_size=51200,
+    #         block_size=2048,
+    #         n_embd=2048,
+    #         n_layer=24,
+    #         rotary_percentage=0.5,  # 32 / (n_embd / n_head) = 32 / 64
+    #         shared_attention_norm=True,
+    #         lm_head_bias=True,
+    #         gelu_approximate="tanh",
+    #     ),
+    #     # https://huggingface.co/microsoft/phi-2/blob/main/config.json
+    #     dict(
+    #         name="phi-2",
+    #         hf_config=dict(org="microsoft", name="phi-2"),
+    #         vocab_size=50257,
+    #         padded_vocab_size=51200,
+    #         block_size=2048,
+    #         n_embd=2560,
+    #         n_layer=32,
+    #         rotary_percentage=0.4,  # 32 / (n_embd / n_head) = 32 / 80
+    #         shared_attention_norm=True,
+    #         lm_head_bias=True,
+    #         gelu_approximate="tanh",
+    #     ),
+    #     # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json
+    #     dict(
+    #         name="Phi-3-mini-4k-instruct",
+    #         hf_config=dict(org="microsoft", name="Phi-3-mini-4k-instruct"),
+    #         vocab_size=32000,
+    #         padded_vocab_size=32064,
+    #         block_size=4096,
+    #         n_embd=3072,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=8192,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #         sliding_window_size=2048,
+    #     ),
+    #     # https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json
+    #     dict(
+    #         name="Phi-3-mini-128k-instruct",
+    #         hf_config=dict(org="microsoft", name="Phi-3-mini-128k-instruct"),
+    #         vocab_size=32000,
+    #         padded_vocab_size=32064,
+    #         block_size=131072,
+    #         n_embd=3072,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=8192,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #         sliding_window_size=262145,
+    #     ),
+    #     # https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/config.json
+    #     dict(
+    #         name="Phi-3.5-mini-instruct",
+    #         hf_config=dict(org="microsoft", name="Phi-3.5-mini-instruct"),
+    #         vocab_size=32000,
+    #         padded_vocab_size=32064,
+    #         block_size=4096,
+    #         n_embd=3072,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=8192,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #     ),
+    #     # https://huggingface.co/microsoft/phi-4/blob/main/config.json
+    #     dict(
+    #         name="phi-4",
+    #         hf_config=dict(org="microsoft", name="phi-4"),
+    #         vocab_size=100352,
+    #         padded_vocab_size=100352,
+    #         block_size=16384,
+    #         n_embd=5120,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_query_groups=10,
+    #         rotary_percentage=1.0,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=17920,
+    #         rope_base=250000,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #     ),
+    #     # https://huggingface.co/microsoft/Phi-4-reasoning/blob/main/config.json
+    #     dict(
+    #         name="Phi-4-reasoning",
+    #         hf_config=dict(org="microsoft", name="Phi-4-reasoning"),
+    #         vocab_size=100352,
+    #         padded_vocab_size=100352,
+    #         block_size=32768,
+    #         n_embd=5120,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_query_groups=10,
+    #         rotary_percentage=1.0,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=17920,
+    #         rope_base=500000,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #     ),
+    #     # https://huggingface.co/microsoft/Phi-4-reasoning-plus/blob/main/config.json
+    #     dict(
+    #         name="Phi-4-reasoning-plus",
+    #         hf_config=dict(org="microsoft", name="Phi-4-reasoning-plus"),
+    #         vocab_size=100352,
+    #         padded_vocab_size=100352,
+    #         block_size=32768,
+    #         n_embd=5120,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_query_groups=10,
+    #         rotary_percentage=1.0,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=17920,
+    #         rope_base=500000,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #     ),
+    #     # https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json
+    #     dict(
+    #         name="Phi-4-mini-instruct",
+    #         hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"),
+    #         vocab_size=200019,
+    #         padded_vocab_size=200064,
+    #         block_size=131072,
+    #         n_embd=3072,
+    #         n_layer=32,
+    #         n_head=24,
+    #         n_query_groups=8,
+    #         rotary_percentage=0.75,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=8192,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #         sliding_window_size=262145,
+    #     ),
+    #     # https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/config.json
+    #     dict(
+    #         name="Phi-4-mini-reasoning",
+    #         hf_config=dict(org="microsoft", name="Phi-4-mini-reasoning"),
+    #         vocab_size=200019,
+    #         padded_vocab_size=200064,
+    #         block_size=131072,
+    #         n_embd=3072,
+    #         n_layer=32,
+    #         n_head=24,
+    #         n_query_groups=8,
+    #         rotary_percentage=0.75,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         intermediate_size=8192,
+    #         mlp_class_name="LLaMAMLP",
+    #         parallel_residual=False,
+    #         sliding_window_size=262145,
+    #     ),
+    # ]
+    # configs.extend(phi)
+    # #############
+    # # Mistral AI
+    # #############
+    # configs.append(
+    #     # https://huggingface.co/mistralai/mathstral-7B-v0.1/blob/main/config.json
+    #     dict(
+    #         name="Mathstral-7B-v0.1",
+    #         hf_config=dict(org="mistralai", name="mathstral-7B-v0.1"),
+    #         padded_vocab_size=32768,
+    #         block_size=32768,
+    #         n_layer=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #         sliding_window_size=4096,
+    #     )
+    # )
+    # mistral = [
+    #     # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+    #     dict(
+    #         name="Mistral-7B-{}v0.1",
+    #         hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
+    #         padded_vocab_size=32000,
+    #         block_size=4096,  # should be 32768 but sliding window attention is not implemented
+    #         n_layer=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #         sliding_window_size=4096,
+    #     ),
+    #     # https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json
+    #     dict(
+    #         name="Mixtral-8x7B-{}v0.1",
+    #         hf_config=dict(org="mistralai", name="Mixtral-8x7B-{}v0.1"),
+    #         padded_vocab_size=32000,
+    #         block_size=32768,
+    #         n_layer=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMoE",
+    #         intermediate_size=14336,
+    #         rope_base=1000000,
+    #         n_expert=8,
+    #         n_expert_per_token=2,
+    #     ),
+    #     # https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1/blob/main/config.json
+    #     dict(
+    #         name="Mixtral-8x22B-{}v0.1",
+    #         hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"),
+    #         padded_vocab_size=32768,
+    #         block_size=65536,
+    #         n_layer=56,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMoE",
+    #         intermediate_size=16384,
+    #         n_head=48,
+    #         n_embd=6144,
+    #         rope_base=1000000,
+    #         n_expert=8,
+    #         n_expert_per_token=2,
+    #     ),
+    # ]
+    # for c in mistral:
+    #     for kind in ("", "Instruct-"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # configs.append(
+    #     # https://huggingface.co/unsloth/mistral-7b-v0.2/blob/main/config.json
+    #     dict(
+    #         name="Mistral-7B-v0.2",
+    #         hf_config=dict(org="unsloth", name="Mistral-7B-v0.2"),
+    #         padded_vocab_size=32000,
+    #         block_size=32768,
+    #         n_layer=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #     )
+    # )
+    # configs.append(
+    #     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/main/config.json
+    #     dict(
+    #         name="Mistral-7B-Instruct-v0.2",
+    #         hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.2"),
+    #         padded_vocab_size=32000,
+    #         block_size=32768,
+    #         n_layer=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #     )
+    # )
+    # configs.append(
+    #     # https://huggingface.co/mistralai/Mistral-7B-v0.3/blob/main/config.json
+    #     dict(
+    #         name="Mistral-7B-v0.3",
+    #         hf_config=dict(org="mistralai", name="Mistral-7B-v0.3"),
+    #         padded_vocab_size=32768,
+    #         block_size=32768,
+    #         n_layer=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #     )
+    # )
+    # configs.append(
+    #     # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/blob/main/config.json
+    #     dict(
+    #         name="Mistral-7B-Instruct-v0.3",
+    #         hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.3"),
+    #         padded_vocab_size=32768,
+    #         block_size=32768,
+    #         n_layer=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #     )
+    # )
+    # configs.append(
+    #     # https://huggingface.co/mistralai/Mistral-Large-Instruct-2407/blob/main/config.json
+    #     dict(
+    #         name="Mistral-Large-Instruct-2407",
+    #         hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2407"),
+    #         padded_vocab_size=32768,
+    #         block_size=32768,
+    #         n_layer=88,
+    #         n_head=96,
+    #         n_embd=12288,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #     )
+    # )
+    # configs.append(
+    #     # https://huggingface.co/mistralai/Mistral-Large-Instruct-2411/blob/main/config.json
+    #     dict(
+    #         name="Mistral-Large-Instruct-2411",
+    #         hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2411"),
+    #         padded_vocab_size=32768,
+    #         block_size=32768,
+    #         n_layer=88,
+    #         n_head=96,
+    #         n_embd=12288,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         norm_eps=1e-05,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #     )
+    # )
+    # ############
+    # # TinyLlama
+    # ############
+    # tiny_llama = [
+    #     dict(
+    #         name="tiny-llama-1.1b{}",
+    #         hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"),
+    #         block_size=2048,
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=22,
+    #         n_head=32,
+    #         n_embd=2048,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",  # original TinyLlama use FusedRMSNorm
+    #         norm_eps=1e-5,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=5632,
+    #         n_query_groups=4,
+    #     )
+    # ]
+    # for c in tiny_llama:
+    #     for kind, hf_postfix in (("", "-intermediate-step-1431k-3T"), ("-chat", "-Chat-v1.0")):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix)
+    #         configs.append(copy)
+    # ############
+    # # MicroLlama
+    # ############
+    # micro_llama = [
+    #     dict(
+    #         name="micro-llama-300M",
+    #         hf_config=dict(org="keeeeenw", name="MicroLlama"),
+    #         block_size=2048,
+    #         vocab_size=32000,
+    #         padding_multiple=64,
+    #         n_layer=12,
+    #         n_head=16,
+    #         n_embd=1024,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",  # original TinyLlama and MicroLlama use FusedRMSNorm
+    #         norm_eps=1e-5,
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=5632,
+    #         n_query_groups=4,
+    #     )
+    # ]
+    # configs.extend(micro_llama)
+    # ##########################
+    # # Trelis Function Calling
+    # ##########################
+    # llama_2_function_calling = [
+    #     # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json
+    #     dict(
+    #         name="Llama-2-7b-chat-hf-function-calling-v2",
+    #         hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"),
+    #         padding_multiple=64,
+    #         n_layer=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         norm_eps=1e-6,
+    #         block_size=4096,
+    #         vocab_size=32000,
+    #         n_head=32,
+    #         n_embd=4096,
+    #         rope_base=10000,
+    #     )
+    # ]
+    # configs.extend(llama_2_function_calling)
+    # ##########
+    # # Qwen2.5
+    # ##########
+    # qwen_2_5 = [
+    #     # https://huggingface.co/Qwen/Qwen2.5-0.5B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-0.5B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=24,
+    #         n_head=14,
+    #         n_embd=896,
+    #         n_query_groups=2,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=4864,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-1.5B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-1.5B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"),
+    #         block_size=131072,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=28,
+    #         n_head=12,
+    #         n_embd=1536,
+    #         n_query_groups=2,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8960,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-3B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-3B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=36,
+    #         n_head=16,
+    #         n_embd=2048,
+    #         n_query_groups=2,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-7B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"),
+    #         block_size=131072,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=28,
+    #         n_head=28,
+    #         n_embd=3584,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=18944,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-14B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-14B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"),
+    #         block_size=131072,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=48,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #         norm_eps=1e-5,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-32B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"),
+    #         block_size=131072,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=64,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=27648,
+    #         norm_eps=1e-5,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-72B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-72B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"),
+    #         block_size=131072,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=29568,
+    #         norm_eps=1e-5,
+    #         rope_base=1000000,
+    #     ),
+    # ]
+    # qwen_2_5_coder = [
+    #     # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Coder-0.5B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Coder-0.5B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=24,
+    #         n_head=14,
+    #         n_embd=896,
+    #         n_query_groups=2,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=4864,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Coder-1.5B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Coder-1.5B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=28,
+    #         n_head=12,
+    #         n_embd=1536,
+    #         n_query_groups=2,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8960,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-Coder-3B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Coder-3B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Coder-3B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=36,
+    #         n_head=16,
+    #         n_embd=2048,
+    #         n_query_groups=2,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-Coder-7B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Coder-7B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Coder-7B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=28,
+    #         n_head=28,
+    #         n_embd=3584,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=18944,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-Coder-14B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Coder-14B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Coder-14B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=48,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #         norm_eps=1e-5,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-Coder-32B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Coder-32B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Coder-32B{}"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=64,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=27648,
+    #         norm_eps=1e-5,
+    #         rope_base=1000000,
+    #     ),
+    # ]
+    # qwen_2_5.extend(qwen_2_5_coder)
+    # qwen_2_5_math = [
+    #     # https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Math-1.5B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Math-1.5B{}"),
+    #         block_size=4096,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=28,
+    #         n_head=12,
+    #         n_embd=1536,
+    #         n_query_groups=2,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8960,
+    #         norm_eps=1e-6,
+    #         rope_base=10000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-Math-7B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Math-7B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Math-7B{}"),
+    #         block_size=4096,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=28,
+    #         n_head=28,
+    #         n_embd=3584,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=18944,
+    #         norm_eps=1e-6,
+    #         rope_base=10000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-Math-72B/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-Math-72B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-Math-72B{}"),
+    #         block_size=4096,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=29568,
+    #         norm_eps=1e-5,
+    #         rope_base=10000,
+    #     ),
+    # ]
+    # qwen_2_5.extend(qwen_2_5_math)
+    # for c in qwen_2_5:
+    #     for kind in ("", "-Instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # qwen_2_5_1m = [
+    #     # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-7B-Instruct-1M",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-7B-Instruct-1M"),
+    #         block_size=1010000,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=28,
+    #         n_head=28,
+    #         n_embd=3584,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=18944,
+    #         norm_eps=1e-5,
+    #         rope_base=10000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
+    #     dict(
+    #         name="Qwen2.5-14B-Instruct-1M",
+    #         hf_config=dict(org="Qwen", name="Qwen2.5-14B-Instruct-1M"),
+    #         block_size=1010000,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=48,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=13824,
+    #         norm_eps=1e-5,
+    #         rope_base=10000000,
+    #     ),
+    # ]
+    # configs.extend(qwen_2_5_1m)
+    # ##########
+    # # QwQ
+    # ##########
+    # qwq = [
+    #     # https://huggingface.co/Qwen/QwQ-32B/blob/main/config.json
+    #     dict(
+    #         name="QwQ-32B",
+    #         hf_config=dict(org="Qwen", name="QwQ-32B"),
+    #         block_size=131072,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=64,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=27648,
+    #         norm_eps=1e-5,
+    #         rope_base=1000000,
+    #     ),
+    #     # https://huggingface.co/Qwen/QwQ-32B-Preview/blob/main/config.json
+    #     dict(
+    #         name="QwQ-32B-Preview",
+    #         hf_config=dict(org="Qwen", name="QwQ-32B-Preview"),
+    #         block_size=32768,
+    #         vocab_size=151643,
+    #         padded_vocab_size=152064,
+    #         n_layer=64,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         attn_bias=True,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=27648,
+    #         norm_eps=1e-5,
+    #         rope_base=1000000,
+    #     ),
+    # ]
+    # configs.extend(qwq)
+    # ##########
+    # # Qwen3
+    # ##########
+    # qwen_3 = [
+    #     # https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-0.6B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen3-0.6B{}"),
+    #         block_size=40960,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=28,
+    #         n_head=16,
+    #         n_embd=1024,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=3072,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         head_size=128,
+    #         norm_qk=True,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-1.7B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen3-1.7B{}"),
+    #         block_size=40960,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=28,
+    #         n_head=16,
+    #         n_embd=2048,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=6144,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         norm_qk=True,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-4B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-4B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen3-4B{}"),
+    #         block_size=40960,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=36,
+    #         n_head=32,
+    #         n_embd=2560,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=9728,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         head_size=128,
+    #         norm_qk=True,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-8B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-8B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen3-8B{}"),
+    #         block_size=40960,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=36,
+    #         n_head=32,
+    #         n_embd=4096,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=12288,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         norm_qk=True,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-14B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-14B{}",
+    #         hf_config=dict(org="Qwen", name="Qwen3-14B{}"),
+    #         block_size=40960,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=40,
+    #         n_head=40,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=17408,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         norm_qk=True,
+    #     ),
+    # ]
+    # for c in qwen_3:
+    #     for kind in ("", "-Base"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # qwen_3_32b = [
+    #     # https://huggingface.co/Qwen/Qwen3-32B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-32B",
+    #         hf_config=dict(org="Qwen", name="Qwen3-32B"),
+    #         block_size=40960,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=64,
+    #         n_head=64,
+    #         n_embd=5120,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=25600,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         head_size=128,
+    #         norm_qk=True,
+    #     ),
+    # ]
+    # configs.extend(qwen_3_32b)
+    # qwen_3_moe = [
+    #     # https://huggingface.co/Qwen/Qwen3-30B-A3B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-30B-A3B",
+    #         hf_config=dict(org="Qwen", name="Qwen3-30B-A3B"),
+    #         block_size=40960,
+    #         head_size=128,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=48,
+    #         n_head=32,
+    #         n_embd=2048,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMoE",
+    #         intermediate_size=6144,
+    #         moe_intermediate_size=768,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         norm_qk=True,
+    #         n_expert=128,
+    #         n_expert_per_token=8,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-30B-A3B-Base/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-30B-A3B-Base",
+    #         hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Base"),
+    #         block_size=40960,
+    #         head_size=128,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=48,
+    #         n_head=32,
+    #         n_embd=2048,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMoE",
+    #         intermediate_size=6144,
+    #         moe_intermediate_size=768,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         norm_qk=True,
+    #         n_expert=128,
+    #         n_expert_per_token=8,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-235B-A22B/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-235B-A22B",
+    #         hf_config=dict(org="Qwen", name="Qwen3-235B-A22B"),
+    #         block_size=40960,
+    #         head_size=128,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=94,
+    #         n_head=64,
+    #         n_embd=4096,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMoE",
+    #         intermediate_size=12288,
+    #         moe_intermediate_size=1536,
+    #         norm_eps=1e-6,
+    #         rope_base=1000000,
+    #         norm_qk=True,
+    #         n_expert=128,
+    #         n_expert_per_token=8,
+    #     ),
+    # ]
+    # configs.extend(qwen_3_moe)
+    # qwen_3_2507_thinking_instruct = [
+    #     # https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-235B-A22B-{}-2507",
+    #         hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-{}-2507"),
+    #         block_size=262144,
+    #         head_size=128,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=94,
+    #         n_head=64,
+    #         n_embd=4096,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMoE",
+    #         intermediate_size=12288,
+    #         moe_intermediate_size=1536,
+    #         norm_eps=1e-6,
+    #         rope_base=5000000,
+    #         norm_qk=True,
+    #         n_expert=128,
+    #         n_expert_per_token=8,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-30B-A3B-{}-2507",
+    #         hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-{}-2507"),
+    #         block_size=262144,
+    #         head_size=128,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=48,
+    #         n_head=32,
+    #         n_embd=2048,
+    #         n_query_groups=4,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMoE",
+    #         intermediate_size=6144,
+    #         moe_intermediate_size=768,
+    #         norm_eps=1e-6,
+    #         rope_base=10000000,
+    #         norm_qk=True,
+    #         n_expert=128,
+    #         n_expert_per_token=8,
+    #     ),
+    #     # https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json
+    #     dict(
+    #         name="Qwen3-4B-{}-2507",
+    #         hf_config=dict(org="Qwen", name="Qwen3-4B-{}-2507"),
+    #         block_size=262144,
+    #         vocab_size=151643,
+    #         padded_vocab_size=151936,
+    #         n_layer=36,
+    #         n_head=32,
+    #         n_embd=2560,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=9728,
+    #         norm_eps=1e-6,
+    #         rope_base=5000000,
+    #         head_size=128,
+    #         norm_qk=True,
+    #     ),
+    # ]
+    # for c in qwen_3_2507_thinking_instruct:
+    #     for kind in ("Thinking", "Instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # #############
+    # # Salamandra
+    # #############
+    # salamandra = [
+    #     # https://huggingface.co/BSC-LT/salamandra-2b-instruct/blob/main/config.json
+    #     dict(
+    #         name="salamandra-2b{}",
+    #         hf_config=dict(org="BSC-LT", name="salamandra-2b{}"),
+    #         block_size=8192,
+    #         vocab_size=256000,
+    #         padded_vocab_size=256000,
+    #         n_layer=24,
+    #         n_head=16,
+    #         n_embd=2048,
+    #         n_query_groups=16,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=5440,
+    #         norm_eps=1e-5,
+    #         rope_base=10000,
+    #     ),
+    #     # https://huggingface.co/BSC-LT/salamandra-7b-instruct/blob/main/config.json
+    #     dict(
+    #         name="salamandra-7b{}",
+    #         hf_config=dict(org="BSC-LT", name="salamandra-7b{}"),
+    #         block_size=8192,
+    #         vocab_size=256000,
+    #         padded_vocab_size=256000,
+    #         n_layer=32,
+    #         n_head=32,
+    #         n_embd=4096,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=11008,
+    #         norm_eps=1e-6,
+    #         rope_base=10000,
+    #     ),
+    # ]
+    # for c in salamandra:
+    #     for kind in ("", "-instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # ###############
+    # # SmolLM2
+    # ###############
+    # smollm2 = [
+    #     # https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config.json
+    #     dict(
+    #         name="SmolLM2-135M{}",
+    #         hf_config=dict(org="HuggingFaceTB", name="SmolLM2-135M{}"),
+    #         block_size=8192,
+    #         vocab_size=49152,
+    #         padded_vocab_size=49152,
+    #         n_layer=30,
+    #         n_head=9,
+    #         n_embd=576,
+    #         n_query_groups=3,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=1536,
+    #         rope_base=100000,
+    #         norm_eps=1e-5,
+    #     ),
+    #     # https://huggingface.co/HuggingFaceTB/SmolLM2-360M/blob/main/config.json
+    #     dict(
+    #         name="SmolLM2-360M{}",
+    #         hf_config=dict(org="HuggingFaceTB", name="SmolLM2-360M{}"),
+    #         block_size=8192,
+    #         vocab_size=49152,
+    #         padded_vocab_size=49152,
+    #         n_layer=32,
+    #         n_head=15,
+    #         n_embd=960,
+    #         n_query_groups=5,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=2560,
+    #         rope_base=100000,
+    #         norm_eps=1e-5,
+    #     ),
+    #     # https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B/blob/main/config.json
+    #     dict(
+    #         name="SmolLM2-1.7B{}",
+    #         hf_config=dict(org="HuggingFaceTB", name="SmolLM2-1.7B{}"),
+    #         block_size=8192,
+    #         vocab_size=49152,
+    #         padded_vocab_size=49152,
+    #         n_layer=24,
+    #         n_head=32,
+    #         n_embd=2048,
+    #         n_query_groups=32,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=8192,
+    #         rope_base=130000,
+    #         norm_eps=1e-5,
+    #     ),
+    # ]
+    # for c in smollm2:
+    #     for kind in ("", "-Instruct"):
+    #         copy = deepcopy(c)
+    #         copy["name"] = c["name"].format(kind)
+    #         copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+    #         configs.append(copy)
+    # ###############
+    # # DeepSeek R1 Distill
+    # ###############
+    # r1_distill_llama = [
+    #     # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/blob/main/config.json
+    #     dict(
+    #         name="R1-Distill-Llama-8B",
+    #         hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-8B"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=32,
+    #         n_head=32,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=14336,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
+    #     # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/config.json
+    #     dict(
+    #         name="R1-Distill-Llama-70B",
+    #         hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-70B"),
+    #         block_size=131072,
+    #         vocab_size=128000,
+    #         padded_vocab_size=128256,
+    #         n_layer=80,
+    #         n_head=64,
+    #         n_embd=8192,
+    #         n_query_groups=8,
+    #         rotary_percentage=1.0,
+    #         parallel_residual=False,
+    #         bias=False,
+    #         norm_class_name="RMSNorm",
+    #         mlp_class_name="LLaMAMLP",
+    #         intermediate_size=28672,
+    #         rope_base=500000,
+    #         rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
+    #     ),
 ]
 
-
-##########################
-# Stability AI StableCode
-##########################
-stablecode = [
-    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
-    dict(
-        name="stablecode-completion-alpha-3b",
-        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
-        block_size=16384,
-        vocab_size=49152,
-        n_layer=32,
-        n_embd=2560,
-    ),
-    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
-    dict(
-        name="stablecode-completion-alpha-3b-4k",
-        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
-        vocab_size=49152,
-        n_layer=32,
-        n_embd=2560,
-    ),
-    # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
-    dict(
-        name="stablecode-instruct-alpha-3b",
-        hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
-        vocab_size=49152,
-        n_layer=32,
-        n_embd=2560,
-    ),
-    # https://huggingface.co/stabilityai/stable-code-3b/blob/main/config.json
-    dict(
-        name="stable-code-3b",
-        hf_config=dict(org="stabilityai", name="stable-code-3b"),
-        padded_vocab_size=50304,
-        n_layer=32,
-        n_embd=2560,
-        block_size=16384,
-        parallel_residual=False,
-        bias=False,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=6912,
-    ),
-]
-configs.extend(stablecode)
-
-
-####################
-# EleutherAI Pythia
-####################
-pythia = [
-    # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json
-    dict(
-        name="pythia-14m",
-        hf_config=dict(org="EleutherAI", name="pythia-14m"),
-        block_size=512,
-        n_layer=6,
-        n_embd=128,
-        n_head=4,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json
-    dict(
-        name="pythia-31m",
-        hf_config=dict(org="EleutherAI", name="pythia-31m"),
-        block_size=1024,
-        n_layer=6,
-        n_embd=256,
-        n_head=8,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
-    dict(
-        name="pythia-70m",
-        hf_config=dict(org="EleutherAI", name="pythia-70m"),
-        block_size=2048,
-        n_layer=6,
-        n_embd=512,
-        n_head=8,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
-    dict(
-        name="pythia-160m",
-        hf_config=dict(org="EleutherAI", name="pythia-160m"),
-        block_size=2048,
-        n_layer=12,
-        n_embd=768,
-        n_head=12,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
-    dict(
-        name="pythia-410m",
-        hf_config=dict(org="EleutherAI", name="pythia-410m"),
-        block_size=2048,
-        n_layer=24,
-        n_embd=1024,
-        n_head=16,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
-    dict(
-        name="pythia-1b",
-        hf_config=dict(org="EleutherAI", name="pythia-1b"),
-        block_size=2048,
-        n_embd=2048,
-        n_head=8,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
-    dict(
-        name="pythia-1.4b",
-        hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
-        block_size=2048,
-        n_layer=24,
-        n_embd=2048,
-        n_head=16,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
-    dict(
-        name="pythia-2.8b",
-        hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
-        block_size=2048,
-        n_layer=32,
-        n_embd=2560,
-        padding_multiple=128,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
-    dict(
-        name="pythia-6.9b",
-        hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
-        block_size=2048,
-        n_layer=32,
-        padding_multiple=256,
-    ),
-    # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
-    dict(
-        name="pythia-12b",
-        hf_config=dict(org="EleutherAI", name="pythia-12b"),
-        block_size=2048,
-        n_layer=36,
-        n_embd=5120,
-        n_head=40,
-    ),
-]
-configs.extend(pythia)
-for c in pythia:
-    # "pythia-14m" and "pythia-31m" don't have deduped version
-    if c["name"] in ("pythia-14m", "pythia-31m"):
-        continue
-    copy = deepcopy(c)
-    copy["name"] = f"{c['name']}-deduped"
-    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
-    configs.append(copy)
-
-
-#################
-# TII UAE Falcon
-#################
-falcon = [
-    # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
-    dict(
-        name="falcon-7b{}",
-        hf_config=dict(org="tiiuae", name="falcon-7b{}"),
-        block_size=2048,
-        vocab_size=65024,
-        padded_vocab_size=65024,
-        n_layer=32,
-        n_head=71,
-        n_embd=4544,
-        rotary_percentage=1.0,
-        n_query_groups=1,
-        bias=False,
-        # this is not in the config, but in the original model implementation, only for this config
-        shared_attention_norm=True,
-    ),
-    # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
-    dict(
-        name="falcon-40b{}",
-        hf_config=dict(org="tiiuae", name="falcon-40b{}"),
-        block_size=2048,
-        vocab_size=65024,
-        padded_vocab_size=65024,
-        n_layer=60,
-        n_head=128,
-        n_embd=8192,
-        rotary_percentage=1.0,
-        n_query_groups=8,
-        bias=False,
-    ),
-]
-for c in falcon:
-    for kind in ("", "-instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
-falcon180b = dict(
-    name="falcon-180B{}",
-    hf_config=dict(org="tiiuae", name="falcon-180B{}"),
-    block_size=2048,
-    vocab_size=65024,
-    padded_vocab_size=65024,
-    n_layer=80,
-    n_head=232,
-    n_embd=14848,
-    rotary_percentage=1.0,
-    n_query_groups=8,
-    bias=False,
-)
-
-for kind in ("", "-chat"):
-    copy = deepcopy(falcon180b)
-    copy["name"] = falcon180b["name"].format(kind)
-    copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
-    configs.append(copy)
-
-falcon3 = [
-    # https://huggingface.co/tiiuae/Falcon3-1B-Base/blob/main/config.json
-    dict(
-        name="Falcon3-1B{}",
-        hf_config=dict(org="tiiuae", name="Falcon3-1B{}"),
-        block_size=4096,
-        vocab_size=131072,
-        padded_vocab_size=131072,
-        n_layer=18,
-        n_head=8,
-        n_query_groups=4,
-        n_embd=2048,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        rope_base=1000042,
-        norm_eps=1e-6,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8192,
-    ),
-    # https://huggingface.co/tiiuae/Falcon3-3B-Base/blob/main/config.json
-    dict(
-        name="Falcon3-3B{}",
-        hf_config=dict(org="tiiuae", name="Falcon3-3B{}"),
-        block_size=32768,
-        vocab_size=131072,
-        padded_vocab_size=131072,
-        n_layer=22,
-        n_head=12,
-        n_query_groups=4,
-        n_embd=3072,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        rope_base=1000042,
-        norm_eps=1e-6,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=9216,
-    ),
-    # https://huggingface.co/tiiuae/Falcon3-7B-Base/blob/main/config.json
-    dict(
-        name="Falcon3-7B{}",
-        hf_config=dict(org="tiiuae", name="Falcon3-7B{}"),
-        block_size=32768,
-        vocab_size=131072,
-        padded_vocab_size=131072,
-        n_layer=28,
-        n_head=12,
-        n_query_groups=4,
-        n_embd=3072,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        rope_base=1000042,
-        norm_eps=1e-6,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=23040,
-    ),
-    # https://huggingface.co/tiiuae/Falcon3-10B-Base/blob/main/config.json
-    dict(
-        name="Falcon3-10B{}",
-        hf_config=dict(org="tiiuae", name="Falcon3-10B{}"),
-        block_size=32768,
-        vocab_size=131072,
-        padded_vocab_size=131072,
-        n_layer=40,
-        n_head=12,
-        n_query_groups=4,
-        n_embd=3072,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        rope_base=1000042,
-        norm_eps=1e-6,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=23040,
-    ),
-]
-for c in falcon3:
-    for kind in ("-Base", "-Instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-
-#############################
-# OpenLM Research Open LLaMA
-#############################
-open_LLaMA = [
-    # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
-    dict(
-        name="open_llama_3b",
-        hf_config=dict(org="openlm-research", name="open_llama_3b"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=26,
-        n_embd=3200,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8640,
-    ),
-    # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
-    dict(
-        name="open_llama_7b",
-        hf_config=dict(org="openlm-research", name="open_llama_7b"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-    # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
-    dict(
-        name="open_llama_13b",
-        hf_config=dict(org="openlm-research", name="open_llama_13b"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-6,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-]
-configs.extend(open_LLaMA)
-
-###############
-# Meta LLaMA 2
-###############
-llama_2 = [
-    # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
-    dict(
-        name="Llama-2-7b{}-hf",
-        hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-    # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
-    dict(
-        name="Llama-2-13b{}-hf",
-        hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-    # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
-    dict(
-        name="Llama-2-70b{}-hf",
-        hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-    ),
-]
-for c in llama_2:
-    for kind in ("", "-chat"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-
-###############
-# Meta LLaMA 3
-###############
-llama_3 = [
-    # https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json
-    dict(
-        name="Llama-3-8B{}",
-        hf_config=dict(org="meta-llama", name="Meta-Llama-3-8B{}"),
-        block_size=8192,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=32,
-        n_head=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-        rope_base=500000,
-    ),
-    # https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/blob/main/config.json
-    dict(
-        name="Llama-3.1-8B{}",
-        hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-8B{}"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=32,
-        n_head=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-        rope_base=500000,
-        rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-    # https://huggingface.co/meta-llama/Meta-Llama-3-70B/blob/main/config.json
-    dict(
-        name="Llama-3-70B{}",
-        hf_config=dict(org="meta-llama", name="Meta-Llama-3-70B{}"),
-        block_size=8192,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=500000,
-    ),
-    # https://huggingface.co/meta-llama/Meta-Llama-3.1-70B/blob/main/config.json
-    dict(
-        name="Llama-3.1-70B{}",
-        hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-70B{}"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=500000,
-        rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-    # https://huggingface.co/meta-llama/Meta-Llama-3.1-405B/blob/main/config.json
-    dict(
-        name="Llama-3.1-405B{}",
-        hf_config=dict(org="meta-llama", name="Meta-Llama-3.1-405B{}"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=126,
-        n_head=128,
-        n_embd=16384,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=53248,
-        rope_base=500000,
-        rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-    # https://huggingface.co/meta-llama/Llama-3.2-1B/blob/main/config.json
-    dict(
-        name="Llama-3.2-1B{}",
-        hf_config=dict(org="meta-llama", name="Llama-3.2-1B{}"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=16,
-        n_embd=2048,
-        n_head=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8192,
-        rope_base=500000,
-        rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-    # https://huggingface.co/meta-llama/Llama-3.2-3B/blob/main/config.json
-    dict(
-        name="Llama-3.2-3B{}",
-        hf_config=dict(org="meta-llama", name="Llama-3.2-3B{}"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=28,
-        n_embd=3072,
-        n_head=24,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8192,
-        rope_base=500000,
-        rope_adjustments=dict(factor=32.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-    # https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/config.json
-    dict(
-        name="Llama-3.3-70B-Instruct",
-        hf_config=dict(org="meta-llama", name="Llama-3.3-70B-Instruct"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=500000,
-        rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-]
-for c in llama_3:
-    if c["name"] == "Llama-3.3-70B-Instruct":
-        configs.append(c)
-        continue
-    for kind in ("", "-Instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-#########################
-# NVIDIA Llama Nemotron
-#########################
-configs.append(
-    dict(
-        name="Llama-3.1-Nemotron-70B-Instruct-HF",
-        hf_config=dict(org="nvidia", name="Llama-3.1-Nemotron-70B-Instruct-HF"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=500000,
-        rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-)
-
-#################
-# Allen AI OLMo
-#################
-olmo = [
-    # https://huggingface.co/allenai/OLMo-1B-hf/blob/main/config.json
-    dict(
-        name="OLMo-1B-hf",
-        hf_config=dict(org="allenai", name="OLMo-1B-hf"),
-        vocab_size=50280,
-        padded_vocab_size=50304,
-        block_size=2048,
-        n_embd=2048,
-        n_layer=16,
-        n_head=16,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="LayerNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8192,
-    ),
-    # https://huggingface.co/allenai/OLMo-7B-hf/blob/main/config.json
-    dict(
-        name="OLMo-7B-hf",
-        hf_config=dict(org="allenai", name="OLMo-7B-hf"),
-        vocab_size=50280,
-        padded_vocab_size=50304,
-        block_size=2048,
-        n_layer=32,
-        n_head=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="LayerNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-    # https://huggingface.co/allenai/OLMo-7B-Instruct-hf/blob/main/config.json
-    dict(
-        name="OLMo-7B-Instruct-hf",
-        hf_config=dict(org="allenai", name="OLMo-7B-Instruct-hf"),
-        vocab_size=50280,
-        padded_vocab_size=50304,
-        block_size=2048,
-        n_layer=32,
-        n_head=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="LayerNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-]
-
-configs.extend(olmo)
-
-olmo2 = [
-    # https://huggingface.co/allenai/OLMo-2-1124-7B/blob/main/config.json
-    dict(
-        name="OLMo-2-1124-7B{}",
-        hf_config=dict(org="allenai", name="OLMo-2-1124-7B{}"),
-        vocab_size=100278,
-        padded_vocab_size=100352,
-        block_size=4096,
-        n_embd=4096,
-        n_layer=32,
-        n_head=32,
-        n_query_groups=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        norm_eps=1e-06,
-        intermediate_size=11008,
-        rope_base=500000,
-        norm_qk=True,
-        post_mlp_norm=True,
-        norm_1=False,
-        norm_2=False,
-        norm_qk_type="olmo2",
-        post_attention_norm=True,
-    ),
-    # https://huggingface.co/allenai/OLMo-2-1124-13B/blob/main/config.json
-    dict(
-        name="OLMo-2-1124-13B{}",
-        hf_config=dict(org="allenai", name="OLMo-2-1124-13B{}"),
-        vocab_size=100278,
-        padded_vocab_size=100352,
-        block_size=4096,
-        n_embd=5120,
-        n_layer=40,
-        n_head=40,
-        n_query_groups=40,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        norm_eps=1e-06,
-        intermediate_size=13824,
-        rope_base=500000,
-        norm_qk=True,
-        post_mlp_norm=True,
-        norm_1=False,
-        norm_2=False,
-        norm_qk_type="olmo2",
-        post_attention_norm=True,
-    ),
-]
-
-for c in olmo2:
-    for kind in ("", "-SFT", "-DPO", "-Instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-###############
-# Google Gemma
-###############
-gemma = [
-    # https://huggingface.co/google/gemma-2b/blob/main/config.json
-    dict(
-        name="Gemma-2b",
-        hf_config=dict(org="google", name="gemma-2b"),
-        scale_embeddings=True,
-        vocab_size=256000,
-        padding_multiple=64,
-        n_embd=2048,
-        n_layer=18,
-        n_head=8,
-        n_query_groups=1,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        intermediate_size=16384,
-    ),
-    # https://huggingface.co/google/gemma-7b/blob/main/config.json
-    dict(
-        name="Gemma-7b",
-        hf_config=dict(org="google", name="gemma-7b"),
-        scale_embeddings=True,
-        vocab_size=256000,
-        padding_multiple=64,
-        n_embd=3072,
-        n_layer=28,
-        n_head=16,
-        head_size=256,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        intermediate_size=24576,
-    ),
-    # https://huggingface.co/google/gemma-2-2b/blob/main/config.json
-    dict(
-        name="Gemma-2-2b",
-        hf_config=dict(org="google", name="gemma-2-2b"),
-        scale_embeddings=True,
-        attention_scores_scalar=256,
-        vocab_size=256000,
-        block_size=8192,
-        sliding_window_size=4096,
-        # only layer with idx 0, 2, 4, ... have sliding window attention
-        sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(26)],
-        intermediate_size=9216,
-        n_embd=2304,
-        n_layer=26,
-        n_head=8,
-        n_query_groups=4,
-        head_size=256,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        post_attention_norm=True,
-        post_mlp_norm=True,
-        attention_logit_softcapping=50.0,
-        final_logit_softcapping=30.0,
-    ),
-    # https://huggingface.co/google/gemma-2-9b/blob/main/config.json
-    dict(
-        name="Gemma-2-9b",
-        hf_config=dict(org="google", name="gemma-2-9b"),
-        scale_embeddings=True,
-        attention_scores_scalar=256,
-        vocab_size=256000,
-        block_size=8192,
-        sliding_window_size=4096,
-        # only layer with idx 0, 2, 4, ... have sliding window attention
-        sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(42)],
-        intermediate_size=14336,
-        n_embd=3584,
-        n_layer=42,
-        n_head=16,
-        n_query_groups=8,
-        head_size=256,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        post_attention_norm=True,
-        post_mlp_norm=True,
-        attention_logit_softcapping=50.0,
-        final_logit_softcapping=30.0,
-    ),
-    # https://huggingface.co/google/gemma-2-27b/blob/main/config.json
-    dict(
-        name="Gemma-2-27b",
-        hf_config=dict(org="google", name="gemma-2-27b"),
-        scale_embeddings=True,
-        # In Gemma 2 27B attention scores are scaled not by `sqrt(head_size)` (11.31),
-        # but by `sqrt(n_emb // n_head)` = sqrt(4608 // 32) = 12
-        attention_scores_scalar=144,
-        vocab_size=256000,
-        block_size=8192,
-        sliding_window_size=4096,
-        # only layer with idx 0, 2, 4, ... have sliding window attention
-        sliding_window_indices=[1 if i % 2 == 0 else 0 for i in range(46)],
-        intermediate_size=36864,
-        n_embd=4608,
-        n_layer=46,
-        n_head=32,
-        n_query_groups=16,
-        head_size=128,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        post_attention_norm=True,
-        post_mlp_norm=True,
-        attention_logit_softcapping=50.0,
-        final_logit_softcapping=30.0,
-    ),
-]
-configs.extend(gemma)
-for c in gemma:
-    copy = deepcopy(c)
-    copy["name"] = f"{c['name']}-it"
-    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-it"
-    configs.append(copy)
-
-##################
-# Google Gemma 3
-##################
-gemma3 = [
-    # https://huggingface.co/google/gemma-3-1b-it/blob/main/config.json
-    dict(
-        name="Gemma-3-1b-it",
-        hf_config=dict(org="google", name="gemma-3-1b-it"),
-        scale_embeddings=True,
-        attention_scores_scalar=256,
-        vocab_size=262144,
-        block_size=131072,
-        sliding_window_size=512,
-        # 5 local layers for every global layer
-        sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)],
-        intermediate_size=6912,
-        n_embd=1152,
-        n_layer=26,
-        n_head=4,
-        n_query_groups=1,
-        head_size=256,
-        rotary_percentage=1.0,
-        rope_adjustments=None,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        post_attention_norm=True,
-        post_mlp_norm=True,
-        norm_qk=True,
-        rope_base=1000000,
-        rope_local_base_freq=10000,
-        # 5 local layers for every global layer
-        rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(26)],
-    ),
-    # https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json
-    dict(
-        name="Gemma-3-4b-it",
-        hf_config=dict(org="google", name="gemma-3-4b-it"),
-        scale_embeddings=True,
-        attention_scores_scalar=256,
-        vocab_size=262144,
-        block_size=131072,
-        sliding_window_size=1024,
-        # 5 local layers for every global layer
-        sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)],
-        intermediate_size=10240,
-        n_embd=2560,
-        n_layer=34,
-        n_head=8,
-        n_query_groups=4,
-        head_size=256,
-        rotary_percentage=1.0,
-        rope_adjustments=dict(factor=8.0),
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        post_attention_norm=True,
-        post_mlp_norm=True,
-        norm_qk=True,
-        rope_base=1000000,
-        rope_local_base_freq=10000,
-        # 5 local layers for every global layer
-        rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(34)],
-    ),
-    # https://huggingface.co/google/gemma-3-12b-it/blob/main/config.json
-    dict(
-        name="Gemma-3-12b-it",
-        hf_config=dict(org="google", name="gemma-3-12b-it"),
-        scale_embeddings=True,
-        attention_scores_scalar=256,
-        vocab_size=262144,
-        block_size=131072,
-        sliding_window_size=1024,
-        # 5 local layers for every global layer
-        sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)],
-        intermediate_size=15360,
-        n_embd=3840,
-        n_layer=48,
-        n_head=16,
-        n_query_groups=8,
-        head_size=256,
-        rotary_percentage=1.0,
-        rope_adjustments=dict(factor=8.0),
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        post_attention_norm=True,
-        post_mlp_norm=True,
-        norm_qk=True,
-        rope_base=1000000,
-        rope_local_base_freq=10000,
-        # 5 local layers for every global layer
-        rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(48)],
-    ),
-    # https://huggingface.co/google/gemma-3-27b-it/blob/main/config.json
-    dict(
-        name="Gemma-3-27b-it",
-        hf_config=dict(org="google", name="gemma-3-27b-it"),
-        scale_embeddings=True,
-        attention_scores_scalar=168,
-        vocab_size=262144,
-        block_size=131072,
-        sliding_window_size=1024,
-        # 5 local layers for every global layer
-        sliding_window_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)],
-        intermediate_size=21504,
-        n_embd=5376,
-        n_layer=62,
-        n_head=32,
-        n_query_groups=16,
-        head_size=128,
-        rotary_percentage=1.0,
-        rope_adjustments=dict(factor=8.0),
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        post_attention_norm=True,
-        post_mlp_norm=True,
-        norm_qk=True,
-        rope_base=1000000,
-        rope_local_base_freq=10000,
-        # 5 local layers for every global layer
-        rope_indices=[0 if (i + 1) % 6 == 0 else 1 for i in range(62)],
-    ),
-]
-configs.extend(gemma3)
-
-##################
-# Google CodeGemma
-##################
-codegemma = [
-    # https://huggingface.co/google/codegemma-7b-it/blob/main/config.json
-    dict(
-        name="CodeGemma-7b-it",
-        hf_config=dict(org="google", name="codegemma-7b-it"),
-        scale_embeddings=True,
-        vocab_size=256000,
-        padding_multiple=64,
-        n_embd=3072,
-        n_layer=28,
-        n_head=16,
-        head_size=256,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="GemmaMLP",
-        gelu_approximate="tanh",
-        intermediate_size=24576,
-    ),
-]
-configs.extend(codegemma)
-
-
-##########################
-# Stability AI FreeWilly2
-##########################
-freewilly_2 = [
-    # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
-    dict(
-        name="FreeWilly2",
-        hf_config=dict(org="stabilityai", name="FreeWilly2"),
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-    )
-]
-configs.extend(freewilly_2)
-
-
-##################
-# Meta Code Llama
-##################
-code_llama = [
-    # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-7b-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
-        block_size=16384,
-        vocab_size=32016,
-        padding_multiple=16,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-13b-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
-        block_size=16384,
-        vocab_size=32016,
-        padding_multiple=16,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-34b-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
-        block_size=16384,
-        vocab_size=32000,
-        padded_vocab_size=32000,
-        n_layer=48,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=22016,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-70b-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-70b-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-70b-hf"),
-        block_size=16384,
-        vocab_size=32016,
-        padding_multiple=16,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-7b-Python-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
-        block_size=16384,
-        vocab_size=32000,
-        padded_vocab_size=32000,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-13b-Python-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
-        block_size=16384,
-        vocab_size=32000,
-        padded_vocab_size=32000,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-34b-Python-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
-        block_size=16384,
-        vocab_size=32000,
-        padded_vocab_size=32000,
-        n_layer=48,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=22016,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-70b-Python-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-70b-Python-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-70b-Python-hf"),
-        block_size=16384,
-        vocab_size=32016,
-        padding_multiple=16,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-7b-Instruct-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
-        block_size=16384,
-        vocab_size=32016,
-        padding_multiple=16,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-13b-Instruct-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
-        block_size=2048,
-        vocab_size=32016,
-        padding_multiple=16,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-34b-Instruct-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
-        block_size=16384,
-        vocab_size=32000,
-        padded_vocab_size=32000,
-        n_layer=48,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=22016,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/config.json
-    dict(
-        name="CodeLlama-70b-Instruct-hf",
-        hf_config=dict(org="codellama", name="CodeLlama-70b-Instruct-hf"),
-        block_size=16384,
-        # 32016 is an added token, so not reported in vocab_size
-        # https://huggingface.co/codellama/CodeLlama-70b-Instruct-hf/blob/main/tokenizer_config.json
-        vocab_size=32015,
-        padding_multiple=16,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=1000000,
-    ),
-]
-configs.extend(code_llama)
-
-
-########################
-# garage-bAInd Platypus
-########################
-platypus = [
-    # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
-    dict(
-        name="Platypus-30B",
-        hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
-        block_size=2048,
-        padded_vocab_size=32000,
-        n_layer=60,
-        n_head=52,
-        n_embd=6656,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-06,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=17920,
-    ),
-    # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
-    dict(
-        name="Platypus2-7B",
-        hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
-        padded_vocab_size=32000,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-    ),
-    # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
-    dict(
-        name="Platypus2-13B",
-        hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
-        padded_vocab_size=32000,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-    # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
-    dict(
-        name="Platypus2-70B",
-        hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
-        padded_vocab_size=32000,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-    ),
-    # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
-    dict(
-        name="Camel-Platypus2-13B",
-        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
-        padded_vocab_size=32000,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-    # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
-    dict(
-        name="Camel-Platypus2-70B",
-        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
-        padded_vocab_size=32000,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-    ),
-    # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
-    dict(
-        name="Stable-Platypus2-13B",
-        hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
-        padded_vocab_size=32000,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-    ),
-    # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
-    dict(
-        name="Platypus2-70B-instruct",
-        hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
-        padded_vocab_size=32000,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-    ),
-]
-configs.extend(platypus)
-
-
-##################################
-# togethercomputer LLaMA-2-7B-32K
-##################################
-together_llama2_32k = [
-    # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
-    dict(
-        name="LLaMA-2-7B-32K",
-        hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        rope_condense_ratio=8,
-    )
-]
-configs.extend(together_llama2_32k)
-
-
-################
-# Microsoft Phi
-################
-phi = [
-    # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
-    dict(
-        name="phi-1_5",
-        hf_config=dict(org="microsoft", name="phi-1_5"),
-        vocab_size=50257,
-        padded_vocab_size=51200,
-        block_size=2048,
-        n_embd=2048,
-        n_layer=24,
-        rotary_percentage=0.5,  # 32 / (n_embd / n_head) = 32 / 64
-        shared_attention_norm=True,
-        lm_head_bias=True,
-        gelu_approximate="tanh",
-    ),
-    # https://huggingface.co/microsoft/phi-2/blob/main/config.json
-    dict(
-        name="phi-2",
-        hf_config=dict(org="microsoft", name="phi-2"),
-        vocab_size=50257,
-        padded_vocab_size=51200,
-        block_size=2048,
-        n_embd=2560,
-        n_layer=32,
-        rotary_percentage=0.4,  # 32 / (n_embd / n_head) = 32 / 80
-        shared_attention_norm=True,
-        lm_head_bias=True,
-        gelu_approximate="tanh",
-    ),
-    # https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/config.json
-    dict(
-        name="Phi-3-mini-4k-instruct",
-        hf_config=dict(org="microsoft", name="Phi-3-mini-4k-instruct"),
-        vocab_size=32000,
-        padded_vocab_size=32064,
-        block_size=4096,
-        n_embd=3072,
-        n_layer=32,
-        rotary_percentage=1.0,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=8192,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-        sliding_window_size=2048,
-    ),
-    # https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json
-    dict(
-        name="Phi-3-mini-128k-instruct",
-        hf_config=dict(org="microsoft", name="Phi-3-mini-128k-instruct"),
-        vocab_size=32000,
-        padded_vocab_size=32064,
-        block_size=131072,
-        n_embd=3072,
-        n_layer=32,
-        rotary_percentage=1.0,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=8192,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-        sliding_window_size=262145,
-    ),
-    # https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/config.json
-    dict(
-        name="Phi-3.5-mini-instruct",
-        hf_config=dict(org="microsoft", name="Phi-3.5-mini-instruct"),
-        vocab_size=32000,
-        padded_vocab_size=32064,
-        block_size=4096,
-        n_embd=3072,
-        n_layer=32,
-        rotary_percentage=1.0,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=8192,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-    ),
-    # https://huggingface.co/microsoft/phi-4/blob/main/config.json
-    dict(
-        name="phi-4",
-        hf_config=dict(org="microsoft", name="phi-4"),
-        vocab_size=100352,
-        padded_vocab_size=100352,
-        block_size=16384,
-        n_embd=5120,
-        n_layer=40,
-        n_head=40,
-        n_query_groups=10,
-        rotary_percentage=1.0,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=17920,
-        rope_base=250000,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-    ),
-    # https://huggingface.co/microsoft/Phi-4-reasoning/blob/main/config.json
-    dict(
-        name="Phi-4-reasoning",
-        hf_config=dict(org="microsoft", name="Phi-4-reasoning"),
-        vocab_size=100352,
-        padded_vocab_size=100352,
-        block_size=32768,
-        n_embd=5120,
-        n_layer=40,
-        n_head=40,
-        n_query_groups=10,
-        rotary_percentage=1.0,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=17920,
-        rope_base=500000,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-    ),
-    # https://huggingface.co/microsoft/Phi-4-reasoning-plus/blob/main/config.json
-    dict(
-        name="Phi-4-reasoning-plus",
-        hf_config=dict(org="microsoft", name="Phi-4-reasoning-plus"),
-        vocab_size=100352,
-        padded_vocab_size=100352,
-        block_size=32768,
-        n_embd=5120,
-        n_layer=40,
-        n_head=40,
-        n_query_groups=10,
-        rotary_percentage=1.0,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=17920,
-        rope_base=500000,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-    ),
-    # https://huggingface.co/microsoft/Phi-4-mini-instruct/blob/main/config.json
-    dict(
-        name="Phi-4-mini-instruct",
-        hf_config=dict(org="microsoft", name="Phi-4-mini-instruct"),
-        vocab_size=200019,
-        padded_vocab_size=200064,
-        block_size=131072,
-        n_embd=3072,
-        n_layer=32,
-        n_head=24,
-        n_query_groups=8,
-        rotary_percentage=0.75,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=8192,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-        sliding_window_size=262145,
-    ),
-    # https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/config.json
-    dict(
-        name="Phi-4-mini-reasoning",
-        hf_config=dict(org="microsoft", name="Phi-4-mini-reasoning"),
-        vocab_size=200019,
-        padded_vocab_size=200064,
-        block_size=131072,
-        n_embd=3072,
-        n_layer=32,
-        n_head=24,
-        n_query_groups=8,
-        rotary_percentage=0.75,
-        bias=False,
-        norm_class_name="RMSNorm",
-        intermediate_size=8192,
-        mlp_class_name="LLaMAMLP",
-        parallel_residual=False,
-        sliding_window_size=262145,
-    ),
-]
-configs.extend(phi)
-
-
-#############
-# Mistral AI
-#############
-
-configs.append(
-    # https://huggingface.co/mistralai/mathstral-7B-v0.1/blob/main/config.json
-    dict(
-        name="Mathstral-7B-v0.1",
-        hf_config=dict(org="mistralai", name="mathstral-7B-v0.1"),
-        padded_vocab_size=32768,
-        block_size=32768,
-        n_layer=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-        sliding_window_size=4096,
-    )
-)
-
-mistral = [
-    # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
-    dict(
-        name="Mistral-7B-{}v0.1",
-        hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
-        padded_vocab_size=32000,
-        block_size=4096,  # should be 32768 but sliding window attention is not implemented
-        n_layer=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-        sliding_window_size=4096,
-    ),
-    # https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/blob/main/config.json
-    dict(
-        name="Mixtral-8x7B-{}v0.1",
-        hf_config=dict(org="mistralai", name="Mixtral-8x7B-{}v0.1"),
-        padded_vocab_size=32000,
-        block_size=32768,
-        n_layer=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMoE",
-        intermediate_size=14336,
-        rope_base=1000000,
-        n_expert=8,
-        n_expert_per_token=2,
-    ),
-    # https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1/blob/main/config.json
-    dict(
-        name="Mixtral-8x22B-{}v0.1",
-        hf_config=dict(org="mistralai", name="Mixtral-8x22B-{}v0.1"),
-        padded_vocab_size=32768,
-        block_size=65536,
-        n_layer=56,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMoE",
-        intermediate_size=16384,
-        n_head=48,
-        n_embd=6144,
-        rope_base=1000000,
-        n_expert=8,
-        n_expert_per_token=2,
-    ),
-]
-for c in mistral:
-    for kind in ("", "Instruct-"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-configs.append(
-    # https://huggingface.co/unsloth/mistral-7b-v0.2/blob/main/config.json
-    dict(
-        name="Mistral-7B-v0.2",
-        hf_config=dict(org="unsloth", name="Mistral-7B-v0.2"),
-        padded_vocab_size=32000,
-        block_size=32768,
-        n_layer=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-    )
-)
-configs.append(
-    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/blob/main/config.json
-    dict(
-        name="Mistral-7B-Instruct-v0.2",
-        hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.2"),
-        padded_vocab_size=32000,
-        block_size=32768,
-        n_layer=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-    )
-)
-configs.append(
-    # https://huggingface.co/mistralai/Mistral-7B-v0.3/blob/main/config.json
-    dict(
-        name="Mistral-7B-v0.3",
-        hf_config=dict(org="mistralai", name="Mistral-7B-v0.3"),
-        padded_vocab_size=32768,
-        block_size=32768,
-        n_layer=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-    )
-)
-configs.append(
-    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/blob/main/config.json
-    dict(
-        name="Mistral-7B-Instruct-v0.3",
-        hf_config=dict(org="mistralai", name="Mistral-7B-Instruct-v0.3"),
-        padded_vocab_size=32768,
-        block_size=32768,
-        n_layer=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-    )
-)
-configs.append(
-    # https://huggingface.co/mistralai/Mistral-Large-Instruct-2407/blob/main/config.json
-    dict(
-        name="Mistral-Large-Instruct-2407",
-        hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2407"),
-        padded_vocab_size=32768,
-        block_size=32768,
-        n_layer=88,
-        n_head=96,
-        n_embd=12288,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-    )
-)
-configs.append(
-    # https://huggingface.co/mistralai/Mistral-Large-Instruct-2411/blob/main/config.json
-    dict(
-        name="Mistral-Large-Instruct-2411",
-        hf_config=dict(org="mistralai", name="Mistral-Large-Instruct-2411"),
-        padded_vocab_size=32768,
-        block_size=32768,
-        n_layer=88,
-        n_head=96,
-        n_embd=12288,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        norm_eps=1e-05,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-    )
-)
-
-
-############
-# TinyLlama
-############
-tiny_llama = [
-    dict(
-        name="tiny-llama-1.1b{}",
-        hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=22,
-        n_head=32,
-        n_embd=2048,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",  # original TinyLlama use FusedRMSNorm
-        norm_eps=1e-5,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=5632,
-        n_query_groups=4,
-    )
-]
-for c in tiny_llama:
-    for kind, hf_postfix in (("", "-intermediate-step-1431k-3T"), ("-chat", "-Chat-v1.0")):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix)
-        configs.append(copy)
-
-
-############
-# MicroLlama
-############
-micro_llama = [
-    dict(
-        name="micro-llama-300M",
-        hf_config=dict(org="keeeeenw", name="MicroLlama"),
-        block_size=2048,
-        vocab_size=32000,
-        padding_multiple=64,
-        n_layer=12,
-        n_head=16,
-        n_embd=1024,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",  # original TinyLlama and MicroLlama use FusedRMSNorm
-        norm_eps=1e-5,
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=5632,
-        n_query_groups=4,
-    )
-]
-configs.extend(micro_llama)
-
-
-##########################
-# Trelis Function Calling
-##########################
-llama_2_function_calling = [
-    # https://huggingface.co/Trelis/Llama-2-7b-chat-hf-function-calling-v2/blob/main/config.json
-    dict(
-        name="Llama-2-7b-chat-hf-function-calling-v2",
-        hf_config=dict(org="Trelis", name="Llama-2-7b-chat-hf-function-calling-v2"),
-        padding_multiple=64,
-        n_layer=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        norm_eps=1e-6,
-        block_size=4096,
-        vocab_size=32000,
-        n_head=32,
-        n_embd=4096,
-        rope_base=10000,
-    )
-]
-
-configs.extend(llama_2_function_calling)
-
-##########
-# Qwen2.5
-##########
-qwen_2_5 = [
-    # https://huggingface.co/Qwen/Qwen2.5-0.5B/blob/main/config.json
-    dict(
-        name="Qwen2.5-0.5B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-0.5B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=24,
-        n_head=14,
-        n_embd=896,
-        n_query_groups=2,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=4864,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-1.5B/blob/main/config.json
-    dict(
-        name="Qwen2.5-1.5B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-1.5B{}"),
-        block_size=131072,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=28,
-        n_head=12,
-        n_embd=1536,
-        n_query_groups=2,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8960,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-3B/blob/main/config.json
-    dict(
-        name="Qwen2.5-3B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-3B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=36,
-        n_head=16,
-        n_embd=2048,
-        n_query_groups=2,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-7B/blob/main/config.json
-    dict(
-        name="Qwen2.5-7B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-7B{}"),
-        block_size=131072,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=28,
-        n_head=28,
-        n_embd=3584,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=18944,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-14B/blob/main/config.json
-    dict(
-        name="Qwen2.5-14B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-14B{}"),
-        block_size=131072,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=48,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        norm_eps=1e-5,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-32B/blob/main/config.json
-    dict(
-        name="Qwen2.5-32B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-32B{}"),
-        block_size=131072,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=64,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=27648,
-        norm_eps=1e-5,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-72B/blob/main/config.json
-    dict(
-        name="Qwen2.5-72B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-72B{}"),
-        block_size=131072,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=29568,
-        norm_eps=1e-5,
-        rope_base=1000000,
-    ),
-]
-
-qwen_2_5_coder = [
-    # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Coder-0.5B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-0.5B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=24,
-        n_head=14,
-        n_embd=896,
-        n_query_groups=2,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=4864,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Coder-1.5B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-1.5B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=28,
-        n_head=12,
-        n_embd=1536,
-        n_query_groups=2,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8960,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-Coder-3B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Coder-3B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-3B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=36,
-        n_head=16,
-        n_embd=2048,
-        n_query_groups=2,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-Coder-7B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Coder-7B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-7B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=28,
-        n_head=28,
-        n_embd=3584,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=18944,
-        norm_eps=1e-6,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-Coder-14B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Coder-14B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-14B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=48,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        norm_eps=1e-5,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-Coder-32B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Coder-32B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Coder-32B{}"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=64,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=27648,
-        norm_eps=1e-5,
-        rope_base=1000000,
-    ),
-]
-
-qwen_2_5.extend(qwen_2_5_coder)
-
-qwen_2_5_math = [
-    # https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Math-1.5B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Math-1.5B{}"),
-        block_size=4096,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=28,
-        n_head=12,
-        n_embd=1536,
-        n_query_groups=2,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8960,
-        norm_eps=1e-6,
-        rope_base=10000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-Math-7B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Math-7B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Math-7B{}"),
-        block_size=4096,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=28,
-        n_head=28,
-        n_embd=3584,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=18944,
-        norm_eps=1e-6,
-        rope_base=10000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-Math-72B/blob/main/config.json
-    dict(
-        name="Qwen2.5-Math-72B{}",
-        hf_config=dict(org="Qwen", name="Qwen2.5-Math-72B{}"),
-        block_size=4096,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=29568,
-        norm_eps=1e-5,
-        rope_base=10000,
-    ),
-]
-
-qwen_2_5.extend(qwen_2_5_math)
-
-for c in qwen_2_5:
-    for kind in ("", "-Instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-qwen_2_5_1m = [
-    # https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M/blob/main/config.json
-    dict(
-        name="Qwen2.5-7B-Instruct-1M",
-        hf_config=dict(org="Qwen", name="Qwen2.5-7B-Instruct-1M"),
-        block_size=1010000,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=28,
-        n_head=28,
-        n_embd=3584,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=18944,
-        norm_eps=1e-5,
-        rope_base=10000000,
-    ),
-    # https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M/blob/main/config.json
-    dict(
-        name="Qwen2.5-14B-Instruct-1M",
-        hf_config=dict(org="Qwen", name="Qwen2.5-14B-Instruct-1M"),
-        block_size=1010000,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=48,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=13824,
-        norm_eps=1e-5,
-        rope_base=10000000,
-    ),
-]
-
-configs.extend(qwen_2_5_1m)
-
-##########
-# QwQ
-##########
-qwq = [
-    # https://huggingface.co/Qwen/QwQ-32B/blob/main/config.json
-    dict(
-        name="QwQ-32B",
-        hf_config=dict(org="Qwen", name="QwQ-32B"),
-        block_size=131072,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=64,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=27648,
-        norm_eps=1e-5,
-        rope_base=1000000,
-    ),
-    # https://huggingface.co/Qwen/QwQ-32B-Preview/blob/main/config.json
-    dict(
-        name="QwQ-32B-Preview",
-        hf_config=dict(org="Qwen", name="QwQ-32B-Preview"),
-        block_size=32768,
-        vocab_size=151643,
-        padded_vocab_size=152064,
-        n_layer=64,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        attn_bias=True,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=27648,
-        norm_eps=1e-5,
-        rope_base=1000000,
-    ),
-]
-
-configs.extend(qwq)
-
-##########
-# Qwen3
-##########
-qwen_3 = [
-    # https://huggingface.co/Qwen/Qwen3-0.6B/blob/main/config.json
-    dict(
-        name="Qwen3-0.6B{}",
-        hf_config=dict(org="Qwen", name="Qwen3-0.6B{}"),
-        block_size=40960,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=28,
-        n_head=16,
-        n_embd=1024,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=3072,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        head_size=128,
-        norm_qk=True,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/config.json
-    dict(
-        name="Qwen3-1.7B{}",
-        hf_config=dict(org="Qwen", name="Qwen3-1.7B{}"),
-        block_size=40960,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=28,
-        n_head=16,
-        n_embd=2048,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=6144,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        norm_qk=True,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-4B/blob/main/config.json
-    dict(
-        name="Qwen3-4B{}",
-        hf_config=dict(org="Qwen", name="Qwen3-4B{}"),
-        block_size=40960,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=36,
-        n_head=32,
-        n_embd=2560,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=9728,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        head_size=128,
-        norm_qk=True,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-8B/blob/main/config.json
-    dict(
-        name="Qwen3-8B{}",
-        hf_config=dict(org="Qwen", name="Qwen3-8B{}"),
-        block_size=40960,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=36,
-        n_head=32,
-        n_embd=4096,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=12288,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        norm_qk=True,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-14B/blob/main/config.json
-    dict(
-        name="Qwen3-14B{}",
-        hf_config=dict(org="Qwen", name="Qwen3-14B{}"),
-        block_size=40960,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=40,
-        n_head=40,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=17408,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        norm_qk=True,
-    ),
-]
-for c in qwen_3:
-    for kind in ("", "-Base"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-qwen_3_32b = [
-    # https://huggingface.co/Qwen/Qwen3-32B/blob/main/config.json
-    dict(
-        name="Qwen3-32B",
-        hf_config=dict(org="Qwen", name="Qwen3-32B"),
-        block_size=40960,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=64,
-        n_head=64,
-        n_embd=5120,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=25600,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        head_size=128,
-        norm_qk=True,
-    ),
-]
-configs.extend(qwen_3_32b)
-
-qwen_3_moe = [
-    # https://huggingface.co/Qwen/Qwen3-30B-A3B/blob/main/config.json
-    dict(
-        name="Qwen3-30B-A3B",
-        hf_config=dict(org="Qwen", name="Qwen3-30B-A3B"),
-        block_size=40960,
-        head_size=128,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=48,
-        n_head=32,
-        n_embd=2048,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMoE",
-        intermediate_size=6144,
-        moe_intermediate_size=768,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        norm_qk=True,
-        n_expert=128,
-        n_expert_per_token=8,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-30B-A3B-Base/blob/main/config.json
-    dict(
-        name="Qwen3-30B-A3B-Base",
-        hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-Base"),
-        block_size=40960,
-        head_size=128,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=48,
-        n_head=32,
-        n_embd=2048,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMoE",
-        intermediate_size=6144,
-        moe_intermediate_size=768,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        norm_qk=True,
-        n_expert=128,
-        n_expert_per_token=8,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-235B-A22B/blob/main/config.json
-    dict(
-        name="Qwen3-235B-A22B",
-        hf_config=dict(org="Qwen", name="Qwen3-235B-A22B"),
-        block_size=40960,
-        head_size=128,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=94,
-        n_head=64,
-        n_embd=4096,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMoE",
-        intermediate_size=12288,
-        moe_intermediate_size=1536,
-        norm_eps=1e-6,
-        rope_base=1000000,
-        norm_qk=True,
-        n_expert=128,
-        n_expert_per_token=8,
-    ),
-]
-configs.extend(qwen_3_moe)
-
-qwen_3_2507_thinking_instruct = [
-    # https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507/blob/main/config.json
-    dict(
-        name="Qwen3-235B-A22B-{}-2507",
-        hf_config=dict(org="Qwen", name="Qwen3-235B-A22B-{}-2507"),
-        block_size=262144,
-        head_size=128,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=94,
-        n_head=64,
-        n_embd=4096,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMoE",
-        intermediate_size=12288,
-        moe_intermediate_size=1536,
-        norm_eps=1e-6,
-        rope_base=5000000,
-        norm_qk=True,
-        n_expert=128,
-        n_expert_per_token=8,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507/blob/main/config.json
-    dict(
-        name="Qwen3-30B-A3B-{}-2507",
-        hf_config=dict(org="Qwen", name="Qwen3-30B-A3B-{}-2507"),
-        block_size=262144,
-        head_size=128,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=48,
-        n_head=32,
-        n_embd=2048,
-        n_query_groups=4,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMoE",
-        intermediate_size=6144,
-        moe_intermediate_size=768,
-        norm_eps=1e-6,
-        rope_base=10000000,
-        norm_qk=True,
-        n_expert=128,
-        n_expert_per_token=8,
-    ),
-    # https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/config.json
-    dict(
-        name="Qwen3-4B-{}-2507",
-        hf_config=dict(org="Qwen", name="Qwen3-4B-{}-2507"),
-        block_size=262144,
-        vocab_size=151643,
-        padded_vocab_size=151936,
-        n_layer=36,
-        n_head=32,
-        n_embd=2560,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=9728,
-        norm_eps=1e-6,
-        rope_base=5000000,
-        head_size=128,
-        norm_qk=True,
-    ),
-]
-
-for c in qwen_3_2507_thinking_instruct:
-    for kind in ("Thinking", "Instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-#############
-# Salamandra
-#############
-salamandra = [
-    # https://huggingface.co/BSC-LT/salamandra-2b-instruct/blob/main/config.json
-    dict(
-        name="salamandra-2b{}",
-        hf_config=dict(org="BSC-LT", name="salamandra-2b{}"),
-        block_size=8192,
-        vocab_size=256000,
-        padded_vocab_size=256000,
-        n_layer=24,
-        n_head=16,
-        n_embd=2048,
-        n_query_groups=16,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=5440,
-        norm_eps=1e-5,
-        rope_base=10000,
-    ),
-    # https://huggingface.co/BSC-LT/salamandra-7b-instruct/blob/main/config.json
-    dict(
-        name="salamandra-7b{}",
-        hf_config=dict(org="BSC-LT", name="salamandra-7b{}"),
-        block_size=8192,
-        vocab_size=256000,
-        padded_vocab_size=256000,
-        n_layer=32,
-        n_head=32,
-        n_embd=4096,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=11008,
-        norm_eps=1e-6,
-        rope_base=10000,
-    ),
-]
-
-for c in salamandra:
-    for kind in ("", "-instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-
-###############
-# SmolLM2
-###############
-smollm2 = [
-    # https://huggingface.co/HuggingFaceTB/SmolLM2-135M/blob/main/config.json
-    dict(
-        name="SmolLM2-135M{}",
-        hf_config=dict(org="HuggingFaceTB", name="SmolLM2-135M{}"),
-        block_size=8192,
-        vocab_size=49152,
-        padded_vocab_size=49152,
-        n_layer=30,
-        n_head=9,
-        n_embd=576,
-        n_query_groups=3,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=1536,
-        rope_base=100000,
-        norm_eps=1e-5,
-    ),
-    # https://huggingface.co/HuggingFaceTB/SmolLM2-360M/blob/main/config.json
-    dict(
-        name="SmolLM2-360M{}",
-        hf_config=dict(org="HuggingFaceTB", name="SmolLM2-360M{}"),
-        block_size=8192,
-        vocab_size=49152,
-        padded_vocab_size=49152,
-        n_layer=32,
-        n_head=15,
-        n_embd=960,
-        n_query_groups=5,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=2560,
-        rope_base=100000,
-        norm_eps=1e-5,
-    ),
-    # https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B/blob/main/config.json
-    dict(
-        name="SmolLM2-1.7B{}",
-        hf_config=dict(org="HuggingFaceTB", name="SmolLM2-1.7B{}"),
-        block_size=8192,
-        vocab_size=49152,
-        padded_vocab_size=49152,
-        n_layer=24,
-        n_head=32,
-        n_embd=2048,
-        n_query_groups=32,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=8192,
-        rope_base=130000,
-        norm_eps=1e-5,
-    ),
-]
-
-for c in smollm2:
-    for kind in ("", "-Instruct"):
-        copy = deepcopy(c)
-        copy["name"] = c["name"].format(kind)
-        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
-        configs.append(copy)
-
-###############
-# DeepSeek R1 Distill
-###############
-
-r1_distill_llama = [
-    # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/blob/main/config.json
-    dict(
-        name="R1-Distill-Llama-8B",
-        hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-8B"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=32,
-        n_head=32,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=14336,
-        rope_base=500000,
-        rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-    # https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/blob/main/config.json
-    dict(
-        name="R1-Distill-Llama-70B",
-        hf_config=dict(org="deepseek-ai", name="DeepSeek-R1-Distill-Llama-70B"),
-        block_size=131072,
-        vocab_size=128000,
-        padded_vocab_size=128256,
-        n_layer=80,
-        n_head=64,
-        n_embd=8192,
-        n_query_groups=8,
-        rotary_percentage=1.0,
-        parallel_residual=False,
-        bias=False,
-        norm_class_name="RMSNorm",
-        mlp_class_name="LLaMAMLP",
-        intermediate_size=28672,
-        rope_base=500000,
-        rope_adjustments=dict(factor=8.0, low_freq_factor=1.0, high_freq_factor=4.0, original_max_seq_len=8192),
-    ),
-]
-
-configs.extend(r1_distill_llama)
+# configs.extend(r1_distill_llama)
 
 name_to_config = {config["name"]: config for config in configs}
diff --git a/pyproject.toml b/pyproject.toml
index ee108ed524..d9e75ab68b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ optional-dependencies.extra = [
   # download:
   "huggingface-hub[hf-transfer]>=0.21",
   "litdata==0.2.51",
+  "litmodels>=0.1.8",
   # litgpt.deploy:
   "litserve>0.2",
   "lm-eval>=0.4.2,!=0.4.9.1",
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 7d49a19338..1201d71537 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,18 +1,15 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 import os
-import shutil
-import warnings
 from types import SimpleNamespace
 from unittest import mock
 
+import litmodels
 import pytest
 from tokenizers import Tokenizer as HFTokenizer
 from tokenizers.models import BPE
-from transformers import AutoTokenizer
-from transformers.utils import cached_file
 
 import litgpt.config as config_module
-from litgpt import PromptStyle, Tokenizer
+from litgpt import Tokenizer
 
 
 # @pytest.mark.flaky(reruns=3, rerun_except=["AssertionError", "assert", "TypeError"])
@@ -20,66 +17,83 @@
 def test_tokenizer_against_hf(config, tmp_path):
     config = config_module.Config(**config)
 
-    repo_id = f"{config.hf_config['org']}/{config.hf_config['name']}"
-    theirs = AutoTokenizer.from_pretrained(repo_id, token=os.getenv("HF_TOKEN"))
-
-    # create a checkpoint directory that points to the HF files
-    hf_files = {}
-    for filename in ("tokenizer.json", "generation_config.json", "tokenizer.model", "tokenizer_config.json"):
-        try:  # download the HF tokenizer config
-            hf_file = cached_file(path_or_repo_id=repo_id, filename=filename)
-            hf_files[filename] = str(hf_file)
-        except Exception as ex:
-            warnings.warn(str(ex), RuntimeWarning)
-    if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
-        raise ConnectionError("Unable to download any tokenizer files from HF")
-
-    # we need to rename the dir to match the model name in testing as well
-    # since we use to it determine the model in tokenizer.py
-    tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"])
-
-    for filename, hf_file in hf_files.items():
-        shutil.copy(hf_file, str(tmp_path / filename))
-
-    ours = Tokenizer(tmp_path)
-
-    assert ours.vocab_size == theirs.vocab_size
-    if config.name == "Mixtral-8x22B-v0.1":
-        pytest.xfail(reason="Mixtral certainly lists 32000 vocab in its config")
-    else:
-        assert ours.vocab_size == config.vocab_size
-
-    if config.name.startswith(("falcon", "stablecode", "Qwen2.5", "QwQ", "Qwen3")):
-        # even though their config defines it, it's set as None in HF
-        assert isinstance(ours.bos_id, int)
-        assert theirs.bos_token_id is None
-    elif config.name.startswith("Falcon3"):
-        if isinstance(ours.bos_id, int):
-            assert theirs.bos_token_id is None
-        else:
-            assert ours.bos_id == theirs.bos_token_id is None
-    else:
-        assert ours.bos_id == theirs.bos_token_id
-
-    if config.name.startswith("stablecode"):
-        # even though their config defines it, it's set as None in HF
-        assert ours.eos_id == 0
-        assert ours.eos_id == theirs.eos_token_id or theirs.eos_token_id is None
-    else:
-        assert ours.eos_id == theirs.eos_token_id
-
-    prompt = "Hello, readers of this test!"
-    prompt = PromptStyle.from_config(config).apply(prompt)
-    actual = ours.encode(prompt)
-    expected = theirs.encode(prompt)
-    assert actual.tolist() == expected
-    assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True)
-
-    if not config.name.startswith(("Mistral", "Mixtral")):
-        decoded_output = "".join([ours.decode(x) for x in actual])
-        if ours.apply_decoding_fix and decoded_output[0] == " ":
-            decoded_output = decoded_output[1:]  # the "hack" adds an empty space to the beginning
-        assert decoded_output == ours.decode(actual), type(theirs)
+    lightning_repo_id = f"lightning-ai/ci/{config.hf_config['name']}"
+    print(f"DEBUG: Starting download for {lightning_repo_id}")
+
+    # Ensure local-models directory exists
+    local_models_dir = "./local-models"
+    os.makedirs(local_models_dir, exist_ok=True)
+    print(f"DEBUG: Created/verified local-models directory: {local_models_dir}")
+
+    model_path = litmodels.download_model(
+        name=lightning_repo_id,
+        download_dir=f"./local-models/{lightning_repo_id}",
+        progress_bar=False,
+    )
+    print(f"DEBUG: Download completed for {lightning_repo_id}")
+
+    # print(f"DEBUG: Loading AutoTokenizer for {lightning_repo_id}")
+    # theirs = AutoTokenizer.from_pretrained(f"./local-models/{lightning_repo_id}", use_fast=True)
+    # print(f"DEBUG: AutoTokenizer loaded for {lightning_repo_id}")
+
+    # # create a checkpoint directory that points to the HF files
+    # hf_files = {}
+    # src_dir = f"./local-models/{lightning_repo_id}"
+    # for filename in ("tokenizer.json", "generation_config.json", "tokenizer.model", "tokenizer_config.json"):
+    #     file_path = os.path.join(src_dir, filename)
+    #     if os.path.isfile(file_path):
+    #         hf_files[filename] = file_path
+    #     else:
+    #         warnings.warn(f"{file_path} not found", RuntimeWarning)
+    # if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
+    #     raise ConnectionError("Unable to find any tokenizer files in the local model directory")
+
+    # # we need to rename the dir to match the model name in testing as well
+    # # since we use to it determine the model in tokenizer.py
+    # tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"])
+
+    # for filename, hf_file in hf_files.items():
+    #     shutil.copy(hf_file, str(tmp_path / filename))
+
+    # ours = Tokenizer(tmp_path)
+
+    # assert ours.vocab_size == theirs.vocab_size
+    # if config.name == "Mixtral-8x22B-v0.1":
+    #     pytest.xfail(reason="Mixtral certainly lists 32000 vocab in its config")
+    # else:
+    #     assert ours.vocab_size == config.vocab_size
+
+    # if config.name.startswith(("falcon", "stablecode", "Qwen2.5", "QwQ", "Qwen3")):
+    #     # even though their config defines it, it's set as None in HF
+    #     assert isinstance(ours.bos_id, int)
+    #     assert theirs.bos_token_id is None
+    # elif config.name.startswith("Falcon3"):
+    #     if isinstance(ours.bos_id, int):
+    #         assert theirs.bos_token_id is None
+    #     else:
+    #         assert ours.bos_id == theirs.bos_token_id is None
+    # else:
+    #     assert ours.bos_id == theirs.bos_token_id
+
+    # if config.name.startswith("stablecode"):
+    #     # even though their config defines it, it's set as None in HF
+    #     assert ours.eos_id == 0
+    #     assert ours.eos_id == theirs.eos_token_id or theirs.eos_token_id is None
+    # else:
+    #     assert ours.eos_id == theirs.eos_token_id
+
+    # prompt = "Hello, readers of this test!"
+    # prompt = PromptStyle.from_config(config).apply(prompt)
+    # actual = ours.encode(prompt)
+    # expected = theirs.encode(prompt)
+    # assert actual.tolist() == expected
+    # assert ours.decode(actual) == theirs.decode(expected, skip_special_tokens=True)
+
+    # if not config.name.startswith(("Mistral", "Mixtral")):
+    #     decoded_output = "".join([ours.decode(x) for x in actual])
+    #     if ours.apply_decoding_fix and decoded_output[0] == " ":
+    #         decoded_output = decoded_output[1:]  # the "hack" adds an empty space to the beginning
+    #     assert decoded_output == ours.decode(actual), type(theirs)
 
 
 def test_tokenizer_input_validation():