Skip to content

Commit 00e3f99

Browse files
Qualcomm AI Engine Direct - backward compatibility CI (#12748)
### Summary - add README instructions for updating Story LLaMA artifacts - add backward compatibility validation for Story LLaMA in CI pipeline ### Test plan ./.ci/scripts/test_qnn_static_llama.sh cc: @haowhsu-quic, @cccclai
1 parent 6d4b68a commit 00e3f99

File tree

6 files changed

+207
-9
lines changed

6 files changed

+207
-9
lines changed

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ exit_code1=$?
4141
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
4242
exit_code2=$?
4343

44+
# Check BC
45+
bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh
46+
exit_code3=$?
47+
4448
# Check the exit codes and print messages
4549
if [ $exit_code1 -ne 0 ]; then
4650
echo "Static Llama compile only with weight sharing test failed. $exit_code1."
@@ -50,8 +54,12 @@ if [ $exit_code2 -ne 0 ]; then
5054
echo "Static Llama accuracy test failed. $exit_code2."
5155
fi
5256

57+
if [ $exit_code3 -ne 0 ]; then
58+
echo "Static Llama BACKWARD COMPATIBILITY test failed. $exit_code3."
59+
fi
60+
5361
# Return failure if either program failed
54-
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
62+
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ] || [ $exit_code3 -ne 0 ]; then
5563
exit 1
5664
else
5765
exit 0
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
# Copyright (c) Qualcomm Innovation Center, Inc.
3+
# All rights reserved
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
9+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
10+
PYTHON_EXECUTABLE=python3
11+
fi
12+
13+
which "${PYTHON_EXECUTABLE}"
14+
15+
16+
llama_artifacts="."
17+
PTE_ARTIFACT="examples/qualcomm/oss_scripts/llama/artifacts"
18+
19+
# Download stories260K.pt and tokenizer from Github
20+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output stories260K.pt
21+
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output tokenizer.model
22+
# Create params.json file
23+
touch params.json
24+
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
25+
26+
# Checks e2e accuracy
27+
expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")
28+
exit_code1=$?
29+
30+
# Checks accuracy with precompiled
31+
output=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir $PTE_ARTIFACT --llama_artifacts $llama_artifacts --enable_x86_64 --pre_gen_pte $PTE_ARTIFACT | grep "Model CI result:")
32+
exit_code2=$?
33+
34+
if [[ "$output" == "$expected" ]]; then
35+
echo "[BACKWARD COMPATIBILITY CHECK] Output matches expected result."
36+
else
37+
echo "[BACKWARD COMPATIBILITY CHECK] Output mismatch!"
38+
echo "[BACKWARD COMPATIBILITY CHECK] Expected: $expected"
39+
echo "[BACKWARD COMPATIBILITY CHECK] Actual: $output"
40+
exit 1
41+
fi
42+
43+
# Check the exit codes and print messages
44+
if [ $exit_code1 -ne 0 ]; then
45+
echo "Static Llama compile only test failed. $exit_code1."
46+
fi
47+
48+
if [ $exit_code2 -ne 0 ]; then
49+
echo "Static Llama execute precompiled test failed. $exit_code2."
50+
fi
51+
52+
# Return failure if either program failed
53+
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
54+
exit 1
55+
else
56+
exit 0
57+
fi

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4094,6 +4094,84 @@ def test_llama3_2_1b(self):
40944094
if not self.compile_only and not self.enable_x86_64:
40954095
self.assertGreaterEqual(msg["inference_speed"], 66) # Lanai
40964096

4097+
def test_llama_stories_260k(self):
4098+
if not self.required_envs():
4099+
self.skipTest("missing required envs")
4100+
assert (
4101+
self.llama_artifacts is not None
4102+
), "Please provide path to llama artifacts"
4103+
4104+
prompt = "Once"
4105+
cmds = [
4106+
"python",
4107+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
4108+
"--artifact",
4109+
self.artifact_dir,
4110+
"--build_folder",
4111+
self.build_folder,
4112+
"--model",
4113+
self.model,
4114+
"--checkpoint",
4115+
f"{self.llama_artifacts}/stories260K.pt",
4116+
"--params",
4117+
f"{self.llama_artifacts}/params.json",
4118+
"--tokenizer_model",
4119+
f"{self.llama_artifacts}/tokenizer.model",
4120+
"--tokenizer_bin",
4121+
f"{self.llama_artifacts}/tokenizer.bin",
4122+
"--ip",
4123+
self.ip,
4124+
"--port",
4125+
str(self.port),
4126+
"--prompt",
4127+
f"{prompt}",
4128+
"--ptq",
4129+
"16a4w",
4130+
"--temperature",
4131+
"0",
4132+
"--decoder_model",
4133+
"stories260k",
4134+
"--model_mode",
4135+
"hybrid",
4136+
"--prefill_ar_len",
4137+
"32",
4138+
"--max_seq_len",
4139+
"128",
4140+
]
4141+
if self.compile_only:
4142+
cmds.extend(["--compile_only"])
4143+
elif self.device:
4144+
cmds.extend(["--device", self.device])
4145+
if self.host:
4146+
cmds.extend(["--host", self.host])
4147+
elif self.enable_x86_64:
4148+
cmds.extend(["--enable_x86_64"])
4149+
if self.pre_gen_pte:
4150+
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
4151+
4152+
golden_start_with = "Once upon a time,"
4153+
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
4154+
with Listener((self.ip, self.port)) as listener:
4155+
conn = listener.accept()
4156+
p.communicate()
4157+
msg = json.loads(conn.recv())
4158+
if "Error" in msg:
4159+
self.fail(msg["Error"])
4160+
else:
4161+
if not self.compile_only:
4162+
model_out = msg["result"][0]
4163+
print(f"Model CI result:{model_out[: len(golden_start_with)]}")
4164+
self.assertTrue(
4165+
model_out.startswith(golden_start_with),
4166+
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
4167+
)
4168+
# x86 does not allow weight sharing, so we don't check pte size
4169+
if not self.enable_x86_64:
4170+
pte_size = msg["pte_size"]
4171+
self.assertLessEqual(pte_size, 2020000)
4172+
if not self.compile_only and not self.enable_x86_64:
4173+
self.assertGreaterEqual(msg["inference_speed"], 1600) # Lanai
4174+
40974175
def test_llama_stories_110m(self):
40984176
if not self.required_envs():
40994177
self.skipTest("missing required envs")
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Artifacts folder for LLaMA backward compatibility validation
2+
This folder contains the stories260K(a smaller LLaMA variant) .pte artifact for backward compatibility (BC) validation in CI pipelines.
3+
4+
Model source: [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
5+
6+
## Purpose
7+
The .pte files stored here serve as reference pte to ensure that changes to the ExecuTorch do not introduce backward-incompatible changes.
8+
9+
These files are used in CI to:
10+
1. Compile story llama with the previous (n-1) commit.
11+
2. Run and validate with the current (n) commit.
12+
13+
We use the stories260K model because it is a minimal LLaMA variant, making it ideal for efficient validation in CI pipelines.
14+
15+
## File Structure
16+
- stories260k_hybrid_llama_qnn.pte: precompiled story llama used for backward compatibility validation.
17+
## Updating Artifacts
18+
To update the .pte file, follow these steps:
19+
20+
1. Checkout the latest commit before all your changes.
21+
22+
2. Download and prepare stories260K model
23+
24+
```bash
25+
# tokenizer.model & stories260K.pt:
26+
wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt"
27+
wget -O tokenizer.model "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model"
28+
29+
# tokenizer.bin:
30+
python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
31+
32+
# params.json:
33+
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
34+
```
35+
36+
3. Run the following command to regenerate and update .pte file:
37+
38+
``` bash
39+
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
40+
python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./examples/qualcomm/oss_scripts/llama/artifacts --llama_artifacts . --enable_x86_64 --compile_only
41+
42+
```
43+
4. Commit the hybrid_llama_qnn.pte file to the repository.
44+
45+
5. Update this README if necessary then commit your changes.
46+
47+
Note: The .pte file is large (~2MB). In the future, we may host it on Hugging Face and download it during CI to reduce repository size.
Binary file not shown.

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,9 @@ def compile(args, pte_filename, tokenizer):
616616
if "model" in state_dict:
617617
state_dict = state_dict["model"]
618618

619+
if args.decoder_model == "stories260k":
620+
state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
621+
619622
# Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
620623
def permute(w, heads):
621624
dim_0 = w.size(0)
@@ -751,7 +754,7 @@ def permute(w, heads):
751754
annotate_conv=args.ptq != "16a8w",
752755
),
753756
)
754-
if args.decoder_model == "stories110m":
757+
if args.decoder_model == {"stories110m", "stories260k"}:
755758
custom_annotations = custom_annotations + (
756759
annotate_linear_16a8w_in_affine_layer,
757760
)
@@ -946,7 +949,7 @@ def post_process():
946949
f"--model_path {pte_path}",
947950
f"--seq_len {seq_len}",
948951
f"--output_path {args.artifact}/outputs/outputs.txt",
949-
f"--performance_output_path {performance_output_path}",
952+
f"--performance_output_path {args.artifact}/{performance_output_path}",
950953
f"--kv_updater ShiftPointer",
951954
runner_args,
952955
]
@@ -995,7 +998,9 @@ def post_process():
995998
adb.pull(output_path=args.artifact, callback=post_process)
996999
if args.ip and args.port != -1:
9971000
inference_speed = 0
998-
with open(f"{args.artifact}/{performance_output_path}", "r") as f:
1001+
with open(
1002+
f"{os.path.abspath(args.artifact)}/{performance_output_path}", "r"
1003+
) as f:
9991004
inference_speed = float(f.read())
10001005

10011006
pte_size = os.path.getsize(pte_path)
@@ -1033,8 +1038,8 @@ def _build_parser():
10331038

10341039
parser.add_argument(
10351040
"--decoder_model",
1036-
choices=["stories110m", "llama3_2", "qwen2_5"],
1037-
help="The Llama model to export. Current available options are: [stories110m, llama3_2, qwen2_5]",
1041+
choices=["stories260k", "stories110m", "llama3_2", "qwen2_5"],
1042+
help="The Llama model to export. Current available options are: [stories260k, stories110m, llama3_2, qwen2_5]",
10381043
required=True,
10391044
)
10401045

@@ -1208,16 +1213,19 @@ def export_llama(args) -> None:
12081213
else:
12091214
raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
12101215

1216+
if args.decoder_model == "stories260k":
1217+
pte_filename = f"{args.decoder_model}_" + pte_filename
1218+
12111219
tokenizer = None
12121220
runtime_tokenizer_path, decoder_model_version = "", ""
1213-
if args.decoder_model == "stories110m":
1221+
if args.decoder_model in {"stories110m", "stories260k"}:
12141222
tokenizer = get_tokenizer(args.tokenizer_model)
12151223
assert isinstance(
12161224
tokenizer, SentencePieceTokenizer
1217-
), f"Wrong tokenizer provided for stories110m."
1225+
), f"Wrong tokenizer provided for stories."
12181226
assert (
12191227
args.tokenizer_bin is not None
1220-
), "Please provide tokenizer_bin for stories110m."
1228+
), "Please provide tokenizer_bin for stories."
12211229
runtime_tokenizer_path = args.tokenizer_bin
12221230
decoder_model_version = "llama2"
12231231
elif args.decoder_model == "llama3_2":

0 commit comments

Comments
 (0)