Qualcomm AI Engine Direct - backward compatibility CI (#12748)

DannyYuyang-quic · web-flow · commit 00e3f9940185 · 2025-07-26T15:12:27.000-07:00
### Summary - add README instructions for updating Story LLaMA artifacts - add backward compatibility validation for Story LLaMA in CI pipeline ### Test plan ./.ci/scripts/test_qnn_static_llama.sh cc: @haowhsu-quic, @cccclai
diff --git a/.ci/scripts/test_qnn_static_llama.sh b/.ci/scripts/test_qnn_static_llama.sh
@@ -41,6 +41,10 @@ exit_code1=$?
 $PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
 exit_code2=$?
 
+# Check BC
+bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh
+exit_code3=$?
+
 # Check the exit codes and print messages
 if [ $exit_code1 -ne 0 ]; then
     echo "Static Llama compile only with weight sharing test failed. $exit_code1."
@@ -50,8 +54,12 @@ if [ $exit_code2 -ne 0 ]; then
     echo "Static Llama accuracy test failed. $exit_code2."
 fi
 
+if [ $exit_code3 -ne 0 ]; then
+    echo "Static Llama BACKWARD COMPATIBILITY test failed. $exit_code3."
+fi
+
 # Return failure if either program failed
-if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
+if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ] || [ $exit_code3 -ne 0 ]; then
     exit 1
 else
     exit 0
diff --git a/backends/qualcomm/bc/test_qnn_static_llama_bc.sh b/backends/qualcomm/bc/test_qnn_static_llama_bc.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+
+llama_artifacts="."
+PTE_ARTIFACT="examples/qualcomm/oss_scripts/llama/artifacts"
+
+# Download stories260K.pt and tokenizer from Github
+curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output stories260K.pt
+curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output tokenizer.model
+# Create params.json file
+touch params.json
+echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
+
+# Checks e2e accuracy
+expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")
+exit_code1=$?
+
+# Checks accuracy with precompiled
+output=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir $PTE_ARTIFACT --llama_artifacts $llama_artifacts --enable_x86_64 --pre_gen_pte $PTE_ARTIFACT | grep "Model CI result:")
+exit_code2=$?
+
+if [[ "$output" == "$expected" ]]; then
+  echo "[BACKWARD COMPATIBILITY CHECK] Output matches expected result."
+else
+  echo "[BACKWARD COMPATIBILITY CHECK] Output mismatch!"
+  echo "[BACKWARD COMPATIBILITY CHECK] Expected: $expected"
+  echo "[BACKWARD COMPATIBILITY CHECK] Actual:   $output"
+  exit 1
+fi
+
+# Check the exit codes and print messages
+if [ $exit_code1 -ne 0 ]; then
+    echo "Static Llama compile only test failed. $exit_code1."
+fi
+
+if [ $exit_code2 -ne 0 ]; then
+    echo "Static Llama execute precompiled test failed. $exit_code2."
+fi
+
+# Return failure if either program failed
+if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0  ]; then
+    exit 1
+else
+    exit 0
+fi
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4094,6 +4094,84 @@ def test_llama3_2_1b(self):
                 if not self.compile_only and not self.enable_x86_64:
                     self.assertGreaterEqual(msg["inference_speed"], 66)  # Lanai
 
+    def test_llama_stories_260k(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+        assert (
+            self.llama_artifacts is not None
+        ), "Please provide path to llama artifacts"
+
+        prompt = "Once"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--checkpoint",
+            f"{self.llama_artifacts}/stories260K.pt",
+            "--params",
+            f"{self.llama_artifacts}/params.json",
+            "--tokenizer_model",
+            f"{self.llama_artifacts}/tokenizer.model",
+            "--tokenizer_bin",
+            f"{self.llama_artifacts}/tokenizer.bin",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a4w",
+            "--temperature",
+            "0",
+            "--decoder_model",
+            "stories260k",
+            "--model_mode",
+            "hybrid",
+            "--prefill_ar_len",
+            "32",
+            "--max_seq_len",
+            "128",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        golden_start_with = "Once upon a time,"
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    print(f"Model CI result:{model_out[: len(golden_start_with)]}")
+                    self.assertTrue(
+                        model_out.startswith(golden_start_with),
+                        f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                    )
+                # x86 does not allow weight sharing, so we don't check pte size
+                if not self.enable_x86_64:
+                    pte_size = msg["pte_size"]
+                    self.assertLessEqual(pte_size, 2020000)
+                if not self.compile_only and not self.enable_x86_64:
+                    self.assertGreaterEqual(msg["inference_speed"], 1600)  # Lanai
+
     def test_llama_stories_110m(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
diff --git a/examples/qualcomm/oss_scripts/llama/artifacts/README.md b/examples/qualcomm/oss_scripts/llama/artifacts/README.md
@@ -0,0 +1,47 @@
+# Artifacts folder for LLaMA backward compatibility validation
+This folder contains the stories260K(a smaller LLaMA variant) .pte artifact for backward compatibility (BC) validation in CI pipelines.
+
+Model source: [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)
+
+## Purpose
+The .pte files stored here serve as reference pte to ensure that changes to the ExecuTorch do not introduce backward-incompatible changes. 
+
+These files are used in CI to:
+1. Compile story llama with the previous (n-1) commit.
+2. Run and validate with the current (n) commit.
+
+We use the stories260K model because it is a minimal LLaMA variant, making it ideal for efficient validation in CI pipelines.
+
+## File Structure
+- stories260k_hybrid_llama_qnn.pte: precompiled story llama used for backward compatibility validation.
+## Updating Artifacts
+To update the .pte file, follow these steps:
+
+1. Checkout the latest commit before all your changes.
+
+2. Download and prepare stories260K model
+
+```bash
+# tokenizer.model & stories260K.pt:
+wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt"
+wget -O tokenizer.model "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model"
+
+# tokenizer.bin:
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
+
+# params.json:
+echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
+```
+
+3. Run the following command to regenerate and update .pte file: 
+
+``` bash
+# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
+python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./examples/qualcomm/oss_scripts/llama/artifacts --llama_artifacts . --enable_x86_64 --compile_only
+
+```
+4. Commit the hybrid_llama_qnn.pte file to the repository.
+
+5. Update this README if necessary then commit your changes.
+
+Note: The .pte file is large (~2MB). In the future, we may host it on Hugging Face and download it during CI to reduce repository size.
diff --git a/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte b/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -616,6 +616,9 @@ def compile(args, pte_filename, tokenizer):
         if "model" in state_dict:
             state_dict = state_dict["model"]
 
+        if args.decoder_model == "stories260k":
+            state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+
         # Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
         def permute(w, heads):
             dim_0 = w.size(0)
@@ -751,7 +754,7 @@ def permute(w, heads):
                 annotate_conv=args.ptq != "16a8w",
             ),
         )
-        if args.decoder_model == "stories110m":
+        if args.decoder_model == {"stories110m", "stories260k"}:
             custom_annotations = custom_annotations + (
                 annotate_linear_16a8w_in_affine_layer,
             )
@@ -946,7 +949,7 @@ def post_process():
                 f"--model_path {pte_path}",
                 f"--seq_len {seq_len}",
                 f"--output_path {args.artifact}/outputs/outputs.txt",
-                f"--performance_output_path {performance_output_path}",
+                f"--performance_output_path {args.artifact}/{performance_output_path}",
                 f"--kv_updater ShiftPointer",
                 runner_args,
             ]
@@ -995,7 +998,9 @@ def post_process():
         adb.pull(output_path=args.artifact, callback=post_process)
     if args.ip and args.port != -1:
         inference_speed = 0
-        with open(f"{args.artifact}/{performance_output_path}", "r") as f:
+        with open(
+            f"{os.path.abspath(args.artifact)}/{performance_output_path}", "r"
+        ) as f:
             inference_speed = float(f.read())
 
         pte_size = os.path.getsize(pte_path)
@@ -1033,8 +1038,8 @@ def _build_parser():
 
     parser.add_argument(
         "--decoder_model",
-        choices=["stories110m", "llama3_2", "qwen2_5"],
-        help="The Llama model to export. Current available options are: [stories110m, llama3_2, qwen2_5]",
+        choices=["stories260k", "stories110m", "llama3_2", "qwen2_5"],
+        help="The Llama model to export. Current available options are: [stories260k, stories110m, llama3_2, qwen2_5]",
         required=True,
     )
 
@@ -1208,16 +1213,19 @@ def export_llama(args) -> None:
     else:
         raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")
 
+    if args.decoder_model == "stories260k":
+        pte_filename = f"{args.decoder_model}_" + pte_filename
+
     tokenizer = None
     runtime_tokenizer_path, decoder_model_version = "", ""
-    if args.decoder_model == "stories110m":
+    if args.decoder_model in {"stories110m", "stories260k"}:
         tokenizer = get_tokenizer(args.tokenizer_model)
         assert isinstance(
             tokenizer, SentencePieceTokenizer
-        ), f"Wrong tokenizer provided for stories110m."
+        ), f"Wrong tokenizer provided for stories."
         assert (
             args.tokenizer_bin is not None
-        ), "Please provide tokenizer_bin for stories110m."
+        ), "Please provide tokenizer_bin for stories."
         runtime_tokenizer_path = args.tokenizer_bin
         decoder_model_version = "llama2"
     elif args.decoder_model == "llama3_2":