Merge pull request #708 from NexaAI/feat/mengsheng/pythonbind-npu

mengshengwu · web-flow · commit 05c2235bb266 · 2025-10-21T14:19:54.000+08:00
feat: refactor python and add npu doc
diff --git a/examples/python/README.md b/examples/python/README.md
@@ -66,3 +66,66 @@ nexa pull NexaAI/paddle-ocr-mlx
 
 python cv_ocr.py
 ```
+## Running Examples (Windows ARM64, Snapdragon X Elite)
+
+### LLM
+```bash
+nexa pull NexaAI/Llama3.2-3B-NPU-Turbo
+
+python llm.py --model NexaAI/Llama3.2-3B-NPU-Turbo --plugin-id npu --device npu --max-tokens 100 --system "You are a helpful assistant."
+```
+
+### Multi-Modal
+
+```bash
+nexa pull NexaAI/OmniNeural-4B
+
+python vlm.py --model NexaAI/OmniNeural-4B --plugin-id npu --device npu --max-tokens 100 --system "You are a helpful assistant."
+```
+
+### Reranker
+```bash
+nexa pull NexaAI/jina-v2-rerank-npu
+
+python rerank.py --model NexaAI/jina-v2-rerank-npu --plugin-id npu --query "Where is on-device AI?" --documents "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud." "edge computing" "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality." "The capital of France is Paris."
+```
+
+### Embedder
+```bash
+nexa pull NexaAI/embeddinggemma-300m-npu
+
+python embedder.py --model NexaAI/embeddinggemma-300m-npu --plugin-id npu --texts "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud." "edge computing" "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality." "The capital of France is Paris." --query "what is on device AI" --batch-size 2
+```
+
+### CV
+
+#### OCR
+```bash
+nexa pull NexaAI/paddleocr-npu
+
+python cv_ocr.py --det-model NexaAI/paddleocr-npu --rec-model NexaAI/paddleocr-npu --image path/to/image.png
+```
+
+### ASR
+```bash
+nexa pull NexaAI/parakeet-npu
+
+python asr.py --model NexaAI/parakeet-npu --audio path/to/audio.wav
+```
+
+## Common Arguments
+
+- `--model`: Path to the model file
+- `--device`: Device to run on (cpu, gpu, etc.)
+- `--max-tokens`: Maximum tokens to generate (for LLM/VLM)
+- `--batch-size`: Batch size for processing
+- `--system`: System message for chat models
+- `--plugin-id`: Plugin ID to use (default: cpu_gpu)
+
+## Plugin ID Options
+
+The `--plugin-id` parameter supports different backends:
+- `cpu_gpu`: Default, supports both CPU and GPU
+- `mlx`: Apple Silicon optimized (for supported models)
+- `llama_cpp`: For GGUF format models
+- `onnx`: ONNX runtime backend
diff --git a/examples/python/asr.py b/examples/python/asr.py
@@ -0,0 +1,46 @@
+"""
+NexaAI ASR Example - Speech to Text (non-streaming)
+
+This example demonstrates how to use the NexaAI SDK to transcribe an audio file.
+"""
+
+import argparse
+import os
+
+from nexaai.asr import ASR, ASRConfig
+
+def main():
+    parser = argparse.ArgumentParser(description="NexaAI ASR Example")
+    parser.add_argument("--model",
+                       default="NexaAI/parakeet-npu",
+                       help="Model id or path")
+    parser.add_argument("--audio",
+                       required=True,
+                       help="Path to the input audio file")
+    parser.add_argument("--language", default="en",
+                       help="Language code (e.g., en, zh). Empty for auto-detect if supported")
+    parser.add_argument("--beam-size", type=int, default=5,
+                       help="Beam size for decoding")
+    parser.add_argument("--timestamps", default="segment",
+                       help="Timestamps granularity: none|segment|word (if supported)")
+    parser.add_argument("--plugin-id", default="npu", help="Plugin ID to use")
+    parser.add_argument("--device", default="npu", help="Device to run on (e.g., cpu, gpu, 0)")
+    args = parser.parse_args()
+
+    model_path = os.path.expanduser(args.model)
+    audio_path = os.path.expanduser(args.audio)
+
+    if not os.path.exists(audio_path):
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+
+    asr = ASR.from_(name_or_path=model_path, plugin_id=args.plugin_id, device_id=args.device)
+
+    cfg = ASRConfig(timestamps=args.timestamps, beam_size=args.beam_size, stream=False)
+    result = asr.transcribe(audio_path=audio_path, language=args.language, config=cfg)
+    print(result.transcript)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/examples/python/cv_ocr.py b/examples/python/cv_ocr.py
@@ -4,25 +4,34 @@
 This example demonstrates how to use the NexaAI SDK to perform OCR on an image.
 """
 
+import argparse
 import os
 from nexaai.cv import CVCapabilities, CVModel, CVModelConfig, CVResults
 
 
 def main():
-    det_model_path = os.path.expanduser(
-        "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_det_infer.safetensors")
-    rec_model_path = os.path.expanduser(
-        "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_rec_infer.safetensors")
+    parser = argparse.ArgumentParser(description="NexaAI CV OCR Example")
+    parser.add_argument("--det-model", 
+                       default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_det_infer.safetensors",
+                       help="Path to detection model")
+    parser.add_argument("--rec-model",
+                       default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_rec_infer.safetensors", 
+                       help="Path to recognition model")
+    parser.add_argument("--image",
+                       default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/test_input.jpg",
+                       help="Path to input image")
+    parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use")
+    args = parser.parse_args()
+
+    det_model_path = os.path.expanduser(args.det_model)
+    rec_model_path = os.path.expanduser(args.rec_model)
+    image_path = os.path.expanduser(args.image)
 
     config = CVModelConfig(capabilities=CVCapabilities.OCR,
                            det_model_path=det_model_path, rec_model_path=rec_model_path)
 
-    # For now, this modality is only supported in MLX. 
-    cv: CVModel = CVModel.from_(
-        name_or_path=det_model_path, config=config, plugin_id="mlx")
-
-    results: CVResults = cv.infer(os.path.expanduser(
-        "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/test_input.jpg"))
+    cv = CVModel.from_(name_or_path=det_model_path, config=config, plugin_id=args.plugin_id)
+    results = cv.infer(image_path)
 
     print(f"Number of results: {results.result_count}")
     for result in results.results:
diff --git a/examples/python/embedder.py b/examples/python/embedder.py
@@ -7,65 +7,66 @@
 It includes basic model initialization, single and batch embedding generation, and embedding analysis.
 """
 
+import argparse
 import os
 import numpy as np
 
 from nexaai.embedder import Embedder, EmbeddingConfig
 
 def main():
-    model_path = os.path.expanduser(
-        "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-fp16-mlx/model.safetensors")
+    parser = argparse.ArgumentParser(description="NexaAI Embedding Example")
+    parser.add_argument("--model", default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-fp16-mlx/model.safetensors",
+                       help="Path to the embedding model")
+    parser.add_argument("--texts", nargs="+", 
+                       default=["On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
+                               "Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU — from instant use cases to production deployments.",
+                               "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
+                               "The capital of France is Paris."],
+                       help="Texts to embed")
+    parser.add_argument("--query", default="what is on device AI",
+                       help="Query text for similarity analysis")
+    parser.add_argument("--batch-size", type=int, help="Batch size for processing")
+    parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use")
+    args = parser.parse_args()
 
-    # For now, this modality is only supported in MLX.
-    embedder: Embedder = Embedder.from_(
-        name_or_path=model_path, plugin_id="mlx")
+    model_path = os.path.expanduser(args.model)
+    embedder = Embedder.from_(name_or_path=model_path, plugin_id=args.plugin_id)
     print('Embedder loaded successfully!')
 
     dim = embedder.get_embedding_dim()
     print(f"Dimension: {dim}")
 
-    texts = [
-        "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
-        "Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU — from instant use cases to production deployments.",
-        "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
-        "The capital of France is Paris."
-    ]
+    batch_size = args.batch_size or len(args.texts)
     embeddings = embedder.generate(
-        texts=texts, config=EmbeddingConfig(batch_size=len(texts)))
+        texts=args.texts, config=EmbeddingConfig(batch_size=batch_size))
     
     print("\n" + "="*80)
     print("GENERATED EMBEDDINGS")
     print("="*80)
     
-    for i, (text, embedding) in enumerate(zip(texts, embeddings)):
+    for i, (text, embedding) in enumerate(zip(args.texts, embeddings)):
         print(f"\nText {i+1}:")
         print(f"  Content: {text}")
         print(f"  Embedding shape: {len(embedding)} dimensions")
         print(f"  First 10 elements: {embedding[:10]}")
         print("-" * 70)
 
-    # Generate embedding for query
-    query = "what is on device AI"
     print(f"\n" + "="*80)
     print("QUERY PROCESSING")
     print("="*80)
-    print(f"Query: '{query}'")
+    print(f"Query: '{args.query}'")
     
     query_embedding = embedder.generate(
-        texts=[query], config=EmbeddingConfig(batch_size=1))[0]
+        texts=[args.query], config=EmbeddingConfig(batch_size=1))[0]
     print(f"Query embedding shape: {len(query_embedding)} dimensions")
     
-    # Compute inner product between query and all texts
     print(f"\n" + "="*80)
     print("SIMILARITY ANALYSIS (Inner Product)")
     print("="*80)
     
-    for i, (text, embedding) in enumerate(zip(texts, embeddings)):
-        # Convert to numpy arrays for easier computation
+    for i, (text, embedding) in enumerate(zip(args.texts, embeddings)):
         query_vec = np.array(query_embedding)
         text_vec = np.array(embedding)
-        
-        # Compute inner product (dot product)
         inner_product = np.dot(query_vec, text_vec)
         
         print(f"\nText {i+1}:")
diff --git a/examples/python/llm.py b/examples/python/llm.py
@@ -4,6 +4,7 @@
 This example demonstrates how to use the NexaAI SDK to work with LLM models.
 """
 
+import argparse
 import io
 import os
 from typing import List
@@ -13,19 +14,23 @@
 
 
 def main():
-    # Your model path
-    model = os.path.expanduser(
-        "~/.cache/nexa.ai/nexa_sdk/models/Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf")
-
-    # Model configuration
+    parser = argparse.ArgumentParser(description="NexaAI LLM Example")
+    parser.add_argument("--model",
+                        default="~/.cache/nexa.ai/nexa_sdk/models/Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf",
+                        help="Path to the LLM model")
+    parser.add_argument("--device", default="cpu", help="Device to run on")
+    parser.add_argument("--max-tokens", type=int, default=100, help="Maximum tokens to generate")
+    parser.add_argument("--system", default="You are a helpful assistant.", 
+                       help="System message")
+    parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use")
+    args = parser.parse_args()
+
+    model_path = os.path.expanduser(args.model)
     m_cfg = ModelConfig()
 
-    # Load model
-    instance: LLM = LLM.from_(
-        model, plugin_id="llama_cpp", device_id="cpu", m_cfg=m_cfg)
+    instance = LLM.from_(model_path, plugin_id=args.plugin_id, device_id=args.device, m_cfg=m_cfg)
 
-    conversation: List[ChatMessage] = [ChatMessage(
-        role="system", content="You are a helpful assistant.")]
+    conversation: List[ChatMessage] = [ChatMessage(role="system", content=args.system)]
     strbuff = io.StringIO()
 
     print("Multi-round conversation started. Type '/quit' or '/exit' to end.")
@@ -60,26 +65,21 @@ def main():
             continue
 
         conversation.append(ChatMessage(role="user", content=user_input))
-
-        # Apply the chat template
         prompt = instance.apply_chat_template(conversation)
 
         strbuff.truncate(0)
         strbuff.seek(0)
 
         print("Assistant: ", end="", flush=True)
-        # Generate the model response
-        for token in instance.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100)):
+        for token in instance.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=args.max_tokens)):
             print(token, end="", flush=True)
             strbuff.write(token)
 
-        # Get profiling data
         profiling_data = instance.get_profiling_data()
         if profiling_data is not None:
             print(profiling_data)
 
-        conversation.append(ChatMessage(
-            role="assistant", content=strbuff.getvalue()))
+        conversation.append(ChatMessage(role="assistant", content=strbuff.getvalue()))
 
 
 if __name__ == "__main__":
diff --git a/examples/python/rerank.py b/examples/python/rerank.py
@@ -7,31 +7,40 @@
 It includes basic model initialization, document reranking, and score analysis.
 """
 
+import argparse
 import os
 from nexaai.rerank import Reranker, RerankConfig
 
 
 def main():
-    model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-rerank-mlx/jina-reranker-v2-base-multilingual-f16.safetensors")
+    parser = argparse.ArgumentParser(description="NexaAI Rerank Example")
+    parser.add_argument("--model", 
+                       default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-rerank-mlx/jina-reranker-v2-base-multilingual-f16.safetensors",
+                       help="Path to the rerank model")
+    parser.add_argument("--query", default="Where is on-device AI?",
+                       help="Query text for reranking")
+    parser.add_argument("--documents", nargs="+",
+                       default=["On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
+                               "edge computing",
+                               "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
+                               "The capital of France is Paris."],
+                       help="Documents to rerank")
+    parser.add_argument("--batch-size", type=int, help="Batch size for processing")
+    parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use")
+    args = parser.parse_args()
+
+    model_path = os.path.expanduser(args.model)
+    reranker = Reranker.from_(name_or_path=model_path, plugin_id=args.plugin_id)
     
-    # For now, this modality is only supported in MLX.
-    reranker: Reranker = Reranker.from_(name_or_path=model_path, plugin_id="mlx")
-    documents = [
-        "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.",
-        "edge computing",
-        "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.",
-        "The capital of France is Paris."
-    ]
+    batch_size = args.batch_size or len(args.documents)
+    scores = reranker.rerank(query=args.query, documents=args.documents, 
+                           config=RerankConfig(batch_size=batch_size))
 
-    query = "Where is on-device AI?"
-
-    scores = reranker.rerank(query=query, documents=documents, config=RerankConfig(batch_size=len(documents)))
-
-    print(f"Query: {query}")
-    print(f"Documents: {len(documents)} documents")
+    print(f"Query: {args.query}")
+    print(f"Documents: {len(args.documents)} documents")
     print("-" * 50)
     for i, score in enumerate(scores):
-        print(f"[{score:.4f}] : {documents[i]}")
+        print(f"[{score:.4f}] : {args.documents[i]}")
 
 
 if __name__ == "__main__":
diff --git a/examples/python/vlm.py b/examples/python/vlm.py