diff --git a/examples/python/README.md b/examples/python/README.md index 1757ea931..f557ba65b 100644 --- a/examples/python/README.md +++ b/examples/python/README.md @@ -66,3 +66,66 @@ nexa pull NexaAI/paddle-ocr-mlx python cv_ocr.py ``` +## Running Examples (Windows ARM64, Snapdragon X Elite) + +### LLM +```bash +nexa pull NexaAI/Llama3.2-3B-NPU-Turbo + +python llm.py --model NexaAI/Llama3.2-3B-NPU-Turbo --plugin-id npu --device npu --max-tokens 100 --system "You are a helpful assistant." +``` + +### Multi-Modal + +```bash +nexa pull NexaAI/OmniNeural-4B + +python vlm.py --model NexaAI/OmniNeural-4B --plugin-id npu --device npu --max-tokens 100 --system "You are a helpful assistant." +``` + +### Reranker +```bash +nexa pull NexaAI/jina-v2-rerank-npu + +python rerank.py --model NexaAI/jina-v2-rerank-npu --plugin-id npu --query "Where is on-device AI?" --documents "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud." "edge computing" "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality." "The capital of France is Paris." +``` + +### Embedder +```bash +nexa pull NexaAI/embeddinggemma-300m-npu + +python embedder.py --model NexaAI/embeddinggemma-300m-npu --plugin-id npu --texts "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud." "edge computing" "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality." "The capital of France is Paris." --query "what is on device AI" --batch-size 2 +``` + +### CV + +#### OCR +```bash +nexa pull NexaAI/paddleocr-npu + +python cv_ocr.py --det-model NexaAI/paddleocr-npu --rec-model NexaAI/paddleocr-npu --image path/to/image.png +``` + +### ASR +```bash +nexa pull NexaAI/parakeet-npu + +python asr.py --model NexaAI/parakeet-npu --audio path/to/audio.wav +``` + +## Common Arguments + +- `--model`: Path to the model file +- `--device`: Device to run on (cpu, gpu, etc.) +- `--max-tokens`: Maximum tokens to generate (for LLM/VLM) +- `--batch-size`: Batch size for processing +- `--system`: System message for chat models +- `--plugin-id`: Plugin ID to use (default: cpu_gpu) + +## Plugin ID Options + +The `--plugin-id` parameter supports different backends: +- `cpu_gpu`: Default, supports both CPU and GPU +- `mlx`: Apple Silicon optimized (for supported models) +- `llama_cpp`: For GGUF format models +- `onnx`: ONNX runtime backend \ No newline at end of file diff --git a/examples/python/asr.py b/examples/python/asr.py new file mode 100644 index 000000000..572bb0e23 --- /dev/null +++ b/examples/python/asr.py @@ -0,0 +1,46 @@ +""" +NexaAI ASR Example - Speech to Text (non-streaming) + +This example demonstrates how to use the NexaAI SDK to transcribe an audio file. +""" + +import argparse +import os + +from nexaai.asr import ASR, ASRConfig + +def main(): + parser = argparse.ArgumentParser(description="NexaAI ASR Example") + parser.add_argument("--model", + default="NexaAI/parakeet-npu", + help="Model id or path") + parser.add_argument("--audio", + required=True, + help="Path to the input audio file") + parser.add_argument("--language", default="en", + help="Language code (e.g., en, zh). Empty for auto-detect if supported") + parser.add_argument("--beam-size", type=int, default=5, + help="Beam size for decoding") + parser.add_argument("--timestamps", default="segment", + help="Timestamps granularity: none|segment|word (if supported)") + parser.add_argument("--plugin-id", default="npu", help="Plugin ID to use") + parser.add_argument("--device", default="npu", help="Device to run on (e.g., cpu, gpu, 0)") + args = parser.parse_args() + + model_path = os.path.expanduser(args.model) + audio_path = os.path.expanduser(args.audio) + + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + + asr = ASR.from_(name_or_path=model_path, plugin_id=args.plugin_id, device_id=args.device) + + cfg = ASRConfig(timestamps=args.timestamps, beam_size=args.beam_size, stream=False) + result = asr.transcribe(audio_path=audio_path, language=args.language, config=cfg) + print(result.transcript) + + +if __name__ == "__main__": + main() + + diff --git a/examples/python/cv_ocr.py b/examples/python/cv_ocr.py index 328a6094f..92afcb274 100644 --- a/examples/python/cv_ocr.py +++ b/examples/python/cv_ocr.py @@ -4,25 +4,34 @@ This example demonstrates how to use the NexaAI SDK to perform OCR on an image. """ +import argparse import os from nexaai.cv import CVCapabilities, CVModel, CVModelConfig, CVResults def main(): - det_model_path = os.path.expanduser( - "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_det_infer.safetensors") - rec_model_path = os.path.expanduser( - "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_rec_infer.safetensors") + parser = argparse.ArgumentParser(description="NexaAI CV OCR Example") + parser.add_argument("--det-model", + default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_det_infer.safetensors", + help="Path to detection model") + parser.add_argument("--rec-model", + default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/ch_ptocr_v4_rec_infer.safetensors", + help="Path to recognition model") + parser.add_argument("--image", + default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/test_input.jpg", + help="Path to input image") + parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use") + args = parser.parse_args() + + det_model_path = os.path.expanduser(args.det_model) + rec_model_path = os.path.expanduser(args.rec_model) + image_path = os.path.expanduser(args.image) config = CVModelConfig(capabilities=CVCapabilities.OCR, det_model_path=det_model_path, rec_model_path=rec_model_path) - # For now, this modality is only supported in MLX. - cv: CVModel = CVModel.from_( - name_or_path=det_model_path, config=config, plugin_id="mlx") - - results: CVResults = cv.infer(os.path.expanduser( - "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/paddle-ocr-mlx/test_input.jpg")) + cv = CVModel.from_(name_or_path=det_model_path, config=config, plugin_id=args.plugin_id) + results = cv.infer(image_path) print(f"Number of results: {results.result_count}") for result in results.results: diff --git a/examples/python/embedder.py b/examples/python/embedder.py index a1f581814..dbf65053e 100644 --- a/examples/python/embedder.py +++ b/examples/python/embedder.py @@ -7,65 +7,66 @@ It includes basic model initialization, single and batch embedding generation, and embedding analysis. """ +import argparse import os import numpy as np from nexaai.embedder import Embedder, EmbeddingConfig def main(): - model_path = os.path.expanduser( - "~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-fp16-mlx/model.safetensors") + parser = argparse.ArgumentParser(description="NexaAI Embedding Example") + parser.add_argument("--model", default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-fp16-mlx/model.safetensors", + help="Path to the embedding model") + parser.add_argument("--texts", nargs="+", + default=["On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.", + "Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU — from instant use cases to production deployments.", + "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.", + "The capital of France is Paris."], + help="Texts to embed") + parser.add_argument("--query", default="what is on device AI", + help="Query text for similarity analysis") + parser.add_argument("--batch-size", type=int, help="Batch size for processing") + parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use") + args = parser.parse_args() - # For now, this modality is only supported in MLX. - embedder: Embedder = Embedder.from_( - name_or_path=model_path, plugin_id="mlx") + model_path = os.path.expanduser(args.model) + embedder = Embedder.from_(name_or_path=model_path, plugin_id=args.plugin_id) print('Embedder loaded successfully!') dim = embedder.get_embedding_dim() print(f"Dimension: {dim}") - texts = [ - "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.", - "Nexa AI allows you to run state-of-the-art AI models locally on CPU, GPU, or NPU — from instant use cases to production deployments.", - "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.", - "The capital of France is Paris." - ] + batch_size = args.batch_size or len(args.texts) embeddings = embedder.generate( - texts=texts, config=EmbeddingConfig(batch_size=len(texts))) + texts=args.texts, config=EmbeddingConfig(batch_size=batch_size)) print("\n" + "="*80) print("GENERATED EMBEDDINGS") print("="*80) - for i, (text, embedding) in enumerate(zip(texts, embeddings)): + for i, (text, embedding) in enumerate(zip(args.texts, embeddings)): print(f"\nText {i+1}:") print(f" Content: {text}") print(f" Embedding shape: {len(embedding)} dimensions") print(f" First 10 elements: {embedding[:10]}") print("-" * 70) - # Generate embedding for query - query = "what is on device AI" print(f"\n" + "="*80) print("QUERY PROCESSING") print("="*80) - print(f"Query: '{query}'") + print(f"Query: '{args.query}'") query_embedding = embedder.generate( - texts=[query], config=EmbeddingConfig(batch_size=1))[0] + texts=[args.query], config=EmbeddingConfig(batch_size=1))[0] print(f"Query embedding shape: {len(query_embedding)} dimensions") - # Compute inner product between query and all texts print(f"\n" + "="*80) print("SIMILARITY ANALYSIS (Inner Product)") print("="*80) - for i, (text, embedding) in enumerate(zip(texts, embeddings)): - # Convert to numpy arrays for easier computation + for i, (text, embedding) in enumerate(zip(args.texts, embeddings)): query_vec = np.array(query_embedding) text_vec = np.array(embedding) - - # Compute inner product (dot product) inner_product = np.dot(query_vec, text_vec) print(f"\nText {i+1}:") diff --git a/examples/python/llm.py b/examples/python/llm.py index 15f13473f..23b96fcd0 100644 --- a/examples/python/llm.py +++ b/examples/python/llm.py @@ -4,6 +4,7 @@ This example demonstrates how to use the NexaAI SDK to work with LLM models. """ +import argparse import io import os from typing import List @@ -13,19 +14,23 @@ def main(): - # Your model path - model = os.path.expanduser( - "~/.cache/nexa.ai/nexa_sdk/models/Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf") - - # Model configuration + parser = argparse.ArgumentParser(description="NexaAI LLM Example") + parser.add_argument("--model", + default="~/.cache/nexa.ai/nexa_sdk/models/Qwen/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf", + help="Path to the LLM model") + parser.add_argument("--device", default="cpu", help="Device to run on") + parser.add_argument("--max-tokens", type=int, default=100, help="Maximum tokens to generate") + parser.add_argument("--system", default="You are a helpful assistant.", + help="System message") + parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use") + args = parser.parse_args() + + model_path = os.path.expanduser(args.model) m_cfg = ModelConfig() - # Load model - instance: LLM = LLM.from_( - model, plugin_id="llama_cpp", device_id="cpu", m_cfg=m_cfg) + instance = LLM.from_(model_path, plugin_id=args.plugin_id, device_id=args.device, m_cfg=m_cfg) - conversation: List[ChatMessage] = [ChatMessage( - role="system", content="You are a helpful assistant.")] + conversation: List[ChatMessage] = [ChatMessage(role="system", content=args.system)] strbuff = io.StringIO() print("Multi-round conversation started. Type '/quit' or '/exit' to end.") @@ -60,26 +65,21 @@ def main(): continue conversation.append(ChatMessage(role="user", content=user_input)) - - # Apply the chat template prompt = instance.apply_chat_template(conversation) strbuff.truncate(0) strbuff.seek(0) print("Assistant: ", end="", flush=True) - # Generate the model response - for token in instance.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100)): + for token in instance.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=args.max_tokens)): print(token, end="", flush=True) strbuff.write(token) - # Get profiling data profiling_data = instance.get_profiling_data() if profiling_data is not None: print(profiling_data) - conversation.append(ChatMessage( - role="assistant", content=strbuff.getvalue())) + conversation.append(ChatMessage(role="assistant", content=strbuff.getvalue())) if __name__ == "__main__": diff --git a/examples/python/rerank.py b/examples/python/rerank.py index c7b6535c2..8392e84bb 100644 --- a/examples/python/rerank.py +++ b/examples/python/rerank.py @@ -7,31 +7,40 @@ It includes basic model initialization, document reranking, and score analysis. """ +import argparse import os from nexaai.rerank import Reranker, RerankConfig def main(): - model_path = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-rerank-mlx/jina-reranker-v2-base-multilingual-f16.safetensors") + parser = argparse.ArgumentParser(description="NexaAI Rerank Example") + parser.add_argument("--model", + default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/jina-v2-rerank-mlx/jina-reranker-v2-base-multilingual-f16.safetensors", + help="Path to the rerank model") + parser.add_argument("--query", default="Where is on-device AI?", + help="Query text for reranking") + parser.add_argument("--documents", nargs="+", + default=["On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.", + "edge computing", + "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.", + "The capital of France is Paris."], + help="Documents to rerank") + parser.add_argument("--batch-size", type=int, help="Batch size for processing") + parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use") + args = parser.parse_args() + + model_path = os.path.expanduser(args.model) + reranker = Reranker.from_(name_or_path=model_path, plugin_id=args.plugin_id) - # For now, this modality is only supported in MLX. - reranker: Reranker = Reranker.from_(name_or_path=model_path, plugin_id="mlx") - documents = [ - "On-device AI is a type of AI that is processed on the device itself, rather than in the cloud.", - "edge computing", - "A ragdoll is a breed of cat that is known for its long, flowing hair and gentle personality.", - "The capital of France is Paris." - ] + batch_size = args.batch_size or len(args.documents) + scores = reranker.rerank(query=args.query, documents=args.documents, + config=RerankConfig(batch_size=batch_size)) - query = "Where is on-device AI?" - - scores = reranker.rerank(query=query, documents=documents, config=RerankConfig(batch_size=len(documents))) - - print(f"Query: {query}") - print(f"Documents: {len(documents)} documents") + print(f"Query: {args.query}") + print(f"Documents: {len(args.documents)} documents") print("-" * 50) for i, score in enumerate(scores): - print(f"[{score:.4f}] : {documents[i]}") + print(f"[{score:.4f}] : {args.documents[i]}") if __name__ == "__main__": diff --git a/examples/python/vlm.py b/examples/python/vlm.py index fbae47ce5..7d507ea1b 100644 --- a/examples/python/vlm.py +++ b/examples/python/vlm.py @@ -7,6 +7,7 @@ It includes basic model initialization, text generation, streaming, and chat template functionality. """ +import argparse import io import os import re @@ -47,16 +48,24 @@ def parse_media_from_input(user_input: str) -> tuple[str, Optional[List[str]], O def main(): - # Your model path - model = os.path.expanduser("~/.cache/nexa.ai/nexa_sdk/models/NexaAI/gemma-3n-E4B-it-4bit-MLX/model-00001-of-00002.safetensors") - - # Model configuration + parser = argparse.ArgumentParser(description="NexaAI VLM Example") + parser.add_argument("--model", + default="~/.cache/nexa.ai/nexa_sdk/models/NexaAI/gemma-3n-E4B-it-4bit-MLX/model-00001-of-00002.safetensors", + help="Path to the VLM model") + parser.add_argument("--device", default="", help="Device to run on") + parser.add_argument("--max-tokens", type=int, default=100, help="Maximum tokens to generate") + parser.add_argument("--system", default="You are a helpful assistant.", + help="System message") + parser.add_argument("--plugin-id", default="cpu_gpu", help="Plugin ID to use") + args = parser.parse_args() + + model_path = os.path.expanduser(args.model) m_cfg = ModelConfig() - # Load model - instance: VLM = VLM.from_(name_or_path=model, mmproj_path="", m_cfg=m_cfg, plugin_id="mlx", device_id="") + instance = VLM.from_(name_or_path=model_path, m_cfg=m_cfg, plugin_id=args.plugin_id, device_id=args.device) - conversation: List[MultiModalMessage] = [MultiModalMessage(role="system", content=[MultiModalMessageContent(type="text", text="You are a helpful assistant.")])] + conversation: List[MultiModalMessage] = [MultiModalMessage(role="system", + content=[MultiModalMessageContent(type="text", text=args.system)])] strbuff = io.StringIO() print("Multi-round conversation started. Type '/quit' or '/exit' to end.") @@ -106,8 +115,7 @@ def main(): strbuff.seek(0) print("Assistant: ", end="", flush=True) - # Generate the model response - for token in instance.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=100, image_paths=images, audio_paths=audios)): + for token in instance.generate_stream(prompt, g_cfg=GenerationConfig(max_tokens=args.max_tokens, image_paths=images, audio_paths=audios)): print(token, end="", flush=True) strbuff.write(token)