Update add item (#34)

truman-cyborg · web-flow · commit 0f9a2aa80a9b · 2025-10-09T13:01:47.000-04:00
* add embedding as a parameter for add text

* added test

* update test

* updated formating

* update test

* update test

* update

* update for linter
diff --git a/cyborgdb/integrations/langchain.py b/cyborgdb/integrations/langchain.py
@@ -8,20 +8,21 @@
     pip install cyborgdb-py[langchain]
 """
 
-import uuid
 import json
+import uuid
 import warnings
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
 import numpy as np
-from typing import List, Dict, Any, Optional, Tuple, Union, Iterable
 
 try:
-    from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
     from langchain_core.documents import Document
     from langchain_core.embeddings import Embeddings
+    from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
     from sentence_transformers import SentenceTransformer
 
     # Import CyborgDB components
-    from cyborgdb import Client, EncryptedIndex, IndexIVF, IndexIVFPQ, IndexIVFFlat
+    from cyborgdb import Client, EncryptedIndex, IndexIVF, IndexIVFFlat, IndexIVFPQ
 
     class CyborgVectorStore(VectorStore):
         """
@@ -325,13 +326,16 @@ def add_texts(
                 texts: Texts to add
                 metadatas: Optional metadata for each text
                 ids: Optional IDs for each text (generated if not provided)
-                **kwargs: Additional arguments (unused)
+                **kwargs: Additional arguments including:
+                    - embeddings: Optional pre-computed embeddings for each text. If provided,
+                      skips embedding generation. Should be a list of vectors or numpy array
+                      with shape (num_texts, dimension)
 
             Returns:
                 List of IDs for the added texts
 
             Raises:
-                ValueError: If lengths of texts, metadatas, or ids don't match
+                ValueError: If lengths of texts, metadatas, ids, or embeddings don't match
             """
             texts_list = list(texts)
             num_texts = len(texts_list)
@@ -351,25 +355,41 @@ def add_texts(
             if metadatas is not None and len(metadatas) != num_texts:
                 raise ValueError("Length of metadatas must match length of texts")
 
-            # Generate embeddings
-            embeddings = self.get_embeddings(texts_list)
+            # Handle embeddings - either use provided vectors or generate them
+            embeddings = kwargs.get("embeddings", None)
+            if embeddings is not None:
+                # Validate provided embeddings
+                if isinstance(embeddings, np.ndarray):
+                    if embeddings.shape[0] != num_texts:
+                        raise ValueError(
+                            f"Number of embeddings ({embeddings.shape[0]}) must match number of texts ({num_texts})"
+                        )
+                    vectors = embeddings
+                else:
+                    # Assume it's a list of lists
+                    if len(embeddings) != num_texts:
+                        raise ValueError(
+                            f"Number of embeddings ({len(embeddings)}) must match number of texts ({num_texts})"
+                        )
+                    vectors = np.array(embeddings, dtype=np.float32)
+            else:
+                # Generate embeddings from texts
+                vectors = self.get_embeddings(texts_list)
 
             # Build items for upsert
             items = []
             for i in range(num_texts):
-                # Handle both numpy arrays and lists for embeddings
-                if hasattr(embeddings, "shape"):
+                # Handle both numpy arrays and lists for vectors
+                if hasattr(vectors, "shape"):
                     vector = (
-                        embeddings[i].tolist()
-                        if len(embeddings.shape) > 1
-                        else embeddings.tolist()
+                        vectors[i].tolist()
+                        if len(vectors.shape) > 1
+                        else vectors.tolist()
                     )
                 else:
-                    # embeddings is likely a list of lists or a list
+                    # vectors is likely a list of lists or a list
                     vector = (
-                        embeddings[i]
-                        if isinstance(embeddings[i], list)
-                        else [embeddings[i]]
+                        vectors[i] if isinstance(vectors[i], list) else [vectors[i]]
                     )
                 item = {"id": id_list[i], "vector": vector}
 
@@ -390,15 +410,21 @@ def add_texts(
             return id_list
 
         def add_documents(
-            self, documents: List[Document], ids: Optional[List[str]] = None, **kwargs
+            self,
+            documents: List[Document],
+            ids: Optional[List[str]] = None,
+            **kwargs,
         ) -> List[str]:
             """
             Add documents to the vector store.
 
             Args:
                 documents: Documents to add
                 ids: Optional IDs for documents
-                **kwargs: Additional arguments
+                **kwargs: Additional arguments including:
+                    - embeddings: Optional pre-computed embeddings for each document. If provided,
+                      skips embedding generation. Should be a list of vectors or numpy array
+                      with shape (num_documents, dimension)
 
             Returns:
                 List of IDs for the added documents
@@ -718,17 +744,24 @@ async def aadd_texts(
             ids: Optional[List[str]] = None,
             **kwargs,
         ) -> List[str]:
-            """Async version of add_texts."""
+            """Async version of add_texts with optional pre-computed embeddings."""
             import asyncio
 
             return await asyncio.to_thread(
-                self.add_texts, texts, metadatas=metadatas, ids=ids, **kwargs
+                self.add_texts,
+                texts,
+                metadatas=metadatas,
+                ids=ids,
+                **kwargs,
             )
 
         async def aadd_documents(
-            self, documents: List[Document], ids: Optional[List[str]] = None, **kwargs
+            self,
+            documents: List[Document],
+            ids: Optional[List[str]] = None,
+            **kwargs,
         ) -> List[str]:
-            """Async version of add_documents."""
+            """Async version of add_documents with optional pre-computed embeddings."""
             import asyncio
 
             return await asyncio.to_thread(
@@ -793,6 +826,7 @@ def from_texts(
             texts: List[str],
             embedding: Union[str, Embeddings, SentenceTransformer],
             metadatas: Optional[List[Dict]] = None,
+            embeddings: Optional[Union[List[List[float]], np.ndarray]] = None,
             **kwargs,
         ) -> "CyborgVectorStore":
             """
@@ -802,6 +836,9 @@ def from_texts(
                 texts: List of texts to add
                 embedding: Embedding model
                 metadatas: Optional metadata for each text
+                embeddings: Optional pre-computed embeddings for each text. If provided,
+                    skips embedding generation. Should be a list of vectors or numpy array
+                    with shape (num_texts, dimension)
                 **kwargs: Additional arguments including:
                     - index_name: Name for the index
                     - index_key: 32-byte encryption key
@@ -841,6 +878,19 @@ def from_texts(
                 if key in kwargs:
                     index_config_params[key] = kwargs.pop(key)
 
+            # If embeddings are provided and dimension is not, infer it
+            if embeddings is not None and dimension is None:
+                if isinstance(embeddings, np.ndarray):
+                    dimension = (
+                        embeddings.shape[1]
+                        if len(embeddings.shape) > 1
+                        else len(embeddings)
+                    )
+                elif isinstance(embeddings, list) and len(embeddings) > 0:
+                    dimension = (
+                        len(embeddings[0]) if isinstance(embeddings[0], list) else 1
+                    )
+
             # Create vector store
             store = cls(
                 index_name=index_name,
@@ -857,7 +907,10 @@ def from_texts(
 
             # Add texts if provided
             if texts:
-                store.add_texts(texts, metadatas, ids=ids)
+                if embeddings is not None:
+                    store.add_texts(texts, metadatas, ids=ids, embeddings=embeddings)
+                else:
+                    store.add_texts(texts, metadatas, ids=ids)
 
             if not store.index.is_trained():
                 warnings.warn("Not enough data to train index.")
@@ -877,7 +930,11 @@ def from_documents(
             Args:
                 documents: List of documents to add
                 embedding: Embedding model
-                **kwargs: Additional arguments (see from_texts)
+                **kwargs: Additional arguments including:
+                    - embeddings: Optional pre-computed embeddings for each document. If provided,
+                      skips embedding generation. Should be a list of vectors or numpy array
+                      with shape (num_documents, dimension)
+                    - Other arguments (see from_texts)
 
             Returns:
                 CyborgVectorStore instance
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -63,10 +63,10 @@ def test_upsert_and_query(self):
         query_vector = np.random.rand(dimension).astype(np.float32)
         results = self.index.query(query_vectors=query_vector, top_k=10)
 
-        # Check results
-        self.assertEqual(len(results[0]), 10)
-        self.assertTrue("id" in results[0][0])
-        self.assertTrue("distance" in results[0][0])
+        # Check results - results is a flat list, not nested
+        self.assertEqual(len(results), 10)
+        self.assertTrue("id" in results[0])
+        self.assertTrue("distance" in results[0])
 
     def test_health_check(self):
         """Test the health check endpoint."""
diff --git a/tests/test_langchain_integration.py b/tests/test_langchain_integration.py