Merge pull request #41 from mpdimitr/mixed-workload

slice4e · web-flow · commit 6a6707f96abb · 2025-08-14T11:37:38.000-07:00
fixed bug with concurrent workers attempting to insert the same doc_id
diff --git a/dataset_reader/ann_h5_reader.py b/dataset_reader/ann_h5_reader.py
@@ -1,3 +1,4 @@
+import itertools
 from typing import Iterator
 
 import h5py
@@ -14,17 +15,18 @@ def __init__(self, path, normalize=False):
 
     def read_queries(self) -> Iterator[Query]:
         data = h5py.File(self.path)
+        distances = data["distances"] if "distances" in data else itertools.repeat(None)
 
         for vector, expected_result, expected_scores in zip(
-            data["test"], data["neighbors"], data["distances"]
+            data["test"], data["neighbors"], distances
         ):
             if self.normalize:
                 vector /= np.linalg.norm(vector)
             yield Query(
                 vector=vector.tolist(),
                 meta_conditions=None,
                 expected_result=expected_result.tolist(),
-                expected_scores=expected_scores.tolist(),
+                expected_scores=expected_scores.tolist() if expected_scores is not None else None,
             )
 
     def read_data(self, *args, **kwargs) -> Iterator[Record]:
diff --git a/datasets/datasets.json b/datasets/datasets.json
@@ -1296,5 +1296,25 @@
     "path": "random-100-match-kw-small-vocab/random_keywords_1m_vocab_10_no_filters",
     "vector_count": 100,
     "description": "Synthetic data"
+  },
+  {
+    "name": "cohere-768-1M",
+    "vector_size": 768,
+    "distance": "dot",
+    "type": "h5",
+    "path": "cohere-768-1M/cohere-768-1M.hdf5",
+    "link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-1m.hdf5.bz2",
+    "vector_count": 1000000,
+    "description": "Wikipedia embeddings"
+  },
+  {
+    "name": "cohere-768-10M",
+    "vector_size": 768,
+    "distance": "dot",
+    "type": "h5",
+    "path": "cohere-768-10M/cohere-768-10M.hdf5",
+    "link": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/vectorsearch/cohere-wikipedia-22-12-en-embeddings/documents-10m.hdf5.bz2",
+    "vector_count": 10000000,
+    "description": "Wikipedia embeddings"
   }
-]
+]
diff --git a/engine/base_client/search.py b/engine/base_client/search.py
@@ -20,7 +20,7 @@
 
 
 class BaseSearcher:
-    _doc_id_counter = itertools.count(100000000)
+    _doc_id_counter = None  # Will be initialized per process
     MP_CONTEXT = None
 
     def __init__(self, host, connection_params, search_params):
@@ -67,15 +67,22 @@ def _search_one(cls, query, top: Optional[int] = None):
             precision = len(ids.intersection(query.expected_result[:top])) / top
         return precision, end - start
 
+    @classmethod
+    def _get_doc_id_counter(cls):
+        if cls._doc_id_counter is None:
+            # Use process ID to create unique starting point for each worker
+            process_id = os.getpid()
+            # Each process gets a unique range: 1000000000 + (pid * 1000000)
+            start_offset = 1000000000 + (process_id % 1000) * 1000000
+            cls._doc_id_counter = itertools.count(start_offset)
+        return cls._doc_id_counter
+
     @classmethod
     def _insert_one(cls, query):
         start = time.perf_counter()
 
-        # Generate unique doc_id here
-        doc_id = next(cls._doc_id_counter)
-        
-        # Debug logging to verify inserts are happening
-        #print(f"DEBUG: Inserting vector with doc_id={doc_id}")
+        # Generate unique doc_id with process-safe counter
+        doc_id = next(cls._get_doc_id_counter())
 
         cls.insert_one(str(doc_id), query.vector, query.meta_conditions)
         end = time.perf_counter()
diff --git a/experiments/configurations/cohere-calibration.json b/experiments/configurations/cohere-calibration.json