ModelEngine-Group
diff --git a/‎ucm/store/test/e2e/nfsstore_embed_fetch.py‎
Lines changed: 96 additions & 185 deletions b/‎ucm/store/test/e2e/nfsstore_embed_fetch.py‎
Lines changed: 96 additions & 185 deletions
@@ -1,47 +1,26 @@
 # -*- coding: utf-8 -*-
-#
-# MIT License
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
 import csv
 import os
 import secrets
 import time
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
 import torch
 
 from ucm.store.nfsstore.nfsstore_connector import UcmNfsStore
 from ucm.store.ucmstore import UcmKVStoreBase
 
 
-def setup_store(storage_backends, block_size, device_id, io_size) -> UcmKVStoreBase:
+def setup(
+    storage_backends, block_size, device_id, io_size, transferStreamNumber
+) -> UcmKVStoreBase:
     config = {
         "storage_backends": storage_backends,
         "kv_block_size": block_size,
         "role": "worker",
         "device": device_id,
         "io_size": io_size,
-        "stream_number": 128,
+        "transferStreamNumber": transferStreamNumber,
     }
     return UcmNfsStore(config)
 
@@ -150,162 +129,94 @@ def fetch(
     return total_size, elapsed_time, throughput_gbps
 
 
-def main():
-    storage_backends = "."
-    device_id = 1
-    mla = False
-    repeat = 3
-    block_elem_size = 2
-    num_tokens_list = [2048, 4096, 8192, 16384, 32768]
-
-    if mla:
-        block_lens = [64, 128]
-        block_layer = 61
-        head_size = 576
-        kv = 1
-        model_name = "deepseek-v3"
-        num_head_list = [1]
-    else:
-        block_lens = [128, 256]
-        block_layer = 64
-        head_size = 128
-        kv = 2
-        model_name = "QwQ-32B"
-        num_head_list = [1, 2, 4]
-
-    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-    csv_file = os.path.join(SCRIPT_DIR, "embed_fetch_result.csv")
-    need_header = not os.path.exists(csv_file)
-
-    with open(csv_file, "a", newline="", encoding="utf-8") as csv_fp:
-        writer = csv.writer(csv_fp)
-
-        if need_header:
-            writer.writerow(
-                [
-                    "Model",
-                    "Sequence Length",
-                    "Batch Size",
-                    "Layers",
-                    "Element Size",
-                    "KV",
-                    "Num Head",
-                    "Block Size",
-                    "IO Count",
-                    "IO Size(B)",
-                    "Total Size(GB)",
-                    "Write Avg Time(s)",
-                    "Write Avg Bandwidth(GB/s)",
-                    "Read Avg Time(s)",
-                    "Read Avg Bandwidth(GB/s)",
-                ]
-            )
-
-        for num_head in num_head_list:
-            for block_len in block_lens:
-                block_dim = head_size * num_head
-                io_size = block_dim * block_len * block_elem_size
-                block_size = io_size * block_layer
-                real_blocks = max(20, 1200 // num_head)
-
-                for num_tokens in num_tokens_list:
-                    sep = "=" * 60
-                    print(
-                        f"\n{sep}\n= num_head={num_head} | num_tokens={num_tokens:>6} | Repeat {repeat} times =\n{sep}\n"
-                    )
-
-                    batch_size = int(num_tokens / block_len)
-                    io_num = int(num_tokens / block_len * block_layer)
-
-                    w_bw_list, r_bw_list = [], []
-                    w_time_list, r_time_list = [], []
-                    w_size_sum, r_size_sum = 0.0, 0.0
-
-                    for r in range(repeat):
-                        print(f"\n--- Round {r+1} ---")
-                        store = setup_store(
-                            storage_backends, block_size, device_id, io_size
-                        )
-
-                        hashes, kvcaches = make_buffers(
-                            real_blocks,
-                            device_id,
-                            batch_size,
-                            head_size,
-                            block_len,
-                            block_layer,
-                            num_head,
-                        )
-
-                        results = store.create(hashes[:batch_size])
-                        assert sum(results) == 0, "Create operation failed"
-
-                        w_size, w_time, w_bw = embed(
-                            store,
-                            hashes[:batch_size],
-                            kvcaches,
-                            num_tokens,
-                            block_len,
-                            block_layer,
-                            block_dim,
-                        )
-                        store.commit(hashes[:batch_size], True)
-
-                        store_all_hashes(hashes[:batch_size])
-
-                        r_size, r_time, r_bw = fetch(
-                            store,
-                            hashes[:batch_size],
-                            kvcaches,
-                            num_tokens,
-                            block_len,
-                            block_layer,
-                            block_dim,
-                        )
-
-                        w_bw_list.append(w_bw)
-                        r_bw_list.append(r_bw)
-                        w_time_list.append(w_time)
-                        r_time_list.append(r_time)
-                        w_size_sum += w_size
-                        r_size_sum += r_size
-
-                        # Clean up resources
-                        del kvcaches, hashes, store
-                        torch.cuda.empty_cache()
-
-                    avg_w_bw = sum(w_bw_list) / repeat
-                    avg_r_bw = sum(r_bw_list) / repeat
-                    avg_w_time = sum(w_time_list) / repeat
-                    avg_r_time = sum(r_time_list) / repeat
-                    avg_w_size = w_size_sum / (1024**3) / repeat
-                    avg_r_size = r_size_sum / (1024**3) / repeat
-
-                    writer.writerow(
-                        [
-                            model_name,
-                            num_tokens,
-                            batch_size,
-                            block_layer,
-                            block_elem_size,
-                            kv,
-                            num_head,
-                            block_len,
-                            io_num,
-                            io_size,
-                            f"{avg_w_size:.4f}",
-                            f"{avg_w_time:.4f}",
-                            f"{avg_w_bw:.4f}",
-                            f"{avg_r_time:.4f}",
-                            f"{avg_r_bw:.4f}",
-                        ]
-                    )
-
-                    csv_fp.flush()
-
-    print("\n" + "=" * 60 + "\n= All combinations tested =\n" + "=" * 60 + "\n")
-
-
-if __name__ == "__main__":
-    os.environ["UC_LOGGER_LEVEL"] = "debug"
-    main()
+def run(
+    storage_backends: str,
+    device_id: int,
+    repeat: int,
+    num_head: int,
+    block_len: int,
+    transferStreamNumber: int,
+    num_tokens: int,
+    block_layer: int,
+    head_size: int,
+    block_elem_size: int,
+) -> Tuple[float, float, float, float, float, float]:
+    """
+    Run a single test with given parameters and return performance metrics.
+
+    Returns:
+        Tuple of (avg_w_size, avg_w_time, avg_w_bw, avg_r_time, avg_r_bw, avg_r_size)
+    """
+
+    block_dim = head_size * num_head
+    io_size = block_dim * block_len * block_elem_size
+    block_size = io_size * block_layer
+    batch_size = int(num_tokens / block_len)
+    real_blocks = batch_size + 10
+
+    w_bw_list, r_bw_list = [], []
+    w_time_list, r_time_list = [], []
+    w_size_sum, r_size_sum = 0.0, 0.0
+
+    store = setup(
+        storage_backends, block_size, device_id, io_size, transferStreamNumber
+    )
+    for r in range(repeat):
+        print(f"\n--- Round {r+1} ---")
+
+        hashes, kvcaches = make_buffers(
+            real_blocks,
+            device_id,
+            batch_size,
+            head_size,
+            block_len,
+            block_layer,
+            num_head,
+        )
+
+        results = store.create(hashes[:batch_size])
+        assert sum(results) == 0, "Create operation failed"
+
+        w_size, w_time, w_bw = embed(
+            store,
+            hashes[:batch_size],
+            kvcaches,
+            num_tokens,
+            block_len,
+            block_layer,
+            block_dim,
+        )
+        store.commit(hashes[:batch_size], True)
+
+        store_all_hashes(hashes[:batch_size])
+
+        r_size, r_time, r_bw = fetch(
+            store,
+            hashes[:batch_size],
+            kvcaches,
+            num_tokens,
+            block_len,
+            block_layer,
+            block_dim,
+        )
+
+        w_bw_list.append(w_bw)
+        r_bw_list.append(r_bw)
+        w_time_list.append(w_time)
+        r_time_list.append(r_time)
+        w_size_sum += w_size
+        r_size_sum += r_size
+
+        # Clean up resources
+        del kvcaches, hashes
+        torch.cuda.empty_cache()
+
+    del store
+    avg_w_bw = sum(w_bw_list) / repeat
+    avg_r_bw = sum(r_bw_list) / repeat
+    avg_w_time = sum(w_time_list) / repeat
+    avg_r_time = sum(r_time_list) / repeat
+    avg_w_size = w_size_sum / (1024**3) / repeat
+    avg_r_size = r_size_sum / (1024**3) / repeat
+
+    return avg_w_size, avg_w_time, avg_w_bw, avg_r_time, avg_r_bw, avg_r_size