From dfd44eedb4a1a42f2df84cfe2e6fd9d44699d7c4 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Thu, 16 Oct 2025 10:56:26 +0800 Subject: [PATCH 1/3] clean create_py_executor API Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- .../_torch/pyexecutor/py_executor_creator.py | 8 +- tensorrt_llm/executor/base_worker.py | 14 +--- tensorrt_llm/executor/executor.py | 82 ++++++------------- tensorrt_llm/executor/proxy.py | 5 +- tensorrt_llm/executor/ray_executor.py | 7 +- tensorrt_llm/executor/ray_gpu_worker.py | 7 +- tensorrt_llm/executor/rpc_proxy.py | 3 - tensorrt_llm/executor/rpc_worker.py | 11 +-- tensorrt_llm/executor/worker.py | 11 +-- tensorrt_llm/llmapi/llm.py | 6 +- 10 files changed, 39 insertions(+), 115 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index ff580751976..c698fdd947d 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -17,13 +17,11 @@ ContextChunkingPolicy, GuidedDecodingConfig) from tensorrt_llm.bindings.internal.batch_manager import ContextChunkingConfig -from tensorrt_llm.llmapi.llm_args import (KvCacheConnectorConfig, LoadFormat, - PybindMirror, TorchLlmArgs) +from tensorrt_llm.llmapi.llm_args import LoadFormat, PybindMirror, TorchLlmArgs from tensorrt_llm.llmapi.tokenizer import (TokenizerBase, _llguidance_tokenizer_info, _xgrammar_tokenizer_info) from tensorrt_llm.logger import logger -from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.quantization import QuantAlgo @@ -205,11 +203,11 @@ def create_py_executor( llm_args: TorchLlmArgs, checkpoint_dir: str = None, tokenizer: Optional[TokenizerBase] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, ) -> PyExecutor: garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold + lora_config = llm_args.lora_config + kv_connector_config = llm_args.kv_connector_config pytorch_backend_config = llm_args.get_pytorch_backend_config() if pytorch_backend_config is None: diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index f2655cafb4e..81f6b749f87 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -16,11 +16,10 @@ nvtx_range_debug) from ..bindings import executor as tllm from ..builder import ConfigEncoder, Engine, EngineConfig -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig, PybindMirror +from ..llmapi.llm_args import BaseLlmArgs, PybindMirror from ..llmapi.tokenizer import TokenizerBase from ..llmapi.tracer import global_tracer from ..llmapi.utils import _SyncQueue, print_colored_debug -from ..lora_helper import LoraConfig from ..lora_manager import LoraManager from ..metrics import RequestEventTiming from ..prompt_adapter_manager import PromptAdapterManager @@ -54,8 +53,6 @@ def __init__( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -73,8 +70,6 @@ def __init__( self._batched_logits_processor = batched_logits_processor self._postproc_worker_config = postproc_worker_config self._is_llm_executor = is_llm_executor - self._lora_config = lora_config - self._kv_connector_config = kv_connector_config self._hf_model_dir = hf_model_dir self._tokenizer = tokenizer self.llm_args = llm_args @@ -92,10 +87,7 @@ def __init__( self._is_pytorch_backend = llm_args is not None and llm_args.backend in [ "pytorch", "_autodeploy" ] - - if not self._is_pytorch_backend and kv_connector_config is not None: - raise ValueError( - "KV connector config is only supported for PyTorch backend") + self._lora_config = llm_args.lora_config if self._is_pytorch_backend else None if global_mpi_size() > 1: logger.set_rank(self.global_rank) @@ -130,8 +122,6 @@ def _create_py_executor(): args["llm_args"] = self.llm_args args["checkpoint_dir"] = self._hf_model_dir args["tokenizer"] = self._tokenizer - args["lora_config"] = self._lora_config - args["kv_connector_config"] = self._kv_connector_config elif self.llm_args.backend == "_autodeploy": from tensorrt_llm._torch.auto_deploy.llm_args import \ LlmArgs as ADLlmArgs diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py index 188f71fe078..6c697625e95 100644 --- a/tensorrt_llm/executor/executor.py +++ b/tensorrt_llm/executor/executor.py @@ -15,13 +15,12 @@ from tensorrt_llm.inputs.multimodal import MultimodalParams from tensorrt_llm.logger import logger, set_level -from tensorrt_llm.lora_helper import LoraConfig from .._utils import mpi_world_size from ..bindings import executor as tllm from ..builder import Engine from ..disaggregated_params import DisaggregatedParams -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig, TorchLlmArgs +from ..llmapi.llm_args import BaseLlmArgs, TorchLlmArgs from ..llmapi.llm_utils import KvCacheRetentionConfig from ..llmapi.mpi_session import (MpiSession, external_mpi_comm_available, need_spawn_mpi_workers) @@ -359,30 +358,22 @@ def aget_kv_events(self, timeout=None) -> IterationResult: return self._iter_kv_events_result @staticmethod - def _create_ray_executor( - worker_kwargs: Dict, - model_world_size: int, - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, - tp_size: int, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + def _create_ray_executor(worker_kwargs: Dict, model_world_size: int, + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, tp_size: int): from .ray_executor import RayExecutor return RayExecutor(worker_kwargs, model_world_size=model_world_size, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - tp_size=tp_size, - kv_connector_config=kv_connector_config) + tp_size=tp_size) @staticmethod - def _create_rpc_executor( - worker_kwargs: Dict, - model_world_size: int, - mpi_session: Optional[MpiSession], - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + def _create_rpc_executor(worker_kwargs: Dict, model_world_size: int, + mpi_session: Optional[MpiSession], + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool): """Create RPC-based executor (GenerationExecutorRpcProxy).""" from .rpc_proxy import GenerationExecutorRpcProxy return GenerationExecutorRpcProxy( @@ -390,18 +381,15 @@ def _create_rpc_executor( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) @staticmethod - def _create_ipc_executor( - worker_kwargs: Dict, - model_world_size: int, - mpi_session: Optional[MpiSession], - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, - use_worker: bool = False, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + def _create_ipc_executor(worker_kwargs: Dict, + model_world_size: int, + mpi_session: Optional[MpiSession], + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, + use_worker: bool = False): """Create IPC-based executor (GenerationExecutorProxy or GenerationExecutorWorker). Args: @@ -410,10 +398,8 @@ def _create_ipc_executor( """ if use_worker: from .worker import GenerationExecutorWorker - return GenerationExecutorWorker( - **worker_kwargs, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + return GenerationExecutorWorker(**worker_kwargs, + is_llm_executor=is_llm_executor) else: from .proxy import GenerationExecutorProxy return GenerationExecutorProxy( @@ -421,8 +407,7 @@ def _create_ipc_executor( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) @staticmethod def create( @@ -436,8 +421,6 @@ def create( return_logits: bool = False, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -469,9 +452,6 @@ def create( "llm_args": llm_args, } - if lora_config: - worker_kwargs["lora_config"] = lora_config - orchestrator_type = None if not isinstance( llm_args, TorchLlmArgs) else llm_args.orchestrator_type if orchestrator_type == "ray": @@ -480,8 +460,7 @@ def create( model_world_size, postproc_worker_config, is_llm_executor=is_llm_executor, - tp_size=args.get("tp_size", 1), - kv_connector_config=kv_connector_config) + tp_size=args.get("tp_size", 1)) elif orchestrator_type is not None and orchestrator_type != "rpc": raise ValueError( f"Unsupported orchestrator_type: {orchestrator_type}") @@ -502,8 +481,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) return GenerationExecutor._create_ipc_executor( worker_kwargs, @@ -511,8 +489,7 @@ def create( mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=False, - kv_connector_config=kv_connector_config) + use_worker=False) # WAR: For the performance of gathering logits, we use single process worker # for TP1 to avoid the large overhead of IPC. @@ -528,8 +505,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) return GenerationExecutor._create_ipc_executor( worker_kwargs, @@ -537,8 +513,7 @@ def create( mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=True, - kv_connector_config=kv_connector_config) + use_worker=True) # For single-gpu case: # Partition the workload to multiple process for streaming performance. @@ -551,8 +526,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) return GenerationExecutor._create_ipc_executor( worker_kwargs, @@ -560,8 +534,7 @@ def create( mpi_session=None, # use mpi4py postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=False, - kv_connector_config=kv_connector_config) + use_worker=False) else: ctx = multiprocessing.get_context("spawn") # The ProcessPoolExecutorSession is used to support Windows, as mpi4py cannot. @@ -574,8 +547,7 @@ def create( mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=False, - kv_connector_config=kv_connector_config) + use_worker=False) def wait_first_completed( self, futures: List[GenerationResult] diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 785bfd51580..4204d9f3e92 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -12,7 +12,6 @@ from tensorrt_llm.logger import logger from .._utils import customized_gc_thresholds, mpi_rank, nvtx_range_debug -from ..llmapi.llm_args import KvCacheConnectorConfig from ..llmapi.mpi_session import (MpiCommSession, MpiPoolSession, MpiSession, RemoteMpiCommSessionClient) from ..llmapi.tracer import enable_llm_tracer, get_tracer, global_tracer @@ -46,7 +45,6 @@ def __init__( worker_cls: type = GenerationExecutorWorker, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, ) -> None: postproc_worker_config = postproc_worker_config or PostprocWorkerConfig( ) @@ -95,8 +93,7 @@ def __init__( worker_kwargs = dict(**worker_kwargs, worker_queues=self._setup_queues(), postproc_worker_config=postproc_worker_config, - is_llm_executor=False, - kv_connector_config=kv_connector_config) + is_llm_executor=False) if "log_level" not in worker_kwargs: worker_kwargs["log_level"] = logger.level diff --git a/tensorrt_llm/executor/ray_executor.py b/tensorrt_llm/executor/ray_executor.py index ee232451e68..5d87fdc9bfc 100644 --- a/tensorrt_llm/executor/ray_executor.py +++ b/tensorrt_llm/executor/ray_executor.py @@ -17,7 +17,6 @@ from tensorrt_llm.logger import logger from .._utils import nvtx_range_debug -from ..llmapi.llm_args import KvCacheConnectorConfig from .executor import GenerationExecutor from .postproc_worker import PostprocWorkerConfig from .ray_gpu_worker import RayGPUWorker, RayWorkerWrapper @@ -36,8 +35,7 @@ def __init__(self, model_world_size: int, postproc_worker_config: PostprocWorkerConfig, is_llm_executor: bool, - tp_size=1, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + tp_size=1): os.environ['RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES'] = '1' os.environ["RAY_DEDUP_LOGS"] = "0" # for debug @@ -97,8 +95,7 @@ def __init__(self, worker_kwargs = dict(**worker_kwargs, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) self.create_workers(RayGPUWorker, worker_kwargs) except Exception as e: diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py index efed67561ac..39725a0f56f 100644 --- a/tensorrt_llm/executor/ray_gpu_worker.py +++ b/tensorrt_llm/executor/ray_gpu_worker.py @@ -8,9 +8,8 @@ from ..bindings import executor as tllm from ..builder import Engine -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig +from ..llmapi.llm_args import BaseLlmArgs from ..llmapi.tokenizer import TokenizerBase -from ..lora_helper import LoraConfig from ..sampling_params import BatchedLogitsProcessor from .base_worker import BaseWorker from .postproc_worker import PostprocWorkerConfig @@ -116,8 +115,6 @@ def __init__( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -131,8 +128,6 @@ def __init__( batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, llm_args=llm_args, diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py index 8e1375a1811..01e113f58cc 100644 --- a/tensorrt_llm/executor/rpc_proxy.py +++ b/tensorrt_llm/executor/rpc_proxy.py @@ -5,7 +5,6 @@ import threading from typing import Optional -from ..llmapi.llm_args import KvCacheConnectorConfig from ..llmapi.mpi_session import MpiPoolSession, MpiSession from ..llmapi.tracer import global_tracer from ..llmapi.utils import (AsyncQueue, _SyncQueue, logger_debug, @@ -33,7 +32,6 @@ def __init__( *, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, ): """ Args: @@ -42,7 +40,6 @@ def __init__( mpi_session: the mpi session to use postproc_worker_config: the postproc worker config is_llm_executor: whether this is an llm executor - kv_connector_config: the kv cache connector config """ GenerationExecutorRpcProxy.INSTANCE_COUNTER += 1 self.rpc_addr = self.gen_uniq_rpc_addr() diff --git a/tensorrt_llm/executor/rpc_worker.py b/tensorrt_llm/executor/rpc_worker.py index a9ef9f435d3..b08516ceecc 100644 --- a/tensorrt_llm/executor/rpc_worker.py +++ b/tensorrt_llm/executor/rpc_worker.py @@ -10,10 +10,9 @@ from .._utils import mpi_rank from ..bindings import executor as tllm from ..builder import Engine -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig +from ..llmapi.llm_args import BaseLlmArgs from ..llmapi.tokenizer import TokenizerBase from ..logger import set_level -from ..lora_manager import LoraConfig from ..sampling_params import BatchedLogitsProcessor from .base_worker import BaseWorker from .postproc_worker import PostprocWorkerConfig @@ -42,10 +41,8 @@ def __init__( engine: Union[Path, Engine], executor_config: Optional[tllm.ExecutorConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -54,11 +51,9 @@ def __init__( engine=engine, executor_config=executor_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, llm_args=llm_args, batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, ) @@ -198,9 +193,7 @@ def main_task( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, llm_args: Optional[BaseLlmArgs] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, **kwargs, @@ -213,11 +206,9 @@ def main_task( engine=engine, executor_config=executor_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, llm_args=llm_args, batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, ) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 59e3fca19fc..ba8c3f31432 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -14,14 +14,13 @@ from .._utils import mpi_comm, mpi_rank from ..bindings import executor as tllm from ..builder import Engine -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig +from ..llmapi.llm_args import BaseLlmArgs from ..llmapi.mpi_session import set_mpi_session_cpp from ..llmapi.tokenizer import TokenizerBase from ..llmapi.tracer import VizTracer, set_global_tracer from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue, clear_sched_affinity, print_colored_debug, print_traceback_on_error) -from ..lora_helper import LoraConfig from ..sampling_params import BatchedLogitsProcessor from .base_worker import BaseWorker from .executor import IterationResultQueue @@ -47,8 +46,6 @@ def __init__( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -59,8 +56,6 @@ def __init__( batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, llm_args=llm_args, @@ -243,8 +238,6 @@ def worker_main( ready_signal: Optional[str] = None, is_llm_executor: Optional[ bool] = True, # whether it's the main executor instance - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -374,8 +367,6 @@ def notify_proxy_threads_to_quit(): batched_logits_processor, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, llm_args=llm_args) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 29ff4a21aea..c9a7aed32b3 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -954,8 +954,7 @@ def _build_model(self): num_postprocess_workers=self.args.num_postprocess_workers, postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir, ), - is_llm_executor=True, - lora_config=lora_config) + is_llm_executor=True) @append_docstring(TORCH_LLM_DOCSTRING) @@ -1057,9 +1056,6 @@ def _build_model(self): postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir, ), is_llm_executor=True, - lora_config=self.args.lora_config, - # Autodeploy does not support kv_connector_config - kv_connector_config=getattr(self.args, "kv_connector_config", None), hf_model_dir=self._hf_model_dir, tokenizer=self.tokenizer, llm_args=self.args) From 7f34a89cc39992be31e67dd3372748193c87f16f Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:47:15 +0800 Subject: [PATCH 2/3] format Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/executor.py | 35 +++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py index 6c697625e95..aef376838f4 100644 --- a/tensorrt_llm/executor/executor.py +++ b/tensorrt_llm/executor/executor.py @@ -358,9 +358,13 @@ def aget_kv_events(self, timeout=None) -> IterationResult: return self._iter_kv_events_result @staticmethod - def _create_ray_executor(worker_kwargs: Dict, model_world_size: int, - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, tp_size: int): + def _create_ray_executor( + worker_kwargs: Dict, + model_world_size: int, + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, + tp_size: int, + ): from .ray_executor import RayExecutor return RayExecutor(worker_kwargs, @@ -370,10 +374,13 @@ def _create_ray_executor(worker_kwargs: Dict, model_world_size: int, tp_size=tp_size) @staticmethod - def _create_rpc_executor(worker_kwargs: Dict, model_world_size: int, - mpi_session: Optional[MpiSession], - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool): + def _create_rpc_executor( + worker_kwargs: Dict, + model_world_size: int, + mpi_session: Optional[MpiSession], + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, + ): """Create RPC-based executor (GenerationExecutorRpcProxy).""" from .rpc_proxy import GenerationExecutorRpcProxy return GenerationExecutorRpcProxy( @@ -384,12 +391,14 @@ def _create_rpc_executor(worker_kwargs: Dict, model_world_size: int, is_llm_executor=is_llm_executor) @staticmethod - def _create_ipc_executor(worker_kwargs: Dict, - model_world_size: int, - mpi_session: Optional[MpiSession], - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, - use_worker: bool = False): + def _create_ipc_executor( + worker_kwargs: Dict, + model_world_size: int, + mpi_session: Optional[MpiSession], + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, + use_worker: bool = False, + ): """Create IPC-based executor (GenerationExecutorProxy or GenerationExecutorWorker). Args: From f13ca58cd54cbbbad32f17e09120345cb9c2e54b Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Fri, 17 Oct 2025 08:53:43 +0800 Subject: [PATCH 3/3] clean Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- 3rdparty/cutlass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/cutlass b/3rdparty/cutlass index 57e3cfb47a2..f3fde58372d 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit 57e3cfb47a2d9e0d46eb6335c3dc411498efa198 +Subproject commit f3fde58372d33e9a5650ba7b80fc48b3b49d40c8