diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index c8aafeff429..ce0386df8c4 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -17,13 +17,11 @@ ContextChunkingPolicy, GuidedDecodingConfig) from tensorrt_llm.bindings.internal.batch_manager import ContextChunkingConfig -from tensorrt_llm.llmapi.llm_args import (KvCacheConnectorConfig, LoadFormat, - PybindMirror, TorchLlmArgs) +from tensorrt_llm.llmapi.llm_args import LoadFormat, PybindMirror, TorchLlmArgs from tensorrt_llm.llmapi.tokenizer import (TokenizerBase, _llguidance_tokenizer_info, _xgrammar_tokenizer_info) from tensorrt_llm.logger import logger -from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.quantization import QuantAlgo @@ -205,12 +203,12 @@ def create_py_executor( llm_args: TorchLlmArgs, checkpoint_dir: str = None, tokenizer: Optional[TokenizerBase] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, profiling_stage_data: Optional[dict] = None, ) -> PyExecutor: garbage_collection_gen0_threshold = llm_args.garbage_collection_gen0_threshold + lora_config = llm_args.lora_config + kv_connector_config = llm_args.kv_connector_config pytorch_backend_config = llm_args.get_pytorch_backend_config() if pytorch_backend_config is None: diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py index f2655cafb4e..81f6b749f87 100644 --- a/tensorrt_llm/executor/base_worker.py +++ b/tensorrt_llm/executor/base_worker.py @@ -16,11 +16,10 @@ nvtx_range_debug) from ..bindings import executor as tllm from ..builder import ConfigEncoder, Engine, EngineConfig -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig, PybindMirror +from ..llmapi.llm_args import BaseLlmArgs, PybindMirror from ..llmapi.tokenizer import TokenizerBase from ..llmapi.tracer import global_tracer from ..llmapi.utils import _SyncQueue, print_colored_debug -from ..lora_helper import LoraConfig from ..lora_manager import LoraManager from ..metrics import RequestEventTiming from ..prompt_adapter_manager import PromptAdapterManager @@ -54,8 +53,6 @@ def __init__( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -73,8 +70,6 @@ def __init__( self._batched_logits_processor = batched_logits_processor self._postproc_worker_config = postproc_worker_config self._is_llm_executor = is_llm_executor - self._lora_config = lora_config - self._kv_connector_config = kv_connector_config self._hf_model_dir = hf_model_dir self._tokenizer = tokenizer self.llm_args = llm_args @@ -92,10 +87,7 @@ def __init__( self._is_pytorch_backend = llm_args is not None and llm_args.backend in [ "pytorch", "_autodeploy" ] - - if not self._is_pytorch_backend and kv_connector_config is not None: - raise ValueError( - "KV connector config is only supported for PyTorch backend") + self._lora_config = llm_args.lora_config if self._is_pytorch_backend else None if global_mpi_size() > 1: logger.set_rank(self.global_rank) @@ -130,8 +122,6 @@ def _create_py_executor(): args["llm_args"] = self.llm_args args["checkpoint_dir"] = self._hf_model_dir args["tokenizer"] = self._tokenizer - args["lora_config"] = self._lora_config - args["kv_connector_config"] = self._kv_connector_config elif self.llm_args.backend == "_autodeploy": from tensorrt_llm._torch.auto_deploy.llm_args import \ LlmArgs as ADLlmArgs diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py index 188f71fe078..aef376838f4 100644 --- a/tensorrt_llm/executor/executor.py +++ b/tensorrt_llm/executor/executor.py @@ -15,13 +15,12 @@ from tensorrt_llm.inputs.multimodal import MultimodalParams from tensorrt_llm.logger import logger, set_level -from tensorrt_llm.lora_helper import LoraConfig from .._utils import mpi_world_size from ..bindings import executor as tllm from ..builder import Engine from ..disaggregated_params import DisaggregatedParams -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig, TorchLlmArgs +from ..llmapi.llm_args import BaseLlmArgs, TorchLlmArgs from ..llmapi.llm_utils import KvCacheRetentionConfig from ..llmapi.mpi_session import (MpiSession, external_mpi_comm_available, need_spawn_mpi_workers) @@ -360,29 +359,28 @@ def aget_kv_events(self, timeout=None) -> IterationResult: @staticmethod def _create_ray_executor( - worker_kwargs: Dict, - model_world_size: int, - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, - tp_size: int, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + worker_kwargs: Dict, + model_world_size: int, + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, + tp_size: int, + ): from .ray_executor import RayExecutor return RayExecutor(worker_kwargs, model_world_size=model_world_size, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - tp_size=tp_size, - kv_connector_config=kv_connector_config) + tp_size=tp_size) @staticmethod def _create_rpc_executor( - worker_kwargs: Dict, - model_world_size: int, - mpi_session: Optional[MpiSession], - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + worker_kwargs: Dict, + model_world_size: int, + mpi_session: Optional[MpiSession], + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, + ): """Create RPC-based executor (GenerationExecutorRpcProxy).""" from .rpc_proxy import GenerationExecutorRpcProxy return GenerationExecutorRpcProxy( @@ -390,18 +388,17 @@ def _create_rpc_executor( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) @staticmethod def _create_ipc_executor( - worker_kwargs: Dict, - model_world_size: int, - mpi_session: Optional[MpiSession], - postproc_worker_config: PostprocWorkerConfig, - is_llm_executor: bool, - use_worker: bool = False, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + worker_kwargs: Dict, + model_world_size: int, + mpi_session: Optional[MpiSession], + postproc_worker_config: PostprocWorkerConfig, + is_llm_executor: bool, + use_worker: bool = False, + ): """Create IPC-based executor (GenerationExecutorProxy or GenerationExecutorWorker). Args: @@ -410,10 +407,8 @@ def _create_ipc_executor( """ if use_worker: from .worker import GenerationExecutorWorker - return GenerationExecutorWorker( - **worker_kwargs, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + return GenerationExecutorWorker(**worker_kwargs, + is_llm_executor=is_llm_executor) else: from .proxy import GenerationExecutorProxy return GenerationExecutorProxy( @@ -421,8 +416,7 @@ def _create_ipc_executor( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) @staticmethod def create( @@ -436,8 +430,6 @@ def create( return_logits: bool = False, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -469,9 +461,6 @@ def create( "llm_args": llm_args, } - if lora_config: - worker_kwargs["lora_config"] = lora_config - orchestrator_type = None if not isinstance( llm_args, TorchLlmArgs) else llm_args.orchestrator_type if orchestrator_type == "ray": @@ -480,8 +469,7 @@ def create( model_world_size, postproc_worker_config, is_llm_executor=is_llm_executor, - tp_size=args.get("tp_size", 1), - kv_connector_config=kv_connector_config) + tp_size=args.get("tp_size", 1)) elif orchestrator_type is not None and orchestrator_type != "rpc": raise ValueError( f"Unsupported orchestrator_type: {orchestrator_type}") @@ -502,8 +490,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) return GenerationExecutor._create_ipc_executor( worker_kwargs, @@ -511,8 +498,7 @@ def create( mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=False, - kv_connector_config=kv_connector_config) + use_worker=False) # WAR: For the performance of gathering logits, we use single process worker # for TP1 to avoid the large overhead of IPC. @@ -528,8 +514,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) return GenerationExecutor._create_ipc_executor( worker_kwargs, @@ -537,8 +522,7 @@ def create( mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=True, - kv_connector_config=kv_connector_config) + use_worker=True) # For single-gpu case: # Partition the workload to multiple process for streaming performance. @@ -551,8 +535,7 @@ def create( model_world_size=model_world_size, mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) return GenerationExecutor._create_ipc_executor( worker_kwargs, @@ -560,8 +543,7 @@ def create( mpi_session=None, # use mpi4py postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=False, - kv_connector_config=kv_connector_config) + use_worker=False) else: ctx = multiprocessing.get_context("spawn") # The ProcessPoolExecutorSession is used to support Windows, as mpi4py cannot. @@ -574,8 +556,7 @@ def create( mpi_session=mpi_session, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - use_worker=False, - kv_connector_config=kv_connector_config) + use_worker=False) def wait_first_completed( self, futures: List[GenerationResult] diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 785bfd51580..4204d9f3e92 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -12,7 +12,6 @@ from tensorrt_llm.logger import logger from .._utils import customized_gc_thresholds, mpi_rank, nvtx_range_debug -from ..llmapi.llm_args import KvCacheConnectorConfig from ..llmapi.mpi_session import (MpiCommSession, MpiPoolSession, MpiSession, RemoteMpiCommSessionClient) from ..llmapi.tracer import enable_llm_tracer, get_tracer, global_tracer @@ -46,7 +45,6 @@ def __init__( worker_cls: type = GenerationExecutorWorker, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, ) -> None: postproc_worker_config = postproc_worker_config or PostprocWorkerConfig( ) @@ -95,8 +93,7 @@ def __init__( worker_kwargs = dict(**worker_kwargs, worker_queues=self._setup_queues(), postproc_worker_config=postproc_worker_config, - is_llm_executor=False, - kv_connector_config=kv_connector_config) + is_llm_executor=False) if "log_level" not in worker_kwargs: worker_kwargs["log_level"] = logger.level diff --git a/tensorrt_llm/executor/ray_executor.py b/tensorrt_llm/executor/ray_executor.py index ee232451e68..5d87fdc9bfc 100644 --- a/tensorrt_llm/executor/ray_executor.py +++ b/tensorrt_llm/executor/ray_executor.py @@ -17,7 +17,6 @@ from tensorrt_llm.logger import logger from .._utils import nvtx_range_debug -from ..llmapi.llm_args import KvCacheConnectorConfig from .executor import GenerationExecutor from .postproc_worker import PostprocWorkerConfig from .ray_gpu_worker import RayGPUWorker, RayWorkerWrapper @@ -36,8 +35,7 @@ def __init__(self, model_world_size: int, postproc_worker_config: PostprocWorkerConfig, is_llm_executor: bool, - tp_size=1, - kv_connector_config: Optional[KvCacheConnectorConfig] = None): + tp_size=1): os.environ['RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES'] = '1' os.environ["RAY_DEDUP_LOGS"] = "0" # for debug @@ -97,8 +95,7 @@ def __init__(self, worker_kwargs = dict(**worker_kwargs, postproc_worker_config=postproc_worker_config, - is_llm_executor=is_llm_executor, - kv_connector_config=kv_connector_config) + is_llm_executor=is_llm_executor) self.create_workers(RayGPUWorker, worker_kwargs) except Exception as e: diff --git a/tensorrt_llm/executor/ray_gpu_worker.py b/tensorrt_llm/executor/ray_gpu_worker.py index efed67561ac..39725a0f56f 100644 --- a/tensorrt_llm/executor/ray_gpu_worker.py +++ b/tensorrt_llm/executor/ray_gpu_worker.py @@ -8,9 +8,8 @@ from ..bindings import executor as tllm from ..builder import Engine -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig +from ..llmapi.llm_args import BaseLlmArgs from ..llmapi.tokenizer import TokenizerBase -from ..lora_helper import LoraConfig from ..sampling_params import BatchedLogitsProcessor from .base_worker import BaseWorker from .postproc_worker import PostprocWorkerConfig @@ -116,8 +115,6 @@ def __init__( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -131,8 +128,6 @@ def __init__( batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, llm_args=llm_args, diff --git a/tensorrt_llm/executor/rpc_proxy.py b/tensorrt_llm/executor/rpc_proxy.py index 8e1375a1811..01e113f58cc 100644 --- a/tensorrt_llm/executor/rpc_proxy.py +++ b/tensorrt_llm/executor/rpc_proxy.py @@ -5,7 +5,6 @@ import threading from typing import Optional -from ..llmapi.llm_args import KvCacheConnectorConfig from ..llmapi.mpi_session import MpiPoolSession, MpiSession from ..llmapi.tracer import global_tracer from ..llmapi.utils import (AsyncQueue, _SyncQueue, logger_debug, @@ -33,7 +32,6 @@ def __init__( *, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, ): """ Args: @@ -42,7 +40,6 @@ def __init__( mpi_session: the mpi session to use postproc_worker_config: the postproc worker config is_llm_executor: whether this is an llm executor - kv_connector_config: the kv cache connector config """ GenerationExecutorRpcProxy.INSTANCE_COUNTER += 1 self.rpc_addr = self.gen_uniq_rpc_addr() diff --git a/tensorrt_llm/executor/rpc_worker.py b/tensorrt_llm/executor/rpc_worker.py index a9ef9f435d3..b08516ceecc 100644 --- a/tensorrt_llm/executor/rpc_worker.py +++ b/tensorrt_llm/executor/rpc_worker.py @@ -10,10 +10,9 @@ from .._utils import mpi_rank from ..bindings import executor as tllm from ..builder import Engine -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig +from ..llmapi.llm_args import BaseLlmArgs from ..llmapi.tokenizer import TokenizerBase from ..logger import set_level -from ..lora_manager import LoraConfig from ..sampling_params import BatchedLogitsProcessor from .base_worker import BaseWorker from .postproc_worker import PostprocWorkerConfig @@ -42,10 +41,8 @@ def __init__( engine: Union[Path, Engine], executor_config: Optional[tllm.ExecutorConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -54,11 +51,9 @@ def __init__( engine=engine, executor_config=executor_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, llm_args=llm_args, batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, ) @@ -198,9 +193,7 @@ def main_task( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, llm_args: Optional[BaseLlmArgs] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, **kwargs, @@ -213,11 +206,9 @@ def main_task( engine=engine, executor_config=executor_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, llm_args=llm_args, batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, ) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 59e3fca19fc..ba8c3f31432 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -14,14 +14,13 @@ from .._utils import mpi_comm, mpi_rank from ..bindings import executor as tllm from ..builder import Engine -from ..llmapi.llm_args import BaseLlmArgs, KvCacheConnectorConfig +from ..llmapi.llm_args import BaseLlmArgs from ..llmapi.mpi_session import set_mpi_session_cpp from ..llmapi.tokenizer import TokenizerBase from ..llmapi.tracer import VizTracer, set_global_tracer from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue, clear_sched_affinity, print_colored_debug, print_traceback_on_error) -from ..lora_helper import LoraConfig from ..sampling_params import BatchedLogitsProcessor from .base_worker import BaseWorker from .executor import IterationResultQueue @@ -47,8 +46,6 @@ def __init__( batched_logits_processor: Optional[BatchedLogitsProcessor] = None, postproc_worker_config: Optional[PostprocWorkerConfig] = None, is_llm_executor: Optional[bool] = None, - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -59,8 +56,6 @@ def __init__( batched_logits_processor=batched_logits_processor, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, llm_args=llm_args, @@ -243,8 +238,6 @@ def worker_main( ready_signal: Optional[str] = None, is_llm_executor: Optional[ bool] = True, # whether it's the main executor instance - lora_config: Optional[LoraConfig] = None, - kv_connector_config: Optional[KvCacheConnectorConfig] = None, hf_model_dir: Optional[Path] = None, tokenizer: Optional[TokenizerBase] = None, llm_args: Optional[BaseLlmArgs] = None, @@ -374,8 +367,6 @@ def notify_proxy_threads_to_quit(): batched_logits_processor, postproc_worker_config=postproc_worker_config, is_llm_executor=is_llm_executor, - lora_config=lora_config, - kv_connector_config=kv_connector_config, hf_model_dir=hf_model_dir, tokenizer=tokenizer, llm_args=llm_args) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 29ff4a21aea..c9a7aed32b3 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -954,8 +954,7 @@ def _build_model(self): num_postprocess_workers=self.args.num_postprocess_workers, postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir, ), - is_llm_executor=True, - lora_config=lora_config) + is_llm_executor=True) @append_docstring(TORCH_LLM_DOCSTRING) @@ -1057,9 +1056,6 @@ def _build_model(self): postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir, ), is_llm_executor=True, - lora_config=self.args.lora_config, - # Autodeploy does not support kv_connector_config - kv_connector_config=getattr(self.args, "kv_connector_config", None), hf_model_dir=self._hf_model_dir, tokenizer=self.tokenizer, llm_args=self.args)