6161from vllm .v1 .request import Request , RequestStatus
6262from vllm .v1 .serial_utils import MsgpackDecoder , MsgpackEncoder
6363from vllm .v1 .structured_output import StructuredOutputManager
64+ from vllm .v1 .utils import record_function_or_nullcontext
6465from vllm .version import __version__ as VLLM_VERSION
6566
6667logger = init_logger (__name__ )
@@ -315,17 +316,21 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
315316 # or finished and not yet removed from the batch.
316317 if not self .scheduler .has_requests ():
317318 return {}, False
318- scheduler_output = self .scheduler .schedule ()
319- future = self .model_executor .execute_model (scheduler_output , non_block = True )
320- grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
321- with self .log_error_detail (scheduler_output ):
322- model_output = future .result ()
323- if model_output is None :
324- model_output = self .model_executor .sample_tokens (grammar_output )
325-
326- engine_core_outputs = self .scheduler .update_from_output (
327- scheduler_output , model_output
328- )
319+ with record_function_or_nullcontext ("core step: schedule" ):
320+ scheduler_output = self .scheduler .schedule ()
321+
322+ with record_function_or_nullcontext ("core step: execute_model" ):
323+ future = self .model_executor .execute_model (scheduler_output , non_block = True )
324+ grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
325+ with self .log_error_detail (scheduler_output ):
326+ model_output = future .result ()
327+ if model_output is None :
328+ model_output = self .model_executor .sample_tokens (grammar_output )
329+
330+ with record_function_or_nullcontext ("core step: update_from_output" ):
331+ engine_core_outputs = self .scheduler .update_from_output (
332+ scheduler_output , model_output
333+ )
329334
330335 return engine_core_outputs , scheduler_output .total_num_scheduled_tokens > 0
331336
@@ -363,32 +368,37 @@ def step_with_batch_queue(
363368 model_executed = False
364369 deferred_scheduler_output = None
365370 if self .scheduler .has_requests ():
366- scheduler_output = self .scheduler .schedule ()
367- exec_future = self .model_executor .execute_model (
368- scheduler_output , non_block = True
369- )
371+ with record_function_or_nullcontext ("core step_with_batch_queue: schedule" ):
372+ scheduler_output = self .scheduler .schedule ()
373+ with record_function_or_nullcontext ("core step_with_batch_queue: execute_model" ):
374+ exec_future = self .model_executor .execute_model (
375+ scheduler_output , non_block = True
376+ )
370377 model_executed = scheduler_output .total_num_scheduled_tokens > 0
371378
372379 if scheduler_output .pending_structured_output_tokens :
373- # We need to defer sampling until we have processed the model output
374- # from the prior step.
375- deferred_scheduler_output = scheduler_output
376- # Block-wait for execute to return (continues running async on the GPU).
377- with self .log_error_detail (scheduler_output ):
378- exec_result = exec_future .result ()
379- assert exec_result is None
380+ with record_function_or_nullcontext ("core step_with_batch_queue: pending_structured_output_tokens" ):
381+ # We need to defer sampling until we have processed the model output
382+ # from the prior step.
383+ deferred_scheduler_output = scheduler_output
384+ # Block-wait for execute to return (continues running async on the GPU).
385+ with self .log_error_detail (scheduler_output ):
386+ exec_result = exec_future .result ()
387+ assert exec_result is None
380388 else :
381- # We aren't waiting for any tokens, get any grammar output immediately.
382- grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
389+ with record_function_or_nullcontext ("core step_with_batch_queue: get_grammar_bitmask" ):
390+ # We aren't waiting for any tokens, get any grammar output immediately.
391+ grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
383392 # Block-wait for execute to return (continues running async on the GPU).
384393 with self .log_error_detail (scheduler_output ):
385394 exec_result = exec_future .result ()
386395
387396 if exec_result is None :
388- # Call sample tokens.
389- future = self .model_executor .sample_tokens (
390- grammar_output , non_block = True
391- )
397+ with record_function_or_nullcontext ("core step_with_batch_queue: sample_tokens" ):
398+ # Call sample tokens.
399+ future = self .model_executor .sample_tokens (
400+ grammar_output , non_block = True
401+ )
392402 else :
393403 # No sampling required (e.g. all requests finished).
394404 future = cast (Future [ModelRunnerOutput ], exec_future )
@@ -408,27 +418,28 @@ def step_with_batch_queue(
408418 # only be called when the scheduler contains requests or the queue
409419 # is non-empty.
410420 return None , False
411-
412- # Block until the next result is available.
413- future , scheduler_output = batch_queue .pop ()
414- with self .log_error_detail (scheduler_output ):
415- model_output = future .result ()
416-
417- engine_core_outputs = self .scheduler .update_from_output (
418- scheduler_output , model_output
419- )
421+ with record_function_or_nullcontext ( "core step_with_batch_queue: model_output" ):
422+ # Block until the next result is available.
423+ future , scheduler_output = batch_queue .pop ()
424+ with self .log_error_detail (scheduler_output ):
425+ model_output = future .result ()
426+ with record_function_or_nullcontext ( "core step_with_batch_queue: update_from_output" ):
427+ engine_core_outputs = self .scheduler .update_from_output (
428+ scheduler_output , model_output
429+ )
420430
421431 # NOTE(nick): We can either handle the deferred tasks here or save
422432 # in a field and do it immediately once step_with_batch_queue is
423433 # re-called. The latter slightly favors TTFT over TPOT/throughput.
424434 if deferred_scheduler_output :
425- # We now have the tokens needed to compute the bitmask for the
426- # deferred request. Get the bitmask and call sample tokens.
427- grammar_output = self .scheduler .get_grammar_bitmask (
428- deferred_scheduler_output
429- )
430- future = self .model_executor .sample_tokens (grammar_output , non_block = True )
431- batch_queue .appendleft ((future , deferred_scheduler_output ))
435+ with record_function_or_nullcontext ("core step_with_batch_queue: deferred_scheduler_output" ):
436+ # We now have the tokens needed to compute the bitmask for the
437+ # deferred request. Get the bitmask and call sample tokens.
438+ grammar_output = self .scheduler .get_grammar_bitmask (
439+ deferred_scheduler_output
440+ )
441+ future = self .model_executor .sample_tokens (grammar_output , non_block = True )
442+ batch_queue .appendleft ((future , deferred_scheduler_output ))
432443
433444 return engine_core_outputs , model_executed
434445
0 commit comments