(nm) lwilkinson@floppy-fan:~/code/neuralmagic-vllm$ python examples/offline_profile.py --model_name nm-testing/llama2.c-stories110M-pruned2.4 --batch_size 8 Run profile with: model_name = nm-testing/llama2.c-stories110M-pruned2.4 model_revision = None is_sparse = False quant_method = None max_seq_len = 1024 prompt_len = 256 batch_size = 8 num_gpus = 1 allow_cuda_graphs = False INFO 03-14 04:08:08 llm_engine.py:81] Initializing an LLM engine with config: model='nm-testing/llama2.c-stories110M-pruned2.4', tokenizer='nm-testing/llama2.c-stories110M-pruned2.4', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, sparsity=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, seed=0) INFO 03-14 04:08:11 weight_utils.py:177] Using model weights format ['*.bin'] INFO 03-14 04:08:13 llm_engine.py:340] # GPU blocks: 76549, # CPU blocks: 7281 STAGE:2024-03-14 04:08:16 3280968:3280968 ActivityProfilerController.cpp:312] Completed Stage: Warm Up STAGE:2024-03-14 04:08:16 3280968:3280968 ActivityProfilerController.cpp:318] Completed Stage: Collection STAGE:2024-03-14 04:08:16 3280968:3280968 ActivityProfilerController.cpp:322] Completed Stage: Post Processing STAGE:2024-03-14 04:08:17 3280968:3280968 ActivityProfilerController.cpp:312] Completed Stage: Warm Up STAGE:2024-03-14 04:08:17 3280968:3280968 ActivityProfilerController.cpp:318] Completed Stage: Collection STAGE:2024-03-14 04:08:17 3280968:3280968 ActivityProfilerController.cpp:322] Completed Stage: Post Processing ================================================================================ = Prefill Model Table (prompt_len=256, batch_size=8) ================================================================================ name | cpu_time_us | cuda_time_us | trace ============================================================================================================================================================================================= LlamaForCausalLM | 10726.25 | 7689.00 | |- LlamaModel | 10699.17 | 7689.00 | ||- VocabParallelEmbedding(weight=float16[32000, 768]) | 163.46 | 19.00 | |||- void at::native::(anonymous namespace)::indexSelectL... | 0.00 | 19.00 | index_select(float16[32000, 768], 0, int64[2048]) <- embedding(float16[32000, 768], int... ||- LlamaDecoderLayer | 4284.74 | 504.00 | |||- RMSNorm(weight=float16[768]) | 77.12 | 18.00 | ||||- void vllm::rms_norm_kernel(c10::Half*, c... | 0.00 | 18.00 | |||- LlamaAttention | 1787.12 | 204.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 252.62 | 72.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 72.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 29.88 | 20.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 2348.43 | 259.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 2197.65 | 135.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 135.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 55.79 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 65.18 | 81.00 | |||||- void at::native::(anonymous namespace)::indexSelec... | 0.00 | 19.00 | reshape(float16[8, 256, 2048], None) <- matmul(float16[8, 256, 2048], float16[2048, 768... |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 719.33 | 619.00 | |||- RMSNorm(weight=float16[768]) | 20.20 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 525.42 | 334.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 45.19 | 71.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 71.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- void vllm::rms_norm_kernel(c10::Half*, c... | 0.00 | 18.00 | split_with_sizes(float16[8, 256, 2304], None, -1) ||||- RotaryEmbedding | 20.86 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 129.98 | 240.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 44.88 | 135.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 135.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 27.48 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 39.18 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 589.02 | 664.00 | |||- RMSNorm(weight=float16[768]) | 13.49 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 407.56 | 378.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 40.98 | 111.00 | |||||- void flash_fwd_kernel(c10... | 0.00 | 24.00 | |||- LlamaMLP | 118.53 | 240.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 41.67 | 135.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 135.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 24.23 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 36.23 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 536.39 | 581.00 | |||- RMSNorm(weight=float16[768]) | 12.54 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 368.45 | 297.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 39.56 | 72.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 72.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 13.83 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 121.31 | 239.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 38.41 | 134.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 24.69 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 43.01 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 517.73 | 754.00 | |||- RMSNorm(weight=float16[768]) | 12.63 | 21.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 21.00 | |||- LlamaAttention | 353.14 | 399.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 43.94 | 71.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 71.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 13.68 | 20.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 114.54 | 311.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 41.69 | 134.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 22.38 | 115.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 72.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 34.97 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 560.47 | 697.00 | |||- RMSNorm(weight=float16[768]) | 12.66 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 401.54 | 277.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 34.98 | 71.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 71.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 13.82 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 113.76 | 375.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 39.82 | 134.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 22.80 | 178.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 135.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 35.92 | 63.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 63.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 526.00 | 637.00 | |||- RMSNorm(weight=float16[768]) | 12.66 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 364.15 | 353.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 36.34 | 115.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | transpose(float16[2304, 768], 0, 1) <- t(float16[2304, 768]) <- linear(float16[8, 256, ... |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 72.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 13.81 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c1... | 0.00 | 22.00 | unsqueeze(float16[2048, 12, 64], 0) |||||- void flash_fwd_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 113.58 | 239.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 38.13 | 134.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 22.54 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 37.50 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 515.91 | 589.00 | |||- RMSNorm(weight=float16[768]) | 13.06 | 21.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 21.00 | |||- LlamaAttention | 346.42 | 306.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 42.67 | 72.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 72.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 13.43 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c1... | 0.00 | 23.00 | unsqueeze(float16[2048, 12, 64], 0) |||||- void flash_fwd_kernel(c10::Hal... | 0.00 | 43.00 | reshape(float16[8, 256, 768], None) <- matmul(float16[8, 256, 768], float16[768, 768]) ... |||||- ampere_fp16_s16816gemm_fp16_256x128_ldg8_f2f_stage... | 0.00 | 34.00 | mm(float16[2048, 768], float16[768, 768]) <- matmul(float16[8, 256, 768], float16[768, ... |||- RMSNorm(weight=float16[768]) | 13.87 | 23.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 122.96 | 239.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 38.88 | 134.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 21.88 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 47.92 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 508.64 | 599.00 | |||- RMSNorm(weight=float16[768]) | 13.29 | 21.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 21.00 | |||- LlamaAttention | 345.85 | 315.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 39.54 | 72.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 72.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x1... | 0.00 | 71.00 | split_with_sizes(float16[8, 256, 2304], None, -1) ||||- RotaryEmbedding | 12.64 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 117.02 | 240.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 44.40 | 135.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 135.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 21.85 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 36.37 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 496.37 | 736.00 | |||- RMSNorm(weight=float16[768]) | 12.36 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 339.88 | 381.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 35.07 | 71.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 71.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x1... | 0.00 | 134.00 | split_with_sizes(float16[8, 256, 2304], None, -1) ||||- RotaryEmbedding | 12.54 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c10::Hal... | 0.00 | 43.00 | |||||- void flash_fwd_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 111.95 | 310.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 39.67 | 205.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 71.00 | reshape(float16[8, 256, 768], None) <- matmul(float16[8, 256, 768], float16[768, 4096])... |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 20.85 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 36.62 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 520.01 | 624.00 | |||- RMSNorm(weight=float16[768]) | 12.79 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 349.06 | 205.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 45.73 | 72.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 72.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 12.90 | 20.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 122.42 | 374.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 44.79 | 268.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | reshape(float16[8, 256, 768], None) <- matmul(float16[8, 256, 768], float16[768, 4096])... |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 23.21 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 36.31 | 63.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 63.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- LlamaDecoderLayer | 568.42 | 644.00 | |||- RMSNorm(weight=float16[768]) | 13.01 | 22.00 | ||||- void vllm::fused_add_rms_norm_kernel(c10... | 0.00 | 22.00 | |||- LlamaAttention | 404.83 | 360.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 63.28 | 71.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x... | 0.00 | 71.00 | mm(float16[2048, 768], float16[768, 2304]) <- matmul(float16[8, 256, 768], float16[768,... ||||- RotaryEmbedding | 13.34 | 21.00 | |||||- void vllm::rotary_embedding_kernel(c10... | 0.00 | 23.00 | |||- LlamaMLP | 115.47 | 239.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 43.23 | 134.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 134.00 | mm(float16[2048, 768], float16[768, 4096]) <- matmul(float16[8, 256, 768], float16[768,... ||||- SiluAndMul | 22.36 | 43.00 | |||||- void vllm::silu_and_mul_kernel(c10::Hal... | 0.00 | 43.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 35.29 | 62.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x... | 0.00 | 62.00 | mm(float16[2048, 2048], float16[2048, 768]) <- matmul(float16[8, 256, 2048], float16[20... ||- RMSNorm(weight=float16[768]) | 13.23 | 22.00 | |||- void vllm::fused_add_rms_norm_kernel(c10:... | 0.00 | 22.00 | Sampler | 64679.53 | 952.00 | |- void at::native::(anonymous namespace)::indexSelectSma... | 0.00 | 5.00 | index_select(float16[2048, 768], 0, int64[8]) |- void cutlass::Kernel(c10::H... | 0.00 | 23.00 | |- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x... | 0.00 | 134.00 | |- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x... | 0.00 | 62.00 | _to_copy(float16[8], 5, 0, None, None, True, None) <- to(float16[8], 5, 0, None, None, ... |- Memcpy HtoD (Pinned -> Device) | 0.00 | 1.00 | copy_(int64[8, 256], int64[8, 256], True) <- _to_copy(int64[8, 256], 4, 0, None, None, ... |- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x128x... | 0.00 | 72.00 | _to_copy(int64[8, 0], 4, 0, None, None, True, None) <- to(int64[8, 0], 4, 0, None, None... |- void at::native::elementwise_kernel<128, 4, at::native... | 0.00 | 3.00 | div_(float16[8, 32000], float16[8, 1]) |- at::native::(anonymous namespace)::fill_index_and_segm... | 0.00 | 3.00 | sort(float16[8, 32000], False, -1, False) <- sort(float16[8, 32000], -1, False) |- void at_cuda_detail::cub::DeviceRadixSortUpsweepKernel... | 0.00 | 3.00 | sort(float16[8, 32000], False, -1, False) <- sort(float16[8, 32000], -1, False) |- void at_cuda_detail::cub::RadixSortScanBinsKernel Device) | 0.00 | 1.00 | copy_(float16[8, 32000], float16[8, 32000], False) <- sort(float16[8, 32000], False, -1... |- void at::native::unrolled_elementwise_kernel Pageable) | 0.00 | 1.00 | copy_(int64[8, 1], int64[8, 1], False) <- _to_copy(int64[8, 1], 4, 0, None, None, False... |- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x... | 0.00 | 62.00 | |- void at::native::index_elementwise_kernel<128, 4, at::... | 0.00 | 2.00 | index(float32[8, 32000], None) |- Memcpy DtoH (Device -> Pageable) | 0.00 | 1.00 | copy_(float32[8], float32[8], False) <- _to_copy(float32[8], 6, 0, None, None, False, N... |- void flash_fwd_kernel(c10::Half*, c... | 0.00 | 2.00 | |||- LlamaAttention | 23276.84 | 34.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 2205.64 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 172.67 | 24.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 73.01 | 13.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 40.07 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 170.53 | 32.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 29.05 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 76.11 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 23.09 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 26.10 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 117.46 | 32.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 23.11 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 74.66 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 22.88 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 26.72 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 111.22 | 31.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 21.91 | 7.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 69.98 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 21.87 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 23.95 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 105.24 | 32.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 21.30 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 71.35 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 21.24 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 24.93 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 115.78 | 32.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 21.00 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 67.73 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 21.06 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 23.27 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 107.83 | 31.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 21.03 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 65.48 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 20.52 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 22.95 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 102.58 | 31.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 20.42 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 67.09 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 20.61 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 23.54 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 104.39 | 31.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 20.09 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 92.35 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 43.66 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 25.33 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 103.27 | 31.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 20.32 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 66.97 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 20.66 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 23.61 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 106.37 | 31.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 21.71 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 68.26 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 21.64 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 22.68 | 9.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaAttention | 123.31 | 32.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 21.55 | 8.00 | |||||- void cutlass::Kernel(c10... | 0.00 | 2.00 | |||- LlamaMLP | 65.53 | 23.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 20.85 | 12.00 | |||||- void cutlass::Kernel(c10::Hal... | 0.00 | 2.00 | ||||- RowParallelLinear(weight=float16[768, 2048]) | 22.26 | 9.00 | |||||- void cutlass::Kernel(c10:... | 0.00 | 2.00 | Sampler | 1591.95 | 354.00 | |- void at::native::(anonymous namespace)::indexSelectSma... | 0.00 | 4.00 | index_select(float16[8, 768], 0, int64[8]) |- void cutlass::Kernel Device) | 0.00 | 1.00 | copy_(int64[8, 256], int64[8, 256], True) <- _to_copy(int64[8, 256], 4, 0, None, None, ... |- void at::native::elementwise_kernel<128, 4, at::native... | 0.00 | 3.00 | div_(float16[8, 32000], float16[8, 1]) |- at::native::(anonymous namespace)::fill_index_and_segm... | 0.00 | 3.00 | sort(float16[8, 32000], False, -1, False) <- sort(float16[8, 32000], -1, False) |- void at_cuda_detail::cub::DeviceRadixSortUpsweepKernel... | 0.00 | 4.00 | sort(float16[8, 32000], False, -1, False) <- sort(float16[8, 32000], -1, False) |- void at_cuda_detail::cub::RadixSortScanBinsKernel Device) | 0.00 | 1.00 | copy_(float16[8, 32000], float16[8, 32000], False) <- sort(float16[8, 32000], False, -1... |- void at::native::unrolled_elementwise_kernel(c10::Half*, c10::Half const*, c10... | 18.00 | 1.00 ||||- void vllm::fused_add_rms_norm_kernel(c10::Half*, c10::Half*,... | 516.00 | 23.00 |||- LlamaAttention | 3809.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 941.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x128x32_stage4_warpsi... | 859.00 | 12.00 |||||- void flash_fwd_kernel(c10::Half*, c10::Half const*... | 43.00 | 1.00 ||||- RotaryEmbedding | 248.00 | |||||- void vllm::rotary_embedding_kernel(long const*, c10::... | 248.00 | 12.00 ||||- PagedAttention | 1947.00 | |||||- void vllm::reshape_and_cache_kernel(long const*, c10::... | 63.00 | 3.00 |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x32_stage4_warpsi... | 187.00 | 3.00 |||||- ampere_fp16_s16816gemm_fp16_256x128_ldg8_f2f_stages_32x3_tn | 68.00 | 2.00 |||||- void vllm::fused_add_rms_norm_kernel(c10::Half*, c10::Half*... | 45.00 | 2.00 |||||- void vllm::silu_and_mul_kernel(c10::Half*, c10::Half const*... | 43.00 | 1.00 ||||- RowParallelLinear(weight=float16[768, 768]) | 450.00 | |||||- ampere_fp16_s16816gemm_fp16_256x128_ldg8_f2f_stages_32x3_tn | 407.00 | 12.00 |||||- void vllm::silu_and_mul_kernel(c10::Half*, c10::Half const*... | 43.00 | 1.00 ||||- void vllm::rms_norm_kernel(c10::Half*, c10::Half const*, c10... | 18.00 | 1.00 ||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x128x32_stage4_warpsiz... | 71.00 | 1.00 ||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x32_stage3_warpsiz... | 134.00 | 1.00 |||- LlamaMLP | 3305.00 | ||||- MergedColumnParallelLinear(weight=float16[4096, 768]) | 1817.00 | |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x32_stage3_warpsi... | 1746.00 | 13.00 |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x128x32_stage4_warpsi... | 71.00 | 1.00 ||||- SiluAndMul | 723.00 | |||||- void vllm::silu_and_mul_kernel(c10::Half*, c10::Half const*... | 516.00 | 12.00 |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x128x32_stage4_warpsi... | 72.00 | 1.00 |||||- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x32_stage3_warpsi... | 135.00 | 1.00 ||||- RowParallelLinear(weight=float16[768, 2048]) | 765.00 | |||||- void at::native::(anonymous namespace)::indexSelectLargeIndex(c10::Half*, c10::Half*, ... | 22.00 | 1.00 Sampler | 952.00 | |- void at::native::(anonymous namespace)::indexSelectSmallIndex(c10::Half*, c10::Half*, c1... | 23.00 | 1.00 |- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x32_stage3_warpsize4x... | 268.00 | 2.00 |- Memcpy HtoD (Pinned -> Device) | 1.00 | 8.00 |- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize192x128x32_stage4_warpsize4x... | 124.00 | 2.00 |- sm80_xmma_gemm_f16f16_f16f32_f32_tn_n_tilesize160x128x32_stage4_warpsize2x... | 72.00 | 1.00 |- void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl... | 6.00 | 1.00 |- Memcpy DtoD (Device -> Device) | 1.00 | 1.00 |- void at::native::unrolled_elementwise_kernel Pageable) | 2.00 | 2.00 |- void flash_fwd_kernel(c10::Half*, c10::Half const*, c10... | 2.00 | 1.00 ||||- void vllm::fused_add_rms_norm_kernel(c10::Half*, c10::Half*,... | 46.00 | 23.00 |||- LlamaAttention | 380.00 | ||||- QKVParallelLinear(weight=float16[2304, 768]) | 95.00 | |||||- void cutlass::Kernel(long const*, c10::... | 24.00 | 12.00 ||||- PagedAttention | 188.00 | |||||- void vllm::reshape_and_cache_kernel(c10::Half*, c10::Half const*... | 24.00 | 12.00 ||||- RowParallelLinear(weight=float16[768, 2048]) | 108.00 | |||||- void cutlass::Kernel(c10::Half*, c10::Half*, ... | 2.00 | 1.00 Sampler | 354.00 | |- void at::native::(anonymous namespace)::indexSelectSmallIndex Device) | 1.00 | 10.00 |- void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl... | 6.00 | 1.00 |- Memcpy DtoD (Device -> Device) | 1.00 | 1.00 |- void at::native::unrolled_elementwise_kernel