diff --git a/examples/offline_inference_enc_dec.py b/examples/offline_inference_enc_dec.py index f554302fefb8c..83c9d4d0377ee 100644 --- a/examples/offline_inference_enc_dec.py +++ b/examples/offline_inference_enc_dec.py @@ -8,11 +8,15 @@ Output: for several prompts, compare native PyTorch & vLLM prompt completions ''' - +import warnings import torch from vllm import LLM, SamplingParams from transformers import T5Tokenizer, T5ForConditionalGeneration +warnings.filterwarnings("ignore", + category=UserWarning, + module="transformers.generation.utils.*") + hf_model_id = "t5-small" dtype = "bfloat16" prompts = [ @@ -27,7 +31,7 @@ # Native PyTorch test # - Model and tokenizer initialization -tokenizer = T5Tokenizer.from_pretrained(hf_model_id) +tokenizer = T5Tokenizer.from_pretrained(hf_model_id, legacy=False) model = T5ForConditionalGeneration.from_pretrained(hf_model_id).to( dtype=dtype_obj) diff --git a/vllm/config.py b/vllm/config.py index b4d48d34a8a72..e260e6a0cb1d6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -317,7 +317,7 @@ def __init__( self.num_cpu_blocks = None def metrics_info(self): - # convert cache_config to dict(key: str, value:str) for prometheus metrics info + # convert cache_config to dict(key: str, value: str) for prometheus metrics info return {key: str(value) for key, value in self.__dict__.items()} def _verify_args(self) -> None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 54b09c38f58a5..d31542159e4a4 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -23,6 +23,7 @@ def __init__(self, labelnames: List[str]): if hasattr(collector, "_name") and "vllm" in collector._name: REGISTRY.unregister(collector) + # Config Information self.info_cache_config = Info( name='vllm:cache_config', documentation='information of cache_config') diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b2f040114a078..3777e0f3a0601 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -62,6 +62,12 @@ def parse_args(): description="vLLM OpenAI-Compatible RESTful API server.") parser.add_argument("--host", type=str, default=None, help="host name") parser.add_argument("--port", type=int, default=8000, help="port number") + parser.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], + help="log level for uvicorn") parser.add_argument("--allow-credentials", action="store_true", help="allow credentials") @@ -245,7 +251,7 @@ async def authentication(request: Request, call_next): uvicorn.run(app, host=args.host, port=args.port, - log_level="info", + log_level=args.uvicorn_log_level, timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile) diff --git a/vllm/model_executor/models/t5.py b/vllm/model_executor/models/t5.py index c16577b50e37c..d6c27ed5b4903 100644 --- a/vllm/model_executor/models/t5.py +++ b/vllm/model_executor/models/t5.py @@ -258,26 +258,17 @@ def forward( input_metadata: InputMetadata, encoder_hidden_states: Optional[torch.Tensor], ) -> torch.Tensor: - # print("hidden_states shape", hidden_states.shape) - # print("hidden_states", hidden_states) q, _ = self.q(hidden_states) - # print("q shape", q.shape) - # print("q", q) batch_size = hidden_states.shape[0] seq_len = hidden_states.shape[1] prompt_len = input_metadata.prompt_lens.max().item() context_len = input_metadata.context_lens.max().item() context_len = max(context_len, 1) - # print("batch_size", batch_size) - # print("seq_len", seq_len) - # print("prompt_len", prompt_len) - # print("context_len", context_len) block_size = 16 if not self.is_decoder: - # print("encoder self attention!") assert kv_cache is None # Encoder self attention, no cache operations k, _ = self.k(hidden_states) @@ -293,12 +284,9 @@ def forward( input_metadata.prompt_lens[i]:, ] = torch.finfo( input_metadata.attn_bias.dtype).min - # print("input_metadata.attn_bias shape", input_metadata.attn_bias.shape) - # print("input_metadata.attn_bias", input_metadata.attn_bias) attn_output = self.attn(q, k, v, input_metadata) elif not self.is_cross: - # print("decoder self attention!") # Decoder self attention k, _ = self.k(hidden_states) v, _ = self.v(hidden_states) @@ -308,13 +296,9 @@ def forward( 1 if input_metadata.is_prompt else context_len, (context_len + block_size - 1) // block_size * block_size).repeat(batch_size, 1, 1, 1) - # print("position_bias shape", position_bias.shape) - # print("position_bias", position_bias) input_metadata.attn_bias = position_bias[:, :, -seq_len:, :].contiguous( ) - # print("input_metadata.attn_bias shape", input_metadata.attn_bias.shape) - # print("input_metadata.attn_bias", input_metadata.attn_bias) key_cache, value_cache = kv_cache @@ -322,7 +306,6 @@ def forward( input_metadata) else: - # print("cross attention!") # Cross attention key_cache, value_cache = kv_cache @@ -330,10 +313,6 @@ def forward( assert encoder_hidden_states is not None k, _ = self.k(encoder_hidden_states) v, _ = self.v(encoder_hidden_states) - # print("k shape", k.shape) - # for i in range(k.shape[0]): - # for j in range(k.shape[1]): - # print(f"key at batch {i} and pos {j}: ", k[i, j, :].reshape(1, 8, 64)) attn_output = self.attn(q, k, v, key_cache, value_cache, input_metadata) else: @@ -369,16 +348,12 @@ def forward( input_metadata: InputMetadata, ) -> torch.Tensor: normed_hidden_states = self.layer_norm(hidden_states) - print("self attention input shape: ", normed_hidden_states.shape) - print("self_attention input: ", normed_hidden_states) attention_output = self.SelfAttention( hidden_states=normed_hidden_states, kv_cache=kv_cache, input_metadata=input_metadata, encoder_hidden_states=None, ) - print("self attention output shape: ", attention_output.shape) - print("self_attention output: ", attention_output) hidden_states = hidden_states + attention_output return hidden_states @@ -408,16 +383,12 @@ def forward( encoder_hidden_states: Optional[torch.Tensor], ) -> torch.Tensor: normed_hidden_states = self.layer_norm(hidden_states) - print("cross attention input shape: ", normed_hidden_states.shape) - print("cross_attention input: ", normed_hidden_states) attention_output = self.EncDecAttention( hidden_states=normed_hidden_states, kv_cache=kv_cache, input_metadata=input_metadata, encoder_hidden_states=encoder_hidden_states, ) - print("cross attention output shape: ", attention_output.shape) - print("cross_attention output: ", attention_output) hidden_states = hidden_states + attention_output return hidden_states @@ -521,11 +492,8 @@ def forward( input_metadata: InputMetadata, encoder_hidden_states: Optional[torch.Tensor], ) -> torch.Tensor: - # print("input_ids: ", input_ids) hidden_states = self.embed_tokens(input_ids) - # print("hidden_states shape: ", hidden_states.shape) - # print("hidden_states: ", hidden_states) for i, layer_module in enumerate(self.block): kv_cache = kv_caches[i] if self.is_decoder else None @@ -539,15 +507,6 @@ def forward( hidden_states = layer_outputs hidden_states = self.final_layer_norm(hidden_states) - # if encoder_hidden_states is not None: - # print("hidden_states shape:" , hidden_states.shape) - # print("encoder_hidden_states shape:" , encoder_hidden_states.shape) - # # Attach encoder hidden states - # hidden_states = torch.cat( - # [encoder_hidden_states, hidden_states], dim=1 - # ) - print("final_hidden_states shape: ", hidden_states.shape) - print("final_hidden_states: ", hidden_states) return hidden_states @@ -579,9 +538,6 @@ def forward( kv_caches: List[KVCache], input_metadata: InputMetadata, ) -> torch.Tensor: - # print("input_ids shape: ", input_ids.shape) - # print("input_ids: ", input_ids) - # print("input_metadata: ", input_metadata) if input_metadata.is_prompt: # prompt run, need to run encoder once hidden_states = self.encoder(input_ids, kv_caches, input_metadata,