From 25402ae47215e85062196eda80c859973653d18c Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 18 Jul 2024 14:03:54 +0800 Subject: [PATCH 01/12] Reduce unnecessary compute when logprobs=None --- vllm/model_executor/layers/sampler.py | 98 +++++++++++++++------------ 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5c376797a054f..04c8817885bef 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -775,7 +775,7 @@ def _get_logprobs( next_token_ids: List[int] = [] # The largest requested number of logprobs. We find logprobs as many as the # largest num logprobs in this API. - largest_num_logprobs = 1 + largest_num_logprobs = 0 # Select indices to compute logprob from, ranks of token ids, and the top # k token ids from logprobs. @@ -786,7 +786,7 @@ def _get_logprobs( # Update indices and tokens for prompt logprobs. if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): - largest_num_logprobs = max(largest_num_logprobs, + largest_num_logprobs = max(1, largest_num_logprobs, sampling_params.prompt_logprobs) next_prompt_tokens = _get_next_prompt_tokens(seq_group) query_indices.extend(seq_group.prompt_logprob_indices) @@ -805,7 +805,7 @@ def _get_logprobs( next_token_ids.extend(token_ids) if sampling_params.logprobs is not None: - largest_num_logprobs = max(largest_num_logprobs, + largest_num_logprobs = max(1, largest_num_logprobs, sampling_params.logprobs) assert len(next_token_ids) == len(query_indices) @@ -815,36 +815,39 @@ def _get_logprobs( empty_prompt_logprob: Optional[PromptLogprobs] = None return [empty_prompt_logprob], [empty_sampled_logprob] - query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) - next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device) - - # (num_selected_query_tokens, num_logprobs). Note that query_indices can - # contain duplicates if beam search is enabled. - selected_logprobs = logprobs[[ - query_indices_gpu, - next_token_ids_gpu, - ]] - ranks = _get_ranks( - logprobs[query_indices_gpu], - next_token_ids_gpu, - ) - assert selected_logprobs.shape[0] == ranks.shape[0] - - # Logprobs of topk tokens for a batch of sequence groups. - # (num_query_tokens_across_batch). if largest_num_logprobs > 0: + query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) + next_token_ids_gpu = torch.tensor(next_token_ids, + device=logprobs.device) + + # (num_selected_query_tokens, num_logprobs). Note that query_indices can + # contain duplicates if beam search is enabled. + selected_logprobs = logprobs[[ + query_indices_gpu, + next_token_ids_gpu, + ]] + ranks = _get_ranks( + logprobs[query_indices_gpu], + next_token_ids_gpu, + ) + assert selected_logprobs.shape[0] == ranks.shape[0] + + # Logprobs of topk tokens for a batch of sequence groups. + # (num_query_tokens_across_batch). top_logprobs, top_token_ids = torch.topk(logprobs, largest_num_logprobs, dim=-1) - else: - top_logprobs, top_token_ids = None, None - selected_logprobs = selected_logprobs.to('cpu') - ranks = ranks.to('cpu') - if top_logprobs is not None and top_token_ids is not None: + selected_logprobs = selected_logprobs.to('cpu') + ranks = ranks.to('cpu') top_logprobs = top_logprobs.to('cpu') top_token_ids = top_token_ids.to('cpu') + else: + # We do not need these if sampling_params.(prompt_)logprobs is None for all seq_groups + selected_logprobs, ranks = None, None + top_logprobs, top_token_ids = None, None + # Find prompt/sample logprobs. prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = [] sample_logprobs_per_seq_group: List[SampleLogprobs] = [] @@ -946,21 +949,26 @@ def _get_sampled_logprob_if_needed( if seq_group.do_sample: assert len(next_token_ids) > 0 - # Pre-select items from tensor. tolist() is faster than repetitive - # `.item()` calls. - selected_logprob_items = selected_logprobs[ - selected_logprobs_idx:selected_logprobs_idx + - len(next_token_ids)].tolist() - rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + - len(next_token_ids)].tolist() - for idx, (next_token_id, - parent_id) in enumerate(zip(next_token_ids, parent_seq_ids)): - # Get the logprob of a sampled token. - sampled_logprobs_dict = { - next_token_id: (selected_logprob_items[idx], rank_items[idx]) - } - # Get top K logprobs. - if num_logprobs > 0: + if num_logprobs == 0: + for next_token_id in next_token_ids: + # Use a dummy logprob + sampled_logprobs.append({next_token_id: Logprob(0.0)}) + else: + # Pre-select items from tensor. tolist() is faster than repetitive + # `.item()` calls. + selected_logprob_items = selected_logprobs[ + selected_logprobs_idx:selected_logprobs_idx + + len(next_token_ids)].tolist() + rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + + len(next_token_ids)].tolist() + for idx, (next_token_id, parent_id) in enumerate( + zip(next_token_ids, parent_seq_ids)): + # Get the logprob of a sampled token. + sampled_logprobs_dict = { + next_token_id: + (selected_logprob_items[idx], rank_items[idx]) + } + # Get top K logprobs. top_ids = top_token_ids[top_logprob_idx + parent_id, :num_logprobs].tolist() top_probs = top_logprobs[top_logprob_idx + @@ -974,11 +982,11 @@ def _get_sampled_logprob_if_needed( top_ranks) }) - sampled_logprobs.append({ - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in - sampled_logprobs_dict.items() - }) + sampled_logprobs.append({ + token_id: Logprob(*logprob_and_rank) + for token_id, logprob_and_rank in + sampled_logprobs_dict.items() + }) # NOTE: This part of code is not intuitive. `selected_logprobs` include # logprobs for the current step, which has len(next_token_ids) tokens From b941184de2157851f31ef4aad408d40251fffd45 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 18 Jul 2024 14:26:16 +0800 Subject: [PATCH 02/12] Wrap long comment --- vllm/model_executor/layers/sampler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 04c8817885bef..5ba6e83d0ab0f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -844,7 +844,8 @@ def _get_logprobs( top_token_ids = top_token_ids.to('cpu') else: - # We do not need these if sampling_params.(prompt_)logprobs is None for all seq_groups + # We do not need these if sampling_params.(prompt_)logprobs is None for + # all seq_groups selected_logprobs, ranks = None, None top_logprobs, top_token_ids = None, None From c93f451926ad386e0c40480d6215a7fa239f88ca Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 18 Jul 2024 14:34:59 +0800 Subject: [PATCH 03/12] Fix logic --- vllm/model_executor/layers/sampler.py | 31 ++++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5ba6e83d0ab0f..12dc93b536a86 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -944,13 +944,13 @@ def _get_sampled_logprob_if_needed( ): """Compute the sample logprob if needed.""" seq_ids = seq_group.seq_ids - num_logprobs = seq_group.sampling_params.logprobs or 0 + num_logprobs = seq_group.sampling_params.logprobs sampled_logprobs: SampleLogprobs = [] next_token_ids, parent_seq_ids = sample_result if seq_group.do_sample: assert len(next_token_ids) > 0 - if num_logprobs == 0: + if num_logprobs is None: for next_token_id in next_token_ids: # Use a dummy logprob sampled_logprobs.append({next_token_id: Logprob(0.0)}) @@ -969,19 +969,20 @@ def _get_sampled_logprob_if_needed( next_token_id: (selected_logprob_items[idx], rank_items[idx]) } - # Get top K logprobs. - top_ids = top_token_ids[top_logprob_idx + - parent_id, :num_logprobs].tolist() - top_probs = top_logprobs[top_logprob_idx + - parent_id, :num_logprobs].tolist() - # Top K is already sorted by rank, so we can use 1 ~ - # num_logprobs + 1 for rank. - top_ranks = range(1, num_logprobs + 1) - sampled_logprobs_dict.update({ - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip(top_ids, top_probs, - top_ranks) - }) + if num_logprobs > 0: + # Get top K logprobs. + top_ids = top_token_ids[top_logprob_idx + + parent_id, :num_logprobs].tolist() + top_probs = top_logprobs[ + top_logprob_idx + parent_id, :num_logprobs].tolist() + # Top K is already sorted by rank, so we can use 1 ~ + # num_logprobs + 1 for rank. + top_ranks = range(1, num_logprobs + 1) + sampled_logprobs_dict.update({ + top_id: (top_prob, rank) + for top_id, top_prob, rank in zip( + top_ids, top_probs, top_ranks) + }) sampled_logprobs.append({ token_id: Logprob(*logprob_and_rank) From c8c296b61e5b210c3989fbf7dc9aa9576676f556 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 18 Jul 2024 16:05:15 +0800 Subject: [PATCH 04/12] Fix beam_search cases --- vllm/model_executor/layers/sampler.py | 39 ++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 12dc93b536a86..64350f9cf0a0c 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -776,6 +776,8 @@ def _get_logprobs( # The largest requested number of logprobs. We find logprobs as many as the # largest num logprobs in this API. largest_num_logprobs = 0 + # If beam search is enabled. + use_beam_search = False # Select indices to compute logprob from, ranks of token ids, and the top # k token ids from logprobs. @@ -808,6 +810,8 @@ def _get_logprobs( largest_num_logprobs = max(1, largest_num_logprobs, sampling_params.logprobs) + use_beam_search = use_beam_search or sampling_params.use_beam_search + assert len(next_token_ids) == len(query_indices) if len(query_indices) == 0: @@ -815,7 +819,10 @@ def _get_logprobs( empty_prompt_logprob: Optional[PromptLogprobs] = None return [empty_prompt_logprob], [empty_sampled_logprob] - if largest_num_logprobs > 0: + selected_logprobs, ranks = None, None + top_logprobs, top_token_ids = None, None + + if largest_num_logprobs > 0 or use_beam_search: query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device) @@ -832,22 +839,17 @@ def _get_logprobs( ) assert selected_logprobs.shape[0] == ranks.shape[0] - # Logprobs of topk tokens for a batch of sequence groups. - # (num_query_tokens_across_batch). - top_logprobs, top_token_ids = torch.topk(logprobs, - largest_num_logprobs, - dim=-1) + if largest_num_logprobs > 0: + # Logprobs of topk tokens for a batch of sequence groups. + # (num_query_tokens_across_batch). + top_logprobs, top_token_ids = torch.topk(logprobs, + largest_num_logprobs, + dim=-1) - selected_logprobs = selected_logprobs.to('cpu') - ranks = ranks.to('cpu') - top_logprobs = top_logprobs.to('cpu') - top_token_ids = top_token_ids.to('cpu') - - else: - # We do not need these if sampling_params.(prompt_)logprobs is None for - # all seq_groups - selected_logprobs, ranks = None, None - top_logprobs, top_token_ids = None, None + selected_logprobs = selected_logprobs.to('cpu') + ranks = ranks.to('cpu') + top_logprobs = top_logprobs.to('cpu') + top_token_ids = top_token_ids.to('cpu') # Find prompt/sample logprobs. prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = [] @@ -945,12 +947,13 @@ def _get_sampled_logprob_if_needed( """Compute the sample logprob if needed.""" seq_ids = seq_group.seq_ids num_logprobs = seq_group.sampling_params.logprobs + use_beam_search = seq_group.sampling_params.use_beam_search sampled_logprobs: SampleLogprobs = [] next_token_ids, parent_seq_ids = sample_result if seq_group.do_sample: assert len(next_token_ids) > 0 - if num_logprobs is None: + if num_logprobs is None and not use_beam_search: for next_token_id in next_token_ids: # Use a dummy logprob sampled_logprobs.append({next_token_id: Logprob(0.0)}) @@ -969,7 +972,7 @@ def _get_sampled_logprob_if_needed( next_token_id: (selected_logprob_items[idx], rank_items[idx]) } - if num_logprobs > 0: + if num_logprobs is not None and num_logprobs > 0: # Get top K logprobs. top_ids = top_token_ids[top_logprob_idx + parent_id, :num_logprobs].tolist() From 5fe67d8fc364f8bd1d7b744e7632279832bd8e31 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 18 Jul 2024 19:28:20 +0800 Subject: [PATCH 05/12] Small refactor to make code more readable --- vllm/model_executor/layers/sampler.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 64350f9cf0a0c..7f69fe4bff72b 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -774,8 +774,9 @@ def _get_logprobs( # The next token ids to get the logprob value from. next_token_ids: List[int] = [] # The largest requested number of logprobs. We find logprobs as many as the - # largest num logprobs in this API. - largest_num_logprobs = 0 + # largest num logprobs in this API. If every logprobs is None, it will be + # set to -1. + largest_num_logprobs = -1 # If beam search is enabled. use_beam_search = False @@ -788,7 +789,7 @@ def _get_logprobs( # Update indices and tokens for prompt logprobs. if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): - largest_num_logprobs = max(1, largest_num_logprobs, + largest_num_logprobs = max(largest_num_logprobs, sampling_params.prompt_logprobs) next_prompt_tokens = _get_next_prompt_tokens(seq_group) query_indices.extend(seq_group.prompt_logprob_indices) @@ -807,7 +808,7 @@ def _get_logprobs( next_token_ids.extend(token_ids) if sampling_params.logprobs is not None: - largest_num_logprobs = max(1, largest_num_logprobs, + largest_num_logprobs = max(largest_num_logprobs, sampling_params.logprobs) use_beam_search = use_beam_search or sampling_params.use_beam_search @@ -822,7 +823,9 @@ def _get_logprobs( selected_logprobs, ranks = None, None top_logprobs, top_token_ids = None, None - if largest_num_logprobs > 0 or use_beam_search: + # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can + # skip the whole logprob calculation. + if largest_num_logprobs >= 0 or use_beam_search: query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device) @@ -839,18 +842,19 @@ def _get_logprobs( ) assert selected_logprobs.shape[0] == ranks.shape[0] + # We need to compute top k only if there exists logprobs > 0. if largest_num_logprobs > 0: # Logprobs of topk tokens for a batch of sequence groups. # (num_query_tokens_across_batch). top_logprobs, top_token_ids = torch.topk(logprobs, largest_num_logprobs, dim=-1) - - selected_logprobs = selected_logprobs.to('cpu') - ranks = ranks.to('cpu') top_logprobs = top_logprobs.to('cpu') top_token_ids = top_token_ids.to('cpu') + selected_logprobs = selected_logprobs.to('cpu') + ranks = ranks.to('cpu') + # Find prompt/sample logprobs. prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = [] sample_logprobs_per_seq_group: List[SampleLogprobs] = [] From 7325e5f98303689cb0125a278719e5a91f86c3e9 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 22 Jul 2024 09:59:13 +0800 Subject: [PATCH 06/12] Use nan instead of 0.0 --- vllm/model_executor/layers/sampler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 7f69fe4bff72b..809e44faf084a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,5 +1,6 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools +from math import nan from typing import Dict, List, Optional, Tuple import torch @@ -960,7 +961,7 @@ def _get_sampled_logprob_if_needed( if num_logprobs is None and not use_beam_search: for next_token_id in next_token_ids: # Use a dummy logprob - sampled_logprobs.append({next_token_id: Logprob(0.0)}) + sampled_logprobs.append({next_token_id: Logprob(nan)}) else: # Pre-select items from tensor. tolist() is faster than repetitive # `.item()` calls. From 63e091586ba6f2b474f1cb426c8f31070a153d69 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 22 Jul 2024 10:09:00 +0800 Subject: [PATCH 07/12] Add tests --- tests/samplers/test_logprobs.py | 38 +++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index f7bcd4c855799..5129e7d26b705 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,3 +1,4 @@ +from math import isnan from typing import List import pytest @@ -14,7 +15,7 @@ @pytest.mark.parametrize("dtype", ["float"]) # needed for comparing logprobs with HF @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) -@pytest.mark.parametrize("num_top_logprobs", [6]) # 32000 == vocab_size +@pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size @pytest.mark.parametrize("detokenize", [True, False]) def test_get_prompt_logprobs( hf_runner, @@ -63,7 +64,8 @@ def test_get_prompt_logprobs( assert result.outputs[0].logprobs is not None assert len(result.outputs[0].logprobs) == max_tokens for logprobs in result.outputs[0].logprobs: - assert len(logprobs) == num_top_logprobs + assert (len(logprobs) == num_top_logprobs + or len(logprobs) == num_top_logprobs + 1) output_text = result.outputs[0].text output_string_from_most_likely_tokens_lst: List[str] = [] for top_logprobs in result.outputs[0].logprobs: @@ -135,3 +137,35 @@ def test_max_logprobs(): bad_sampling_params = SamplingParams(logprobs=2) with pytest.raises(ValueError): runner.generate(["Hello world"], sampling_params=bad_sampling_params) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) +@pytest.mark.parametrize("detokenize", [True, False]) +def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, + detokenize: bool, example_prompts): + max_num_seqs = 256 + enable_chunked_prefill = False + max_num_batched_tokens = None + if chunked_prefill_token_size != -1: + enable_chunked_prefill = True + max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) + max_num_batched_tokens = chunked_prefill_token_size + max_tokens = 5 + + with vllm_runner( + model, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_seqs, + ) as vllm_model: + sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, + logprobs=None, + temperature=0.0, + detokenize=detokenize) + results_logprobs_none = vllm_model.model.generate( + example_prompts, sampling_params=sampling_params_logprobs_none) + + for i in range(len(results_logprobs_none)): + assert results_logprobs_none[i].outputs[0].logprobs is None + assert isnan(results_logprobs_none[i].outputs[0].cumulative_logprob) From fcc37f83284659d2d4c07bccea200ca5c7c3b265 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 22 Jul 2024 17:53:58 +0800 Subject: [PATCH 08/12] Use None in outputs.cumulative_logprobs if not requested --- tests/samplers/test_logprobs.py | 3 +-- vllm/outputs.py | 17 +++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 5129e7d26b705..dadee70a0b5dc 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,4 +1,3 @@ -from math import isnan from typing import List import pytest @@ -168,4 +167,4 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, for i in range(len(results_logprobs_none)): assert results_logprobs_none[i].outputs[0].logprobs is None - assert isnan(results_logprobs_none[i].outputs[0].cumulative_logprob) + assert results_logprobs_none[i].outputs[0].cumulative_logprob is None diff --git a/vllm/outputs.py b/vllm/outputs.py index 4cb7f06bdb8c7..b1cb1cd07fbb1 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -29,7 +29,7 @@ class CompletionOutput: index: int text: str token_ids: Tuple[int, ...] - cumulative_logprob: float + cumulative_logprob: Optional[float] logprobs: Optional[SampleLogprobs] finish_reason: Optional[str] = None stop_reason: Union[int, str, None] = None @@ -124,13 +124,14 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": include_logprobs = seq_group.sampling_params.logprobs is not None text_buffer_length = seq_group.sampling_params.output_text_buffer_length outputs = [ - CompletionOutput(seqs.index(seq), - seq.get_output_text_to_return(text_buffer_length), - seq.get_output_token_ids(), - seq.get_cumulative_logprob(), - seq.output_logprobs if include_logprobs else None, - SequenceStatus.get_finished_reason(seq.status), - seq.stop_reason) for seq in top_n_seqs + CompletionOutput( + seqs.index(seq), + seq.get_output_text_to_return(text_buffer_length), + seq.get_output_token_ids(), + seq.get_cumulative_logprob() if include_logprobs else None, + seq.output_logprobs if include_logprobs else None, + SequenceStatus.get_finished_reason(seq.status), + seq.stop_reason) for seq in top_n_seqs ] # Every sequence in the sequence group should have the same prompt. From bbd775705bb81a98dfca9c465f0e9655ecb0e956 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Tue, 23 Jul 2024 10:09:47 +0800 Subject: [PATCH 09/12] Update docstring --- vllm/sampling_params.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index ebe5e0fd34135..9afd4a7bb89de 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -93,11 +93,12 @@ class SamplingParams: min_tokens: Minimum number of tokens to generate per output sequence before EOS or stop_token_ids can be generated logprobs: Number of log probabilities to return per output token. - Note that the implementation follows the OpenAI API: The return - result includes the log probabilities on the `logprobs` most likely - tokens, as well the chosen tokens. The API will always return the - log probability of the sampled token, so there may be up to - `logprobs+1` elements in the response. + When set to None, no probability is returned. If set to a non-None + value, the result includes the log probabilities of the specified + number of most likely tokens, as well as the chosen tokens. + Note that the implementation follows the OpenAI API: The API will + always return the log probability of the sampled token, so there + may be up to `logprobs+1` elements in the response. prompt_logprobs: Number of log probabilities to return per prompt token. detokenize: Whether to detokenize the output. Defaults to True. skip_special_tokens: Whether to skip special tokens in the output. From 83612579c4ea76c0e225593fe15410cb32e1a28e Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Tue, 23 Jul 2024 18:19:47 +0800 Subject: [PATCH 10/12] Use inf instead of nan to avoid some test failure --- vllm/model_executor/layers/sampler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 809e44faf084a..3196db73d3767 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,6 +1,6 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools -from math import nan +from math import inf from typing import Dict, List, Optional, Tuple import torch @@ -961,7 +961,7 @@ def _get_sampled_logprob_if_needed( if num_logprobs is None and not use_beam_search: for next_token_id in next_token_ids: # Use a dummy logprob - sampled_logprobs.append({next_token_id: Logprob(nan)}) + sampled_logprobs.append({next_token_id: Logprob(inf)}) else: # Pre-select items from tensor. tolist() is faster than repetitive # `.item()` calls. From fded571c610608e54080b741a63bff0c998bdea6 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Wed, 24 Jul 2024 14:11:22 +0800 Subject: [PATCH 11/12] Add comment as per review --- tests/samplers/test_logprobs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index dadee70a0b5dc..c07c71e38233f 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -63,6 +63,8 @@ def test_get_prompt_logprobs( assert result.outputs[0].logprobs is not None assert len(result.outputs[0].logprobs) == max_tokens for logprobs in result.outputs[0].logprobs: + # If the output token is not included in the top X + # logprob, it can return 1 more data assert (len(logprobs) == num_top_logprobs or len(logprobs) == num_top_logprobs + 1) output_text = result.outputs[0].text From 1eb5260411babf510ff6e3a5dfc71e8ff5485a55 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Wed, 24 Jul 2024 16:38:43 +0800 Subject: [PATCH 12/12] Fix the undocumented logprob=True usage --- vllm/sampling_params.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 51bde197dd8d0..2598325439ebf 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -169,8 +169,8 @@ def __init__( self.ignore_eos = ignore_eos self.max_tokens = max_tokens self.min_tokens = min_tokens - self.logprobs = logprobs - self.prompt_logprobs = prompt_logprobs + self.logprobs = 1 if logprobs is True else logprobs + self.prompt_logprobs = 1 if prompt_logprobs is True else prompt_logprobs # NOTE: This parameter is only exposed at the engine level for now. # It is not exposed in the OpenAI API server, as the OpenAI API does # not support returning only a list of token IDs.