From 19399b7c2b3681cf19ff87203e16bfbf406cbe66 Mon Sep 17 00:00:00 2001 From: Mark Kurtz Date: Fri, 19 Jul 2024 10:34:55 -0400 Subject: [PATCH] Updates for pydantic serialization / deserialization to rebase on current main, address review comments, and finalize last pieces as well as test fixes --- .pre-commit-config.yaml | 3 + pyproject.toml | 9 +- src/guidellm/backend/base.py | 8 +- src/guidellm/core/distribution.py | 33 ++-- src/guidellm/core/request.py | 40 ++--- src/guidellm/core/result.py | 182 ++++++++++++++------- src/guidellm/core/serializable.py | 13 +- src/guidellm/executor/profile_generator.py | 18 +- src/guidellm/main.py | 8 +- src/guidellm/scheduler/scheduler.py | 5 +- tests/unit/backend/test_openai_backend.py | 1 - tests/unit/core/test_distribution.py | 10 +- tests/unit/core/test_request.py | 32 ++-- 13 files changed, 226 insertions(+), 136 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 16fd4d1..ec7b37e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,6 +28,8 @@ repos: datasets, loguru, numpy, + pydantic, + pyyaml, openai, requests, transformers, @@ -38,4 +40,5 @@ repos: # types types-click, types-requests, + types-PyYAML, ] diff --git a/pyproject.toml b/pyproject.toml index 0a48d13..fcd6d2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ dependencies = [ "loguru", "numpy", "openai", + "pydantic>=2.0.0", + "pyyaml>=6.0.0", "requests", "transformers", ] @@ -48,7 +50,12 @@ dev = [ "pytest-mock~=3.14.0", "ruff~=0.5.2", "tox~=4.16.0", - "types-requests~=2.32.0" + "types-requests~=2.32.0", + + # type-checking + "types-click", + "types-requests", + "types-PyYAML", ] diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py index 644d28c..031f6a2 100644 --- a/src/guidellm/backend/base.py +++ b/src/guidellm/backend/base.py @@ -91,7 +91,9 @@ def submit(self, request: TextGenerationRequest) -> TextGenerationResult: logger.info(f"Submitting request with prompt: {request.prompt}") - result = TextGenerationResult(TextGenerationRequest(prompt=request.prompt)) + result = TextGenerationResult( + request=TextGenerationRequest(prompt=request.prompt) + ) result.start(request.prompt) for response in self.make_request(request): # GenerativeResponse @@ -99,8 +101,8 @@ def submit(self, request: TextGenerationRequest) -> TextGenerationResult: result.output_token(response.add_token) elif response.type_ == "final": result.end( - response.prompt_token_count, - response.output_token_count, + prompt_token_count=response.prompt_token_count, + output_token_count=response.output_token_count, ) logger.info(f"Request completed with output: {result.output}") diff --git a/src/guidellm/core/distribution.py b/src/guidellm/core/distribution.py index a48291d..cefe39a 100644 --- a/src/guidellm/core/distribution.py +++ b/src/guidellm/core/distribution.py @@ -1,7 +1,8 @@ -from typing import List, Optional, Union +from typing import List, Sequence import numpy as np from loguru import logger +from pydantic import Field from guidellm.core.serializable import Serializable @@ -14,24 +15,12 @@ class Distribution(Serializable): statistical analyses. """ - def __init__(self, **data): - super().__init__(**data) - logger.debug(f"Initialized Distribution with data: {self.data}") + data: Sequence[float] = Field( + default_factory=list, description="The data points of the distribution." + ) - def __str__(self) -> str: - """ - Return a string representation of the Distribution. - """ - return ( - f"Distribution(mean={self.mean:.2f}, median={self.median:.2f}, " - f"min={self.min}, max={self.max}, count={len(self.data)})" - ) - - def __repr__(self) -> str: - """ - Return an unambiguous string representation of the Distribution for debugging. - """ - return f"Distribution(data={self.data})" + def __str__(self): + return f"Distribution({self.describe()})" @property def mean(self) -> float: @@ -99,7 +88,7 @@ def percentile(self, percentile: float) -> float: logger.warning("No data points available to calculate percentile.") return 0.0 - percentile_value = np.percentile(self._data, percentile).item() + percentile_value = np.percentile(self.data, percentile).item() logger.debug(f"Calculated {percentile}th percentile: {percentile_value}") return percentile_value @@ -180,15 +169,15 @@ def describe(self) -> dict: logger.debug(f"Generated description: {description}") return description - def add_data(self, new_data: Union[List[int], List[float]]): + def add_data(self, new_data: Sequence[float]): """ Add new data points to the distribution. :param new_data: A list of new numerical data points to add. """ - self.data.extend(new_data) + self.data = list(self.data) + list(new_data) logger.debug(f"Added new data: {new_data}") - def remove_data(self, remove_data: Union[List[int], List[float]]): + def remove_data(self, remove_data: Sequence[float]): """ Remove specified data points from the distribution. :param remove_data: A list of numerical data points to remove. diff --git a/src/guidellm/core/request.py b/src/guidellm/core/request.py index 98df6c7..89abaf7 100644 --- a/src/guidellm/core/request.py +++ b/src/guidellm/core/request.py @@ -1,5 +1,7 @@ import uuid -from typing import Dict, Optional, Any +from typing import Any, Dict, Optional + +from pydantic import Field from guidellm.core.serializable import Serializable @@ -9,24 +11,18 @@ class TextGenerationRequest(Serializable): A class to represent a text generation request for generative AI workloads. """ - id: str - prompt: str - prompt_token_count: Optional[int] - generated_token_count: Optional[int] - params: Dict[str, Any] - - def __init__( - self, - prompt: str, - prompt_token_count: Optional[int] = None, - generated_token_count: Optional[int] = None, - params: Optional[Dict[str, Any]] = None, - id: Optional[str] = None, - ): - super().__init__( - id=str(uuid.uuid4()) if id is None else id, - prompt=prompt, - prompt_token_count=prompt_token_count, - generated_token_count=generated_token_count, - params=params or {}, - ) + id: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="The unique identifier for the request.", + ) + prompt: str = Field(description="The input prompt for the text generation.") + prompt_token_count: Optional[int] = Field( + default=None, description="The number of tokens in the input prompt." + ) + generate_token_count: Optional[int] = Field( + default=None, description="The number of tokens to generate." + ) + params: Dict[str, Any] = Field( + default_factory=dict, + description="The parameters for the text generation request.", + ) diff --git a/src/guidellm/core/result.py b/src/guidellm/core/result.py index ef536dd..7decb62 100644 --- a/src/guidellm/core/result.py +++ b/src/guidellm/core/result.py @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Union from loguru import logger +from pydantic import Field from guidellm.core.distribution import Distribution from guidellm.core.request import TextGenerationRequest @@ -22,19 +23,43 @@ class TextGenerationResult(Serializable): for generative AI workloads. """ - request: TextGenerationRequest - prompt: str = "" - prompt_word_count: int = 0 - prompt_token_count: int = 0 - output: str = "" - output_word_count: int = 0 - output_token_count: int = 0 - last_time: Optional[float] = None - first_token_set: bool = False - start_time: Optional[float] = None - end_time: Optional[float] = None - first_token_time: Optional[float] = None - decode_times: Distribution = Distribution() + request: TextGenerationRequest = Field( + description="The text generation request used to generate the result." + ) + prompt: str = Field( + default_factory=str, description="The input prompt for the text generation." + ) + prompt_word_count: int = Field( + default=0, description="The number of words in the input prompt." + ) + prompt_token_count: int = Field( + default=0, description="The number of tokens in the input prompt." + ) + output: str = Field( + default_factory=str, description="The generated output for the text generation." + ) + output_word_count: int = Field( + default=0, description="The number of words in the output." + ) + output_token_count: int = Field( + default=0, description="The number of tokens in the output." + ) + last_time: float = Field(default=None, description="The last time recorded.") + first_token_set: bool = Field( + default=False, description="Whether the first token time is set." + ) + start_time: float = Field( + default=None, description="The start time of the text generation." + ) + end_time: float = Field( + default=None, description="The end time of the text generation." + ) + first_token_time: float = Field( + default=None, description="The time taken to decode the first token." + ) + decode_times: Distribution = Field( + default_factory=Distribution, description="The distribution of decode times." + ) def start(self, prompt: str): """ @@ -52,25 +77,6 @@ def start(self, prompt: str): logger.info("Text generation started with prompt: '{}'", prompt) - def _recording_started(self, raise_exception: bool = True) -> bool: - """ - Ensure that the benchmark text generation recording is started. - - We can assume that if the `self._start_time` exist, - then the `start()` has been called. - """ - - if self._start_time is not None: - return True - else: - if raise_exception is True: - raise ValueError( - "start time is not specified. " - "Did you make the `text_generation_benchmark.start()`?" - ) - else: - return False - def output_token(self, token: str): """ Add a token to the output and record the decode time. @@ -80,13 +86,13 @@ def output_token(self, token: str): """ current_counter = time() - if not self._first_token_set: + if not self.first_token_set: self.first_token_time = current_counter - self.last_time self.first_token_set = True - logger.debug(f"First token decode time: {self._first_token_time}") + logger.debug(f"First token decode time: {self.first_token_time}") else: decode_time = current_counter - self.last_time - self._decode_times.add_data([decode_time]) + self.decode_times.add_data([decode_time]) logger.debug(f"Token '{token}' decoded in {decode_time} seconds") self.last_time = current_counter @@ -95,6 +101,7 @@ def output_token(self, token: str): def end( self, + output: Optional[str] = None, prompt_token_count: Optional[int] = None, output_token_count: Optional[int] = None, ): @@ -111,12 +118,35 @@ def end( :type output_token_count: Optional[int] """ self.end_time = time() + + if output: + self.output = output + self.output_word_count = len(self.output.split()) - self.output_token_count = output_token_count or self._output_word_count - self.prompt_token_count = prompt_token_count or self._prompt_word_count + self.output_token_count = output_token_count or self.output_word_count + self.prompt_token_count = prompt_token_count or self.prompt_word_count logger.info(f"Text generation ended with output: '{self.output}'") + def _check_recording_started(self, raise_exception: bool = True) -> bool: + """ + Ensure that the benchmark text generation recording is started. + + We can assume that if the `self._start_time` exist, + then the `start()` has been called. + """ + + if self.start_time is not None: + return True + else: + if raise_exception is True: + raise ValueError( + "start time is not specified. " + "Did you make the `text_generation_benchmark.start()`?" + ) + else: + return False + class TextGenerationError(Serializable): """ @@ -124,8 +154,12 @@ class TextGenerationError(Serializable): for generative AI workloads. """ - request: TextGenerationRequest - error: str + request: TextGenerationRequest = Field( + description="The text generation request that resulted in an error." + ) + error: str = Field( + description="The error message that occurred during text generation." + ) def __init__(self, request: TextGenerationRequest, error: Exception): super().__init__(request=request, error=str(error)) @@ -137,10 +171,10 @@ class RequestConcurrencyMeasurement(Serializable): A dataclass to represent the concurrency measurement of a request. """ - time: float - completed: int - errored: int - processing: int + time: float = Field(description="The time of the measurement.") + completed: int = Field(description="The number of completed requests.") + errored: int = Field(description="The number of errored requests.") + processing: int = Field(description="The number of processing requests.") class TextGenerationBenchmark(Serializable): @@ -150,11 +184,20 @@ class TextGenerationBenchmark(Serializable): This is a set of results and errors for a specific mode and rate. """ - mode: str - rate: Optional[float] - results: List[TextGenerationResult] = [] - errors: List[TextGenerationError] = [] - concurrencies: List[RequestConcurrencyMeasurement] = [] + mode: str = Field(description="The generation mode, either 'async' or 'sync'.") + rate: float = Field( + default=None, description="The requested rate of requests per second." + ) + results: List[TextGenerationResult] = Field( + default_factory=list, description="The results of the text generation requests." + ) + errors: List[TextGenerationError] = Field( + default_factory=list, description="The errors of the text generation requests." + ) + concurrencies: List[RequestConcurrencyMeasurement] = Field( + default_factory=list, + description="The concurrency measurements of the requests.", + ) def __iter__(self): """ @@ -162,7 +205,7 @@ def __iter__(self): :return: An iterator over the results. """ - return iter(self._results) + return iter(self.results) @property def request_count(self) -> int: @@ -185,20 +228,45 @@ def error_count(self) -> int: return len(self.errors) @property - def request_rate(self) -> float: + def completed_request_rate(self) -> float: """ Get the rate of requests per second in the result. :return: The rate of requests per second. :rtype: float """ - if not self._results: + if not self.results: return 0.0 else: return self.request_count / ( - self._results[-1].end_time - self._results[0].start_time + self.results[-1].end_time - self.results[0].start_time ) + @property + def overloaded(self) -> bool: + if not self.results or not self.concurrencies: + raise ValueError("No results or concurrencies to check for overload.") + + if self.rate is None or len(self.concurrencies) < 2: + # if rate was not set, sync mode is assumed, + # or we have less than 2 data points, + # then we cannot be overloaded by definition + return False + + if self.completed_request_rate < 0.60 * self.rate: + # if the calculated rate is less than 60% of the requested rate, + # safe to assume the system is overloaded + return True + + # rate comparisons did not give a clear signal, + # let's double check that we aren't overloaded by comparing the + # compute throughput for the benchmark with the latency for the requests. + # overall this means that a relatively flat or decreasing throughput curve + # over time in addition to a growing processing queue is a sign of overload + + # TODO + return False + def request_started(self): """ Record the start of a generation request. @@ -266,8 +334,12 @@ class TextGenerationBenchmarkReport(Serializable): This is a collection of benchmarks for different modes and rates. """ - benchmarks: List[TextGenerationBenchmark] = [] - args: List[Dict[str, Any]] = [] + benchmarks: List[TextGenerationBenchmark] = Field( + default_factory=list, description="The benchmarks of text generation requests." + ) + args: List[Dict[str, Any]] = Field( + default_factory=list, description="The arguments used for the benchmarks." + ) def __iter__(self): return iter(self.benchmarks) @@ -280,7 +352,7 @@ def benchmarks_sorted(self) -> List[TextGenerationBenchmark]: :return: The sorted list of benchmarks. :rtype: List[TextGenerationBenchmark] """ - benchmarks = sorted(self.benchmarks, key=lambda x: x.request_rate) + benchmarks = sorted(self.benchmarks, key=lambda x: x.completed_request_rate) return benchmarks def add_benchmark(self, benchmark: TextGenerationBenchmark): diff --git a/src/guidellm/core/serializable.py b/src/guidellm/core/serializable.py index afa40dd..7749026 100644 --- a/src/guidellm/core/serializable.py +++ b/src/guidellm/core/serializable.py @@ -2,7 +2,7 @@ import yaml from loguru import logger -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict class Serializable(BaseModel): @@ -11,6 +11,13 @@ class Serializable(BaseModel): deserialization. """ + model_config = ConfigDict( + extra="forbid", + use_enum_values=True, + validate_assignment=True, + from_attributes=True, + ) + def __init__(self, /, **data: Any) -> None: super().__init__(**data) logger.debug( @@ -27,7 +34,6 @@ def to_yaml(self) -> str: """ logger.debug("Serializing to YAML... {}", self) yaml_str = yaml.dump(self.model_dump()) - logger.debug("Serialized to YAML: {}", yaml_str) return yaml_str @@ -41,7 +47,6 @@ def from_yaml(cls, data: str): """ logger.debug("Deserializing from YAML... {}", data) obj = cls.model_validate(yaml.safe_load(data)) - logger.debug("Deserialized from YAML: {}", obj) return obj @@ -53,7 +58,6 @@ def to_json(self) -> str: """ logger.debug("Serializing to JSON... {}", self) json_str = self.model_dump_json() - logger.debug("Serialized to JSON: {}", json_str) return json_str @@ -67,6 +71,5 @@ def from_json(cls, data: str): """ logger.debug("Deserializing from JSON... {}", data) obj = cls.model_validate_json(data) - logger.debug("Deserialized from JSON: {}", obj) return obj diff --git a/src/guidellm/executor/profile_generator.py b/src/guidellm/executor/profile_generator.py index 8946a25..fbfde65 100644 --- a/src/guidellm/executor/profile_generator.py +++ b/src/guidellm/executor/profile_generator.py @@ -120,9 +120,9 @@ def next_profile( if not last_benchmark.overloaded: last_rate = ( - last_benchmark.args_rate - if last_benchmark.args_rate - else last_benchmark.request_rate + last_benchmark.rate + if last_benchmark.rate + else last_benchmark.completed_request_rate ) return Profile( load_gen_mode=LoadGenerationModes.CONSTANT, @@ -133,14 +133,14 @@ def next_profile( first_benchmark = current_report.benchmarks[0] min_rate = ( - first_benchmark.args_rate - if first_benchmark.args_rate - else first_benchmark.request_rate + first_benchmark.rate + if first_benchmark.rate + else first_benchmark.completed_request_rate ) max_rate = ( - last_benchmark.args_rate - if last_benchmark.args_rate - else last_benchmark.request_rate + last_benchmark.rate + if last_benchmark.rate + else last_benchmark.completed_request_rate ) self._pending_rates = list(numpy.linspace(min_rate, max_rate, 10)) diff --git a/src/guidellm/main.py b/src/guidellm/main.py index 2fb5508..0e7cc55 100644 --- a/src/guidellm/main.py +++ b/src/guidellm/main.py @@ -1,5 +1,3 @@ -import json - import click from guidellm.backend import Backend @@ -125,13 +123,13 @@ def main( def save_report(report: TextGenerationBenchmarkReport, filename: str): - with open(filename, "w") as f: - json.dump(report.to_dict(), f, indent=4) + with open(filename, "w") as file: + file.write(report.to_json()) def print_report(report: TextGenerationBenchmarkReport): for benchmark in report.benchmarks: - print(f"Rate: {benchmark.request_rate}, Results: {benchmark.results}") + print(f"Rate: {benchmark.completed_request_rate}, Results: {benchmark.results}") if __name__ == "__main__": diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 51eee11..3c02b27 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -49,7 +49,7 @@ def run(self) -> TextGenerationBenchmark: return result def _run_sync(self) -> TextGenerationBenchmark: - result_set = TextGenerationBenchmark(mode=self._load_gen_mode.value, rate=None) + result_set = TextGenerationBenchmark(mode=self._load_gen_mode.value) start_time = time.time() counter = 0 @@ -68,6 +68,9 @@ def _run_sync(self) -> TextGenerationBenchmark: return result_set async def _run_async(self) -> TextGenerationBenchmark: + if self._load_gen_rate is None: + raise ValueError("Rate must be specified for asynchronous load generation") + result_set = TextGenerationBenchmark( mode=self._load_gen_mode.value, rate=self._load_gen_rate ) diff --git a/tests/unit/backend/test_openai_backend.py b/tests/unit/backend/test_openai_backend.py index 93a5a29..39e79da 100644 --- a/tests/unit/backend/test_openai_backend.py +++ b/tests/unit/backend/test_openai_backend.py @@ -82,7 +82,6 @@ def test_make_request( backend_service.make_request(request=request), openai_completion_create_patch, ): - total_generative_responses += 1 expected_token: Optional[str] = getattr(patched_completion, "content") or None diff --git a/tests/unit/core/test_distribution.py b/tests/unit/core/test_distribution.py index fac99bf..111b9c7 100644 --- a/tests/unit/core/test_distribution.py +++ b/tests/unit/core/test_distribution.py @@ -46,7 +46,15 @@ def test_distribution_remove_data(): def test_distribution_str(): data = [1, 2, 3, 4, 5] dist = Distribution(data=data) - assert str(dist) == "Distribution(mean=3.00, median=3.00, min=1, max=5, count=5)" + assert str(dist) == ( + "Distribution({'mean': 3.0, 'median': 3.0, " + "'variance': 2.0, 'std_deviation': 1.4142135623730951, " + "'percentile_indices': " + "[10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99], " + "'percentile_values': " + "[1.4, 1.8, 2.2, 2.6, 3.0, 3.4, 3.8, 4.2, 4.6, 4.8, 4.96], " + "'min': 1, 'max': 5, 'range': 4})" + ) @pytest.mark.regression diff --git a/tests/unit/core/test_request.py b/tests/unit/core/test_request.py index fbd239c..327397d 100644 --- a/tests/unit/core/test_request.py +++ b/tests/unit/core/test_request.py @@ -6,10 +6,10 @@ @pytest.mark.smoke def test_text_generation_request_initialization(): prompt = "Generate a story" - request = TextGenerationRequest(prompt) + request = TextGenerationRequest(prompt=prompt) assert request.prompt == prompt assert request.prompt_token_count is None - assert request.generated_token_count is None + assert request.generate_token_count is None assert request.params == {} @@ -17,17 +17,17 @@ def test_text_generation_request_initialization(): def test_text_generation_request_initialization_with_params(): prompt = "Generate a story" prompt_token_count = 50 - generated_token_count = 100 + generate_token_count = 100 params = {"temperature": 0.7} request = TextGenerationRequest( prompt=prompt, prompt_token_count=prompt_token_count, - generated_token_count=generated_token_count, + generate_token_count=generate_token_count, params=params, ) assert request.prompt == prompt assert request.prompt_token_count == prompt_token_count - assert request.generated_token_count == generated_token_count + assert request.generate_token_count == generate_token_count assert request.params == params @@ -35,9 +35,14 @@ def test_text_generation_request_initialization_with_params(): def test_request_json(): prompt = "Generate text" prompt_token_count = 10 - generated_token_count = 50 + generate_token_count = 50 params = {"temperature": 0.7} - request = TextGenerationRequest(prompt=prompt, prompt_token_count=prompt_token_count, generated_token_count=generated_token_count, params=params) + request = TextGenerationRequest( + prompt=prompt, + prompt_token_count=prompt_token_count, + generate_token_count=generate_token_count, + params=params, + ) json_str = request.to_json() assert '"prompt":"Generate text"' in json_str assert '"id":' in json_str @@ -46,7 +51,7 @@ def test_request_json(): assert request.id == request_restored.id assert request_restored.prompt == prompt assert request_restored.prompt_token_count == prompt_token_count - assert request_restored.generated_token_count == generated_token_count + assert request_restored.generate_token_count == generate_token_count assert request_restored.params == params @@ -54,9 +59,14 @@ def test_request_json(): def test_request_yaml(): prompt = "Generate text" prompt_token_count = 15 - generated_token_count = 55 + generate_token_count = 55 params = {"temperature": 0.8} - request = TextGenerationRequest(prompt=prompt, prompt_token_count=prompt_token_count, generated_token_count=generated_token_count, params=params) + request = TextGenerationRequest( + prompt=prompt, + prompt_token_count=prompt_token_count, + generate_token_count=generate_token_count, + params=params, + ) yaml_str = request.to_yaml() assert "prompt: Generate text" in yaml_str assert "id:" in yaml_str @@ -65,5 +75,5 @@ def test_request_yaml(): assert request.id == request_restored.id assert request_restored.prompt == prompt assert request_restored.prompt_token_count == prompt_token_count - assert request_restored.generated_token_count == generated_token_count + assert request_restored.generate_token_count == generate_token_count assert request_restored.params == params