diff --git a/pyproject.toml b/pyproject.toml index 535a8db2..76b3383b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "together" -version = "1.3.13" +version = "1.3.14" authors = [ "Together AI " ] diff --git a/src/together/abstract/api_requestor.py b/src/together/abstract/api_requestor.py index 32c749b7..e4004f3e 100644 --- a/src/together/abstract/api_requestor.py +++ b/src/together/abstract/api_requestor.py @@ -78,7 +78,7 @@ def parse_stream_helper(line: bytes) -> str | None: line = line[len(b"data: ") :] else: line = line[len(b"data:") :] - if line.strip() == b"[DONE]": + if line.strip().upper() == b"[DONE]": # return here will cause GeneratorExit exception in urllib3 # and it will close http connection with TCP Reset return None @@ -620,7 +620,8 @@ def _interpret_response( self, result: requests.Response, stream: bool ) -> Tuple[TogetherResponse | Iterator[TogetherResponse], bool]: """Returns the response(s) and a bool indicating whether it is a stream.""" - if stream and "text/event-stream" in result.headers.get("Content-Type", ""): + content_type = result.headers.get("Content-Type", "") + if stream and "text/event-stream" in content_type: return ( self._interpret_response_line( line, result.status_code, result.headers, stream=True @@ -628,9 +629,13 @@ def _interpret_response( for line in parse_stream(result.iter_lines()) ), True else: + if content_type in ["application/octet-stream", "audio/wav", "audio/mpeg"]: + content = result.content + else: + content = result.content.decode("utf-8") return ( self._interpret_response_line( - result.content.decode("utf-8"), + content, result.status_code, result.headers, stream=False, @@ -670,7 +675,7 @@ async def _interpret_async_response( ) def _interpret_response_line( - self, rbody: str, rcode: int, rheaders: Any, stream: bool + self, rbody: str | bytes, rcode: int, rheaders: Any, stream: bool ) -> TogetherResponse: # HTTP 204 response code does not have any content in the body. if rcode == 204: @@ -684,13 +689,16 @@ def _interpret_response_line( ) try: - if "text/plain" in rheaders.get("Content-Type", ""): - data: Dict[str, Any] = {"message": rbody} + content_type = rheaders.get("Content-Type", "") + if isinstance(rbody, bytes): + data: Dict[str, Any] | bytes = rbody + elif "text/plain" in content_type: + data = {"message": rbody} else: data = json.loads(rbody) except (JSONDecodeError, UnicodeDecodeError) as e: raise error.APIError( - f"Error code: {rcode} -{rbody}", + f"Error code: {rcode} -{rbody if isinstance(rbody, str) else rbody.decode()}", http_status=rcode, headers=rheaders, ) from e diff --git a/src/together/client.py b/src/together/client.py index 91518230..6419581b 100644 --- a/src/together/client.py +++ b/src/together/client.py @@ -19,6 +19,7 @@ class Together: models: resources.Models fine_tuning: resources.FineTuning rerank: resources.Rerank + audio: resources.Audio # client options client: TogetherClient @@ -79,6 +80,7 @@ def __init__( self.models = resources.Models(self.client) self.fine_tuning = resources.FineTuning(self.client) self.rerank = resources.Rerank(self.client) + self.audio = resources.Audio(self.client) class AsyncTogether: diff --git a/src/together/resources/__init__.py b/src/together/resources/__init__.py index e5e85eac..cf4bf3b2 100644 --- a/src/together/resources/__init__.py +++ b/src/together/resources/__init__.py @@ -6,6 +6,7 @@ from together.resources.images import AsyncImages, Images from together.resources.models import AsyncModels, Models from together.resources.rerank import AsyncRerank, Rerank +from together.resources.audio import AsyncAudio, Audio __all__ = [ @@ -25,4 +26,6 @@ "Models", "AsyncRerank", "Rerank", + "AsyncAudio", + "Audio", ] diff --git a/src/together/resources/audio/__init__.py b/src/together/resources/audio/__init__.py new file mode 100644 index 00000000..5703d348 --- /dev/null +++ b/src/together/resources/audio/__init__.py @@ -0,0 +1,24 @@ +from functools import cached_property + +from together.resources.audio.speech import AsyncSpeech, Speech +from together.types import ( + TogetherClient, +) + + +class Audio: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + @cached_property + def speech(self) -> Speech: + return Speech(self._client) + + +class AsyncAudio: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + @cached_property + def speech(self) -> AsyncSpeech: + return AsyncSpeech(self._client) diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py new file mode 100644 index 00000000..da01586d --- /dev/null +++ b/src/together/resources/audio/speech.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from typing import Any, AsyncGenerator, Dict, Iterator, List, Union + +from together.abstract import api_requestor +from together.together_response import TogetherResponse +from together.types import ( + AudioSpeechRequest, + AudioResponseFormat, + AudioLanguage, + AudioResponseEncoding, + AudioSpeechStreamChunk, + AudioSpeechStreamEvent, + AudioSpeechStreamResponse, + TogetherClient, + TogetherRequest, +) + + +class Speech: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + def create( + self, + *, + model: str, + input: str, + voice: str | None = None, + response_format: str = "wav", + language: str = "en", + response_encoding: str = "pcm_f32le", + sample_rate: int = 44100, + stream: bool = False, + **kwargs: Any, + ) -> AudioSpeechStreamResponse: + """ + Method to generate audio from input text using a specified model. + + Args: + model (str): The name of the model to query. + input (str): Input text to generate the audio for. + voice (str, optional): The voice to use for generating the audio. + Defaults to None. + response_format (str, optional): The format of audio output. + Defaults to "wav". + language (str, optional): Language of input text. + Defaults to "en". + response_encoding (str, optional): Audio encoding of response. + Defaults to "pcm_f32le". + sample_rate (int, optional): Sampling rate to use for the output audio. + Defaults to 44100. + stream (bool, optional): If true, output is streamed for several characters at a time. + Defaults to False. + + Returns: + Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks. + """ + + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + parameter_payload = AudioSpeechRequest( + model=model, + input=input, + voice=voice, + response_format=AudioResponseFormat(response_format), + language=AudioLanguage(language), + response_encoding=AudioResponseEncoding(response_encoding), + sample_rate=sample_rate, + stream=stream, + **kwargs, + ).model_dump(exclude_none=True) + + response, streamed, _ = requestor.request( + options=TogetherRequest( + method="POST", + url="audio/speech", + params=parameter_payload, + ), + stream=stream, + ) + + return AudioSpeechStreamResponse(response=response) + + +class AsyncSpeech: + def __init__(self, client: TogetherClient) -> None: + self._client = client + + async def create( + self, + *, + model: str, + input: str, + voice: str | None = None, + response_format: str = "wav", + language: str = "en", + response_encoding: str = "pcm_f32le", + sample_rate: int = 44100, + stream: bool = False, + **kwargs: Any, + ) -> AudioSpeechStreamResponse: + """ + Async method to generate audio from input text using a specified model. + + Args: + model (str): The name of the model to query. + input (str): Input text to generate the audio for. + voice (str, optional): The voice to use for generating the audio. + Defaults to None. + response_format (str, optional): The format of audio output. + Defaults to "wav". + language (str, optional): Language of input text. + Defaults to "en". + response_encoding (str, optional): Audio encoding of response. + Defaults to "pcm_f32le". + sample_rate (int, optional): Sampling rate to use for the output audio. + Defaults to 44100. + stream (bool, optional): If true, output is streamed for several characters at a time. + Defaults to False. + + Returns: + Union[bytes, AsyncGenerator[AudioSpeechStreamChunk, None]]: The generated audio as bytes or an async generator over audio stream chunks. + """ + + requestor = api_requestor.APIRequestor( + client=self._client, + ) + + parameter_payload = AudioSpeechRequest( + model=model, + input=input, + voice=voice, + response_format=AudioResponseFormat(response_format), + language=AudioLanguage(language), + response_encoding=AudioResponseEncoding(response_encoding), + sample_rate=sample_rate, + stream=stream, + **kwargs, + ).model_dump(exclude_none=True) + + response, _, _ = await requestor.arequest( + options=TogetherRequest( + method="POST", + url="audio/speech", + params=parameter_payload, + ), + stream=stream, + ) + + return AudioSpeechStreamResponse(response=response) diff --git a/src/together/together_response.py b/src/together/together_response.py index 463e9f2e..d97255be 100644 --- a/src/together/together_response.py +++ b/src/together/together_response.py @@ -8,7 +8,7 @@ class TogetherResponse: API Response class. Stores headers and response data. """ - def __init__(self, data: Dict[str, Any], headers: Dict[str, Any]): + def __init__(self, data: Any, headers: Dict[str, Any]): self._headers = headers self.data = data diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py index f6f85083..5768d8de 100644 --- a/src/together/types/__init__.py +++ b/src/together/types/__init__.py @@ -42,6 +42,15 @@ RerankRequest, RerankResponse, ) +from together.types.audio_speech import ( + AudioSpeechRequest, + AudioResponseFormat, + AudioLanguage, + AudioResponseEncoding, + AudioSpeechStreamChunk, + AudioSpeechStreamEvent, + AudioSpeechStreamResponse, +) __all__ = [ "TogetherClient", @@ -77,4 +86,11 @@ "RerankRequest", "RerankResponse", "FinetuneTrainingLimits", + "AudioSpeechRequest", + "AudioResponseFormat", + "AudioLanguage", + "AudioResponseEncoding", + "AudioSpeechStreamChunk", + "AudioSpeechStreamEvent", + "AudioSpeechStreamResponse", ] diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py new file mode 100644 index 00000000..fb0cf786 --- /dev/null +++ b/src/together/types/audio_speech.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from enum import Enum +from typing import Iterator +import threading + +from pydantic import BaseModel, ConfigDict + +from together.together_response import TogetherResponse +import base64 + + +class AudioResponseFormat(str, Enum): + MP3 = "mp3" + WAV = "wav" + RAW = "raw" + + +class AudioLanguage(str, Enum): + EN = "en" + DE = "de" + FR = "fr" + ES = "es" + HI = "hi" + IT = "it" + JA = "ja" + KO = "ko" + NL = "nl" + PL = "pl" + PT = "pt" + RU = "ru" + SV = "sv" + TR = "tr" + ZH = "zh" + + +class AudioResponseEncoding(str, Enum): + PCM_F32LE = "pcm_f32le" + PCM_S16LE = "pcm_s16le" + PCM_MULAW = "pcm_mulaw" + PCM_ALAW = "pcm_alaw" + + +class AudioObjectType(str, Enum): + AUDIO_TTS_CHUNK = "audio.tts.chunk" + + +class StreamSentinelType(str, Enum): + DONE = "[DONE]" + + +class AudioSpeechRequest(BaseModel): + model: str + input: str + voice: str | None = None + response_format: AudioResponseFormat = AudioResponseFormat.MP3 + language: AudioLanguage = AudioLanguage.EN + response_encoding: AudioResponseEncoding = AudioResponseEncoding.PCM_F32LE + sample_rate: int = 44100 + stream: bool = False + + +class AudioSpeechStreamChunk(BaseModel): + object: AudioObjectType = AudioObjectType.AUDIO_TTS_CHUNK + model: str + b64: str + + +class AudioSpeechStreamEvent(BaseModel): + data: AudioSpeechStreamChunk + + +class StreamSentinel(BaseModel): + data: StreamSentinelType = StreamSentinelType.DONE + + +class AudioSpeechStreamEventResponse(BaseModel): + response: AudioSpeechStreamEvent | StreamSentinel + + +class AudioSpeechStreamResponse(BaseModel): + + response: TogetherResponse | Iterator[TogetherResponse] + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def stream_to_file(self, file_path: str) -> None: + + if isinstance(self.response, TogetherResponse): + # save response to file + with open(file_path, "wb") as f: + f.write(self.response.data) + + elif isinstance(self.response, Iterator): + + with open(file_path, "wb") as f: + for chunk in self.response: + + # Try to parse as stream chunk + stream_event_response = AudioSpeechStreamEventResponse( + response={"data": chunk.data} + ) + + if isinstance(stream_event_response.response, StreamSentinel): + break + + # decode base64 + audio = base64.b64decode(stream_event_response.response.data.b64) + + f.write(audio)