audio apis (#238)

* do not merge - audio api init * Get audio stuff working. (#245) * Initially getting things working. * More closely match spec * Formatting fixes. * Adjust handling of different types to make linter happy. * Add type definition * Decode bytes in ternary * bump to version 1.3.14 --------- Co-authored-by: jdreamerz <[email protected]> Co-authored-by: Justin Driemeyer <[email protected]>
togethercomputer · Jan 27, 2025 · 82dba38 · 82dba38
1 parent c185015
commit 82dba38
Show file tree

Hide file tree

Showing 9 changed files with 325 additions and 9 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.3.13"
+version = "1.3.14"
 authors = [
     "Together AI <[email protected]>"
 ]

diff --git a/src/together/abstract/api_requestor.py b/src/together/abstract/api_requestor.py
@@ -78,7 +78,7 @@ def parse_stream_helper(line: bytes) -> str | None:
             line = line[len(b"data: ") :]
         else:
             line = line[len(b"data:") :]
-        if line.strip() == b"[DONE]":
+        if line.strip().upper() == b"[DONE]":
             # return here will cause GeneratorExit exception in urllib3
             # and it will close http connection with TCP Reset
             return None
@@ -620,17 +620,22 @@ def _interpret_response(
         self, result: requests.Response, stream: bool
     ) -> Tuple[TogetherResponse | Iterator[TogetherResponse], bool]:
         """Returns the response(s) and a bool indicating whether it is a stream."""
-        if stream and "text/event-stream" in result.headers.get("Content-Type", ""):
+        content_type = result.headers.get("Content-Type", "")
+        if stream and "text/event-stream" in content_type:
             return (
                 self._interpret_response_line(
                     line, result.status_code, result.headers, stream=True
                 )
                 for line in parse_stream(result.iter_lines())
             ), True
         else:
+            if content_type in ["application/octet-stream", "audio/wav", "audio/mpeg"]:
+                content = result.content
+            else:
+                content = result.content.decode("utf-8")
             return (
                 self._interpret_response_line(
-                    result.content.decode("utf-8"),
+                    content,
                     result.status_code,
                     result.headers,
                     stream=False,
@@ -670,7 +675,7 @@ async def _interpret_async_response(
             )
 
     def _interpret_response_line(
-        self, rbody: str, rcode: int, rheaders: Any, stream: bool
+        self, rbody: str | bytes, rcode: int, rheaders: Any, stream: bool
     ) -> TogetherResponse:
         # HTTP 204 response code does not have any content in the body.
         if rcode == 204:
@@ -684,13 +689,16 @@ def _interpret_response_line(
             )
 
         try:
-            if "text/plain" in rheaders.get("Content-Type", ""):
-                data: Dict[str, Any] = {"message": rbody}
+            content_type = rheaders.get("Content-Type", "")
+            if isinstance(rbody, bytes):
+                data: Dict[str, Any] | bytes = rbody
+            elif "text/plain" in content_type:
+                data = {"message": rbody}
             else:
                 data = json.loads(rbody)
         except (JSONDecodeError, UnicodeDecodeError) as e:
             raise error.APIError(
-                f"Error code: {rcode} -{rbody}",
+                f"Error code: {rcode} -{rbody if isinstance(rbody, str) else rbody.decode()}",
                 http_status=rcode,
                 headers=rheaders,
             ) from e

diff --git a/src/together/client.py b/src/together/client.py
@@ -19,6 +19,7 @@ class Together:
     models: resources.Models
     fine_tuning: resources.FineTuning
     rerank: resources.Rerank
+    audio: resources.Audio
 
     # client options
     client: TogetherClient
@@ -79,6 +80,7 @@ def __init__(
         self.models = resources.Models(self.client)
         self.fine_tuning = resources.FineTuning(self.client)
         self.rerank = resources.Rerank(self.client)
+        self.audio = resources.Audio(self.client)
 
 
 class AsyncTogether:

diff --git a/src/together/resources/__init__.py b/src/together/resources/__init__.py
@@ -6,6 +6,7 @@
 from together.resources.images import AsyncImages, Images
 from together.resources.models import AsyncModels, Models
 from together.resources.rerank import AsyncRerank, Rerank
+from together.resources.audio import AsyncAudio, Audio
 
 
 __all__ = [
@@ -25,4 +26,6 @@
     "Models",
     "AsyncRerank",
     "Rerank",
+    "AsyncAudio",
+    "Audio",
 ]
diff --git a/src/together/resources/audio/__init__.py b/src/together/resources/audio/__init__.py
@@ -0,0 +1,24 @@
+from functools import cached_property
+
+from together.resources.audio.speech import AsyncSpeech, Speech
+from together.types import (
+    TogetherClient,
+)
+
+
+class Audio:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    @cached_property
+    def speech(self) -> Speech:
+        return Speech(self._client)
+
+
+class AsyncAudio:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    @cached_property
+    def speech(self) -> AsyncSpeech:
+        return AsyncSpeech(self._client)
diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from typing import Any, AsyncGenerator, Dict, Iterator, List, Union
+
+from together.abstract import api_requestor
+from together.together_response import TogetherResponse
+from together.types import (
+    AudioSpeechRequest,
+    AudioResponseFormat,
+    AudioLanguage,
+    AudioResponseEncoding,
+    AudioSpeechStreamChunk,
+    AudioSpeechStreamEvent,
+    AudioSpeechStreamResponse,
+    TogetherClient,
+    TogetherRequest,
+)
+
+
+class Speech:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    def create(
+        self,
+        *,
+        model: str,
+        input: str,
+        voice: str | None = None,
+        response_format: str = "wav",
+        language: str = "en",
+        response_encoding: str = "pcm_f32le",
+        sample_rate: int = 44100,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> AudioSpeechStreamResponse:
+        """
+        Method to generate audio from input text using a specified model.
+
+        Args:
+            model (str): The name of the model to query.
+            input (str): Input text to generate the audio for.
+            voice (str, optional): The voice to use for generating the audio.
+                Defaults to None.
+            response_format (str, optional): The format of audio output.
+                Defaults to "wav".
+            language (str, optional): Language of input text.
+                Defaults to "en".
+            response_encoding (str, optional): Audio encoding of response.
+                Defaults to "pcm_f32le".
+            sample_rate (int, optional): Sampling rate to use for the output audio.
+                Defaults to 44100.
+            stream (bool, optional): If true, output is streamed for several characters at a time.
+                Defaults to False.
+
+        Returns:
+            Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks.
+        """
+
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        parameter_payload = AudioSpeechRequest(
+            model=model,
+            input=input,
+            voice=voice,
+            response_format=AudioResponseFormat(response_format),
+            language=AudioLanguage(language),
+            response_encoding=AudioResponseEncoding(response_encoding),
+            sample_rate=sample_rate,
+            stream=stream,
+            **kwargs,
+        ).model_dump(exclude_none=True)
+
+        response, streamed, _ = requestor.request(
+            options=TogetherRequest(
+                method="POST",
+                url="audio/speech",
+                params=parameter_payload,
+            ),
+            stream=stream,
+        )
+
+        return AudioSpeechStreamResponse(response=response)
+
+
+class AsyncSpeech:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    async def create(
+        self,
+        *,
+        model: str,
+        input: str,
+        voice: str | None = None,
+        response_format: str = "wav",
+        language: str = "en",
+        response_encoding: str = "pcm_f32le",
+        sample_rate: int = 44100,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> AudioSpeechStreamResponse:
+        """
+        Async method to generate audio from input text using a specified model.
+
+        Args:
+            model (str): The name of the model to query.
+            input (str): Input text to generate the audio for.
+            voice (str, optional): The voice to use for generating the audio.
+                Defaults to None.
+            response_format (str, optional): The format of audio output.
+                Defaults to "wav".
+            language (str, optional): Language of input text.
+                Defaults to "en".
+            response_encoding (str, optional): Audio encoding of response.
+                Defaults to "pcm_f32le".
+            sample_rate (int, optional): Sampling rate to use for the output audio.
+                Defaults to 44100.
+            stream (bool, optional): If true, output is streamed for several characters at a time.
+                Defaults to False.
+
+        Returns:
+            Union[bytes, AsyncGenerator[AudioSpeechStreamChunk, None]]: The generated audio as bytes or an async generator over audio stream chunks.
+        """
+
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        parameter_payload = AudioSpeechRequest(
+            model=model,
+            input=input,
+            voice=voice,
+            response_format=AudioResponseFormat(response_format),
+            language=AudioLanguage(language),
+            response_encoding=AudioResponseEncoding(response_encoding),
+            sample_rate=sample_rate,
+            stream=stream,
+            **kwargs,
+        ).model_dump(exclude_none=True)
+
+        response, _, _ = await requestor.arequest(
+            options=TogetherRequest(
+                method="POST",
+                url="audio/speech",
+                params=parameter_payload,
+            ),
+            stream=stream,
+        )
+
+        return AudioSpeechStreamResponse(response=response)
diff --git a/src/together/together_response.py b/src/together/together_response.py
@@ -8,7 +8,7 @@ class TogetherResponse:
     API Response class. Stores headers and response data.
     """
 
-    def __init__(self, data: Dict[str, Any], headers: Dict[str, Any]):
+    def __init__(self, data: Any, headers: Dict[str, Any]):
         self._headers = headers
         self.data = data
 

diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py
@@ -42,6 +42,15 @@
     RerankRequest,
     RerankResponse,
 )
+from together.types.audio_speech import (
+    AudioSpeechRequest,
+    AudioResponseFormat,
+    AudioLanguage,
+    AudioResponseEncoding,
+    AudioSpeechStreamChunk,
+    AudioSpeechStreamEvent,
+    AudioSpeechStreamResponse,
+)
 
 __all__ = [
     "TogetherClient",
@@ -77,4 +86,11 @@
     "RerankRequest",
     "RerankResponse",
     "FinetuneTrainingLimits",
+    "AudioSpeechRequest",
+    "AudioResponseFormat",
+    "AudioLanguage",
+    "AudioResponseEncoding",
+    "AudioSpeechStreamChunk",
+    "AudioSpeechStreamEvent",
+    "AudioSpeechStreamResponse",
 ]