diff --git a/pyproject.toml b/pyproject.toml
index 535a8db2..76b3383b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.3.13"
+version = "1.3.14"
 authors = [
     "Together AI <support@together.ai>"
 ]
diff --git a/src/together/abstract/api_requestor.py b/src/together/abstract/api_requestor.py
index 32c749b7..e4004f3e 100644
--- a/src/together/abstract/api_requestor.py
+++ b/src/together/abstract/api_requestor.py
@@ -78,7 +78,7 @@ def parse_stream_helper(line: bytes) -> str | None:
             line = line[len(b"data: ") :]
         else:
             line = line[len(b"data:") :]
-        if line.strip() == b"[DONE]":
+        if line.strip().upper() == b"[DONE]":
             # return here will cause GeneratorExit exception in urllib3
             # and it will close http connection with TCP Reset
             return None
@@ -620,7 +620,8 @@ def _interpret_response(
         self, result: requests.Response, stream: bool
     ) -> Tuple[TogetherResponse | Iterator[TogetherResponse], bool]:
         """Returns the response(s) and a bool indicating whether it is a stream."""
-        if stream and "text/event-stream" in result.headers.get("Content-Type", ""):
+        content_type = result.headers.get("Content-Type", "")
+        if stream and "text/event-stream" in content_type:
             return (
                 self._interpret_response_line(
                     line, result.status_code, result.headers, stream=True
@@ -628,9 +629,13 @@ def _interpret_response(
                 for line in parse_stream(result.iter_lines())
             ), True
         else:
+            if content_type in ["application/octet-stream", "audio/wav", "audio/mpeg"]:
+                content = result.content
+            else:
+                content = result.content.decode("utf-8")
             return (
                 self._interpret_response_line(
-                    result.content.decode("utf-8"),
+                    content,
                     result.status_code,
                     result.headers,
                     stream=False,
@@ -670,7 +675,7 @@ async def _interpret_async_response(
             )
 
     def _interpret_response_line(
-        self, rbody: str, rcode: int, rheaders: Any, stream: bool
+        self, rbody: str | bytes, rcode: int, rheaders: Any, stream: bool
     ) -> TogetherResponse:
         # HTTP 204 response code does not have any content in the body.
         if rcode == 204:
@@ -684,13 +689,16 @@ def _interpret_response_line(
             )
 
         try:
-            if "text/plain" in rheaders.get("Content-Type", ""):
-                data: Dict[str, Any] = {"message": rbody}
+            content_type = rheaders.get("Content-Type", "")
+            if isinstance(rbody, bytes):
+                data: Dict[str, Any] | bytes = rbody
+            elif "text/plain" in content_type:
+                data = {"message": rbody}
             else:
                 data = json.loads(rbody)
         except (JSONDecodeError, UnicodeDecodeError) as e:
             raise error.APIError(
-                f"Error code: {rcode} -{rbody}",
+                f"Error code: {rcode} -{rbody if isinstance(rbody, str) else rbody.decode()}",
                 http_status=rcode,
                 headers=rheaders,
             ) from e
diff --git a/src/together/client.py b/src/together/client.py
index 91518230..6419581b 100644
--- a/src/together/client.py
+++ b/src/together/client.py
@@ -19,6 +19,7 @@ class Together:
     models: resources.Models
     fine_tuning: resources.FineTuning
     rerank: resources.Rerank
+    audio: resources.Audio
 
     # client options
     client: TogetherClient
@@ -79,6 +80,7 @@ def __init__(
         self.models = resources.Models(self.client)
         self.fine_tuning = resources.FineTuning(self.client)
         self.rerank = resources.Rerank(self.client)
+        self.audio = resources.Audio(self.client)
 
 
 class AsyncTogether:
diff --git a/src/together/resources/__init__.py b/src/together/resources/__init__.py
index e5e85eac..cf4bf3b2 100644
--- a/src/together/resources/__init__.py
+++ b/src/together/resources/__init__.py
@@ -6,6 +6,7 @@
 from together.resources.images import AsyncImages, Images
 from together.resources.models import AsyncModels, Models
 from together.resources.rerank import AsyncRerank, Rerank
+from together.resources.audio import AsyncAudio, Audio
 
 
 __all__ = [
@@ -25,4 +26,6 @@
     "Models",
     "AsyncRerank",
     "Rerank",
+    "AsyncAudio",
+    "Audio",
 ]
diff --git a/src/together/resources/audio/__init__.py b/src/together/resources/audio/__init__.py
new file mode 100644
index 00000000..5703d348
--- /dev/null
+++ b/src/together/resources/audio/__init__.py
@@ -0,0 +1,24 @@
+from functools import cached_property
+
+from together.resources.audio.speech import AsyncSpeech, Speech
+from together.types import (
+    TogetherClient,
+)
+
+
+class Audio:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    @cached_property
+    def speech(self) -> Speech:
+        return Speech(self._client)
+
+
+class AsyncAudio:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    @cached_property
+    def speech(self) -> AsyncSpeech:
+        return AsyncSpeech(self._client)
diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py
new file mode 100644
index 00000000..da01586d
--- /dev/null
+++ b/src/together/resources/audio/speech.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+from typing import Any, AsyncGenerator, Dict, Iterator, List, Union
+
+from together.abstract import api_requestor
+from together.together_response import TogetherResponse
+from together.types import (
+    AudioSpeechRequest,
+    AudioResponseFormat,
+    AudioLanguage,
+    AudioResponseEncoding,
+    AudioSpeechStreamChunk,
+    AudioSpeechStreamEvent,
+    AudioSpeechStreamResponse,
+    TogetherClient,
+    TogetherRequest,
+)
+
+
+class Speech:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    def create(
+        self,
+        *,
+        model: str,
+        input: str,
+        voice: str | None = None,
+        response_format: str = "wav",
+        language: str = "en",
+        response_encoding: str = "pcm_f32le",
+        sample_rate: int = 44100,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> AudioSpeechStreamResponse:
+        """
+        Method to generate audio from input text using a specified model.
+
+        Args:
+            model (str): The name of the model to query.
+            input (str): Input text to generate the audio for.
+            voice (str, optional): The voice to use for generating the audio.
+                Defaults to None.
+            response_format (str, optional): The format of audio output.
+                Defaults to "wav".
+            language (str, optional): Language of input text.
+                Defaults to "en".
+            response_encoding (str, optional): Audio encoding of response.
+                Defaults to "pcm_f32le".
+            sample_rate (int, optional): Sampling rate to use for the output audio.
+                Defaults to 44100.
+            stream (bool, optional): If true, output is streamed for several characters at a time.
+                Defaults to False.
+
+        Returns:
+            Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks.
+        """
+
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        parameter_payload = AudioSpeechRequest(
+            model=model,
+            input=input,
+            voice=voice,
+            response_format=AudioResponseFormat(response_format),
+            language=AudioLanguage(language),
+            response_encoding=AudioResponseEncoding(response_encoding),
+            sample_rate=sample_rate,
+            stream=stream,
+            **kwargs,
+        ).model_dump(exclude_none=True)
+
+        response, streamed, _ = requestor.request(
+            options=TogetherRequest(
+                method="POST",
+                url="audio/speech",
+                params=parameter_payload,
+            ),
+            stream=stream,
+        )
+
+        return AudioSpeechStreamResponse(response=response)
+
+
+class AsyncSpeech:
+    def __init__(self, client: TogetherClient) -> None:
+        self._client = client
+
+    async def create(
+        self,
+        *,
+        model: str,
+        input: str,
+        voice: str | None = None,
+        response_format: str = "wav",
+        language: str = "en",
+        response_encoding: str = "pcm_f32le",
+        sample_rate: int = 44100,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> AudioSpeechStreamResponse:
+        """
+        Async method to generate audio from input text using a specified model.
+
+        Args:
+            model (str): The name of the model to query.
+            input (str): Input text to generate the audio for.
+            voice (str, optional): The voice to use for generating the audio.
+                Defaults to None.
+            response_format (str, optional): The format of audio output.
+                Defaults to "wav".
+            language (str, optional): Language of input text.
+                Defaults to "en".
+            response_encoding (str, optional): Audio encoding of response.
+                Defaults to "pcm_f32le".
+            sample_rate (int, optional): Sampling rate to use for the output audio.
+                Defaults to 44100.
+            stream (bool, optional): If true, output is streamed for several characters at a time.
+                Defaults to False.
+
+        Returns:
+            Union[bytes, AsyncGenerator[AudioSpeechStreamChunk, None]]: The generated audio as bytes or an async generator over audio stream chunks.
+        """
+
+        requestor = api_requestor.APIRequestor(
+            client=self._client,
+        )
+
+        parameter_payload = AudioSpeechRequest(
+            model=model,
+            input=input,
+            voice=voice,
+            response_format=AudioResponseFormat(response_format),
+            language=AudioLanguage(language),
+            response_encoding=AudioResponseEncoding(response_encoding),
+            sample_rate=sample_rate,
+            stream=stream,
+            **kwargs,
+        ).model_dump(exclude_none=True)
+
+        response, _, _ = await requestor.arequest(
+            options=TogetherRequest(
+                method="POST",
+                url="audio/speech",
+                params=parameter_payload,
+            ),
+            stream=stream,
+        )
+
+        return AudioSpeechStreamResponse(response=response)
diff --git a/src/together/together_response.py b/src/together/together_response.py
index 463e9f2e..d97255be 100644
--- a/src/together/together_response.py
+++ b/src/together/together_response.py
@@ -8,7 +8,7 @@ class TogetherResponse:
     API Response class. Stores headers and response data.
     """
 
-    def __init__(self, data: Dict[str, Any], headers: Dict[str, Any]):
+    def __init__(self, data: Any, headers: Dict[str, Any]):
         self._headers = headers
         self.data = data
 
diff --git a/src/together/types/__init__.py b/src/together/types/__init__.py
index f6f85083..5768d8de 100644
--- a/src/together/types/__init__.py
+++ b/src/together/types/__init__.py
@@ -42,6 +42,15 @@
     RerankRequest,
     RerankResponse,
 )
+from together.types.audio_speech import (
+    AudioSpeechRequest,
+    AudioResponseFormat,
+    AudioLanguage,
+    AudioResponseEncoding,
+    AudioSpeechStreamChunk,
+    AudioSpeechStreamEvent,
+    AudioSpeechStreamResponse,
+)
 
 __all__ = [
     "TogetherClient",
@@ -77,4 +86,11 @@
     "RerankRequest",
     "RerankResponse",
     "FinetuneTrainingLimits",
+    "AudioSpeechRequest",
+    "AudioResponseFormat",
+    "AudioLanguage",
+    "AudioResponseEncoding",
+    "AudioSpeechStreamChunk",
+    "AudioSpeechStreamEvent",
+    "AudioSpeechStreamResponse",
 ]
diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py
new file mode 100644
index 00000000..fb0cf786
--- /dev/null
+++ b/src/together/types/audio_speech.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+from enum import Enum
+from typing import Iterator
+import threading
+
+from pydantic import BaseModel, ConfigDict
+
+from together.together_response import TogetherResponse
+import base64
+
+
+class AudioResponseFormat(str, Enum):
+    MP3 = "mp3"
+    WAV = "wav"
+    RAW = "raw"
+
+
+class AudioLanguage(str, Enum):
+    EN = "en"
+    DE = "de"
+    FR = "fr"
+    ES = "es"
+    HI = "hi"
+    IT = "it"
+    JA = "ja"
+    KO = "ko"
+    NL = "nl"
+    PL = "pl"
+    PT = "pt"
+    RU = "ru"
+    SV = "sv"
+    TR = "tr"
+    ZH = "zh"
+
+
+class AudioResponseEncoding(str, Enum):
+    PCM_F32LE = "pcm_f32le"
+    PCM_S16LE = "pcm_s16le"
+    PCM_MULAW = "pcm_mulaw"
+    PCM_ALAW = "pcm_alaw"
+
+
+class AudioObjectType(str, Enum):
+    AUDIO_TTS_CHUNK = "audio.tts.chunk"
+
+
+class StreamSentinelType(str, Enum):
+    DONE = "[DONE]"
+
+
+class AudioSpeechRequest(BaseModel):
+    model: str
+    input: str
+    voice: str | None = None
+    response_format: AudioResponseFormat = AudioResponseFormat.MP3
+    language: AudioLanguage = AudioLanguage.EN
+    response_encoding: AudioResponseEncoding = AudioResponseEncoding.PCM_F32LE
+    sample_rate: int = 44100
+    stream: bool = False
+
+
+class AudioSpeechStreamChunk(BaseModel):
+    object: AudioObjectType = AudioObjectType.AUDIO_TTS_CHUNK
+    model: str
+    b64: str
+
+
+class AudioSpeechStreamEvent(BaseModel):
+    data: AudioSpeechStreamChunk
+
+
+class StreamSentinel(BaseModel):
+    data: StreamSentinelType = StreamSentinelType.DONE
+
+
+class AudioSpeechStreamEventResponse(BaseModel):
+    response: AudioSpeechStreamEvent | StreamSentinel
+
+
+class AudioSpeechStreamResponse(BaseModel):
+
+    response: TogetherResponse | Iterator[TogetherResponse]
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def stream_to_file(self, file_path: str) -> None:
+
+        if isinstance(self.response, TogetherResponse):
+            # save response to file
+            with open(file_path, "wb") as f:
+                f.write(self.response.data)
+
+        elif isinstance(self.response, Iterator):
+
+            with open(file_path, "wb") as f:
+                for chunk in self.response:
+
+                    # Try to parse as stream chunk
+                    stream_event_response = AudioSpeechStreamEventResponse(
+                        response={"data": chunk.data}
+                    )
+
+                    if isinstance(stream_event_response.response, StreamSentinel):
+                        break
+
+                    # decode base64
+                    audio = base64.b64decode(stream_event_response.response.data.b64)
+
+                    f.write(audio)