togethercomputer · jdreamerz · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025
diff --git a/speech.mp3 b/speech.mp3
diff --git a/src/together/abstract/api_requestor.py b/src/together/abstract/api_requestor.py
@@ -78,7 +78,7 @@ def parse_stream_helper(line: bytes) -> str | None:
             line = line[len(b"data: ") :]
         else:
             line = line[len(b"data:") :]
-        if line.strip() == b"[DONE]":
+        if line.strip().upper() == b"[DONE]":
             # return here will cause GeneratorExit exception in urllib3
             # and it will close http connection with TCP Reset
             return None
@@ -620,17 +620,22 @@ def _interpret_response(
         self, result: requests.Response, stream: bool
     ) -> Tuple[TogetherResponse | Iterator[TogetherResponse], bool]:
         """Returns the response(s) and a bool indicating whether it is a stream."""
-        if stream and "text/event-stream" in result.headers.get("Content-Type", ""):
+        content_type = result.headers.get("Content-Type", "")
+        if stream and "text/event-stream" in content_type:
             return (
                 self._interpret_response_line(
                     line, result.status_code, result.headers, stream=True
                 )
                 for line in parse_stream(result.iter_lines())
             ), True
         else:
+            if content_type in ["application/octet-stream", "audio/wav", "audio/mpeg"]:
+                content = result.content
+            else:
+                content = result.content.decode("utf-8")
             return (
                 self._interpret_response_line(
-                    result.content.decode("utf-8"),
+                    content,
                     result.status_code,
                     result.headers,
                     stream=False,
@@ -670,7 +675,7 @@ async def _interpret_async_response(
             )
 
     def _interpret_response_line(
-        self, rbody: str, rcode: int, rheaders: Any, stream: bool
+        self, rbody: str | bytes, rcode: int, rheaders: Any, stream: bool
     ) -> TogetherResponse:
         # HTTP 204 response code does not have any content in the body.
         if rcode == 204:
@@ -684,8 +689,11 @@ def _interpret_response_line(
             )
 
         try:
-            if "text/plain" in rheaders.get("Content-Type", ""):
+            content_type = rheaders.get("Content-Type", "")
+            if "text/plain" in content_type:
                 data: Dict[str, Any] = {"message": rbody}
+            elif content_type in ["application/octet-stream", "audio/wav", "audio/mpeg"]:
+                data = rbody
             else:
                 data = json.loads(rbody)
         except (JSONDecodeError, UnicodeDecodeError) as e:

diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py
@@ -150,4 +150,4 @@ async def create(
             stream=stream,
         )
 
-        # return AudioSpeechStreamResponse(response=response)
+        return AudioSpeechStreamResponse(response=response)
diff --git a/src/together/types/audio_speech.py b/src/together/types/audio_speech.py
@@ -65,14 +65,14 @@ class AudioSpeechStreamChunk(BaseModel):
     model: str
     b64: str
 
-
 class AudioSpeechStreamEvent(BaseModel):
     data: AudioSpeechStreamChunk
 
-
 class StreamSentinel(BaseModel):
     data: StreamSentinelType = StreamSentinelType.DONE
 
+class AudioSpeechStreamEventResponse(BaseModel):
+    response: AudioSpeechStreamEvent | StreamSentinel
 
 class AudioSpeechStreamResponse(BaseModel):
 
@@ -92,9 +92,13 @@ def stream_to_file(self, file_path: str) -> None:
             with open(file_path, "wb") as f:
                 for chunk in self.response:
 
-                    data = AudioSpeechStreamChunk(**chunk.data)
+                    # Try to parse as stream chunk
+                    stream_event_response = AudioSpeechStreamEventResponse(response={"data": chunk.data})
+
+                    if isinstance(stream_event_response.response, StreamSentinel):
+                        break
 
                     # decode base64
-                    audio = base64.b64decode(data.b64)
+                    audio = base64.b64decode(stream_event_response.response.data.b64)
 
                     f.write(audio)
diff --git a/test.py b/test.py