feat✨(audio): add audio api

Signed-off-by: zjzjzjzj1874 <[email protected]>
zjzjzjzj1874 · Mar 28, 2023 · a8976cc · a8976cc
1 parent 2db6e5d
commit a8976cc
Show file tree

Hide file tree

Showing 8 changed files with 265 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 chatgpt
 .idea
-cmd/gptx
+cmd/gptx
+
+cmd/*.mp3
+cmd/*.mp4
diff --git a/README.md b/README.md
@@ -25,10 +25,11 @@ Usage:
   gptx [command]
 
 Available Commands:
+  audio       turn audio into text.
   chat        creates a completion for the chat message
   help        Help about any command
   img         creates an image given a prompt.
-  model       lists the currently available models
+  model       lists the currently available models,
 
 Flags:
   -h, --help      help for gptx
@@ -70,6 +71,16 @@ Total Image: 2
 Url: https://oaidalleapiprodscus.blob.core.windows.net/private/org-FszeU94XqTOxWst1f2mp5LpO/user-qcjpFAv1q7NKNH42MHry25KB/img-r3lAOCz0DSmypxl3X5w3ZWyE.png?st=2023-03-24T05%3A27%3A14Z&se=2023-03-24T07%3A27%3A14Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2023-03-23T22%3A08%3A23Z&ske=2023-03-24T22%3A08%3A23Z&sks=b&skv=2021-08-06&sig=%2BaFB5nW23BeT6XGdrcSS1M2wvWeWbywJnebdp9wdza8%3D
 Url: https://oaidalleapiprodscus.blob.core.windows.net/private/org-FszeU94XqTOxWst1f2mp5LpO/user-qcjpFAv1q7NKNH42MHry25KB/img-r3XgIswuunVwZ6NlwP0NnUAG.png?st=2023-03-24T05%3A27%3A14Z&se=2023-03-24T07%3A27%3A14Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2023-03-23T22%3A08%3A23Z&ske=2023-03-24T22%3A08%3A23Z&sks=b&skv=2021-08-06&sig=nvVZDD3hsaxPtaS9sxyfvwr2x7u0mF4/9cbts8t60I0%3D
 
+```
+### 音频转文字
+
+```Bash
+-f: 待转文件
+-m: gpt模型,默认使用whisper-1
+-l: 语言,参考 https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+./gptx audio trans -f 5.6.mp3 -l en
+
+翻译结果:John, John, you are so dumb. John, John, you are so dumb. John, John, you are so dumb. John, John, you are so dumb.
 ```
 
 ## TODO list

diff --git a/cmd/audio/audio.go b/cmd/audio/audio.go
@@ -0,0 +1,28 @@
+package audio
+
+import (
+	"github.com/spf13/cobra"
+)
+
+const (
+	defaultModel = "whisper-1" // 默认模型
+)
+
+var (
+	file     string // 文件
+	model    string // gpt模型
+	prompt   string // 提示
+	language string // Supplying the input language in ISO-639-1 format will improve accuracy and latency. link-at:https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+)
+
+func init() {
+	Cmd.AddCommand(transCmd)
+	Cmd.AddCommand(transcCmd)
+}
+
+var (
+	Cmd = &cobra.Command{
+		Use:   "audio",
+		Short: "turn audio into text.",
+	}
+)
diff --git a/cmd/audio/transcription.go b/cmd/audio/transcription.go
@@ -0,0 +1,80 @@
+package audio
+
+import (
+	"bytes"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"os"
+
+	"github.com/fatih/color"
+	"github.com/spf13/cobra"
+
+	"github.com/zjzjzjzj1874/chatgpt/pkg"
+)
+
+func init() {
+	transcCmd.Flags().StringVarP(&file, "file", "f", "", "The audio file to transcribe, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.")
+	transcCmd.Flags().StringVarP(&model, "model", "m", defaultModel, "ID of the model to use. Only whisper-1 is currently available.")
+	transcCmd.Flags().StringVarP(&prompt, "prompt", "p", "", "An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.")
+	transcCmd.Flags().StringVarP(&language, "language", "l", "", "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.")
+	transcCmd.MarkFlagsRequiredTogether("file")
+}
+
+var (
+	transcCmd = &cobra.Command{
+		Use:   "transc",
+		Short: "Transcribes audio into the input language.",
+		Run: func(cmd *cobra.Command, args []string) {
+			if len(file) == 0 {
+				color.Red("%s", "Please input your file")
+				return
+			}
+			fi, err := os.Open(file)
+			if err != nil {
+				color.Red("Open file(%s) failure:%s", file, err.Error())
+				return
+			}
+
+			body := new(bytes.Buffer)
+			writer := multipart.NewWriter(body)
+			part, err := writer.CreateFormFile("file", file)
+			if err != nil {
+				color.Red("CreateFormFile file(%s) failure:%s", file, err.Error())
+				return
+			}
+			_, err = io.Copy(part, fi)
+			if err != nil {
+				color.Red("Copy file(%s) failure:%s", file, err.Error())
+				return
+			}
+			if len(model) != 0 {
+				_ = writer.WriteField("model", model)
+			}
+			if len(prompt) != 0 {
+				_ = writer.WriteField("prompt", prompt)
+			}
+			if len(language) != 0 {
+				_ = writer.WriteField("language", language)
+			}
+			_ = writer.Close()
+			var (
+				resp pkg.AudioTranslationResponse
+			)
+
+			client, err := pkg.NewClient(pkg.WithMethod(http.MethodPost), pkg.WithContentType(writer.FormDataContentType()), pkg.WithUrl(pkg.AUDIO_TRANSCRIPTION_URL), pkg.WithBody(body))
+			if err != nil {
+				color.Red("New Client Err:%s", err.Error())
+				return
+			}
+
+			err = client.Send(&resp)
+			if err != nil {
+				color.Red("Send Err:%s", err.Error())
+				return
+			}
+
+			color.Cyan("转录结果:%s", resp.Text)
+		},
+	}
+)
diff --git a/cmd/audio/translation.go b/cmd/audio/translation.go
@@ -0,0 +1,80 @@
+package audio
+
+import (
+	"bytes"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"os"
+
+	"github.com/fatih/color"
+	"github.com/spf13/cobra"
+
+	"github.com/zjzjzjzj1874/chatgpt/pkg"
+)
+
+func init() {
+	transCmd.Flags().StringVarP(&file, "file", "f", "", "The audio file to translate, in one of these formats: mp3, mp4, mpeg, mpga, m4a, wav, or webm.")
+	transCmd.Flags().StringVarP(&model, "model", "m", defaultModel, "ID of the model to use. Only whisper-1 is currently available.")
+	transCmd.Flags().StringVarP(&prompt, "prompt", "p", "", "An optional text to guide the model's style or continue a previous audio segment. The prompt should be in English.")
+	transCmd.Flags().StringVarP(&language, "language", "l", "", "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.")
+	transCmd.MarkFlagsRequiredTogether("file")
+}
+
+var (
+	transCmd = &cobra.Command{
+		Use:   "trans",
+		Short: "Translates audio into into text.",
+		Run: func(cmd *cobra.Command, args []string) {
+			if len(file) == 0 {
+				color.Red("%s", "Please input your file")
+				return
+			}
+			fi, err := os.Open(file)
+			if err != nil {
+				color.Red("Open file(%s) failure:%s", file, err.Error())
+				return
+			}
+
+			body := new(bytes.Buffer)
+			writer := multipart.NewWriter(body)
+			part, err := writer.CreateFormFile("file", file)
+			if err != nil {
+				color.Red("CreateFormFile file(%s) failure:%s", file, err.Error())
+				return
+			}
+			_, err = io.Copy(part, fi)
+			if err != nil {
+				color.Red("Copy file(%s) failure:%s", file, err.Error())
+				return
+			}
+			if len(model) != 0 {
+				_ = writer.WriteField("model", model)
+			}
+			if len(prompt) != 0 {
+				_ = writer.WriteField("prompt", prompt)
+			}
+			if len(language) != 0 {
+				_ = writer.WriteField("language", language)
+			}
+			_ = writer.Close()
+			var (
+				resp pkg.AudioTranslationResponse
+			)
+
+			client, err := pkg.NewClient(pkg.WithMethod(http.MethodPost), pkg.WithContentType(writer.FormDataContentType()), pkg.WithUrl(pkg.AUDIO_TRANSLATION_URL), pkg.WithBody(body))
+			if err != nil {
+				color.Red("New Client Err:%s", err.Error())
+				return
+			}
+
+			err = client.Send(&resp)
+			if err != nil {
+				color.Red("Send Chat Err:%s", err.Error())
+				return
+			}
+
+			color.Cyan("翻译结果:%s", resp.Text)
+		},
+	}
+)
diff --git a/cmd/main.go b/cmd/main.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"github.com/fatih/color"
+	"github.com/zjzjzjzj1874/chatgpt/cmd/audio"
 	"github.com/zjzjzjzj1874/chatgpt/cmd/image"
 	"os"
 
@@ -24,6 +25,7 @@ func init() {
 	rootCmd.AddCommand(chat.Cmd)
 	rootCmd.AddCommand(model.Cmd)
 	rootCmd.AddCommand(image.Cmd)
+	rootCmd.AddCommand(audio.Cmd)
 	rootCmd.CompletionOptions.DisableDefaultCmd = true
 }
 

diff --git a/pkg/client.go b/pkg/client.go
@@ -19,6 +19,7 @@ type Client struct {
 	method           string      // 请求方法
 	url              string      // 请求url
 	body             interface{} // 请求body
+	contentType      string      // 类型
 }
 
 type Option func(client *Client)
@@ -29,6 +30,12 @@ func WithPrompt(prompt string) Option {
 	}
 }
 
+func WithContentType(contentType string) Option {
+	return func(c *Client) {
+		c.contentType = contentType
+	}
+}
+
 func WithMethod(method string) Option {
 	return func(c *Client) {
 		c.method = method
@@ -58,6 +65,15 @@ func (c *Client) PreNewClient() {
 	if c.clientTimeoutSec <= 0 {
 		c.clientTimeoutSec = default_timeout
 	}
+	if c.contentType == "" {
+	}
+}
+
+// PostClient 后置处理参数
+func (c *Client) PostClient() {
+	if c.contentType != "" {
+		c.Client = c.Client.SetCommonContentType(c.contentType)
+	}
 }
 
 func NewClient(opts ...Option) (client *Client, err error) {
@@ -76,6 +92,7 @@ func NewClient(opts ...Option) (client *Client, err error) {
 		SetTimeout(time.Duration(client.clientTimeoutSec) * time.Second).
 		SetCommonBearerAuthToken(key).
 		SetCommonContentType("application/json; charset=utf-8")
+	client.PostClient()
 	return
 }
 
@@ -85,7 +102,15 @@ func (c *Client) Send(src interface{}) (err error) {
 	if c.body != nil {
 		request = request.SetBody(c.body)
 	}
+	respErr := ResponseErr{}
+	resp, err := request.SetSuccessResult(src).SetErrorResult(&respErr).Send(c.method, c.url)
+	if resp.IsErrorState() {
+		return respErr.Error
+	}
 
-	_, err = request.SetSuccessResult(src).Send(c.method, c.url)
+	// TODO add a debug var to print blow info
+	//color.Cyan("Resp:%v", src)
+	//res, _ := json.Marshal(src)
+	//color.Cyan("Total Res:%v", string(res))
 	return
 }
diff --git a/pkg/gpt.go b/pkg/gpt.go
@@ -13,8 +13,28 @@ const (
 	GPT_URL        = "https://api.openai.com/v1/chat/completions"   // POST&GET:和gpt进行聊天
 	MODEL_URL      = "https://api.openai.com/v1/models"             // GET:请求模型列表
 	IMG_CREATE_URL = "https://api.openai.com/v1/images/generations" // POST:图片生成
+
+	AUDIO_TRANSLATION_URL   = "https://api.openai.com/v1/audio/transcriptions" // POST:音频asr
+	AUDIO_TRANSCRIPTION_URL = "https://api.openai.com/v1/audio/transcriptions" // POST:音频转录
 )
 
+// 返回错误信息
+type (
+	ResponseErr struct {
+		Error RespErr `json:"error"`
+	}
+	RespErr struct {
+		Message string      `json:"message"`
+		Type    string      `json:"type"`
+		Param   interface{} `json:"param"`
+		Code    interface{} `json:"code"`
+	}
+)
+
+func (r RespErr) Error() string {
+	return r.Message
+}
+
 type Text2Cmd struct {
 	Model            string   `json:"model"`
 	Prompt           string   `json:"prompt"`
@@ -101,3 +121,16 @@ type (
 		URL string `json:"url"`
 	}
 )
+
+type (
+	AudioTranslationRequest struct {
+		File     string `json:"file"`
+		Model    string `json:"model"`
+		Prompt   string `json:"prompt"`
+		Language string `json:"language" description:"The language of the input audio"`
+	}
+
+	AudioTranslationResponse struct {
+		Text string `json:"text"`
+	}
+)