janhq · vansangpfiev · Jul 9, 2024
diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h
@@ -26,6 +26,12 @@ class BaseModel {
   virtual void FineTuning(
       const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) = 0;
+  virtual void CreateTranscription(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) = 0;
+  virtual void CreateTranslation(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) = 0;
 };
 
 class BaseChatCompletion {

diff --git a/cortex-cpp/controllers/server.cc b/cortex-cpp/controllers/server.cc
@@ -17,6 +17,7 @@ constexpr static auto kLlamaEngine = "cortex.llamacpp";
 constexpr static auto kPythonRuntimeEngine = "cortex.python";
 constexpr static auto kOnnxEngine = "cortex.onnx";
 constexpr static auto kTensorrtLlmEngine = "cortex.tensorrt-llm";
+constexpr static auto kAudioEngine = "cortex.audio";
 }  // namespace
 
 server::server(){
@@ -156,23 +157,33 @@ void server::GetModels(const HttpRequestPtr& req,
   }
 
   LOG_TRACE << "Start to get models";
-  auto& en = std::get<EngineI*>(engines_[cur_engine_type_].engine);
-  if (en->IsSupported("GetModels")) {
-    en->GetModels(
-        req->getJsonObject(),
-        [cb = std::move(callback)](Json::Value status, Json::Value res) {
-          auto resp = cortex_utils::nitroHttpJsonResponse(res);
-          resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
-              status["status_code"].asInt()));
-          cb(resp);
-        });
-  } else {
-    Json::Value res;
-    res["message"] = "Method is not supported yet";
-    auto resp = cortex_utils::nitroHttpJsonResponse(res);
-    resp->setStatusCode(k500InternalServerError);
-    callback(resp);
-    LOG_WARN << "Method is not supported yet";
+  std::vector<std::string> e_types = {cur_engine_type_};
+  if (cur_engine_type_ == kLlamaEngine) {
+    e_types.push_back(kAudioEngine);
+  } else if (cur_engine_type_ == kAudioEngine) {
+    e_types.push_back(kLlamaEngine);
+  }
+  for (auto const& et : e_types) {
+    if (IsEngineLoaded(et)) {
+      auto& en = std::get<EngineI*>(engines_[et].engine);
+      if (en->IsSupported("GetModels")) {
+        en->GetModels(
+            req->getJsonObject(),
+            [cb = std::move(callback)](Json::Value status, Json::Value res) {
+              auto resp = cortex_utils::nitroHttpJsonResponse(res);
+              resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
+                  status["status_code"].asInt()));
+              cb(resp);
+            });
+      } else {
+        Json::Value res;
+        res["message"] = "Method is not supported yet";
+        auto resp = cortex_utils::nitroHttpJsonResponse(res);
+        resp->setStatusCode(k500InternalServerError);
+        callback(resp);
+        LOG_WARN << "Method is not supported yet";
+      }
+    }
   }
 
   LOG_TRACE << "Done get models";
@@ -257,21 +268,24 @@ void server::LoadModel(const HttpRequestPtr& req,
 
   // We have not loaded engine yet, should load it before using it
   if (engines_.find(engine_type) == engines_.end()) {
-    // We only use single engine so unload all engines before load new engine
-    UnloadEngines();
+    // We use single engine for llamacpp, onnx, tensorrt so unload all engines before load new engine
+    // But it is tricky that we can use llamacpp with audio engine
+    UnloadEngines(engine_type);
     auto get_engine_path = [](std::string_view e) {
       if (e == kLlamaEngine) {
         return cortex_utils::kLlamaLibPath;
-      } else if(e == kOnnxEngine) {
+      } else if (e == kOnnxEngine) {
         return cortex_utils::kOnnxLibPath;
-      } else if(e == kTensorrtLlmEngine) {
+      } else if (e == kTensorrtLlmEngine) {
         return cortex_utils::kTensorrtLlmPath;
+      } else if (e == kAudioEngine) {
+        return cortex_utils::kAudioLibPath;
       }
       return cortex_utils::kLlamaLibPath;
     };
 
     try {
-      if (engine_type == kLlamaEngine) {
+      if (engine_type == kLlamaEngine || engine_type == kAudioEngine) {
         cortex::cpuid::CpuInfo cpu_info;
         LOG_INFO << "CPU instruction set: " << cpu_info.to_string();
       }
@@ -312,6 +326,96 @@ void server::LoadModel(const HttpRequestPtr& req,
   LOG_TRACE << "Done load model";
 }
 
+void server::CreateTranscription(
+    const HttpRequestPtr& req,
+    std::function<void(const HttpResponsePtr&)>&& callback) {
+  if (!IsEngineLoaded(kAudioEngine)) {
+    Json::Value res;
+    res["message"] = "Engine is not loaded yet";
+    auto resp = cortex_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k409Conflict);
+    callback(resp);
+    LOG_WARN << "Engine is not loaded yet";
+    return;
+  }
+
+  LOG_TRACE << "Start transcript";
+  SyncQueue q;
+  auto& en = std::get<EngineI*>(engines_[kAudioEngine].engine);
+  if (en->IsSupported("CreateTranscription")) {
+    auto req_body = ToAudioReqBody(req);
+    if (req_body == nullptr) {
+      Json::Value json_resp;
+      json_resp["message"] =
+          "No model field found or too many files in request body";
+      auto resp = cortex_utils::nitroHttpJsonResponse(json_resp);
+      resp->setStatusCode(k400BadRequest);
+      callback(resp);
+      return;
+    }
+    en->CreateTranscription(req_body,
+                            [&q](Json::Value status, Json::Value res) {
+                              q.push(std::make_pair(status, res));
+                            });
+    LOG_TRACE << "Wait to transcript";
+    ProcessNonStreamRes(std::move(callback), q);
+  } else {
+    Json::Value res;
+    res["message"] = "Method is not supported yet";
+    auto resp = cortex_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k500InternalServerError);
+    callback(resp);
+    LOG_WARN << "Method is not supported yet";
+  }
+  LOG_TRACE << "Done transcript";
+}
+
+void server::CreateTranslation(
+    const HttpRequestPtr& req,
+    std::function<void(const HttpResponsePtr&)>&& callback) {
+  auto engine_type =
+      (*(req->getJsonObject())).get("engine", kAudioEngine).asString();
+  if (!IsEngineLoaded(engine_type)) {
+    Json::Value res;
+    res["message"] = "Engine is not loaded yet";
+    auto resp = cortex_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k409Conflict);
+    callback(resp);
+    LOG_WARN << "Engine is not loaded yet";
+    return;
+  }
+
+  LOG_TRACE << "Start translate";
+  SyncQueue q;
+  auto& en = std::get<EngineI*>(engines_[engine_type].engine);
+  if (en->IsSupported("CreateTranscription")) {
+    auto req_body = ToAudioReqBody(req);
+    if (req_body == nullptr) {
+      Json::Value json_resp;
+      json_resp["message"] =
+          "No model field found or too many files in request body";
+      auto resp = cortex_utils::nitroHttpJsonResponse(json_resp);
+      resp->setStatusCode(k400BadRequest);
+      callback(resp);
+      return;
+    }
+    en->CreateTranscription(req_body,
+                            [&q](Json::Value status, Json::Value res) {
+                              q.push(std::make_pair(status, res));
+                            });
+    LOG_TRACE << "Wait to translate";
+    ProcessNonStreamRes(std::move(callback), q);
+  } else {
+    Json::Value res;
+    res["message"] = "Method is not supported yet";
+    auto resp = cortex_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k500InternalServerError);
+    callback(resp);
+    LOG_WARN << "Method is not supported yet";
+  }
+  LOG_TRACE << "Done translate";
+}
+
 void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
                               std::shared_ptr<SyncQueue> q) {
   auto err_or_done = std::make_shared<std::atomic_bool>(false);
@@ -359,14 +463,69 @@ bool server::IsEngineLoaded(const std::string& e) {
   return engines_.find(e) != engines_.end();
 }
 
-void server::UnloadEngines() {
-  // We unload all engines except python engine
+void server::UnloadEngines(const std::string& e) {
+  // We unload all engines except:
+  // - python engine
+  // - llama engine in case we'd like to load audio engine
+  // - audio engine in case we'd like to load llama engine
   for (auto it = engines_.begin(); it != engines_.end();) {
-    if (it->first != kPythonRuntimeEngine) {
-      it = engines_.erase(it);
-    } else
+    if ((it->first == kPythonRuntimeEngine) ||
+        (e == kLlamaEngine && it->first == kAudioEngine) ||
+        (e == kAudioEngine && it->first == kLlamaEngine)) {
       it++;
+    } else {
+      it = engines_.erase(it);
+    }
+  }
+}
+
+std::shared_ptr<Json::Value> server::ToAudioReqBody(const HttpRequestPtr& req) {
+  auto req_body = std::make_shared<Json::Value>();
+  MultiPartParser part_parser;
+  if (part_parser.parse(req) != 0 || part_parser.getFiles().size() != 1) {
+    LOG_ERROR << "Must have exactly one file";
+    return nullptr;
   }
+
+  auto& file = part_parser.getFiles()[0];
+  const auto& form_fields = part_parser.getParameters();
+
+  if (form_fields.find("model") == form_fields.end()) {
+    LOG_ERROR << "No model field found in request body";
+    return nullptr;
+  }
+
+  (*req_body)["model"] = form_fields.at("model");
+  // Parse all other optional parameters from the request
+  (*req_body)["language"] = form_fields.find("language") != form_fields.end()
+                                ? form_fields.at("language")
+                                : "en";
+  (*req_body)["prompt"] = form_fields.find("prompt") != form_fields.end()
+                              ? form_fields.at("prompt")
+                              : "";
+  (*req_body)["response_format"] =
+      form_fields.find("response_format") != form_fields.end()
+          ? form_fields.at("response_format")
+          : "json";
+  (*req_body)["temperature"] =
+      form_fields.find("temperature") != form_fields.end()
+          ? std::stof(form_fields.at("temperature"))
+          : 0;
+
+  // Save input file to temp location
+  std::string temp_dir =
+      std::filesystem::temp_directory_path().string() + "/" +
+      std::to_string(std::chrono::duration_cast<std::chrono::milliseconds>(
+                         std::chrono::system_clock::now().time_since_epoch())
+                         .count());
+  // Create the directory
+  std::filesystem::create_directory(temp_dir);
+  // Save the file to the directory, with its original name
+  std::string temp_file_path = temp_dir + "/" + file.getFileName();
+  file.saveAs(temp_file_path);
+
+  (*req_body)["file"] = temp_file_path;
+  return req_body;
 }
 
 }  // namespace inferences
diff --git a/cortex-cpp/controllers/server.h b/cortex-cpp/controllers/server.h
@@ -50,6 +50,8 @@ class server : public drogon::HttpController<server>,
   METHOD_ADD(server::ModelStatus, "modelstatus", Post);
   METHOD_ADD(server::GetModels, "models", Get);
   METHOD_ADD(server::GetEngines, "engines", Get);
+  METHOD_ADD(server::CreateTranscription, "transcriptions", Post);
+  METHOD_ADD(server::CreateTranslation, "translations", Post);
 
   // cortex.python API
   METHOD_ADD(server::FineTuning, "finetuning", Post);
@@ -58,6 +60,8 @@ class server : public drogon::HttpController<server>,
   ADD_METHOD_TO(server::ChatCompletion, "/v1/chat/completions", Post);
   ADD_METHOD_TO(server::GetModels, "/v1/models", Get);
   ADD_METHOD_TO(server::FineTuning, "/v1/fine_tuning/job", Post);
+  ADD_METHOD_TO(server::CreateTranscription, "/v1/audio/transcriptions", Post);
+  ADD_METHOD_TO(server::CreateTranslation, "/v1/audio/translations", Post);
 
   // ADD_METHOD_TO(server::handlePrelight, "/v1/chat/completions", Options);
   // NOTE: prelight will be added back when browser support is properly planned
@@ -91,6 +95,12 @@ class server : public drogon::HttpController<server>,
   void FineTuning(
       const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) override;
+  void CreateTranscription(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) override;
+  void CreateTranslation(
+      const HttpRequestPtr& req,
+      std::function<void(const HttpResponsePtr&)>&& callback) override;
 
  private:
   void ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
@@ -99,7 +109,9 @@ class server : public drogon::HttpController<server>,
                            SyncQueue& q);
   bool IsEngineLoaded(const std::string& e);
 
-  void UnloadEngines();
+  void UnloadEngines(const std::string& e);
+
+  std::shared_ptr<Json::Value> ToAudioReqBody(const HttpRequestPtr& req);
 
  private:
   struct SyncQueue {

diff --git a/cortex-cpp/cortex-common/EngineI.h b/cortex-cpp/cortex-common/EngineI.h
@@ -34,4 +34,13 @@ class EngineI {
   virtual void GetModels(
       std::shared_ptr<Json::Value> jsonBody,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  // cortex.audio interface
+  virtual void CreateTranscription(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  virtual void CreateTranslation(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 };
diff --git a/cortex-cpp/utils/cortex_utils.h b/cortex-cpp/utils/cortex_utils.h
@@ -29,6 +29,7 @@ constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp";
 constexpr static auto kPythonRuntimeLibPath = "/engines/cortex.python";
 constexpr static auto kOnnxLibPath = "/engines/cortex.onnx";
 constexpr static auto kTensorrtLlmPath = "/engines/cortex.tensorrt-llm";
+constexpr static auto kAudioLibPath = "/engines/cortex.audio";
 
 inline std::string models_folder = "./models";