Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: integrate cortex.audio engine #860

Draft
wants to merge 1 commit into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cortex-cpp/common/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ class BaseModel {
virtual void FineTuning(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
virtual void CreateTranscription(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
virtual void CreateTranslation(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
};

class BaseChatCompletion {
Expand Down
213 changes: 186 additions & 27 deletions cortex-cpp/controllers/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ constexpr static auto kLlamaEngine = "cortex.llamacpp";
constexpr static auto kPythonRuntimeEngine = "cortex.python";
constexpr static auto kOnnxEngine = "cortex.onnx";
constexpr static auto kTensorrtLlmEngine = "cortex.tensorrt-llm";
constexpr static auto kAudioEngine = "cortex.audio";
} // namespace

server::server(){
Expand Down Expand Up @@ -156,23 +157,33 @@ void server::GetModels(const HttpRequestPtr& req,
}

LOG_TRACE << "Start to get models";
auto& en = std::get<EngineI*>(engines_[cur_engine_type_].engine);
if (en->IsSupported("GetModels")) {
en->GetModels(
req->getJsonObject(),
[cb = std::move(callback)](Json::Value status, Json::Value res) {
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
status["status_code"].asInt()));
cb(resp);
});
} else {
Json::Value res;
res["message"] = "Method is not supported yet";
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(k500InternalServerError);
callback(resp);
LOG_WARN << "Method is not supported yet";
std::vector<std::string> e_types = {cur_engine_type_};
if (cur_engine_type_ == kLlamaEngine) {
e_types.push_back(kAudioEngine);
} else if (cur_engine_type_ == kAudioEngine) {
e_types.push_back(kLlamaEngine);
}
for (auto const& et : e_types) {
if (IsEngineLoaded(et)) {
auto& en = std::get<EngineI*>(engines_[et].engine);
if (en->IsSupported("GetModels")) {
en->GetModels(
req->getJsonObject(),
[cb = std::move(callback)](Json::Value status, Json::Value res) {
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
status["status_code"].asInt()));
cb(resp);
});
} else {
Json::Value res;
res["message"] = "Method is not supported yet";
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(k500InternalServerError);
callback(resp);
LOG_WARN << "Method is not supported yet";
}
}
}

LOG_TRACE << "Done get models";
Expand Down Expand Up @@ -257,21 +268,24 @@ void server::LoadModel(const HttpRequestPtr& req,

// We have not loaded engine yet, should load it before using it
if (engines_.find(engine_type) == engines_.end()) {
// We only use single engine so unload all engines before load new engine
UnloadEngines();
// We use single engine for llamacpp, onnx, tensorrt so unload all engines before load new engine
// But it is tricky that we can use llamacpp with audio engine
UnloadEngines(engine_type);
auto get_engine_path = [](std::string_view e) {
if (e == kLlamaEngine) {
return cortex_utils::kLlamaLibPath;
} else if(e == kOnnxEngine) {
} else if (e == kOnnxEngine) {
return cortex_utils::kOnnxLibPath;
} else if(e == kTensorrtLlmEngine) {
} else if (e == kTensorrtLlmEngine) {
return cortex_utils::kTensorrtLlmPath;
} else if (e == kAudioEngine) {
return cortex_utils::kAudioLibPath;
}
return cortex_utils::kLlamaLibPath;
};

try {
if (engine_type == kLlamaEngine) {
if (engine_type == kLlamaEngine || engine_type == kAudioEngine) {
cortex::cpuid::CpuInfo cpu_info;
LOG_INFO << "CPU instruction set: " << cpu_info.to_string();
}
Expand Down Expand Up @@ -312,6 +326,96 @@ void server::LoadModel(const HttpRequestPtr& req,
LOG_TRACE << "Done load model";
}

void server::CreateTranscription(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) {
if (!IsEngineLoaded(kAudioEngine)) {
Json::Value res;
res["message"] = "Engine is not loaded yet";
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(k409Conflict);
callback(resp);
LOG_WARN << "Engine is not loaded yet";
return;
}

LOG_TRACE << "Start transcript";
SyncQueue q;
auto& en = std::get<EngineI*>(engines_[kAudioEngine].engine);
if (en->IsSupported("CreateTranscription")) {
auto req_body = ToAudioReqBody(req);
if (req_body == nullptr) {
Json::Value json_resp;
json_resp["message"] =
"No model field found or too many files in request body";
auto resp = cortex_utils::nitroHttpJsonResponse(json_resp);
resp->setStatusCode(k400BadRequest);
callback(resp);
return;
}
en->CreateTranscription(req_body,
[&q](Json::Value status, Json::Value res) {
q.push(std::make_pair(status, res));
});
LOG_TRACE << "Wait to transcript";
ProcessNonStreamRes(std::move(callback), q);
} else {
Json::Value res;
res["message"] = "Method is not supported yet";
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(k500InternalServerError);
callback(resp);
LOG_WARN << "Method is not supported yet";
}
LOG_TRACE << "Done transcript";
}

void server::CreateTranslation(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) {
auto engine_type =
(*(req->getJsonObject())).get("engine", kAudioEngine).asString();
if (!IsEngineLoaded(engine_type)) {
Json::Value res;
res["message"] = "Engine is not loaded yet";
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(k409Conflict);
callback(resp);
LOG_WARN << "Engine is not loaded yet";
return;
}

LOG_TRACE << "Start translate";
SyncQueue q;
auto& en = std::get<EngineI*>(engines_[engine_type].engine);
if (en->IsSupported("CreateTranscription")) {
auto req_body = ToAudioReqBody(req);
if (req_body == nullptr) {
Json::Value json_resp;
json_resp["message"] =
"No model field found or too many files in request body";
auto resp = cortex_utils::nitroHttpJsonResponse(json_resp);
resp->setStatusCode(k400BadRequest);
callback(resp);
return;
}
en->CreateTranscription(req_body,
[&q](Json::Value status, Json::Value res) {
q.push(std::make_pair(status, res));
});
LOG_TRACE << "Wait to translate";
ProcessNonStreamRes(std::move(callback), q);
} else {
Json::Value res;
res["message"] = "Method is not supported yet";
auto resp = cortex_utils::nitroHttpJsonResponse(res);
resp->setStatusCode(k500InternalServerError);
callback(resp);
LOG_WARN << "Method is not supported yet";
}
LOG_TRACE << "Done translate";
}

void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
std::shared_ptr<SyncQueue> q) {
auto err_or_done = std::make_shared<std::atomic_bool>(false);
Expand Down Expand Up @@ -359,14 +463,69 @@ bool server::IsEngineLoaded(const std::string& e) {
return engines_.find(e) != engines_.end();
}

void server::UnloadEngines() {
// We unload all engines except python engine
void server::UnloadEngines(const std::string& e) {
// We unload all engines except:
// - python engine
// - llama engine in case we'd like to load audio engine
// - audio engine in case we'd like to load llama engine
for (auto it = engines_.begin(); it != engines_.end();) {
if (it->first != kPythonRuntimeEngine) {
it = engines_.erase(it);
} else
if ((it->first == kPythonRuntimeEngine) ||
(e == kLlamaEngine && it->first == kAudioEngine) ||
(e == kAudioEngine && it->first == kLlamaEngine)) {
it++;
} else {
it = engines_.erase(it);
}
}
}

std::shared_ptr<Json::Value> server::ToAudioReqBody(const HttpRequestPtr& req) {
auto req_body = std::make_shared<Json::Value>();
MultiPartParser part_parser;
if (part_parser.parse(req) != 0 || part_parser.getFiles().size() != 1) {
LOG_ERROR << "Must have exactly one file";
return nullptr;
}

auto& file = part_parser.getFiles()[0];
const auto& form_fields = part_parser.getParameters();

if (form_fields.find("model") == form_fields.end()) {
LOG_ERROR << "No model field found in request body";
return nullptr;
}

(*req_body)["model"] = form_fields.at("model");
// Parse all other optional parameters from the request
(*req_body)["language"] = form_fields.find("language") != form_fields.end()
? form_fields.at("language")
: "en";
(*req_body)["prompt"] = form_fields.find("prompt") != form_fields.end()
? form_fields.at("prompt")
: "";
(*req_body)["response_format"] =
form_fields.find("response_format") != form_fields.end()
? form_fields.at("response_format")
: "json";
(*req_body)["temperature"] =
form_fields.find("temperature") != form_fields.end()
? std::stof(form_fields.at("temperature"))
: 0;

// Save input file to temp location
std::string temp_dir =
std::filesystem::temp_directory_path().string() + "/" +
std::to_string(std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count());
// Create the directory
std::filesystem::create_directory(temp_dir);
// Save the file to the directory, with its original name
std::string temp_file_path = temp_dir + "/" + file.getFileName();
file.saveAs(temp_file_path);

(*req_body)["file"] = temp_file_path;
return req_body;
}

} // namespace inferences
14 changes: 13 additions & 1 deletion cortex-cpp/controllers/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class server : public drogon::HttpController<server>,
METHOD_ADD(server::ModelStatus, "modelstatus", Post);
METHOD_ADD(server::GetModels, "models", Get);
METHOD_ADD(server::GetEngines, "engines", Get);
METHOD_ADD(server::CreateTranscription, "transcriptions", Post);
METHOD_ADD(server::CreateTranslation, "translations", Post);

// cortex.python API
METHOD_ADD(server::FineTuning, "finetuning", Post);
Expand All @@ -58,6 +60,8 @@ class server : public drogon::HttpController<server>,
ADD_METHOD_TO(server::ChatCompletion, "/v1/chat/completions", Post);
ADD_METHOD_TO(server::GetModels, "/v1/models", Get);
ADD_METHOD_TO(server::FineTuning, "/v1/fine_tuning/job", Post);
ADD_METHOD_TO(server::CreateTranscription, "/v1/audio/transcriptions", Post);
ADD_METHOD_TO(server::CreateTranslation, "/v1/audio/translations", Post);

// ADD_METHOD_TO(server::handlePrelight, "/v1/chat/completions", Options);
// NOTE: prelight will be added back when browser support is properly planned
Expand Down Expand Up @@ -91,6 +95,12 @@ class server : public drogon::HttpController<server>,
void FineTuning(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) override;
void CreateTranscription(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) override;
void CreateTranslation(
const HttpRequestPtr& req,
std::function<void(const HttpResponsePtr&)>&& callback) override;

private:
void ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
Expand All @@ -99,7 +109,9 @@ class server : public drogon::HttpController<server>,
SyncQueue& q);
bool IsEngineLoaded(const std::string& e);

void UnloadEngines();
void UnloadEngines(const std::string& e);

std::shared_ptr<Json::Value> ToAudioReqBody(const HttpRequestPtr& req);

private:
struct SyncQueue {
Expand Down
9 changes: 9 additions & 0 deletions cortex-cpp/cortex-common/EngineI.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,13 @@ class EngineI {
virtual void GetModels(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;

// cortex.audio interface
virtual void CreateTranscription(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;

virtual void CreateTranslation(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
};
1 change: 1 addition & 0 deletions cortex-cpp/utils/cortex_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp";
constexpr static auto kPythonRuntimeLibPath = "/engines/cortex.python";
constexpr static auto kOnnxLibPath = "/engines/cortex.onnx";
constexpr static auto kTensorrtLlmPath = "/engines/cortex.tensorrt-llm";
constexpr static auto kAudioLibPath = "/engines/cortex.audio";

inline std::string models_folder = "./models";

Expand Down
Loading