diff --git a/CMakeLists.txt b/CMakeLists.txt index e9bbe9f3d..19c27189d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.3") +set(SHERPA_ONNX_VERSION "1.9.4") # Disable warning about # @@ -106,10 +106,23 @@ endif() set(CMAKE_CXX_EXTENSIONS OFF) message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}") + include(CheckIncludeFileCXX) -check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) -if(SHERPA_ONNX_HAS_ALSA) - add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) + +if(UNIX AND NOT APPLE) + check_include_file_cxx(alsa/asoundlib.h SHERPA_ONNX_HAS_ALSA) + if(SHERPA_ONNX_HAS_ALSA) + add_definitions(-DSHERPA_ONNX_ENABLE_ALSA=1) + else() + message(WARNING "\ +Could not find alsa/asoundlib.h ! +We won't build sherpa-ncnn-alsa +To fix that, please do: + (1) sudo apt-get install alsa-utils libasound2-dev + (2) rm -rf build + (3) re-try + ") + endif() endif() check_include_file_cxx(cxxabi.h SHERPA_ONNX_HAVE_CXXABI_H) diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index 2981ac047..ed329162d 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -144,6 +144,8 @@ def build_extension(self, ext: setuptools.extension.Extension): binaries += ["sherpa-onnx-vad-microphone-offline-asr"] binaries += ["sherpa-onnx-offline-tts"] binaries += ["sherpa-onnx-offline-tts-play"] + binaries += ["sherpa-onnx-alsa"] + binaries += ["sherpa-onnx-offline-tts-play-alsa"] if is_windows(): binaries += ["kaldi-native-fbank-core.dll"] @@ -165,6 +167,11 @@ def build_extension(self, ext: setuptools.extension.Extension): src_file = install_dir / "lib" / (f + suffix) if not src_file.is_file(): src_file = install_dir / ".." / (f + suffix) + + if not src_file.is_file() and 'alsa' in f: + print(f'Skipping {f}') + continue + print(f"Copying {src_file} to {out_bin_dir}/") shutil.copy(f"{src_file}", f"{out_bin_dir}/") diff --git a/setup.py b/setup.py index 8b7e52bc9..b80ecdd02 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,8 @@ def get_binaries_to_install(): binaries += ["sherpa-onnx-vad-microphone-offline-asr"] binaries += ["sherpa-onnx-offline-tts"] binaries += ["sherpa-onnx-offline-tts-play"] + binaries += ["sherpa-onnx-alsa"] + binaries += ["sherpa-onnx-offline-tts-play-alsa"] if is_windows(): binaries += ["kaldi-native-fbank-core.dll"] binaries += ["sherpa-onnx-c-api.dll"] diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index e9717996f..c114e08fb 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -207,14 +207,42 @@ install( if(SHERPA_ONNX_HAS_ALSA) add_executable(sherpa-onnx-alsa sherpa-onnx-alsa.cc alsa.cc) - target_link_libraries(sherpa-onnx-alsa sherpa-onnx-core) + add_executable(sherpa-onnx-offline-tts-play-alsa sherpa-onnx-offline-tts-play-alsa.cc alsa-play.cc) - if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) - target_link_libraries(sherpa-onnx-alsa -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) - else() - target_link_libraries(sherpa-onnx-alsa asound) + set(exes + sherpa-onnx-alsa + sherpa-onnx-offline-tts-play-alsa + ) + foreach(exe IN LISTS exes) + target_link_libraries(${exe} sherpa-onnx-core) + endforeach() + + foreach(exe IN LISTS exes) + if(DEFINED ENV{SHERPA_ONNX_ALSA_LIB_DIR}) + target_link_libraries(${exe} -L$ENV{SHERPA_ONNX_ALSA_LIB_DIR} -lasound) + else() + target_link_libraries(${exe} asound) + endif() + endforeach() + + if(NOT WIN32) + foreach(exe IN LISTS exes) + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib") + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../../../sherpa_onnx/lib") + endforeach() + + if(SHERPA_ONNX_ENABLE_PYTHON) + foreach(exe IN LISTS exes) + target_link_libraries(${exe} "-Wl,-rpath,${SHERPA_ONNX_RPATH_ORIGIN}/../lib/python${PYTHON_VERSION}/site-packages/sherpa_onnx/lib") + endforeach() + endif() endif() - install(TARGETS sherpa-onnx-alsa DESTINATION bin) + + install( + TARGETS ${exes} + DESTINATION + bin + ) endif() if(SHERPA_ONNX_ENABLE_PORTAUDIO) diff --git a/sherpa-onnx/csrc/alsa-play.cc b/sherpa-onnx/csrc/alsa-play.cc new file mode 100644 index 000000000..5602e389b --- /dev/null +++ b/sherpa-onnx/csrc/alsa-play.cc @@ -0,0 +1,150 @@ +// sherpa-onnx/csrc/alsa-play.cc +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +#ifdef SHERPA_ONNX_ENABLE_ALSA + +#include "sherpa-onnx/csrc/alsa-play.h" + +#include + +namespace sherpa_onnx { + +AlsaPlay::AlsaPlay(const char *device_name, int32_t sample_rate) { + int32_t err = snd_pcm_open(&handle_, device_name, SND_PCM_STREAM_PLAYBACK, 0); + + if (err) { + fprintf(stderr, "Unable to open: %s. %s\n", device_name, snd_strerror(err)); + exit(-1); + } + + SetParameters(sample_rate); +} + +AlsaPlay::~AlsaPlay() { + if (handle_) { + int32_t err = snd_pcm_close(handle_); + if (err < 0) { + printf("Failed to close pcm: %s\n", snd_strerror(err)); + } + } +} + +void AlsaPlay::SetParameters(int32_t sample_rate) { + // set the following parameters + // 1. sample_rate + // 2. sample format: int16_t + // 3. num_channels: 1 + snd_pcm_hw_params_t *params; + snd_pcm_hw_params_alloca(¶ms); + snd_pcm_hw_params_any(handle_, params); + + int32_t err = snd_pcm_hw_params_set_access(handle_, params, + SND_PCM_ACCESS_RW_INTERLEAVED); + if (err < 0) { + printf("SND_PCM_ACCESS_RW_INTERLEAVED is not supported: %s\n", + snd_strerror(err)); + exit(-1); + } + + err = snd_pcm_hw_params_set_format(handle_, params, SND_PCM_FORMAT_S16_LE); + + if (err < 0) { + printf("Can't set format to 16-bit: %s\n", snd_strerror(err)); + exit(-1); + } + + err = snd_pcm_hw_params_set_channels(handle_, params, 1); + + if (err < 0) { + printf("Can't set channel number to 1: %s\n", snd_strerror(err)); + } + + uint32_t rate = sample_rate; + err = snd_pcm_hw_params_set_rate_near(handle_, params, &rate, 0); + if (err < 0) { + printf("Can't set rate to %d. %s\n", rate, snd_strerror(err)); + } + + err = snd_pcm_hw_params(handle_, params); + if (err < 0) { + printf("Can't set hardware parameters. %s\n", snd_strerror(err)); + exit(-1); + } + + uint32_t tmp; + snd_pcm_hw_params_get_rate(params, &tmp, 0); + int32_t actual_sample_rate = tmp; + if (actual_sample_rate != sample_rate) { + fprintf(stderr, + "Creating a resampler:\n" + " in_sample_rate: %d\n" + " output_sample_rate: %d\n", + sample_rate, actual_sample_rate); + + float min_freq = std::min(actual_sample_rate, sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + + int32_t lowpass_filter_width = 6; + resampler_ = std::make_unique( + sample_rate, actual_sample_rate, lowpass_cutoff, lowpass_filter_width); + } + + snd_pcm_uframes_t frames; + snd_pcm_hw_params_get_period_size(params, &frames, 0); + buf_.resize(frames); +} + +void AlsaPlay::Play(const std::vector &samples) { + std::vector tmp; + const float *p = samples.data(); + int32_t num_samples = samples.size(); + if (resampler_) { + resampler_->Resample(samples.data(), samples.size(), false, &tmp); + p = tmp.data(); + num_samples = tmp.size(); + } + + int32_t frames = buf_.size(); + int32_t i = 0; + for (; i + frames < num_samples; i += frames) { + for (int32_t k = 0; k != frames; ++k) { + buf_[k] = p[i + k] * 32767; + } + + int32_t err = snd_pcm_writei(handle_, buf_.data(), frames); + if (err == -EPIPE) { + printf("XRUN.\n"); + snd_pcm_prepare(handle_); + } else if (err < 0) { + printf("Can't write to PCM device: %s\n", snd_strerror(err)); + exit(-1); + } + } + + if (i < num_samples) { + for (int32_t k = 0; k + i < num_samples; ++k) { + buf_[k] = p[i + k] * 32767; + } + + int32_t err = snd_pcm_writei(handle_, buf_.data(), num_samples - i); + if (err == -EPIPE) { + printf("XRUN.\n"); + snd_pcm_prepare(handle_); + } else if (err < 0) { + printf("Can't write to PCM device: %s\n", snd_strerror(err)); + exit(-1); + } + } +} + +void AlsaPlay::Drain() { + int32_t err = snd_pcm_drain(handle_); + if (err < 0) { + printf("Failed to drain pcm. %s\n", snd_strerror(err)); + } +} + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_ENABLE_ALSA diff --git a/sherpa-onnx/csrc/alsa-play.h b/sherpa-onnx/csrc/alsa-play.h new file mode 100644 index 000000000..de324ca0c --- /dev/null +++ b/sherpa-onnx/csrc/alsa-play.h @@ -0,0 +1,37 @@ +// sherpa-onnx/csrc/alsa-play.h +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_ALSA_PLAY_H_ +#define SHERPA_ONNX_CSRC_ALSA_PLAY_H_ + +#include +#include +#include + +#include "alsa/asoundlib.h" +#include "sherpa-onnx/csrc/resample.h" + +namespace sherpa_onnx { + +class AlsaPlay { + public: + AlsaPlay(const char *device_name, int32_t sample_rate); + ~AlsaPlay(); + void Play(const std::vector &samples); + + // wait for all the samples to be played + void Drain(); + + private: + void SetParameters(int32_t sample_rate); + + private: + snd_pcm_t *handle_ = nullptr; + std::unique_ptr resampler_; + std::vector buf_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_ALSA_PLAY_H_ diff --git a/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc new file mode 100644 index 000000000..2cb17bd6f --- /dev/null +++ b/sherpa-onnx/csrc/sherpa-onnx-offline-tts-play-alsa.cc @@ -0,0 +1,218 @@ +// sherpa-onnx/csrc/sherpa-onnx-tts-play-alsa.cc +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +// see https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html +// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m___h_w___params.html +// https://www.alsa-project.org/alsa-doc/alsa-lib/group___p_c_m.html + +#include + +#include +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "sherpa-onnx/csrc/alsa-play.h" +#include "sherpa-onnx/csrc/offline-tts.h" +#include "sherpa-onnx/csrc/parse-options.h" +#include "sherpa-onnx/csrc/wave-writer.h" + +static std::condition_variable g_cv; +static std::mutex g_cv_m; + +struct Buffer { + std::queue> samples; + std::mutex mutex; +}; + +static Buffer g_buffer; + +static bool g_stopped = false; +static bool g_killed = false; + +static void Handler(int32_t /*sig*/) { + if (g_killed) { + exit(0); + } + + g_killed = true; + fprintf(stderr, "\nCaught Ctrl + C. Exiting\n"); +} + +static void AudioGeneratedCallback(const float *s, int32_t n) { + if (n > 0) { + std::lock_guard lock(g_buffer.mutex); + g_buffer.samples.push({s, s + n}); + g_cv.notify_all(); + } +} + +static void StartPlayback(const std::string &device_name, int32_t sample_rate) { + sherpa_onnx::AlsaPlay alsa(device_name.c_str(), sample_rate); + + std::unique_lock lock(g_cv_m); + while (!g_killed && !g_stopped) { + while (!g_buffer.samples.empty()) { + auto &p = g_buffer.samples.front(); + alsa.Play(p); + g_buffer.samples.pop(); + } + + g_cv.wait(lock); + } + + if (g_killed) { + return; + } + + if (g_stopped) { + while (!g_buffer.samples.empty()) { + auto &p = g_buffer.samples.front(); + alsa.Play(p); + g_buffer.samples.pop(); + } + } + + alsa.Drain(); +} + +int main(int32_t argc, char *argv[]) { + signal(SIGINT, Handler); + + const char *kUsageMessage = R"usage( +Offline text-to-speech with sherpa-onnx. + +It plays the generated audio as the model is processing. + +Note that it is alsa so it works only on **Linux**. For instance, you can +use it on Raspberry Pi. + +Usage example: + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 +tar xf vits-piper-en_US-amy-low.tar.bz2 + +./bin/sherpa-onnx-offline-tts-play-alsa \ + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ + --output-filename=./generated.wav \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + +It will generate a file ./generated.wav as specified by --output-filename. + +You can find more models at +https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models + +Please see +https://k2-fsa.github.io/sherpa/onnx/tts/index.html +or details. +)usage"; + + sherpa_onnx::ParseOptions po(kUsageMessage); + std::string device_name = "default"; + std::string output_filename = "./generated.wav"; + int32_t sid = 0; + + po.Register("output-filename", &output_filename, + "Path to save the generated audio"); + + po.Register("device-name", &device_name, + "Name of the device to play the generated audio"); + + po.Register("sid", &sid, + "Speaker ID. Used only for multi-speaker models, e.g., models " + "trained using the VCTK dataset. Not used for single-speaker " + "models, e.g., models trained using the LJSpeech dataset"); + + sherpa_onnx::OfflineTtsConfig config; + + config.Register(&po); + po.Read(argc, argv); + + if (po.NumArgs() == 0) { + fprintf(stderr, "Error: Please provide the text to generate audio.\n\n"); + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + if (po.NumArgs() > 1) { + fprintf(stderr, + "Error: Accept only one positional argument. Please use single " + "quotes to wrap your text\n"); + po.PrintUsage(); + exit(EXIT_FAILURE); + } + + if (!config.Validate()) { + fprintf(stderr, "Errors in config!\n"); + exit(EXIT_FAILURE); + } + + if (config.max_num_sentences != 1) { + fprintf(stderr, "Setting config.max_num_sentences to 1\n"); + config.max_num_sentences = 1; + } + + fprintf(stderr, "Loading the model\n"); + sherpa_onnx::OfflineTts tts(config); + + fprintf(stderr, "Start the playback thread\n"); + std::thread playback_thread(StartPlayback, device_name, tts.SampleRate()); + + float speed = 1.0; + + fprintf(stderr, "Generating ...\n"); + const auto begin = std::chrono::steady_clock::now(); + auto audio = tts.Generate(po.GetArg(1), sid, speed, AudioGeneratedCallback); + const auto end = std::chrono::steady_clock::now(); + g_stopped = true; + g_cv.notify_all(); + fprintf(stderr, "Generating done!\n"); + if (audio.samples.empty()) { + fprintf( + stderr, + "Error in generating audio. Please read previous error messages.\n"); + exit(EXIT_FAILURE); + } + + float elapsed_seconds = + std::chrono::duration_cast(end - begin) + .count() / + 1000.; + float duration = audio.samples.size() / static_cast(audio.sample_rate); + + float rtf = elapsed_seconds / duration; + fprintf(stderr, "Elapsed seconds: %.3f s\n", elapsed_seconds); + fprintf(stderr, "Audio duration: %.3f s\n", duration); + fprintf(stderr, "Real-time factor (RTF): %.3f/%.3f = %.3f\n", elapsed_seconds, + duration, rtf); + + bool ok = sherpa_onnx::WriteWave(output_filename, audio.sample_rate, + audio.samples.data(), audio.samples.size()); + if (!ok) { + fprintf(stderr, "Failed to write wave to %s\n", output_filename.c_str()); + exit(EXIT_FAILURE); + } + + fprintf(stderr, "The text is: %s. Speaker ID: %d\n\n", po.GetArg(1).c_str(), + sid); + fprintf(stderr, "\n**** Saved to %s successfully! ****\n", + output_filename.c_str()); + + fprintf(stderr, "\n"); + fprintf( + stderr, + "Wait for the playback to finish. You can safely press ctrl + C to stop " + "the playback.\n"); + playback_thread.join(); + + fprintf(stderr, "Done!\n"); + + return 0; +}