diff --git a/README.md b/README.md index 9b4ee1c..ef0a6bb 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,18 @@ python -m build --wheel # in this repository to build the wheel. Assumes you hav pip install dist/.whl ``` +### Vulkan support + +Thanks to [@thewh1teagle](https://github.com/thewh1teagle) + +To build and install, clone the repository and run the following commands: + +```shell +export CMAKE_ARGS="-DGGML_VULKAN=1" +python -m build --wheel # in this repository to build the wheel. Assumes you have installed build with pip install build +pip install dist/.whl +``` + Then download and convert the appropriate model using the original `whisper.cpp` repository, producing a `.mlmodelc` directory. You can now verify if everything's working: @@ -180,7 +192,7 @@ usage: pwcpp [-h] [-m MODEL] [--version] [--processors PROCESSORS] [-otxt] [-ovt [--translate TRANSLATE] [--no_context NO_CONTEXT] [--single_segment SINGLE_SEGMENT] [--print_special PRINT_SPECIAL] [--print_progress PRINT_PROGRESS] [--print_realtime PRINT_REALTIME] [--print_timestamps PRINT_TIMESTAMPS] [--token_timestamps TOKEN_TIMESTAMPS] [--thold_pt THOLD_PT] [--thold_ptsum THOLD_PTSUM] [--max_len MAX_LEN] - [--split_on_word SPLIT_ON_WORD] [--max_tokens MAX_TOKENS] [--speed_up SPEED_UP] [--audio_ctx AUDIO_CTX] + [--split_on_word SPLIT_ON_WORD] [--max_tokens MAX_TOKENS] [--audio_ctx AUDIO_CTX] [--prompt_tokens PROMPT_TOKENS] [--prompt_n_tokens PROMPT_N_TOKENS] [--language LANGUAGE] [--suppress_blank SUPPRESS_BLANK] [--suppress_non_speech_tokens SUPPRESS_NON_SPEECH_TOKENS] [--temperature TEMPERATURE] [--max_initial_ts MAX_INITIAL_TS] [--length_penalty LENGTH_PENALTY] [--temperature_inc TEMPERATURE_INC] [--entropy_thold ENTROPY_THOLD] @@ -234,7 +246,6 @@ options: split on word rather than on token (when used with max_len) --max_tokens MAX_TOKENS max tokens per segment (0 = no limit) - --speed_up SPEED_UP speed-up the audio by 2x using Phase Vocoder --audio_ctx AUDIO_CTX overwrite the audio context size (0 = use default) --prompt_tokens PROMPT_TOKENS diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py index 00b0c0a..0865399 100644 --- a/pywhispercpp/constants.py +++ b/pywhispercpp/constants.py @@ -154,14 +154,6 @@ 'options': None, 'default': 0 }, - # [EXPERIMENTAL] speed-up techniques - # note: these can significantly reduce the quality of the output - 'speed_up': { - 'type': bool, - 'description': "speed-up the audio by 2x using Phase Vocoder", - 'options': None, - 'default': False - }, 'audio_ctx': { 'type': int, 'description': "overwrite the audio context size (0 = use default)", diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py index 4867fab..882b7e3 100644 --- a/pywhispercpp/model.py +++ b/pywhispercpp/model.py @@ -52,7 +52,7 @@ class Model: Example usage. ```python model = Model('base.en', n_threads=6) - segments = model.transcribe('file.mp3', speed_up=True) + segments = model.transcribe('file.mp3') for segment in segments: print(segment.text) ``` diff --git a/src/main.cpp b/src/main.cpp index 558d29e..8c79e7b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -90,17 +90,6 @@ int whisper_pcm_to_mel_wrapper( return whisper_pcm_to_mel(ctx->ptr, samples_ptr, n_samples, n_threads); }; -int whisper_pcm_to_mel_phase_vocoder_wrapper( - struct whisper_context_wrapper* ctx, - py::array_t samples, - int n_samples, - int n_threads){ - py::buffer_info buf = samples.request(); - float *samples_ptr = static_cast(buf.ptr); - return whisper_pcm_to_mel_phase_vocoder(ctx->ptr, samples_ptr, n_samples, n_threads); - -}; - int whisper_set_mel_wrapper( struct whisper_context_wrapper * ctx, py::array_t data, @@ -388,9 +377,6 @@ PYBIND11_MODULE(_pywhispercpp, m) { m.def("whisper_pcm_to_mel", &whisper_pcm_to_mel_wrapper, "Convert RAW PCM audio to log mel spectrogram.\n" "The resulting spectrogram is stored inside the provided whisper context.\n" "Returns 0 on success"); - m.def("whisper_pcm_to_mel_phase_vocoder", &whisper_pcm_to_mel_phase_vocoder_wrapper, "Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. \n" - "The resulting spectrogram is stored inside the provided whisper context.\n" - "Returns 0 on success"); m.def("whisper_set_mel", &whisper_set_mel_wrapper, " This can be used to set a custom log mel spectrogram inside the provided whisper context.\n" "Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.\n" @@ -490,7 +476,6 @@ PYBIND11_MODULE(_pywhispercpp, m) { .def_readwrite("max_len", &whisper_full_params::max_len) .def_readwrite("split_on_word", &whisper_full_params::split_on_word) .def_readwrite("max_tokens", &whisper_full_params::max_tokens) - .def_readwrite("speed_up", &whisper_full_params::speed_up) .def_readwrite("audio_ctx", &whisper_full_params::audio_ctx) .def_readwrite("initial_prompt", &whisper_full_params::initial_prompt) .def_readwrite("prompt_tokens", &whisper_full_params::prompt_tokens) diff --git a/whisper.cpp b/whisper.cpp index c7b6988..9e3c534 160000 --- a/whisper.cpp +++ b/whisper.cpp @@ -1 +1 @@ -Subproject commit c7b6988678779901d02ceba1a8212d2c9908956e +Subproject commit 9e3c5345cd46ea718209db53464e426c3fe7a25e