Merge pull request #42 from thewh1teagle/feat/vulkan

feat: add vulkan support
absadiki · Aug 29, 2024 · 4ddfa8c · 4ddfa8c
2 parents 3ee9803 + b67f020
commit 4ddfa8c
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -113,6 +113,18 @@ python -m build --wheel # in this repository to build the wheel. Assumes you hav
 pip install dist/<generated>.whl
 ```
 
+### Vulkan support
+
+Thanks to [@thewh1teagle](https://github.com/thewh1teagle)
+
+To build and install, clone the repository and run the following commands:
+
+```shell
+export CMAKE_ARGS="-DGGML_VULKAN=1"
+python -m build --wheel # in this repository to build the wheel. Assumes you have installed build with pip install build
+pip install dist/<generated>.whl
+```
+
 Then download and convert the appropriate model using the original `whisper.cpp` repository, producing a `<model>.mlmodelc` directory.
 
 You can now verify if everything's working: 
@@ -180,7 +192,7 @@ usage: pwcpp [-h] [-m MODEL] [--version] [--processors PROCESSORS] [-otxt] [-ovt
              [--translate TRANSLATE] [--no_context NO_CONTEXT] [--single_segment SINGLE_SEGMENT] [--print_special PRINT_SPECIAL]
              [--print_progress PRINT_PROGRESS] [--print_realtime PRINT_REALTIME] [--print_timestamps PRINT_TIMESTAMPS]
              [--token_timestamps TOKEN_TIMESTAMPS] [--thold_pt THOLD_PT] [--thold_ptsum THOLD_PTSUM] [--max_len MAX_LEN]
-             [--split_on_word SPLIT_ON_WORD] [--max_tokens MAX_TOKENS] [--speed_up SPEED_UP] [--audio_ctx AUDIO_CTX]
+             [--split_on_word SPLIT_ON_WORD] [--max_tokens MAX_TOKENS] [--audio_ctx AUDIO_CTX]
              [--prompt_tokens PROMPT_TOKENS] [--prompt_n_tokens PROMPT_N_TOKENS] [--language LANGUAGE] [--suppress_blank SUPPRESS_BLANK]
              [--suppress_non_speech_tokens SUPPRESS_NON_SPEECH_TOKENS] [--temperature TEMPERATURE] [--max_initial_ts MAX_INITIAL_TS]
              [--length_penalty LENGTH_PENALTY] [--temperature_inc TEMPERATURE_INC] [--entropy_thold ENTROPY_THOLD]
@@ -234,7 +246,6 @@ options:
                         split on word rather than on token (when used with max_len)
   --max_tokens MAX_TOKENS
                         max tokens per segment (0 = no limit)
-  --speed_up SPEED_UP   speed-up the audio by 2x using Phase Vocoder
   --audio_ctx AUDIO_CTX
                         overwrite the audio context size (0 = use default)
   --prompt_tokens PROMPT_TOKENS

diff --git a/pywhispercpp/constants.py b/pywhispercpp/constants.py
@@ -154,14 +154,6 @@
             'options': None,
             'default': 0
     },
-    # [EXPERIMENTAL] speed-up techniques
-    # note: these can significantly reduce the quality of the output
-    'speed_up': {
-            'type': bool,
-            'description': "speed-up the audio by 2x using Phase Vocoder",
-            'options': None,
-            'default': False
-    },
     'audio_ctx': {
             'type': int,
             'description': "overwrite the audio context size (0 = use default)",

diff --git a/pywhispercpp/model.py b/pywhispercpp/model.py
@@ -52,7 +52,7 @@ class Model:
     Example usage.
     ```python
     model = Model('base.en', n_threads=6)
-    segments = model.transcribe('file.mp3', speed_up=True)
+    segments = model.transcribe('file.mp3')
     for segment in segments:
         print(segment.text)
     ```

diff --git a/src/main.cpp b/src/main.cpp
@@ -90,17 +90,6 @@ int whisper_pcm_to_mel_wrapper(
     return whisper_pcm_to_mel(ctx->ptr, samples_ptr, n_samples, n_threads);
 };
 
-int whisper_pcm_to_mel_phase_vocoder_wrapper(
-        struct whisper_context_wrapper* ctx,
-        py::array_t<float> samples,
-        int   n_samples,
-        int   n_threads){
-    py::buffer_info buf = samples.request();
-    float *samples_ptr = static_cast<float *>(buf.ptr);
-    return whisper_pcm_to_mel_phase_vocoder(ctx->ptr, samples_ptr, n_samples, n_threads);
-
-};
-
 int whisper_set_mel_wrapper(
         struct whisper_context_wrapper * ctx,
         py::array_t<float> data,
@@ -388,9 +377,6 @@ PYBIND11_MODULE(_pywhispercpp, m) {
     m.def("whisper_pcm_to_mel", &whisper_pcm_to_mel_wrapper, "Convert RAW PCM audio to log mel spectrogram.\n"
                                                              "The resulting spectrogram is stored inside the provided whisper context.\n"
                                                              "Returns 0 on success");
-    m.def("whisper_pcm_to_mel_phase_vocoder", &whisper_pcm_to_mel_phase_vocoder_wrapper, "Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. \n"
-                                                                                        "The resulting spectrogram is stored inside the provided whisper context.\n"
-                                                                                        "Returns 0 on success");
 
     m.def("whisper_set_mel", &whisper_set_mel_wrapper, " This can be used to set a custom log mel spectrogram inside the provided whisper context.\n"
                                                         "Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.\n"
@@ -490,7 +476,6 @@ PYBIND11_MODULE(_pywhispercpp, m) {
         .def_readwrite("max_len", &whisper_full_params::max_len)
         .def_readwrite("split_on_word", &whisper_full_params::split_on_word)
         .def_readwrite("max_tokens", &whisper_full_params::max_tokens)
-        .def_readwrite("speed_up", &whisper_full_params::speed_up)
         .def_readwrite("audio_ctx", &whisper_full_params::audio_ctx)
         .def_readwrite("initial_prompt", &whisper_full_params::initial_prompt)
         .def_readwrite("prompt_tokens", &whisper_full_params::prompt_tokens)

diff --git a/whisper.cpp b/whisper.cpp