huggingface · eustlb · Jan 10, 2025 · Nov 18, 2024 · Dec 2, 2024 · Dec 3, 2024
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -490,6 +490,8 @@
         title: mLUKE
       - local: model_doc/mobilebert
         title: MobileBERT
+      - local: model_doc/moonshine
+        title: moonshine
       - local: model_doc/mpnet
         title: MPNet
       - local: model_doc/mpt

diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md
@@ -0,0 +1,60 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# moonshine
+
+# moonshine
+
+## Overview
+
+The moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands
+](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
+
+The abstract from the paper is the following:
+
+This paper introduces Moonshine, a family of speech recognition models optimized for live transcription and voice command processing. Moonshine is based on an encoder-decoder transformer architecture and employs Rotary Position Embedding (RoPE) instead of traditional absolute position embeddings. The model is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time. When benchmarked against OpenAI's Whisper tiny-en, Moonshine Tiny demonstrates a 5x reduction in compute requirements for transcribing a 10-second speech segment while incurring no increase in word error rates across standard evaluation datasets. These results highlight Moonshine's potential for real-time and resource-constrained applications.
+
+Tips:
+
+- Moonshine improves upon Whisper's architecture:
+  1. It uses SwiGLU activation instead of GELU in the decoder layers
+  2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows.
+
+This model was contributed by [Eustache Le Bihan (eustlb)](https://huggingface.co/eustlb).
+The original code can be found [here](https://github.com/usefulsensors/moonshine).
+
+
+## MoonshineConfig
+
+[[autodoc]] MoonshineConfig
+
+<frameworkcontent>
+<pt>
+
+## MoonshineModel
+
+[[autodoc]] MoonshineModel
+    - forward
+    - _mask_input_features
+
+## MoonshineForConditionalGeneration
+
+[[autodoc]] MoonshineForConditionalGeneration
+    - forward
+    - generate
+
+</pt>
+<tf>
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -463,6 +463,7 @@
     "models.gpt_bigcode": ["GPTBigCodeConfig"],
     "models.gpt_neo": ["GPTNeoConfig"],
     "models.gpt_neox": ["GPTNeoXConfig"],
+    "models.moonshine": ["MoonshineConfig"],
     "models.gpt_neox_japanese": ["GPTNeoXJapaneseConfig"],
     "models.gpt_sw3": [],
     "models.gptj": ["GPTJConfig"],
@@ -1081,6 +1082,7 @@
     _import_structure["models.gemma"].append("GemmaTokenizerFast")
     _import_structure["models.gpt2"].append("GPT2TokenizerFast")
     _import_structure["models.gpt_neox"].append("GPTNeoXTokenizerFast")
+    _import_structure["models.moonshine"].append("MoonshineTokenizer")
     _import_structure["models.gpt_neox_japanese"].append("GPTNeoXJapaneseTokenizer")
     _import_structure["models.herbert"].append("HerbertTokenizerFast")
     _import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
@@ -2360,6 +2362,16 @@
             "GPTNeoXPreTrainedModel",
         ]
     )
+    _import_structure["models.moonshine"].extend(
+        [
+            "MoonshineForCausalLM",
+            "MoonshineForQuestionAnswering",
+            "MoonshineForSequenceClassification",
+            "MoonshineForTokenClassification",
+            "MoonshineModel",
+            "MoonshinePreTrainedModel",
+        ]
+    )
     _import_structure["models.gpt_neox_japanese"].extend(
         [
             "GPTNeoXJapaneseForCausalLM",
@@ -5486,6 +5498,7 @@
     from .models.mobilevitv2 import (
         MobileViTV2Config,
     )
+    from .models.moonshine import MoonshineConfig
     from .models.moshi import (
         MoshiConfig,
         MoshiDepthConfig,
@@ -6023,6 +6036,7 @@
         from .models.mbart import MBartTokenizerFast
         from .models.mbart50 import MBart50TokenizerFast
         from .models.mobilebert import MobileBertTokenizerFast
+        from .models.moonshine import MoonshineTokenizer
         from .models.mpnet import MPNetTokenizerFast
         from .models.mt5 import MT5TokenizerFast
         from .models.mvp import MvpTokenizerFast
@@ -7441,6 +7455,14 @@
             MobileViTV2Model,
             MobileViTV2PreTrainedModel,
         )
+        from .models.moonshine import (
+            MoonshineForCausalLM,
+            MoonshineForQuestionAnswering,
+            MoonshineForSequenceClassification,
+            MoonshineForTokenClassification,
+            MoonshineModel,
+            MoonshinePreTrainedModel,
+        )
         from .models.moshi import (
             MoshiForCausalLM,
             MoshiForConditionalGeneration,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -162,6 +162,7 @@
     mobilenet_v2,
     mobilevit,
     mobilevitv2,
+    moonshine,
     moshi,
     mpnet,
     mpt,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -180,6 +180,7 @@
         ("mobilenet_v2", "MobileNetV2Config"),
         ("mobilevit", "MobileViTConfig"),
         ("mobilevitv2", "MobileViTV2Config"),
+        ("moonshine", "MoonshineConfig"),
         ("moshi", "MoshiConfig"),
         ("mpnet", "MPNetConfig"),
         ("mpt", "MptConfig"),
@@ -494,6 +495,7 @@
         ("mobilenet_v2", "MobileNetV2"),
         ("mobilevit", "MobileViT"),
         ("mobilevitv2", "MobileViTV2"),
+        ("moonshine", "moonshine"),
         ("moshi", "Moshi"),
         ("mpnet", "MPNet"),
         ("mpt", "MPT"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -170,6 +170,7 @@
         ("mobilenet_v2", "MobileNetV2Model"),
         ("mobilevit", "MobileViTModel"),
         ("mobilevitv2", "MobileViTV2Model"),
+        ("moonshine", "MoonshineModel"),
         ("moshi", "MoshiModel"),
         ("mpnet", "MPNetModel"),
         ("mpt", "MptModel"),
@@ -424,6 +425,7 @@
         ("mega", "MegaForMaskedLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("mobilebert", "MobileBertForMaskedLM"),
+        ("moonshine", "MoonshineForConditionalGeneration"),
         ("mpnet", "MPNetForMaskedLM"),
         ("mpt", "MptForCausalLM"),
         ("mra", "MraForMaskedLM"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -79,6 +79,7 @@
         ("mctct", "MCTCTProcessor"),
         ("mgp-str", "MgpstrProcessor"),
         ("mllama", "MllamaProcessor"),
+        ("moonshine", "Wav2Vec2Processor"),
         ("oneformer", "OneFormerProcessor"),
         ("owlv2", "Owlv2Processor"),
         ("owlvit", "OwlViTProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -310,6 +310,7 @@
             ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
             ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
+            ("moonshine", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
             ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),

diff --git a/src/transformers/models/moonshine/__init__.py b/src/transformers/models/moonshine/__init__.py
@@ -0,0 +1,59 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_moonshine": ["MoonshineConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_moonshine"] = [
+        "MoonshineForConditionalGeneration",
+        "MoonshineModel",
+        "MoonshinePreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_moonshine import MoonshineConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_moonshine import (
+            MoonshineForConditionalGeneration,
+            MoonshineModel,
+            MoonshinePreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)