diff --git a/manim/scene/scene_file_writer.py b/manim/scene/scene_file_writer.py
index 49d8492a9a..6873a06ad1 100644
--- a/manim/scene/scene_file_writer.py
+++ b/manim/scene/scene_file_writer.py
@@ -8,6 +8,7 @@
 import shutil
 from pathlib import Path
 from queue import Queue
+from tempfile import NamedTemporaryFile
 from threading import Thread
 from typing import TYPE_CHECKING, Any
 
@@ -330,7 +331,32 @@ def add_sound(
 
         """
         file_path = get_full_sound_file_path(sound_file)
-        new_segment = AudioSegment.from_file(file_path)
+        # we assume files with .wav / .raw suffix are actually
+        # .wav and .raw files, respectively.
+        if file_path.suffix not in (".wav", ".raw"):
+            # we need to pass delete=False to work on Windows
+            # TODO: figure out a way to cache the wav file generated (benchmark needed)
+            wav_file_path = NamedTemporaryFile(suffix=".wav", delete=False)
+            with (
+                av.open(file_path) as input_container,
+                av.open(wav_file_path, "w", format="wav") as output_container,
+            ):
+                for audio_stream in input_container.streams.audio:
+                    output_stream = output_container.add_stream("pcm_s16le")
+                    for frame in input_container.decode(audio_stream):
+                        for packet in output_stream.encode(frame):
+                            output_container.mux(packet)
+
+                    for packet in output_stream.encode():
+                        output_container.mux(packet)
+
+            new_segment = AudioSegment.from_file(wav_file_path.name)
+            logger.info(f"Automatically converted {file_path} to .wav")
+            wav_file_path.close()
+            Path(wav_file_path.name).unlink()
+        else:
+            new_segment = AudioSegment.from_file(file_path)
+
         if gain:
             new_segment = new_segment.apply_gain(gain)
         self.add_audio_segment(new_segment, time, **kwargs)
diff --git a/tests/test_scene_rendering/click.mp3 b/tests/test_scene_rendering/click.mp3
new file mode 100644
index 0000000000..8abc5221dd
Binary files /dev/null and b/tests/test_scene_rendering/click.mp3 differ
diff --git a/tests/test_scene_rendering/test_file_writer.py b/tests/test_scene_rendering/test_file_writer.py
index 004756280f..f8f12d487a 100644
--- a/tests/test_scene_rendering/test_file_writer.py
+++ b/tests/test_scene_rendering/test_file_writer.py
@@ -145,6 +145,17 @@ def test_codecs(tmp_path, format, transparent, codec, pixel_format):
     np.testing.assert_allclose(first_frame[-1, -1], target_rgba_center, atol=5)
 
 
+def test_scene_with_non_raw_or_wav_audio(manim_caplog):
+    class SceneWithMP3(Scene):
+        def construct(self):
+            file_path = Path(__file__).parent / "click.mp3"
+            self.add_sound(file_path)
+            self.wait()
+
+    SceneWithMP3().render()
+    assert "click.mp3 to .wav" in manim_caplog.text
+
+
 @pytest.mark.slow
 def test_unicode_partial_movie(tmpdir, simple_scenes_path):
     # Characters that failed for a user on Windows