Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

replace soundfile with librosa #726

Merged
merged 10 commits into from
Sep 6, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Added support for the `ObjectDetector` with FiftyOne ([#727](https://github.com/PyTorchLightning/lightning-flash/pull/727))

- Added support for MP3 files to the `SpeechRecognition` task with librosa ([#726](https://github.com/PyTorchLightning/lightning-flash/pull/726))

### Changed

- Changed how pretrained flag works for loading weights for ImageClassifier task ([#560](https://github.com/PyTorchLightning/lightning-flash/pull/560))
Expand Down
8 changes: 4 additions & 4 deletions flash/audio/speech_recognition/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from flash.core.utilities.imports import _AUDIO_AVAILABLE, requires_extras

if _AUDIO_AVAILABLE:
import soundfile as sf
import librosa
from datasets import Dataset as HFDataset
from datasets import load_dataset
from transformers import Wav2Vec2CTCTokenizer
Expand All @@ -48,7 +48,7 @@ def deserialize(self, sample: Any) -> Dict:
encoded_with_padding = (sample + "===").encode("ascii")
audio = base64.b64decode(encoded_with_padding)
buffer = io.BytesIO(audio)
data, sampling_rate = sf.read(buffer)
data, sampling_rate = librosa.load(buffer, sr=16000)
ethanwharris marked this conversation as resolved.
Show resolved Hide resolved
return {
DefaultDataKeys.INPUT: data,
DefaultDataKeys.METADATA: {"sampling_rate": sampling_rate},
Expand All @@ -69,7 +69,7 @@ def _load_sample(self, sample: Dict[str, Any]) -> Any:
and "root" in sample[DefaultDataKeys.METADATA]
):
path = os.path.join(sample[DefaultDataKeys.METADATA]["root"], path)
speech_array, sampling_rate = sf.read(path)
speech_array, sampling_rate = librosa.load(path, sr=16000)
sample[DefaultDataKeys.INPUT] = speech_array
sample[DefaultDataKeys.METADATA] = {"sampling_rate": sampling_rate}
return sample
Expand Down Expand Up @@ -129,7 +129,7 @@ def load_data(self, data: Dataset, dataset: Optional[Any] = None) -> Union[Seque

class SpeechRecognitionPathsDataSource(PathsDataSource, BaseSpeechRecognition):
def __init__(self):
super().__init__(("wav", "ogg", "flac", "mat"))
super().__init__(("wav", "ogg", "flac", "mat", "mp3"))

def load_sample(self, sample: Dict[str, Any], dataset: Any = None) -> Any:
return self._load_sample(sample)
Expand Down
4 changes: 2 additions & 2 deletions flash/core/utilities/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _compare_version(package: str, op, version) -> bool:
_PIL_AVAILABLE = _module_available("PIL")
_OPEN3D_AVAILABLE = _module_available("open3d")
_SEGMENTATION_MODELS_AVAILABLE = _module_available("segmentation_models_pytorch")
_SOUNDFILE_AVAILABLE = _module_available("soundfile")
_LIBROSA_AVAILABLE = _module_available("librosa")
_TORCH_SCATTER_AVAILABLE = _module_available("torch_scatter")
_TORCH_SPARSE_AVAILABLE = _module_available("torch_sparse")
_TORCH_GEOMETRIC_AVAILABLE = _module_available("torch_geometric")
Expand Down Expand Up @@ -148,7 +148,7 @@ class Image(metaclass=MetaImage):
)
_SERVE_AVAILABLE = _FASTAPI_AVAILABLE and _PYDANTIC_AVAILABLE and _CYTOOLZ_AVAILABLE and _UVICORN_AVAILABLE
_POINTCLOUD_AVAILABLE = _OPEN3D_AVAILABLE and _TORCHVISION_AVAILABLE
_AUDIO_AVAILABLE = all([_TORCHAUDIO_AVAILABLE, _SOUNDFILE_AVAILABLE, _TRANSFORMERS_AVAILABLE])
_AUDIO_AVAILABLE = all([_TORCHAUDIO_AVAILABLE, _LIBROSA_AVAILABLE, _TRANSFORMERS_AVAILABLE])
_GRAPH_AVAILABLE = _TORCH_SCATTER_AVAILABLE and _TORCH_SPARSE_AVAILABLE and _TORCH_GEOMETRIC_AVAILABLE

_EXTRAS_AVAILABLE = {
Expand Down
2 changes: 1 addition & 1 deletion requirements/datatype_audio.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torchaudio
soundfile>=0.10.2
librosa>=0.8.1
transformers>=4.5
datasets>=1.8