forked from PaddlePaddle/Parakeet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_processors.py
174 lines (143 loc) · 6.34 KB
/
dataset_processors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
from typing import List
from pathlib import Path
import multiprocessing as mp
import numpy as np
from tqdm import tqdm
from audio_processor import SpeakerVerificationPreprocessor
def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor):
# Load and preprocess the waveform
input_path, output_path = path_pair
wav = processor.preprocess_wav(input_path)
if len(wav) == 0:
return
# Create the mel spectrogram, discard those that are too short
frames = processor.melspectrogram(wav)
if len(frames) < processor.partial_n_frames:
return
np.save(output_path, frames)
def _process_speaker(speaker_dir: Path,
processor: SpeakerVerificationPreprocessor,
datasets_root: Path,
output_dir: Path,
pattern: str,
skip_existing: bool=False):
# datastes root: a reference path to compute speaker_name
# we prepand dataset name to speaker_id becase we are mixing serveal
# multispeaker datasets together
speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
speaker_output_dir = output_dir / speaker_name
speaker_output_dir.mkdir(parents=True, exist_ok=True)
# load exsiting file set
sources_fpath = speaker_output_dir / "_sources.txt"
if sources_fpath.exists():
try:
with sources_fpath.open("rt") as sources_file:
existing_names = {line.split(",")[0] for line in sources_file}
except Exception as e:
existing_names = {}
else:
existing_names = {}
sources_file = sources_fpath.open("at" if skip_existing else "wt")
for in_fpath in speaker_dir.rglob(pattern):
out_name = "_".join(
in_fpath.relative_to(speaker_dir).with_suffix(".npy").parts)
if skip_existing and out_name in existing_names:
continue
out_fpath = speaker_output_dir / out_name
_process_utterance((in_fpath, out_fpath), processor)
sources_file.write(f"{out_name},{in_fpath}\n")
sources_file.close()
def _process_dataset(processor: SpeakerVerificationPreprocessor,
datasets_root: Path,
speaker_dirs: List[Path],
dataset_name: str,
output_dir: Path,
pattern: str,
skip_existing: bool=False):
print(
f"{dataset_name}: Preprocessing data for {len(speaker_dirs)} speakers.")
_func = partial(
_process_speaker,
processor=processor,
datasets_root=datasets_root,
output_dir=output_dir,
pattern=pattern,
skip_existing=skip_existing)
with mp.Pool(16) as pool:
list(
tqdm(
pool.imap(_func, speaker_dirs),
dataset_name,
len(speaker_dirs),
unit="speakers"))
print(f"Done preprocessing {dataset_name}.")
def process_librispeech(processor,
datasets_root,
output_dir,
skip_existing=False):
dataset_name = "LibriSpeech/train-other-500"
dataset_root = datasets_root / dataset_name
speaker_dirs = list(dataset_root.glob("*"))
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.flac", skip_existing)
def process_voxceleb1(processor, datasets_root, output_dir,
skip_existing=False):
dataset_name = "VoxCeleb1"
dataset_root = datasets_root / dataset_name
anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
with dataset_root.joinpath("vox1_meta.csv").open("rt") as metafile:
metadata = [line.strip().split("\t") for line in metafile][1:]
# speaker id -> nationality
nationalities = {line[0]: line[3] for line in metadata if line[-1] == "dev"}
keep_speaker_ids = [
speaker_id for speaker_id, nationality in nationalities.items()
if nationality.lower() in anglophone_nationalites
]
print(
"VoxCeleb1: using samples from {} (presumed anglophone) speakers out of {}."
.format(len(keep_speaker_ids), len(nationalities)))
speaker_dirs = list((dataset_root / "wav").glob("*"))
speaker_dirs = [
speaker_dir for speaker_dir in speaker_dirs
if speaker_dir.name in keep_speaker_ids
]
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.wav", skip_existing)
def process_voxceleb2(processor, datasets_root, output_dir,
skip_existing=False):
dataset_name = "VoxCeleb2"
dataset_root = datasets_root / dataset_name
# There is no nationality in meta data for VoxCeleb2
speaker_dirs = list((dataset_root / "wav").glob("*"))
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.wav", skip_existing)
def process_aidatatang_200zh(processor,
datasets_root,
output_dir,
skip_existing=False):
dataset_name = "aidatatang_200zh/train"
dataset_root = datasets_root / dataset_name
speaker_dirs = list((dataset_root).glob("*"))
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.wav", skip_existing)
def process_magicdata(processor, datasets_root, output_dir,
skip_existing=False):
dataset_name = "magicdata/train"
dataset_root = datasets_root / dataset_name
speaker_dirs = list((dataset_root).glob("*"))
_process_dataset(processor, datasets_root, speaker_dirs, dataset_name,
output_dir, "*.wav", skip_existing)