-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathrVAD_fast.py
65 lines (43 loc) · 1.8 KB
/
rVAD_fast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from __future__ import division
import numpy
import pickle
import os
import sys
import math
import code
from scipy.signal import lfilter
import speechproc
from copy import deepcopy
# Refs:
# [1] Z.-H. Tan, A.k. Sarkara and N. Dehak, "rVAD: an unsupervised segment-based robust voice activity detection method," Computer Speech and Language, vol. 59, pp. 1-21, 2020.
# [2] Z.-H. Tan and B. Lindberg, "Low-complexity variable frame rate analysis for speech recognition and voice activity detection."
# IEEE Journal of Selected Topics in Signal Processing, vol. 4, no. 5, pp. 798-807, 2010.
# Version: 2.0
# 02 Dec 2017, Achintya Kumar Sarkar and Zheng-Hua Tan
# Usage: python rVAD_fast_2.0.py inWaveFile outputVadLabel
winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
ftThres=0.5; vadThres=0.4
opts=1
finwav=str(sys.argv[1])
fvad=str(sys.argv[2])
fs, data = speechproc.speech_wave(finwav)
ft, flen, fsh10, nfr10 =speechproc.sflux(data, fs, winlen, ovrlen, nftt)
# --spectral flatness --
pv01=numpy.zeros(nfr10)
pv01[numpy.less_equal(ft, ftThres)]=1
pitch=deepcopy(ft)
pvblk=speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)
# --filtering--
ENERGYFLOOR = numpy.exp(-50)
b=numpy.array([0.9770, -0.9770])
a=numpy.array([1.0000, -0.9540])
fdata=lfilter(b, a, data, axis=0)
#--pass 1--
noise_samp, noise_seg, n_noise_samp=speechproc.snre_highenergy(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk)
#sets noisy segments to zero
for j in range(n_noise_samp):
fdata[range(int(noise_samp[j,0]), int(noise_samp[j,1]) +1)] = 0
vad_seg=speechproc.snre_vad(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres)
numpy.savetxt(fvad, vad_seg.astype(int), fmt='%i')
print("%s --> %s " %(finwav, fvad))
data=None; pv01=None; pitch=None; fdata=None; pvblk=None; vad_seg=None