@@ -97,31 +97,58 @@ <h2>Main Experiment Results</h2>
97
97
< p >
98
98
Our method reduce the computation of the core step of diffusion-based text-to-audio generation by
99
99
a factor of 400 and enables on-device generation, while observing minimal performance degradation in
100
- Fréchet Audio Distance (FAD), Fréchet Distance (FD), KL Divergence, and CLAP Scores.
100
+ Fréchet Audio Distance (FAD), Fréchet Distance (FD), KL Divergence, and CLAP Scores.< br >
101
+ Generation Time is the time in minutes to generate the entire validation set (882 samples).< br >
102
+ < i > ↑: higher is better; ↓: lower is better.</ i >
101
103
</ p >
102
104
< table class ="result-table ">
103
105
< thead >
104
106
< tr class ="result-row ">
105
- < th class ="result-head "> </ th > < th class ="result-head "> # queries (↓)</ th >
106
- < th class ="result-head "> CLAP< sub > T</ sub > (↑)</ th > < th class ="result-head "> CLAP< sub > A</ sub > (↑)</ th >
107
- < th class ="result-head "> FAD (↓)</ th > < th class ="result-head "> FD (↓)</ th > < th class ="result-head "> KLD (↓)</ th >
107
+ < th class ="result-head "> </ th >
108
+ < th class ="result-head-2 "> Model Queries< br > ↓</ th > < th class ="result-head-2 "> Generation Time< br > ↓</ th >
109
+ < th class ="result-head "> Subjective Quality< br > ↑</ th > < th class ="result-head "> Subjective Text Align< br > ↑</ th >
110
+ < th class ="result-head-2 "> CLAP< sub > T</ sub > < br > ↑</ th > < th class ="result-head-2 "> CLAP< sub > A</ sub > < br > ↑</ th >
111
+ < th class ="result-head-2 "> FAD< br > ↓</ th > < th class ="result-head-2 "> FD< br > ↓</ th > < th class ="result-head-2 "> KLD< br > ↓</ th >
108
112
</ tr >
109
113
</ thead >
110
114
< tbody >
111
- < tr class ="result-row " style ="color: #a0a0a0 ">
112
- < td class ="result-data "> Diffusion (Baseline)</ td > < td class ="result-data "> 400</ td >
113
- < td class ="result-data "> 24.57</ td > < td class ="result-data "> 72.79</ td >
114
- < td class ="result-data "> 1.908</ td > < td class ="result-data "> 19.57</ td > < td class ="result-data "> 1.350</ td >
115
+ < tr class ="result-row-2 " style ="color: #898989 ">
116
+ < td class ="result-data-small "> < span style ="font-weight: 400; "> AudioLDM-L (Baseline)</ span > </ td >
117
+ < td class ="result-data-2 "> 400</ td > < td class ="result-data-2 "> -</ td > < td class ="result-data "> -</ td >
118
+ < td class ="result-data "> -</ td > < td class ="result-data-2 "> -</ td > < td class ="result-data-2 "> -</ td >
119
+ < td class ="result-data-2 "> < span style ="font-weight: 400; "> 2.08</ span > </ td > < td class ="result-data-2 "> 27.12</ td >
120
+ < td class ="result-data-2 "> 1.86</ td >
121
+ </ tr >
122
+ < tr class ="result-row-2 " style ="color: #898989 ">
123
+ < td class ="result-data-small "> < span style ="font-weight: 400; "> TANGO (Baseline)</ span > </ td >
124
+ < td class ="result-data-2 "> 400</ td > < td class ="result-data-2 "> 168</ td >
125
+ < td class ="result-data "> < b > 4.136</ b > </ td > < td class ="result-data "> < b > 4.064</ b > </ td >
126
+ < td class ="result-data-2 "> < span style ="font-weight: 400; "> 24.10</ span > </ td > < td class ="result-data-2 "> < b > 72.85</ b > </ td >
127
+ < td class ="result-data-2 "> < b > 1.631</ b > </ td > < td class ="result-data-2 "> < b > 20.11</ b > </ td >
128
+ < td class ="result-data-2 "> 1.362</ td >
115
129
</ tr >
116
130
< tr class ="result-row ">
117
- < td class ="result-data "> Consistency + CLAP FT (Ours)</ td > < td class ="result-data "> 1</ td >
118
- < td class ="result-data "> 24.69</ td > < td class ="result-data "> 72.54</ td >
119
- < td class ="result-data "> 2.406</ td > < td class ="result-data "> 20.97</ td > < td class ="result-data "> 1.358</ td >
131
+ < td class ="result-data-small "> < span style ="font-weight: 400; "> ConsistencyTTA + CLAP-FT</ span > </ td >
132
+ < td class ="result-data-2 "> < b > 1</ b > </ td > < td class ="result-data-2 "> < b > 2.3</ b > </ td >
133
+ < td class ="result-data "> 3.830</ td > < td class ="result-data "> < b > 4.064</ b > </ td >
134
+ < td class ="result-data-2 "> < b > 24.69</ b > </ td > < td class ="result-data-2 "> < span style ="font-weight: 400; "> 72.54</ span > </ td >
135
+ < td class ="result-data-2 "> 2.406</ td > < td class ="result-data-2 "> < span style ="font-weight: 400; "> 20.97</ span > </ td >
136
+ < td class ="result-data-2 "> < span style ="font-weight: 400; "> 1.358</ span > </ td >
120
137
</ tr >
121
138
< tr class ="result-row ">
122
- < td class ="result-data "> Consistency (Ours)</ td > < td class ="result-data "> 1</ td >
123
- < td class ="result-data "> 22.50</ td > < td class ="result-data "> 72.30</ td >
124
- < td class ="result-data "> 2.575</ td > < td class ="result-data "> 22.08</ td > < td class ="result-data "> 1.354</ td >
139
+ < td class ="result-data-small "> < span style ="font-weight: 400; "> ConsistencyTTA</ span > </ td >
140
+ < td class ="result-data-2 "> < b > 1</ b > </ td > < td class ="result-data-2 "> < b > 2.3</ b > </ td >
141
+ < td class ="result-data "> < span style ="font-weight: 400; "> 3.902</ span > </ td > < td class ="result-data "> 4.010</ td >
142
+ < td class ="result-data-2 "> 22.50</ td > < td class ="result-data-2 "> 72.30</ td >
143
+ < td class ="result-data-2 "> 2.575</ td > < td class ="result-data-2 "> 22.08</ td >
144
+ < td class ="result-data-2 "> < b > 1.354</ b > </ td >
145
+ </ tr >
146
+ < tr class ="result-row-2-small " style ="color: #898989 ">
147
+ < td class ="result-data "> < span style ="font-weight: 400; "> Ground Truth</ span > </ td >
148
+ < td class ="result-data-2 "> -</ td > < td class ="result-data-2 "> -</ td >
149
+ < td class ="result-data "> -</ td > < td class ="result-data "> -</ td >
150
+ < td class ="result-data-2 "> 26.71</ td > < td class ="result-data-2 "> 100</ td >
151
+ < td class ="result-data-2 "> -</ td > < td class ="result-data-2 "> -</ td > < td class ="result-data-2 "> -</ td >
125
152
</ tr >
126
153
</ tbody >
127
154
</ table >
0 commit comments