Merge pull request #6 from david-ryan-snyder/chain

xvector: Objf and Deriv
pegahgh · Feb 14, 2016 · 1e1d1ef · 1e1d1ef
2 parents 598e9b1 + 7596d01
commit 1e1d1ef
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 104 deletions.
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
@@ -2105,11 +2105,11 @@ static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim,
   int32_cuda scores_index = i + j * scores_dim.stride;
   Real K = 1.0 / (scores_dim.rows - 2.0);
   Real L = scores[scores_index];
-  if (i < scores_dim.cols && j < scores_dim.rows && i < j) {
+  if (i < scores_dim.cols && j < scores_dim.rows) {
     if (i + 1 == j && i % 2 == 0) {
       obfj_terms[scores_index] = log(1.0 + exp(-L));
       obfj_derivs[scores_index] = 1.0 / (1.0 + exp(L));
-    } else if (i != j) {
+    } else if (i < j) {
       obfj_terms[scores_index] = K * log(1.0 + exp(L));
       obfj_derivs[scores_index] = -K / (1.0 + exp(-L));
     }

diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
@@ -206,10 +206,11 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   }
 }
 
-template<typename Real>
-void ComputeXvectorObjfFromScores(const CuMatrixBase<Real> &scores,
-                                  CuMatrixBase<Real> *objf_terms,
-                                  CuMatrixBase<Real> *objf_derivs) {
+void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
+                                  CuMatrixBase<BaseFloat> *objf_terms,
+                                  CuMatrixBase<BaseFloat> *objf_derivs) {
+  KALDI_ASSERT(SameDim(*objf_terms, *objf_derivs)
+               && SameDim(*objf_terms, scores));
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
@@ -226,8 +227,19 @@ void ComputeXvectorObjfFromScores(const CuMatrixBase<Real> &scores,
   } else
   #endif
   {
-    // TODO: Add the CPU version.
-    KALDI_LOG << "NOT USING CUDA";
+    int32 num_rows = scores.NumRows();
+    BaseFloat K = 1.0 / (num_rows - 2.0);
+    for (int32 i = 0; i < num_rows; i++) {
+      for (int32 j = i + 1; j < num_rows; j++) {
+        if (i + 1 == j && i % 2 == 0) {
+          (*objf_terms)(i, j) = log(1.0 + exp(-scores(i, j)));
+          (*objf_derivs)(i, j) = 1.0 / (1.0 + exp(scores(i, j)));
+        } else {
+          (*objf_terms)(i, j) = K * log(1.0 + exp(scores(i, j)));
+          (*objf_derivs)(i, j) = -K / (1.0 + exp(-scores(i, j)));
+        }
+      }
+    }
   }
 }
 
@@ -259,16 +271,6 @@ void Randomize(const CuMatrixBase<double> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<double> *tgt);
 
-template
-void ComputeXvectorObjfFromScores(const CuMatrixBase<float> &scores,
-                                  CuMatrixBase<float> *objf_terms,
-                                  CuMatrixBase<float> *objf_derivs);
-template
-void ComputeXvectorObjfFromScores(const CuMatrixBase<double> &scores,
-                                  CuMatrixBase<double> *objf_terms,
-                                  CuMatrixBase<double> *objf_derivs);
-
-
 
 } //namespace cu
 

diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
@@ -80,9 +80,31 @@ void Group2norm(const CuMatrixBase<Real> &src,
                 int32 group_stride);
 
 /*
-TODO: Documentation.
+  This function is used in computing the objective function and derivatives
+  in xvector training.
+  @param [in] scores   'scores' is a symmetric matrix of scores which are to
+  be interpreted as log-odds (according to the model) of pairs coming from the
+  same class, so scores(i, j) is the model's log p(same/different) for
+  elements i and j of the original minibatch of input. We assume that the data
+  in 'scores' has been arranged in such a way that pairs of indexes of the form
+  (2k, 2k+1), e.g., (0, 1), (2, 3), (4, 5), etc, are from the same class, but
+  indexes of any other form, such as (0, 2), (1, 2), etc, are from different
+  classes.
+  @param [out] objf_terms   'objf_terms' is a matrix of the same dimension as
+  'scores' whose elements we will sum to get the objective function for this
+  minibatch. This function computes the appropriate contributions to the
+  objective function, as follows.
+    if i == j:
+      objf_terms(i, j)== 0       # the same exact element is not scored
+    elsif i%2 == j%2:
+      objf_terms(i, j) = log(p(same))
+                       = -log(1 + exp(-scores(i, j))
+    else:
+      objf_terms(i, j) = 1 / (scores.NumRows() - 2) * log(p(different))
+                       = -1/(scores.NumRows() - 2) * log(1+exp(scores(i,j))
+  @param [out] objf_derivs    Element (i,j) of this matrix is the derivative
+  of objf_terms(i,j) with respect to scores(i, j).
 */
-template <typename BaseFloat>
 void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
                                   CuMatrixBase<BaseFloat> *objf_terms,
                                   CuMatrixBase<BaseFloat> *objf_derivs);

diff --git a/src/ivector/xvector-test.cc b/src/ivector/xvector-test.cc
@@ -30,17 +30,17 @@ void TestGetDeriv(const CuVector<BaseFloat> &v,
     const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, bool is_same, BaseFloat similarity_score,
     CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
-    CuVector<BaseFloat> *deriv_S_and_b);
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b);
 
 void TestComputeXvectorObjfAndDeriv(
     const CuMatrixBase<BaseFloat> &xvector_pairs,
     const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
-    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf,
     BaseFloat *tot_weight);
 
 bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
-  int32 xvector_dim = RandInt(4, 30),
+  int32 xvector_dim = RandInt(4, 50),
         num_rows = 2 * RandInt(2, 10); // The number of rows must be even
                                        // and greater than 2.
   CuSpMatrix<BaseFloat> S(xvector_dim);
@@ -49,14 +49,15 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
   S.Scale(1.0e-01);
   BaseFloat b = RandInt(-100, 100) / 10.0,
             tot_weight,
-            tot_objf;
+            tot_objf,
+            deriv_b;
   int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
   CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
                       deriv_xvector(num_rows, xvector_dim, kSetZero);
-  CuVector<BaseFloat> deriv_S_and_b(S_dim + 1, kSetZero);
+  CuVector<BaseFloat> deriv_S(S_dim, kSetZero);
   xvector_pairs.SetRandn();
   ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
-    &deriv_S_and_b, &tot_objf, &tot_weight);
+    &deriv_S, &deriv_b, &tot_objf, &tot_weight);
   CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
 
   // Sum over the derivatives for xvector input.
@@ -74,14 +75,12 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
       xvector_pairs_p(j, i) += perturb_delta;
       xvector_pairs_n(j, i) += -perturb_delta;
     }
-    CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
-    CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
-    BaseFloat tot_objf_p;
-    BaseFloat tot_objf_n;
-    ComputeXvectorObjfAndDeriv(xvector_pairs_p, S, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
-    ComputeXvectorObjfAndDeriv(xvector_pairs_n, S, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
+    BaseFloat tot_objf_p,
+              tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs_p, S, b, NULL,
+      NULL, NULL, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs_n, S, b, NULL,
+      NULL, NULL, &tot_objf_n, &tot_weight);
     BaseFloat delta = (tot_objf_p  - tot_objf_n)
       * 1.0 / (2.0 * perturb_delta);
     l2_xvector += pow(deriv_xvector_vec(i) - delta, 2);
@@ -92,43 +91,42 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
   for (int32 i = 0; i < S_dim; i++) {
     CuSpMatrix<BaseFloat> S_p(S);
     CuSpMatrix<BaseFloat> S_n(S);
-    S_p.Data()[i] += perturb_delta;
-    S_n.Data()[i] -= perturb_delta;
-    CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
-    CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
-    BaseFloat tot_objf_p;
-    BaseFloat tot_objf_n;
-    ComputeXvectorObjfAndDeriv(xvector_pairs, S_p, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
-    ComputeXvectorObjfAndDeriv(xvector_pairs, S_n, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
+    CuSubVector<BaseFloat> S_p_vec(S_p.Data(), S_dim);
+    CuSubVector<BaseFloat> S_n_vec(S_n.Data(), S_dim);
+    S_p_vec(i) += perturb_delta;
+    S_n_vec(i) += -perturb_delta;
+    BaseFloat tot_objf_p,
+              tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_p, b, NULL,
+      NULL, NULL, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_n, b, NULL,
+      NULL, NULL, &tot_objf_n, &tot_weight);
     BaseFloat delta = (tot_objf_p  - tot_objf_n)
       * 1.0 / (2.0 * perturb_delta);
-    l2_S += pow(deriv_S_and_b(i) - delta, 2);
+    l2_S += pow(deriv_S(i) - delta, 2);
   }
 
   // Compare the b derivative calculated above with a numerical
   // approximation.
   BaseFloat b_p = b + perturb_delta;
   BaseFloat b_n = b - perturb_delta;
-  CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
-  CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
   BaseFloat tot_objf_p;
   BaseFloat tot_objf_n;
-  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_p, &deriv_xvector_tmp,
-    &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
-  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_n, &deriv_xvector_tmp,
-    &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
-  BaseFloat delta = (tot_objf_p  - tot_objf_n) * 1.0 / (2.0 * perturb_delta);
-  l2_b = pow(deriv_S_and_b(S_dim) - delta, 2);
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_p, NULL,
+    NULL, NULL, &tot_objf_p, &tot_weight);
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_n, NULL,
+    NULL, NULL, &tot_objf_n, &tot_weight);
+  BaseFloat delta = (tot_objf_p  - tot_objf_n)
+                    * 1.0 / (2.0 * perturb_delta);
+  l2_b = pow(deriv_b - delta, 2);
   KALDI_ASSERT(l2_xvector < 1.0e-03);
   KALDI_ASSERT(l2_S <  1.0e-03);
   KALDI_ASSERT(l2_b < 1.0e-03);
   return true;
 }
 
 bool TestXvectorComputeObjf() {
-  int32 xvector_dim = RandInt(4, 30),
+  int32 xvector_dim = RandInt(4, 40),
         num_rows = 2 * RandInt(2, 10); // The number of rows must be even
                                        // and greater than 2.
   CuSpMatrix<BaseFloat> S(xvector_dim);
@@ -139,19 +137,21 @@ bool TestXvectorComputeObjf() {
             tot_weight,
             tot_weight_test,
             tot_objf,
-            tot_objf_test;
+            tot_objf_test,
+            deriv_b,
+            deriv_b_test;
   int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
   CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
                       deriv_xvector(num_rows, xvector_dim, kSetZero),
                       deriv_xvector_test(num_rows, xvector_dim, kSetZero);
-  CuVector<BaseFloat> deriv_S_and_b(S_dim + 1, kSetZero),
-                      deriv_S_and_b_test(S_dim + 1, kSetZero);
+  CuVector<BaseFloat> deriv_S(S_dim, kSetZero),
+                      deriv_S_test(S_dim, kSetZero);
   xvector_pairs.SetRandn();
 
   ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
-    &deriv_S_and_b, &tot_objf, &tot_weight);
+    &deriv_S, &deriv_b, &tot_objf, &tot_weight);
   TestComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector_test,
-    &deriv_S_and_b_test, &tot_objf_test, &tot_weight_test);
+    &deriv_S_test, &deriv_b_test, &tot_objf_test, &tot_weight_test);
 
   CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
   deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector, 0.0);
@@ -160,43 +160,50 @@ bool TestXvectorComputeObjf() {
 
   // Verify that the objfs are the same.
   KALDI_ASSERT(ApproxEqual(tot_objf, tot_objf_test, 0.001));
+
   // Also verify that the gradients are the same.
   for (int32 i = 0; i < deriv_xvector_vec.Dim(); i++)
-    KALDI_ASSERT(ApproxEqual(deriv_xvector_vec(i), deriv_xvector_vec_test(i), 0.001));
-  for (int32 i = 0; i < deriv_S_and_b.Dim(); i++)
-    KALDI_ASSERT(ApproxEqual(deriv_S_and_b(i), deriv_S_and_b_test(i), 0.001));
+    KALDI_ASSERT(ApproxEqual(deriv_xvector_vec(i),
+    deriv_xvector_vec_test(i), 0.001));
+
+  // Verify that the S derivates are the same.
+  for (int32 i = 0; i < deriv_S.Dim(); i++)
+    KALDI_ASSERT(ApproxEqual(deriv_S(i), deriv_S_test(i), 0.001));
+
+  // Verify that the b derivates are the same.
+  KALDI_ASSERT(ApproxEqual(deriv_b, deriv_b_test, 0.001));
   return true;
 }
 
 void TestComputeXvectorObjfAndDeriv(
     const CuMatrixBase<BaseFloat> &xvector_pairs,
     const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
-    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf,
     BaseFloat *tot_weight) {
 
   int32 N = xvector_pairs.NumRows();
   BaseFloat same_objf = 0,
             diff_objf = 0;
   BaseFloat K = 1.0 / (N - 2.0);
-  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
-  CuMatrix<BaseFloat> tmp_deriv(N, xvector_pairs.NumCols()
-                                + S_dim + 1, kSetZero);
+  (*deriv_b) = 0;
   // Handle portion of the objf corresponding to pairs of xvectors
   // from the same classes.
   for (int32 i = 0; i < N/2; i++) {
     const CuVector<BaseFloat> &v(xvector_pairs.Row(2 * i)),
                               &w(xvector_pairs.Row(2 * i + 1));
     CuVector<BaseFloat> deriv_v,
                         deriv_w,
-                        deriv_S_and_b_part;
-    BaseFloat similarity_score = TestSimilarityScore(v, w, S, b);
+                        deriv_S_part;
+    BaseFloat similarity_score = TestSimilarityScore(v, w, S, b),
+              deriv_b_part = 0;
     same_objf += Log(1 + Exp(-similarity_score));
     TestGetDeriv(v, w, S, b, true, similarity_score, &deriv_v,
-     &deriv_w, &deriv_S_and_b_part);
+     &deriv_w, &deriv_S_part, &deriv_b_part);
     deriv_xvector->Row(2 * i).AddVec(1.0, deriv_v);
     deriv_xvector->Row(2 * i + 1).AddVec(1.0, deriv_w);
-    deriv_S_and_b->AddVec(1.0, deriv_S_and_b_part);
+    deriv_S->AddVec(1.0, deriv_S_part);
+    (*deriv_b) += deriv_b_part;
   }
 
   // Handle portion of the objf corresponding to pairs of xvectors
@@ -207,14 +214,16 @@ void TestComputeXvectorObjfAndDeriv(
                                 &w(xvector_pairs.Row(j));
       CuVector<BaseFloat> deriv_v,
                           deriv_w,
-                          deriv_S_and_b_part;
-      BaseFloat similarity_score = TestSimilarityScore(v, w, S, b);
+                          deriv_S_part;
+      BaseFloat similarity_score = TestSimilarityScore(v, w, S, b),
+              deriv_b_part = 0;
       diff_objf += Log(1 + Exp(similarity_score));
       TestGetDeriv(v, w, S, b, false, similarity_score, &deriv_v,
-        &deriv_w, &deriv_S_and_b_part);
+        &deriv_w, &deriv_S_part, &deriv_b_part);
       deriv_xvector->Row(i).AddVec(K, deriv_v);
       deriv_xvector->Row(j).AddVec(K, deriv_w);
-      deriv_S_and_b->AddVec(K, deriv_S_and_b_part);
+      deriv_S->AddVec(K, deriv_S_part);
+      (*deriv_b) += K * deriv_b_part;
     }
   }
   // Scale the same and different portions of the objective function
@@ -228,12 +237,12 @@ void TestGetDeriv(const CuVector<BaseFloat> &v,
     const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, bool is_same, BaseFloat similarity_score,
     CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
-    CuVector<BaseFloat> *deriv_S_and_b) {
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b) {
   int32 d = is_same ? 1 : -1,
         S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
   deriv_v->Resize(v.Dim(), kSetZero);
   deriv_w->Resize(v.Dim(), kSetZero);
-  deriv_S_and_b->Resize(S_dim + 1, kSetZero);
+  deriv_S->Resize(S_dim, kSetZero);
 
   // This scalar is common to the different derivatives.
   BaseFloat deriv_coef = d * Exp(-1 * d * similarity_score)
@@ -254,11 +263,10 @@ void TestGetDeriv(const CuVector<BaseFloat> &v,
   for (int32 i = 0; i < S.NumCols(); i++)
     deriv_S_mat(i, i) = 0.5 * deriv_S_mat(i, i);
   CuSubVector<BaseFloat> deriv_S_vec(deriv_S_mat.Data(), S_dim);
-  CuSubVector<BaseFloat> sub_deriv_S_and_b(*deriv_S_and_b, 0, S_dim);
-  sub_deriv_S_and_b.AddVec(deriv_coef, deriv_S_vec);
+  deriv_S->AddVec(deriv_coef, deriv_S_vec);
 
   // Handle derivative with respect to b.
-  (*deriv_S_and_b)(S_dim) = -deriv_coef;
+  (*deriv_b) = -deriv_coef;
 }
 
 BaseFloat TestSimilarityScore(const CuVector<BaseFloat> &v,
@@ -286,14 +294,15 @@ void UnitTestXvectorExtractor() {
 
 int main() {
   using namespace kaldi;
-  for (int32 i = 0; i < 2; i++)
+  for (int32 i = 0; i < 2; i++) {
 #if HAVE_CUDA == 1
     if (i == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no");
     else
-      CuDevice::Instantiate().SelectGpuId("yes"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("yes");
 #endif
     UnitTestXvectorExtractor();
+  }
   std::cout << "Xvector tests succeeded.\n";
   return 0;
 }