diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 00d4f8bebf8..41288b411c0 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2105,11 +2105,11 @@ static void _compute_xvector_objf(const Real* scores, MatrixDim scores_dim,
   int32_cuda scores_index = i + j * scores_dim.stride;
   Real K = 1.0 / (scores_dim.rows - 2.0);
   Real L = scores[scores_index];
-  if (i < scores_dim.cols && j < scores_dim.rows && i < j) {
+  if (i < scores_dim.cols && j < scores_dim.rows) {
     if (i + 1 == j && i % 2 == 0) {
       obfj_terms[scores_index] = log(1.0 + exp(-L));
       obfj_derivs[scores_index] = 1.0 / (1.0 + exp(L));
-    } else if (i != j) {
+    } else if (i < j) {
       obfj_terms[scores_index] = K * log(1.0 + exp(L));
       obfj_derivs[scores_index] = -K / (1.0 + exp(-L));
     }
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index d5e9cfb6ef8..005bb3146c0 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -206,10 +206,11 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   }
 }
 
-template<typename Real>
-void ComputeXvectorObjfFromScores(const CuMatrixBase<Real> &scores,
-                                  CuMatrixBase<Real> *objf_terms,
-                                  CuMatrixBase<Real> *objf_derivs) {
+void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
+                                  CuMatrixBase<BaseFloat> *objf_terms,
+                                  CuMatrixBase<BaseFloat> *objf_derivs) {
+  KALDI_ASSERT(SameDim(*objf_terms, *objf_derivs)
+               && SameDim(*objf_terms, scores));
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
@@ -226,8 +227,19 @@ void ComputeXvectorObjfFromScores(const CuMatrixBase<Real> &scores,
   } else
   #endif
   {
-    // TODO: Add the CPU version.
-    KALDI_LOG << "NOT USING CUDA";
+    int32 num_rows = scores.NumRows();
+    BaseFloat K = 1.0 / (num_rows - 2.0);
+    for (int32 i = 0; i < num_rows; i++) {
+      for (int32 j = i + 1; j < num_rows; j++) {
+        if (i + 1 == j && i % 2 == 0) {
+          (*objf_terms)(i, j) = log(1.0 + exp(-scores(i, j)));
+          (*objf_derivs)(i, j) = 1.0 / (1.0 + exp(scores(i, j)));
+        } else {
+          (*objf_terms)(i, j) = K * log(1.0 + exp(scores(i, j)));
+          (*objf_derivs)(i, j) = -K / (1.0 + exp(-scores(i, j)));
+        }
+      }
+    }
   }
 }
 
@@ -259,16 +271,6 @@ void Randomize(const CuMatrixBase<double> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<double> *tgt);
 
-template
-void ComputeXvectorObjfFromScores(const CuMatrixBase<float> &scores,
-                                  CuMatrixBase<float> *objf_terms,
-                                  CuMatrixBase<float> *objf_derivs);
-template
-void ComputeXvectorObjfFromScores(const CuMatrixBase<double> &scores,
-                                  CuMatrixBase<double> *objf_terms,
-                                  CuMatrixBase<double> *objf_derivs);
-
-
 
 } //namespace cu
 
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index a30cec5d9df..4aaa4ceb29a 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -80,9 +80,31 @@ void Group2norm(const CuMatrixBase<Real> &src,
                 int32 group_stride);
 
 /*
-TODO: Documentation.
+  This function is used in computing the objective function and derivatives
+  in xvector training.
+  @param [in] scores   'scores' is a symmetric matrix of scores which are to
+  be interpreted as log-odds (according to the model) of pairs coming from the
+  same class, so scores(i, j) is the model's log p(same/different) for
+  elements i and j of the original minibatch of input. We assume that the data
+  in 'scores' has been arranged in such a way that pairs of indexes of the form
+  (2k, 2k+1), e.g., (0, 1), (2, 3), (4, 5), etc, are from the same class, but
+  indexes of any other form, such as (0, 2), (1, 2), etc, are from different
+  classes.
+  @param [out] objf_terms   'objf_terms' is a matrix of the same dimension as
+  'scores' whose elements we will sum to get the objective function for this
+  minibatch. This function computes the appropriate contributions to the
+  objective function, as follows.
+    if i == j:
+      objf_terms(i, j)== 0       # the same exact element is not scored
+    elsif i%2 == j%2:
+      objf_terms(i, j) = log(p(same))
+                       = -log(1 + exp(-scores(i, j))
+    else:
+      objf_terms(i, j) = 1 / (scores.NumRows() - 2) * log(p(different))
+                       = -1/(scores.NumRows() - 2) * log(1+exp(scores(i,j))
+  @param [out] objf_derivs    Element (i,j) of this matrix is the derivative
+  of objf_terms(i,j) with respect to scores(i, j).
 */
-template <typename BaseFloat>
 void ComputeXvectorObjfFromScores(const CuMatrixBase<BaseFloat> &scores,
                                   CuMatrixBase<BaseFloat> *objf_terms,
                                   CuMatrixBase<BaseFloat> *objf_derivs);
diff --git a/src/ivector/xvector-test.cc b/src/ivector/xvector-test.cc
index 229863e820a..ae3b6d7e57b 100644
--- a/src/ivector/xvector-test.cc
+++ b/src/ivector/xvector-test.cc
@@ -30,17 +30,17 @@ void TestGetDeriv(const CuVector<BaseFloat> &v,
     const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, bool is_same, BaseFloat similarity_score,
     CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
-    CuVector<BaseFloat> *deriv_S_and_b);
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b);
 
 void TestComputeXvectorObjfAndDeriv(
     const CuMatrixBase<BaseFloat> &xvector_pairs,
     const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
-    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf,
     BaseFloat *tot_weight);
 
 bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
-  int32 xvector_dim = RandInt(4, 30),
+  int32 xvector_dim = RandInt(4, 50),
         num_rows = 2 * RandInt(2, 10); // The number of rows must be even
                                        // and greater than 2.
   CuSpMatrix<BaseFloat> S(xvector_dim);
@@ -49,14 +49,15 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
   S.Scale(1.0e-01);
   BaseFloat b = RandInt(-100, 100) / 10.0,
             tot_weight,
-            tot_objf;
+            tot_objf,
+            deriv_b;
   int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
   CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
                       deriv_xvector(num_rows, xvector_dim, kSetZero);
-  CuVector<BaseFloat> deriv_S_and_b(S_dim + 1, kSetZero);
+  CuVector<BaseFloat> deriv_S(S_dim, kSetZero);
   xvector_pairs.SetRandn();
   ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
-    &deriv_S_and_b, &tot_objf, &tot_weight);
+    &deriv_S, &deriv_b, &tot_objf, &tot_weight);
   CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
 
   // Sum over the derivatives for xvector input.
@@ -74,14 +75,12 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
       xvector_pairs_p(j, i) += perturb_delta;
       xvector_pairs_n(j, i) += -perturb_delta;
     }
-    CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
-    CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
-    BaseFloat tot_objf_p;
-    BaseFloat tot_objf_n;
-    ComputeXvectorObjfAndDeriv(xvector_pairs_p, S, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
-    ComputeXvectorObjfAndDeriv(xvector_pairs_n, S, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
+    BaseFloat tot_objf_p,
+              tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs_p, S, b, NULL,
+      NULL, NULL, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs_n, S, b, NULL,
+      NULL, NULL, &tot_objf_n, &tot_weight);
     BaseFloat delta = (tot_objf_p  - tot_objf_n)
       * 1.0 / (2.0 * perturb_delta);
     l2_xvector += pow(deriv_xvector_vec(i) - delta, 2);
@@ -92,35 +91,34 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
   for (int32 i = 0; i < S_dim; i++) {
     CuSpMatrix<BaseFloat> S_p(S);
     CuSpMatrix<BaseFloat> S_n(S);
-    S_p.Data()[i] += perturb_delta;
-    S_n.Data()[i] -= perturb_delta;
-    CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
-    CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
-    BaseFloat tot_objf_p;
-    BaseFloat tot_objf_n;
-    ComputeXvectorObjfAndDeriv(xvector_pairs, S_p, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
-    ComputeXvectorObjfAndDeriv(xvector_pairs, S_n, b, &deriv_xvector_tmp,
-      &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
+    CuSubVector<BaseFloat> S_p_vec(S_p.Data(), S_dim);
+    CuSubVector<BaseFloat> S_n_vec(S_n.Data(), S_dim);
+    S_p_vec(i) += perturb_delta;
+    S_n_vec(i) += -perturb_delta;
+    BaseFloat tot_objf_p,
+              tot_objf_n;
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_p, b, NULL,
+      NULL, NULL, &tot_objf_p, &tot_weight);
+    ComputeXvectorObjfAndDeriv(xvector_pairs, S_n, b, NULL,
+      NULL, NULL, &tot_objf_n, &tot_weight);
     BaseFloat delta = (tot_objf_p  - tot_objf_n)
       * 1.0 / (2.0 * perturb_delta);
-    l2_S += pow(deriv_S_and_b(i) - delta, 2);
+    l2_S += pow(deriv_S(i) - delta, 2);
   }
 
   // Compare the b derivative calculated above with a numerical
   // approximation.
   BaseFloat b_p = b + perturb_delta;
   BaseFloat b_n = b - perturb_delta;
-  CuMatrix<BaseFloat> deriv_xvector_tmp(num_rows, xvector_dim, kSetZero);
-  CuVector<BaseFloat> deriv_S_and_b_tmp(S_dim + 1, kSetZero);
   BaseFloat tot_objf_p;
   BaseFloat tot_objf_n;
-  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_p, &deriv_xvector_tmp,
-    &deriv_S_and_b_tmp, &tot_objf_p, &tot_weight);
-  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_n, &deriv_xvector_tmp,
-    &deriv_S_and_b_tmp, &tot_objf_n, &tot_weight);
-  BaseFloat delta = (tot_objf_p  - tot_objf_n) * 1.0 / (2.0 * perturb_delta);
-  l2_b = pow(deriv_S_and_b(S_dim) - delta, 2);
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_p, NULL,
+    NULL, NULL, &tot_objf_p, &tot_weight);
+  ComputeXvectorObjfAndDeriv(xvector_pairs, S, b_n, NULL,
+    NULL, NULL, &tot_objf_n, &tot_weight);
+  BaseFloat delta = (tot_objf_p  - tot_objf_n)
+                    * 1.0 / (2.0 * perturb_delta);
+  l2_b = pow(deriv_b - delta, 2);
   KALDI_ASSERT(l2_xvector < 1.0e-03);
   KALDI_ASSERT(l2_S <  1.0e-03);
   KALDI_ASSERT(l2_b < 1.0e-03);
@@ -128,7 +126,7 @@ bool TestXvectorExtractorDerivative(BaseFloat perturb_delta) {
 }
 
 bool TestXvectorComputeObjf() {
-  int32 xvector_dim = RandInt(4, 30),
+  int32 xvector_dim = RandInt(4, 40),
         num_rows = 2 * RandInt(2, 10); // The number of rows must be even
                                        // and greater than 2.
   CuSpMatrix<BaseFloat> S(xvector_dim);
@@ -139,19 +137,21 @@ bool TestXvectorComputeObjf() {
             tot_weight,
             tot_weight_test,
             tot_objf,
-            tot_objf_test;
+            tot_objf_test,
+            deriv_b,
+            deriv_b_test;
   int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
   CuMatrix<BaseFloat> xvector_pairs(num_rows, xvector_dim, kSetZero),
                       deriv_xvector(num_rows, xvector_dim, kSetZero),
                       deriv_xvector_test(num_rows, xvector_dim, kSetZero);
-  CuVector<BaseFloat> deriv_S_and_b(S_dim + 1, kSetZero),
-                      deriv_S_and_b_test(S_dim + 1, kSetZero);
+  CuVector<BaseFloat> deriv_S(S_dim, kSetZero),
+                      deriv_S_test(S_dim, kSetZero);
   xvector_pairs.SetRandn();
 
   ComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector,
-    &deriv_S_and_b, &tot_objf, &tot_weight);
+    &deriv_S, &deriv_b, &tot_objf, &tot_weight);
   TestComputeXvectorObjfAndDeriv(xvector_pairs, S, b, &deriv_xvector_test,
-    &deriv_S_and_b_test, &tot_objf_test, &tot_weight_test);
+    &deriv_S_test, &deriv_b_test, &tot_objf_test, &tot_weight_test);
 
   CuVector<BaseFloat> deriv_xvector_vec(xvector_dim);
   deriv_xvector_vec.AddRowSumMat(1.0, deriv_xvector, 0.0);
@@ -160,11 +160,18 @@ bool TestXvectorComputeObjf() {
 
   // Verify that the objfs are the same.
   KALDI_ASSERT(ApproxEqual(tot_objf, tot_objf_test, 0.001));
+
   // Also verify that the gradients are the same.
   for (int32 i = 0; i < deriv_xvector_vec.Dim(); i++)
-    KALDI_ASSERT(ApproxEqual(deriv_xvector_vec(i), deriv_xvector_vec_test(i), 0.001));
-  for (int32 i = 0; i < deriv_S_and_b.Dim(); i++)
-    KALDI_ASSERT(ApproxEqual(deriv_S_and_b(i), deriv_S_and_b_test(i), 0.001));
+    KALDI_ASSERT(ApproxEqual(deriv_xvector_vec(i),
+    deriv_xvector_vec_test(i), 0.001));
+
+  // Verify that the S derivates are the same.
+  for (int32 i = 0; i < deriv_S.Dim(); i++)
+    KALDI_ASSERT(ApproxEqual(deriv_S(i), deriv_S_test(i), 0.001));
+
+  // Verify that the b derivates are the same.
+  KALDI_ASSERT(ApproxEqual(deriv_b, deriv_b_test, 0.001));
   return true;
 }
 
@@ -172,16 +179,14 @@ void TestComputeXvectorObjfAndDeriv(
     const CuMatrixBase<BaseFloat> &xvector_pairs,
     const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
-    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b, BaseFloat *tot_objf,
     BaseFloat *tot_weight) {
 
   int32 N = xvector_pairs.NumRows();
   BaseFloat same_objf = 0,
             diff_objf = 0;
   BaseFloat K = 1.0 / (N - 2.0);
-  int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
-  CuMatrix<BaseFloat> tmp_deriv(N, xvector_pairs.NumCols()
-                                + S_dim + 1, kSetZero);
+  (*deriv_b) = 0;
   // Handle portion of the objf corresponding to pairs of xvectors
   // from the same classes.
   for (int32 i = 0; i < N/2; i++) {
@@ -189,14 +194,16 @@ void TestComputeXvectorObjfAndDeriv(
                               &w(xvector_pairs.Row(2 * i + 1));
     CuVector<BaseFloat> deriv_v,
                         deriv_w,
-                        deriv_S_and_b_part;
-    BaseFloat similarity_score = TestSimilarityScore(v, w, S, b);
+                        deriv_S_part;
+    BaseFloat similarity_score = TestSimilarityScore(v, w, S, b),
+              deriv_b_part = 0;
     same_objf += Log(1 + Exp(-similarity_score));
     TestGetDeriv(v, w, S, b, true, similarity_score, &deriv_v,
-     &deriv_w, &deriv_S_and_b_part);
+     &deriv_w, &deriv_S_part, &deriv_b_part);
     deriv_xvector->Row(2 * i).AddVec(1.0, deriv_v);
     deriv_xvector->Row(2 * i + 1).AddVec(1.0, deriv_w);
-    deriv_S_and_b->AddVec(1.0, deriv_S_and_b_part);
+    deriv_S->AddVec(1.0, deriv_S_part);
+    (*deriv_b) += deriv_b_part;
   }
 
   // Handle portion of the objf corresponding to pairs of xvectors
@@ -207,14 +214,16 @@ void TestComputeXvectorObjfAndDeriv(
                                 &w(xvector_pairs.Row(j));
       CuVector<BaseFloat> deriv_v,
                           deriv_w,
-                          deriv_S_and_b_part;
-      BaseFloat similarity_score = TestSimilarityScore(v, w, S, b);
+                          deriv_S_part;
+      BaseFloat similarity_score = TestSimilarityScore(v, w, S, b),
+              deriv_b_part = 0;
       diff_objf += Log(1 + Exp(similarity_score));
       TestGetDeriv(v, w, S, b, false, similarity_score, &deriv_v,
-        &deriv_w, &deriv_S_and_b_part);
+        &deriv_w, &deriv_S_part, &deriv_b_part);
       deriv_xvector->Row(i).AddVec(K, deriv_v);
       deriv_xvector->Row(j).AddVec(K, deriv_w);
-      deriv_S_and_b->AddVec(K, deriv_S_and_b_part);
+      deriv_S->AddVec(K, deriv_S_part);
+      (*deriv_b) += K * deriv_b_part;
     }
   }
   // Scale the same and different portions of the objective function
@@ -228,12 +237,12 @@ void TestGetDeriv(const CuVector<BaseFloat> &v,
     const CuVector<BaseFloat> &w, const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, bool is_same, BaseFloat similarity_score,
     CuVector<BaseFloat> *deriv_v, CuVector<BaseFloat> *deriv_w,
-    CuVector<BaseFloat> *deriv_S_and_b) {
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b) {
   int32 d = is_same ? 1 : -1,
         S_dim = S.NumCols() * (S.NumCols() + 1) / 2;
   deriv_v->Resize(v.Dim(), kSetZero);
   deriv_w->Resize(v.Dim(), kSetZero);
-  deriv_S_and_b->Resize(S_dim + 1, kSetZero);
+  deriv_S->Resize(S_dim, kSetZero);
 
   // This scalar is common to the different derivatives.
   BaseFloat deriv_coef = d * Exp(-1 * d * similarity_score)
@@ -254,11 +263,10 @@ void TestGetDeriv(const CuVector<BaseFloat> &v,
   for (int32 i = 0; i < S.NumCols(); i++)
     deriv_S_mat(i, i) = 0.5 * deriv_S_mat(i, i);
   CuSubVector<BaseFloat> deriv_S_vec(deriv_S_mat.Data(), S_dim);
-  CuSubVector<BaseFloat> sub_deriv_S_and_b(*deriv_S_and_b, 0, S_dim);
-  sub_deriv_S_and_b.AddVec(deriv_coef, deriv_S_vec);
+  deriv_S->AddVec(deriv_coef, deriv_S_vec);
 
   // Handle derivative with respect to b.
-  (*deriv_S_and_b)(S_dim) = -deriv_coef;
+  (*deriv_b) = -deriv_coef;
 }
 
 BaseFloat TestSimilarityScore(const CuVector<BaseFloat> &v,
@@ -286,14 +294,15 @@ void UnitTestXvectorExtractor() {
 
 int main() {
   using namespace kaldi;
-  for (int32 i = 0; i < 2; i++)
+  for (int32 i = 0; i < 2; i++) {
 #if HAVE_CUDA == 1
     if (i == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no");
     else
-      CuDevice::Instantiate().SelectGpuId("yes"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("yes");
 #endif
     UnitTestXvectorExtractor();
+  }
   std::cout << "Xvector tests succeeded.\n";
   return 0;
 }
diff --git a/src/ivector/xvector.cc b/src/ivector/xvector.cc
index a6e8533b611..c06942d1cb6 100644
--- a/src/ivector/xvector.cc
+++ b/src/ivector/xvector.cc
@@ -25,52 +25,86 @@ void ComputeXvectorObjfAndDeriv(
     const CuMatrixBase<BaseFloat> &xvector_pairs,
     const CuSpMatrix<BaseFloat> &S,
     BaseFloat b, CuMatrixBase<BaseFloat> *deriv_xvector,
-    CuVector<BaseFloat> *deriv_S_and_b, BaseFloat *tot_objf,
+    CuVector<BaseFloat> *deriv_S, BaseFloat *deriv_b,
+    BaseFloat *tot_objf,
     BaseFloat *tot_weight) {
 
   int32 S_dim = S.NumCols() * (S.NumCols() + 1) / 2,
         N = xvector_pairs.NumRows(),
         xvector_dim = xvector_pairs.NumCols();
-  BaseFloat K = 1.0 / (N - 2.0);
   (*tot_objf) = 0;
 
   if (deriv_xvector == NULL)
-    KALDI_ASSERT(deriv_S_and_b == NULL);
+    KALDI_ASSERT(deriv_S == NULL && deriv_b == NULL);
   else {
     KALDI_ASSERT(deriv_xvector->NumCols() == xvector_dim);
     KALDI_ASSERT(deriv_xvector->NumRows() == N);
-    KALDI_ASSERT(deriv_S_and_b->Dim() == S_dim + 1);
+    KALDI_ASSERT(deriv_S->Dim() == S_dim);
   }
 
-  CuMatrix<BaseFloat> S_tmp(S);
-  CuMatrix<BaseFloat> P(N, xvector_dim),
+  CuMatrix<BaseFloat> S_tmp(S),
+                      P(N, xvector_dim),
                       Q(N, N),
                       R(N, N),
-                      T(N, N),
+                      scores(N, N),           // The raw scores.
                       objf_terms(N, N),
-                      objf_deriv_terms(N, N);
-
+                      objf_deriv_terms(N, N); // Derivative of the
+                                              // objf w.r.t. the scores.
   CuVector<BaseFloat> r(N);
+
   P.AddMatMat(1.0, xvector_pairs, kNoTrans, S_tmp, kNoTrans, 0.0);
   r.AddDiagMatMat(1.0, xvector_pairs, kNoTrans, P, kTrans, 0.0);
   R.AddVecToRows(1.0, r);
   Q.SymAddMat2(1.0, xvector_pairs, kNoTrans, 0.0);
   Q.CopyLowerToUpper();
-  T.AddMat(1.0, Q, kNoTrans);
-  T.AddMat(-1.0, R, kTrans);
-  T.AddMat(-1.0, R, kNoTrans);
-  T.Add(b);
+  scores.AddMat(1.0, Q, kNoTrans);
+  scores.AddMat(-1.0, R, kTrans);
+  scores.AddMat(-1.0, R, kNoTrans);
+  scores.Add(b);
 
-  cu::ComputeXvectorObjfFromScores<BaseFloat>(T, &objf_terms, &objf_deriv_terms);
+  cu::ComputeXvectorObjfFromScores(scores, &objf_terms, &objf_deriv_terms);
   CuVector<BaseFloat> objf_terms_vec(N);
   objf_terms_vec.AddRowSumMat(1.0, objf_terms);
   (*tot_objf) = objf_terms_vec.Sum();
 
   if (deriv_xvector != NULL) {
-    /* TODO: Call cu-math function that handles the derivatives of S
-       and the xvectors.
-    */
-    (*deriv_S_and_b)(S_dim) = -objf_deriv_terms.Sum();
+    // Some vector and matrix quantities for computing the
+    // derivatives.
+    CuMatrix<BaseFloat> objf_deriv_terms_trans(objf_deriv_terms, kTrans),
+             S_deriv_part(N, xvector_dim),
+             S_deriv(xvector_dim, xvector_dim);
+    CuVector<BaseFloat> cvec_rows(N),
+                        cvec_cols(N);
+    cvec_rows.AddRowSumMat(1.0, objf_deriv_terms, 1.0);
+    cvec_cols.AddRowSumMat(1.0, objf_deriv_terms_trans, 1.0);
+    CuVector<BaseFloat> cvec(cvec_rows);
+    cvec.AddVec(1.0, cvec_cols, 1.0);
+
+    // Compute derivative of the objf with respect to the xvectors.
+    CuMatrix<BaseFloat> SX(N, xvector_dim);
+    SX.AddMatMat(1.0, xvector_pairs, kNoTrans, S_tmp, kNoTrans, 0.0);
+    deriv_xvector->AddDiagVecMat(-1.0, cvec_rows, xvector_pairs,
+                                 kNoTrans, 0.0);
+    deriv_xvector->AddMatMat(-1.0, objf_deriv_terms, kTrans,
+                             xvector_pairs, kNoTrans, 1.0);
+    deriv_xvector->AddDiagVecMat(2.0, cvec_cols, SX,
+                                kNoTrans, 1.0);
+    deriv_xvector->AddMatMat(2.0, objf_deriv_terms, kNoTrans,
+                             SX, kNoTrans, 1.0);
+
+    // Compute derivative of the objf with respect to the symmetric matrix
+    // S.
+    S_deriv_part.AddDiagVecMat(2.0, cvec, xvector_pairs,
+                              kNoTrans, 0.0);
+    S_deriv.AddMatMat(1.0, xvector_pairs, kTrans, S_deriv_part,
+                      kNoTrans, 1.0);
+    CuSpMatrix<BaseFloat> S_deriv_tmp(S_deriv);
+    S_deriv_tmp.ScaleDiag(0.5);
+    deriv_S->CopyFromVec(CuSubVector<BaseFloat>(S_deriv_tmp.Data(),
+                            S_dim));
+
+    // Compute derivative of objf with respect to the scalar offset b.
+    (*deriv_b) = -objf_deriv_terms.Sum();
   }
   (*tot_weight) = N;
 }
diff --git a/src/ivector/xvector.h b/src/ivector/xvector.h
index ddb05c632d7..53d0864575a 100644
--- a/src/ivector/xvector.h
+++ b/src/ivector/xvector.h
@@ -55,8 +55,10 @@ namespace kaldi {
   (2, 4), etc, are from different classes.
   @param [out] deriv_xvector  If non-NULL, the derivative of the objective
   function with respect to the xvectors is written here.
-  @param [out] deriv_S_and_b  If non-NULL, the derivative of the objective
-  function with respect to the parameters S and b are written here.
+  @param [out] deriv_S  If non-NULL, the derivative of the objective
+  function with respect to the parameter S are written here.
+  @param [out] deriv_b  If other derivates are non-NULL the derivative of
+  the objective function with respect to the parameter b is written here.
   @param [out] tot_objf  The total objective function described above
   @param [out] tot_weight  The total normalizing factor for the objective
   function, equal to dvector_pairs.NumRows().
@@ -65,10 +67,10 @@ namespace kaldi {
     const CuSpMatrix<BaseFloat> &S,
     BaseFloat b,
     CuMatrixBase<BaseFloat> *deriv_xvector,
-    CuVector<BaseFloat> *deriv_S_and_b,
+    CuVector<BaseFloat> *deriv_S,
+    BaseFloat *deriv_b,
     BaseFloat *tot_objf,
     BaseFloat *tot_weight);
-
 }  // namespace kaldi
 
 #endif