Have low_rank_cholesky return the diagonal of the residual, as it is useful

ThomasColthurst · tensorflower-gardener · commit 9bc0d1e633e5 · 2023-05-03T06:58:52.000-07:00
when preconditioning.

PiperOrigin-RevId: 529077632
diff --git a/tensorflow_probability/python/math/linalg.py b/tensorflow_probability/python/math/linalg.py
@@ -428,10 +428,15 @@ def low_rank_cholesky(matrix, max_rank, trace_atol=0, trace_rtol=0, name=None):
     name: Optional name for the op.
 
   Returns:
-    A pair (LR, r) of a matrix LR such that the rank of LR is r <= max_rank
-    and if r is < max_rank, trace(matrix - LR * LR^t) < trace_atol.
-    If matrix is of shape (b1, ..., bn, m, m), then LR will be of shape
-    (b1, ..., bn, m, r) where r <= max_rank.
+    A triplet (LR, r, residual_diag) of
+    LR: a matrix such that LR * LR^t is approximately the input matrix.
+      If matrix is of shape (b1, ..., bn, m, m), then LR will be of shape
+      (b1, ..., bn, m, r) where r <= max_rank.
+    r: the rank of LR.  If r is < max_rank, then
+      trace(matrix - LR * LR^t) < trace_atol, and
+    residual_diag: The diagonal entries of matrix - LR * LR^t.  This is
+      returned because together with LR, it is useful for preconditioning
+      the input matrix.
   """
   with tf.name_scope(name or 'low_rank_cholesky'):
     dtype = dtype_util.common_dtype([matrix, trace_atol, trace_rtol],
@@ -498,14 +503,14 @@ def lr_cholesky_body(i, lr, residual_diag):
     lr = tf.zeros(matrix.shape, dtype=matrix.dtype)[..., :max_rank]
 
     mdiag = tf.linalg.diag_part(matrix)
-    i, lr, _ = tf.while_loop(
+    i, lr, residual_diag = tf.while_loop(
         cond=lr_cholesky_cond,
         body=lr_cholesky_body,
         loop_vars=(0, lr, mdiag),
         maximum_iterations=max_rank
     )
 
-    return lr, i
+    return lr, i, residual_diag
 
 
 def lu_solve(lower_upper, perm, rhs,
diff --git a/tensorflow_probability/python/math/linalg_test.py b/tensorflow_probability/python/math/linalg_test.py
@@ -429,16 +429,24 @@ def testLowRankCholesky(self):
     matrix = self._random_batch_psd(dim)
     true_diag = tf.linalg.diag_part(matrix)
 
-    pchol, r = linalg.low_rank_cholesky(matrix, max_rank=1)
+    pchol, r, residual_diag = linalg.low_rank_cholesky(matrix, max_rank=1)
     self.assertEqual(1, self.evaluate(r))
+    self.assertEqual((2, 11), residual_diag.shape)
     mat = tf.matmul(pchol, pchol, transpose_b=True)
     diag_diff_prev = self.evaluate(tf.abs(tf.linalg.diag_part(mat) - true_diag))
     diff_norm_prev = self.evaluate(
         tf.linalg.norm(mat - matrix, ord='fro', axis=[-1, -2]))
+    old_residual_trace = None
     for rank in range(2, dim + 1):
       # Specifying trace_rtol forces the full max_rank decomposition.
-      pchol, r = linalg.low_rank_cholesky(matrix, max_rank=rank, trace_rtol=-1)
+      pchol, r, residual_diag = linalg.low_rank_cholesky(
+          matrix, max_rank=rank, trace_rtol=-1)
       self.assertEqual(rank, self.evaluate(r))
+      residual_trace = tf.math.reduce_sum(residual_diag, axis=-1)
+      if old_residual_trace is not None:
+        self.assertTrue(self.evaluate(tf.reduce_all(
+            residual_trace < old_residual_trace)))
+      old_residual_trace = residual_trace
       # Compared to pivot_cholesky, low_rank_cholesky will sometimes have
       # approximate zeros like 7e-17 or -2.6e-7 where it "should" have a
       # real zero.
@@ -471,7 +479,7 @@ def testGradient(self):
     dim = 11
 
     def fn(matrix):
-      chol, _ = linalg.low_rank_cholesky(matrix, max_rank=dim // 3)
+      chol, _, _ = linalg.low_rank_cholesky(matrix, max_rank=dim // 3)
       return chol
     def grad(matrix):
       _, dmatrix = gradient.value_and_gradient(fn, matrix)
@@ -494,7 +502,7 @@ def testGradientTapeCFv2(self):
     def grad(matrix):
       with tf.GradientTape() as tape:
         tape.watch(matrix)
-        pchol, _ = linalg.low_rank_cholesky(matrix, max_rank=dim // 3)
+        pchol, _, _ = linalg.low_rank_cholesky(matrix, max_rank=dim // 3)
       dmatrix = tape.gradient(
           pchol, matrix, output_gradients=tf.ones_like(pchol) * .01)
       return dmatrix
@@ -561,7 +569,7 @@ def testOracleExamples(self, mat, oracle_pchol):
 
     mat = np.matmul(mat, mat.T)
     for rank in range(1, max_rank):
-      lr_chol, r = fns[rank](mat)
+      lr_chol, r, _ = fns[rank](mat)
       self.assertEqual(self.evaluate(r), rank)
       self.assertAllClose(
           oracle_pchol[..., :rank], lr_chol[..., :rank], atol=1e-4)