From b9adcb61fc87bde09d31c3ead2f1c2120691eae3 Mon Sep 17 00:00:00 2001
From: Lu Lu <lululxvi@gmail.com>
Date: Fri, 15 Dec 2023 23:10:09 -0500
Subject: [PATCH] Adding computing gradients using forward-mode autodiff

---
 deepxde/gradients/__init__.py          |   1 +
 deepxde/gradients/gradients_forward.py | 178 +++++++++++++++++++++++++
 deepxde/gradients/gradients_reverse.py |   2 +-
 3 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 deepxde/gradients/gradients_forward.py

diff --git a/deepxde/gradients/__init__.py b/deepxde/gradients/__init__.py
index c7be7b6ac..268805b1f 100644
--- a/deepxde/gradients/__init__.py
+++ b/deepxde/gradients/__init__.py
@@ -1,3 +1,4 @@
 __all__ = ["jacobian", "hessian"]
 
+# from .gradients_forward import jacobian
 from .gradients_reverse import clear, jacobian, hessian
diff --git a/deepxde/gradients/gradients_forward.py b/deepxde/gradients/gradients_forward.py
new file mode 100644
index 000000000..b03930a84
--- /dev/null
+++ b/deepxde/gradients/gradients_forward.py
@@ -0,0 +1,178 @@
+"""Compute gradients using forward-mode autodiff."""
+
+__all__ = ["jacobian", "hessian"]
+
+from ..backend import backend_name, jax
+
+
+class Jacobian:
+    """Compute Jacobian matrix J: J[i][j] = dy_i/dx_j, where i = 0, ..., dim_y-1 and
+    j = 0, ..., dim_x - 1.
+
+    It is lazy evaluation, i.e., it only computes J[i][j] when needed.
+
+    Args:
+        ys: Output Tensor of shape (batch_size, dim_y).
+        xs: Input Tensor of shape (batch_size, dim_x).
+    """
+
+    def __init__(self, ys, xs):
+        self.ys = ys
+        self.xs = xs
+
+        if backend_name in ["tensorflow.compat.v1", "tensorflow", "pytorch", "paddle"]:
+            self.dim_y = ys.shape[1]
+            # TODO: Other backends
+            raise NotImplementedError(
+                "Backend f{backend_name} doesn't support forward-mode autodiff."
+            )
+        elif backend_name == "jax":
+            # For backend jax, a tuple of a jax array and a callable is passed as one of
+            # the arguments, since jax does not support computational graph explicitly.
+            # The array is used to control the dimensions and the callable is used to
+            # obtain the derivative function, which can be used to compute the
+            # derivatives.
+            self.dim_y = ys[0].shape[1]
+        self.dim_x = xs.shape[1]
+
+        self.J = {}
+
+    def __call__(self, i=0, j=None):
+        """Returns J[`i`][`j`]. If `j` is ``None``, returns the gradient of y_i, i.e.,
+        J[i].
+        """
+        if not 0 <= i < self.dim_y:
+            raise ValueError("i={} is not valid.".format(i))
+        if j is not None and not 0 <= j < self.dim_x:
+            raise ValueError("j={} is not valid.".format(j))
+        # Computing gradient is not supported in forward mode, unless there is only one input.
+        if j is None and self.dim_x > 1:
+            raise NotImplementedError(
+                "Forward-mode autodiff doesn't support computing gradient."
+            )
+        # Compute J[:, j]
+        if j not in self.J:
+            if backend_name == "jax":
+                # Here, we use jax.jvp to compute the gradient of a function. This is
+                # different from TensorFlow and PyTorch that the input of a function is
+                # no longer a batch. Instead, it is a single point. Formally, backend
+                # jax computes gradients pointwisely and then vectorizes to batch, by
+                # jax.vmap. However, computationally, this is in fact done batchwisely
+                # and efficiently. It is very important to note that, without jax.vmap,
+                # this can only deal with functions whose output is a scalar and input
+                # is a single point.
+                tangent = jax.numpy.zeros(self.dim_x).at[j].set(1)
+                grad_fn = lambda x: jax.jvp(self.ys[1], (x,), (tangent,))[1]
+                self.J[j] = (jax.vmap(grad_fn)(self.xs), grad_fn)
+
+        if backend_name == "jax":
+            # Unlike other backends, in backend jax, a tuple of a jax array and a callable is returned, so that
+            # it is consistent with the argument, which is also a tuple. This may be useful for further computation,
+            # e.g. Hessian.
+            return (
+                self.J[i]
+                if self.dim_y == 1
+                else (
+                    self.J[j][0][:, i : i + 1],
+                    lambda inputs: self.J[j][1](inputs)[i : i + 1],
+                )
+            )
+
+
+# TODO: Refactor duplicate code
+class Jacobians:
+    """Compute multiple Jacobians.
+
+    A new instance will be created for a new pair of (output, input). For the (output,
+    input) pair that has been computed before, it will reuse the previous instance,
+    rather than creating a new one.
+    """
+
+    def __init__(self):
+        self.Js = {}
+
+    def __call__(self, ys, xs, i=0, j=None):
+        # For backend tensorflow and pytorch, self.Js cannot be reused across iteration.
+        # For backend pytorch, we need to reset self.Js in each iteration to avoid
+        # memory leak.
+        #
+        # For backend tensorflow, in each iteration, self.Js is reset to {}.
+        #
+        # Example:
+        #
+        # mydict = {}
+        #
+        # @tf.function
+        # def f(x):
+        #     tf.print(mydict)  # always {}
+        #     y = 1 * x
+        #     tf.print(hash(y.ref()), hash(x.ref()))  # Doesn't change
+        #     mydict[(y.ref(), x.ref())] = 1
+        #     tf.print(mydict)
+        #
+        # for _ in range(2):
+        #     x = np.random.random((3, 4))
+        #     f(x)
+        #
+        #
+        # For backend pytorch, in each iteration, ys and xs are new tensors
+        # converted from np.ndarray, so self.Js will increase over iteration.
+        #
+        # Example:
+        #
+        # mydict = {}
+        #
+        # def f(x):
+        #     print(mydict)
+        #     y = 1 * x
+        #     print(hash(y), hash(x))
+        #     mydict[(y, x)] = 1
+        #     print(mydict)
+        #
+        # for i in range(2):
+        #     x = np.random.random((3, 4))
+        #     x = torch.from_numpy(x)
+        #     x.requires_grad_()
+        #     f(x)
+        if backend_name in ["tensorflow.compat.v1", "tensorflow"]:
+            key = (ys.ref(), xs.ref())
+        elif backend_name in ["pytorch", "paddle"]:
+            key = (ys, xs)
+        elif backend_name == "jax":
+            key = (id(ys[0]), id(xs))
+        if key not in self.Js:
+            self.Js[key] = Jacobian(ys, xs)
+        return self.Js[key](i, j)
+
+    def clear(self):
+        """Clear cached Jacobians."""
+        self.Js = {}
+
+
+# TODO: Refactor duplicate code
+def jacobian(ys, xs, i=0, j=None):
+    """Compute Jacobian matrix J: J[i][j] = dy_i / dx_j, where i = 0, ..., dim_y - 1 and
+    j = 0, ..., dim_x - 1.
+
+    Use this function to compute first-order derivatives instead of ``tf.gradients()``
+    or ``torch.autograd.grad()``, because
+
+    - It is lazy evaluation, i.e., it only computes J[i][j] when needed.
+    - It will remember the gradients that have already been computed to avoid duplicate
+      computation.
+
+    Args:
+        ys: Output Tensor of shape (batch_size, dim_y).
+        xs: Input Tensor of shape (batch_size, dim_x).
+        i (int):
+        j (int or None):
+
+    Returns:
+        J[`i`][`j`] in Jacobian matrix J. If `j` is ``None``, returns the gradient of
+        y_i, i.e., J[`i`].
+    """
+    return jacobian._Jacobians(ys, xs, i=i, j=j)
+
+
+# TODO: Refactor duplicate code
+jacobian._Jacobians = Jacobians()
diff --git a/deepxde/gradients/gradients_reverse.py b/deepxde/gradients/gradients_reverse.py
index d0e0d4aa4..858026d6d 100644
--- a/deepxde/gradients/gradients_reverse.py
+++ b/deepxde/gradients/gradients_reverse.py
@@ -1,4 +1,4 @@
-"""Compute gradients using reverse mode."""
+"""Compute gradients using reverse-mode autodiff."""
 
 __all__ = ["jacobian", "hessian"]