patrick-kidger · johannahaffner · Jul 14, 2025 · May 3, 2025 · May 3, 2025 · May 4, 2025
diff --git a/optimistix/__init__.py b/optimistix/__init__.py
@@ -64,6 +64,7 @@
     hestenes_stiefel as hestenes_stiefel,
     IndirectDampedNewtonDescent as IndirectDampedNewtonDescent,
     IndirectLevenbergMarquardt as IndirectLevenbergMarquardt,
+    LBFGS as LBFGS,
     LearningRate as LearningRate,
     LevenbergMarquardt as LevenbergMarquardt,
     LinearTrustRegion as LinearTrustRegion,

diff --git a/optimistix/_search.py b/optimistix/_search.py
@@ -196,7 +196,6 @@ def compute_grad_dot(self, y: Y):
 FunctionInfo.Residual = Residual
 FunctionInfo.ResidualJac = ResidualJac
 
-
 Eval.__init__.__doc__ = """**Arguments:**
 
 - `f`: the scalar output of a function evaluation `fn(y)`.

diff --git a/optimistix/_solver/__init__.py b/optimistix/_solver/__init__.py
@@ -43,6 +43,7 @@
     BFGSUpdate as BFGSUpdate,
     DFP as DFP,
     DFPUpdate as DFPUpdate,
+    LBFGS as LBFGS,
 )
 from .trust_region import (
     ClassicalTrustRegion as ClassicalTrustRegion,

diff --git a/optimistix/_solver/quasi_newton.py b/optimistix/_solver/quasi_newton.py
@@ -1,6 +1,7 @@
 import abc
 from collections.abc import Callable
-from typing import Any, Generic, TypeVar, Union
+from functools import partial
+from typing import Any, Generic, TypeVar, Union, Optional
 
 import equinox as eqx
 import jax
@@ -66,6 +67,97 @@ def _identity_pytree(pytree: PyTree[Array]) -> lx.PyTreeLinearOperator:
     )
 
 
+@jax.jit
+def _lim_mem_hess_inv_operator_fn(
+        pytree: PyTree[Array],
+        residual_par: PyTree[Array],
+        residual_grad: PyTree[Array],
+        rho: Array,
+        index_start: Array
+):
+    """
+    LBFGS descent linear operator.
+
+    """
+    history_len = rho.shape[0]
+    circ_index = (jnp.arange(history_len) + index_start) % history_len
+
+    # First loop: iterate backwards and compute alpha coefficients
+    def backward_iter(q, indx):
+        dy, dg, r = jtu.tree_map(lambda x: x[indx], (residual_par, residual_grad, rho))
+        alpha = r * tree_dot(dy, q)
+        q = (q ** ω - alpha * dg ** ω).ω
+        return q, alpha
+
+    # Second loop: iterate forwards and apply correction using stored alpha
+    def forward_iter(args, indx):
+        q, alpha = args
+        ai = alpha[indx]
+        r = rho[circ_index[indx]]
+        dy, dg = jtu.tree_map(
+            lambda x: x[circ_index[indx]],
+            (residual_par, residual_grad)
+        )
+        bi = r * tree_dot(dg, q)
+        q = (q ** ω + (dy ** ω * (ai - bi))).ω
+        return (q, alpha), None
+
+
+    q, alpha = jax.lax.scan(backward_iter, pytree, circ_index, reverse=True)
+    dym, dgm = jtu.tree_map(
+        lambda x: x[index_start % history_len],
+        (residual_par, residual_grad)
+    )
+    dyg = tree_dot(dym, dgm)
+    dgg = tree_dot(dgm, dgm)
+    gamma_k = jnp.where(dgg > 1e-10, dyg / dgg, 1.0)
+    q = (gamma_k * q ** ω).ω
+    (q, _), _ = jax.lax.scan(forward_iter, (q, alpha), jnp.arange(history_len), reverse=False)
+    return q
+
+
+def _lim_mem_hess_inv_operator(
+        residual_par: PyTree[Array],
+        residual_grad: PyTree[Array],
+        rho: Array,
+        index_start: Array,
+        input_shape: Optional[PyTree[jax.ShapeDtypeStruct]] = None,
+):
+    """Define a `lineax` linear operator implementing the L-BFGS inverse Hessian approximation.
+
+    This operator computes the action of the approximate inverse Hessian on a vector `pytree`
+    using the limited-memory BFGS (L-BFGS) two-loop recursion. It does not materialize the matrix
+    explicitly but returns a `lineax.FunctionLinearOperator`.
+
+    - `residual_par`: History of parameter updates `s_k = x_{k+1} - x_k`
+    - `residual_grad`: History of gradient updates `y_k = g_{k+1} - g_k`
+    - `rho`: Reciprocal dot products `rho_k = 1 / ⟨s_k, y_k⟩`
+    - `index_start`: Index of the most recent update in the circular buffer
+
+    Returns a `lineax.FunctionLinearOperator` with input and output shape matching a single element
+    of `residual_par`.
+
+    """
+    operator_func = partial(
+        _lim_mem_hess_inv_operator_fn,
+        residual_par=residual_par,
+        residual_grad=residual_grad,
+        rho=rho,
+        index_start=index_start
+    )
+    input_shape = (
+        jax.eval_shape(lambda: jtu.tree_map(lambda x: x[0], residual_par))
+        if input_shape is None
+        else input_shape
+    )
+    op = lx.FunctionLinearOperator(
+        operator_func,
+        input_shape,
+        tags=lx.positive_semidefinite_tag,
+    )
+    return lx.materialise(op)
+
+
 def _outer(tree1, tree2):
     def leaf_fn(x):
         return jtu.tree_map(lambda leaf: jnp.tensordot(x, leaf, axes=0), tree2)
@@ -74,7 +166,7 @@ def leaf_fn(x):
 
 
 _Hessian = TypeVar(
-    "_Hessian", FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv
+    "_Hessian", FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv,
 )
 
 
@@ -96,7 +188,8 @@ def __call__(
         y_eval: Y,
         f_info: Union[FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv],
         f_eval_info: FunctionInfo.EvalGrad,
-    ) -> Union[FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv]:
+        hess_update_state: dict,
+    ) -> tuple[Union[FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv], dict]:
         """Called whenever we want to update the Hessian approximation. This is usually
         in the `accepted` branch of the `step` method of an
         [`optimistix.AbstractQuasiNewton`][] minimiser.
@@ -143,7 +236,8 @@ def __call__(
         y_eval: Y,
         f_info: Union[FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv],
         f_eval_info: FunctionInfo.EvalGrad,
-    ) -> Union[FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv]:
+        hess_update_state: dict,
+    ) -> tuple[Union[FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv], dict]:
         f_eval = f_eval_info.f
         grad = f_eval_info.grad
         y_diff = (y_eval**ω - y**ω).ω
@@ -167,9 +261,9 @@ def __call__(
         )
         if self.use_inverse:
             # in this case `hessian` is the new inverse hessian
-            return FunctionInfo.EvalGradHessianInv(f_eval, grad, hessian)
+            return FunctionInfo.EvalGradHessianInv(f_eval, grad, hessian), {}
         else:
-            return FunctionInfo.EvalGradHessian(f_eval, grad, hessian)
+            return FunctionInfo.EvalGradHessian(f_eval, grad, hessian), {}
 
 
 class DFPUpdate(_AbstractBFGSDFPUpdate, strict=True):
@@ -292,6 +386,9 @@ class _QuasiNewtonState(
     result: RESULTS
     # Used in compat.py
     num_accepted_steps: Int[Array, ""]
+    # update state
+    hess_update_state: dict
+
 
 
 class AbstractQuasiNewton(
@@ -335,13 +432,31 @@ def init(
     ) -> _QuasiNewtonState:
         f = tree_full_like(f_struct, 0)
         grad = tree_full_like(y, 0)
+        hess_update_state = dict()
         if self.hessian_update.use_inverse:
-            hessian_inv = _identity_pytree(y)
+            if isinstance(self, LBFGS):
+                hess_update_state = dict(
+                    history_len=10,
+                    start_index=jnp.array(0),
+                    residual_y=jtu.tree_map(lambda y: jnp.zeros((10, *y.shape)), y),
+                    residual_grad=jtu.tree_map(lambda y: jnp.zeros((10, *y.shape)), y),
+                    rho=jnp.zeros(10)
+                )
+                hessian_inv = _lim_mem_hess_inv_operator(
+                    hess_update_state["residual_y"],
+                    hess_update_state["residual_grad"],
+                    hess_update_state["rho"],
+                    hess_update_state["start_index"]
+                )
+            else:
+                hessian_inv = _identity_pytree(y)
             f_info = FunctionInfo.EvalGradHessianInv(f, grad, hessian_inv)
         else:
             hessian = _identity_pytree(y)
             f_info = FunctionInfo.EvalGradHessian(f, grad, hessian)
         f_info_struct = eqx.filter_eval_shape(lambda: f_info)
+
+
         return _QuasiNewtonState(
             first_step=jnp.array(True),
             y_eval=y,
@@ -352,6 +467,7 @@ def init(
             terminate=jnp.array(False),
             result=RESULTS.successful,
             num_accepted_steps=jnp.array(0),
+            hess_update_state=hess_update_state
         )
 
     def step(
@@ -379,11 +495,12 @@ def step(
         def accepted(descent_state):
             grad = lin_to_grad(lin_fn, state.y_eval, autodiff_mode=autodiff_mode)
 
-            f_eval_info = self.hessian_update(
+            f_eval_info, hess_update_state = self.hessian_update(
                 y,
                 state.y_eval,
                 state.f_info,
                 FunctionInfo.EvalGrad(f_eval, grad),
+                state.hess_update_state
             )
 
             descent_state = self.descent.query(
@@ -399,12 +516,12 @@ def accepted(descent_state):
             terminate = jnp.where(
                 state.first_step, jnp.array(False), terminate
             )  # Skip termination on first step
-            return state.y_eval, f_eval_info, aux_eval, descent_state, terminate
+            return state.y_eval, f_eval_info, aux_eval, descent_state, terminate, hess_update_state
 
         def rejected(descent_state):
-            return y, state.f_info, state.aux, descent_state, jnp.array(False)
+            return y, state.f_info, state.aux, descent_state, jnp.array(False), state.hess_update_state
 
-        y, f_info, aux, descent_state, terminate = filter_cond(
+        y, f_info, aux, descent_state, terminate, hess_update_state = filter_cond(
             accept, accepted, rejected, state.descent_state
         )
 
@@ -438,6 +555,7 @@ def rejected(descent_state):
             terminate=terminate,
             result=result,
             num_accepted_steps=state.num_accepted_steps + jnp.where(accept, 1, 0),
+            hess_update_state=hess_update_state,
         )
         return y, state, aux
 
@@ -603,3 +721,114 @@ def __init__(
     Valid entries are `step_size`, `loss`, `y`. For example
     `verbose=frozenset({"step_size", "loss"})`.
 """
+
+
+class LBFGSUpdate(AbstractQuasiNewtonUpdate, strict=True):
+    """Private intermediate class for LBFGS updates."""
+
+    use_inverse = True
+
+    def no_update(self, inner, grad_diff, y_diff, f_info, start_index):
+        return f_info.hessian_inv, jnp.array(0)
+
+    def update(self, inner, grad_diff, y_diff, f_info, start_index):
+        assert isinstance(f_info, FunctionInfo.EvalGradHessianInv)
+        # update the start index
+        return _lim_mem_hess_inv_operator(
+            y_diff,
+            grad_diff,
+            inner,
+            start_index,
+        ), jnp.array(1)
+
+    def __call__(
+        self,
+        y: Y,
+        y_eval: Y,
+        f_info: Union[FunctionInfo.EvalGradHessian, FunctionInfo.EvalGradHessianInv],
+        f_eval_info: FunctionInfo.EvalGrad,
+        hess_update_state: dict,
+    ) -> tuple[FunctionInfo.EvalGradHessianInv, dict]:
+        f_eval = f_eval_info.f
+        grad = f_eval_info.grad
+        y_diff = (y_eval**ω - y**ω).ω
+        grad_diff = (grad**ω - f_info.grad**ω).ω
+
+        history_len = hess_update_state["history_len"]
+        residual_y = hess_update_state["residual_y"]
+        residual_grad = hess_update_state["residual_grad"]
+        start_index = hess_update_state["start_index"]
+        rho = hess_update_state["rho"]
+
+        # update states
+        residual_y = jtu.tree_map(lambda x, z: x.at[start_index].set(z), residual_y, y_diff)
+        residual_grad = jtu.tree_map(lambda x, z: x.at[start_index].set(z), residual_grad, grad_diff)
+        rho = rho.at[start_index].set(1. / tree_dot(y_diff, grad_diff))
+        rho = jnp.where(jnp.isinf(rho), 0, rho)
+
+        hessian, update = filter_cond(
+            rho[start_index] != 0,
+            self.update,
+            self.no_update,
+            rho,
+            residual_grad,
+            residual_y,
+            f_info,
+            start_index,
+        )
+        # increment circular index
+        start_index = (start_index + update) % history_len
+
+        update_state = dict(
+            history_len = history_len,
+            start_index = jnp.array(start_index),
+            residual_y = residual_y,
+            residual_grad = residual_grad,
+            rho= rho,
+        )
+
+        return FunctionInfo.EvalGradHessianInv(f_eval, grad, hessian), update_state
+
+class LBFGS(AbstractQuasiNewton[Y, Aux, _Hessian], strict=True):
+    """L-BFGS (Limited-memory Broyden–Fletcher–Goldfarb–Shanno) minimisation algorithm.
+
+    This is a quasi-Newton optimisation algorithm that approximates the inverse Hessian
+    using a limited history of gradient and parameter updates. Unlike full BFGS, which stores
+    a dense matrix, L-BFGS maintains a memory-efficient representation suitable for large-scale
+    problems.
+
+    Supports the following `options`:
+
+    - `autodiff_mode`: whether to use forward- or reverse-mode autodifferentiation to
+        compute the gradient. Can be either `"fwd"` or `"bwd"`. Defaults to `"bwd"`,
+        which is usually more efficient. Changing this can be useful when the target
+        function does not support reverse-mode automatic differentiation.
+    """
+
+    rtol: float
+    atol: float
+    norm: Callable[[PyTree], Scalar]
+    descent: NewtonDescent
+    search: AbstractSearch
+    hessian_update: AbstractQuasiNewtonUpdate
+    use_inverse: bool
+    verbose: frozenset[str]
+
+    def __init__(
+        self,
+        rtol: float,
+        atol: float,
+        norm: Callable[[PyTree], Scalar] = max_norm,
+        verbose: frozenset[str] = frozenset(),
+        search: AbstractSearch = BacktrackingArmijo(),
+
+    ):
+        self.rtol = rtol
+        self.atol = atol
+        self.norm = norm
+        self.use_inverse = True
+        self.descent = NewtonDescent(linear_solver=lx.Cholesky())
+        # TODO(raderj): switch out `BacktrackingArmijo` with a better line search.
+        self.search = search
+        self.hessian_update = LBFGSUpdate()
+        self.verbose = verbose