JuliaNLSolvers · pkofod · Mar 26, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/docs/src/algo/precondition.md b/docs/src/algo/precondition.md
@@ -7,43 +7,41 @@ good preconditioner substantially improved convergence is possible.
 A preconditioner `P`can be of any type as long as the following two methods are
 implemented:
 
-* `A_ldiv_B!(pgr, P, gr)` : apply `P` to a vector `gr` and store in `pgr`
+* `ldiv!(pgr, P, gr)` : apply `P` to a vector `gr` and store in `pgr`
       (intuitively, `pgr = P \ gr`)
 * `dot(x, P, y)` : the inner product induced by `P`
       (intuitively, `dot(x, P * y)`)
 
 Precisely what these operations mean, depends on how `P` is stored. Commonly, we store a matrix `P` which
-approximates the Hessian in some vague sense. In this case,
-
-* `A_ldiv_B!(pgr, P, gr) = copyto!(pgr, P \ A)`
-* `dot(x, P, y) = dot(x, P * y)`
+approximates the Hessian (not the inverse Hessian) in some vague sense.
 
 Finally, it is possible to update the preconditioner as the state variable `x`
-changes. This is done through  `precondprep!` which is passed to the
+changes. This is done through  `precondprep` which is passed to the
 optimizers as kw-argument, e.g.,
 ```jl
-   method=ConjugateGradient(P = precond(100), precondprep! = precond(100))
+   method=ConjugateGradient(P = precond(100), precondprep = (P, x)->copyto!(P, precond(x)))
 ```
 though in this case it would always return the same matrix.
-(See `fminbox.jl` for a more natural example.)
 
-Apart from preconditioning with matrices, `Optim.jl` provides
-a type `InverseDiagonal`, which represents a diagonal matrix by
-its inverse elements.
+!!! note
+    Preconditioning is also used in `Fminbox` even if the user does not provide a preconditioner. This is because we have a barrier term causing the problem to be ill-conditioned. The preconditioner uses the hessian of the barrier term to improve the conditioning.
 
 ## Example
 Below, we see an example where a function is minimized without and with a preconditioner
 applied.
 ```jl
 using ForwardDiff, Optim, SparseArrays
+plap(U; n = length(U)) = (n - 1) * sum((0.1 .+ diff(U) .^ 2) .^ 2) - sum(U) / (n - 1)
+plap1(U; n = length(U), dU = diff(U), dW = 4 .* (0.1 .+ dU .^ 2) .* dU) =
+      (n - 1) .* ([0.0; dW] .- [dW; 0.0]) .- ones(n) / (n - 1)
+precond(x::Vector) = precond(length(x))
+precond(n::Number) = spdiagm(-1 => -ones(n - 1), 0 => 2 * ones(n), 1 => -ones(n - 1)) * (n + 1)
+f(X) = plap([0; X; 0])
+g!(G, X) = copyto!(G, (plap1([0; X; 0]))[2:end-1])
 initial_x = zeros(100)
-plap(U; n = length(U)) = (n-1)*sum((0.1 .+ diff(U).^2).^2 ) - sum(U) / (n-1)
-plap1(x) = ForwardDiff.gradient(plap,x)
-precond(n) = spdiagm(-1 => -ones(n-1), 0 => 2ones(n), 1 => -ones(n-1)) * (n+1)
-f(x) = plap([0; x; 0])
-g!(G, x) = copyto!(G, (plap1([0; x; 0]))[2:end-1])
+
 result = Optim.optimize(f, g!, initial_x, method = ConjugateGradient(P = nothing))
-result = Optim.optimize(f, g!, initial_x, method = ConjugateGradient(P = precond(100)))
+result = Optim.optimize(f, g!, initial_x, method = ConjugateGradient(P = precond(initial_x)))
 ```
 The former optimize call converges at a slower rate than the latter. Looking at a
  plot of the 2D version of the function shows the problem.

diff --git a/src/Optim.jl b/src/Optim.jl
@@ -55,7 +55,9 @@ import LinearAlgebra: Diagonal, diag, Hermitian, Symmetric,
                       I,
                       svd,
                       opnorm, # for safeguards in newton trust regions
-                      issuccess
+                      issuccess,
+                      ldiv!,
+                      dot
 
 import SparseArrays: AbstractSparseMatrix
 

diff --git a/src/multivariate/precon.jl b/src/multivariate/precon.jl
@@ -14,51 +14,57 @@
 #    but this is passed as an argument at the moment!
 #
 
-# Fallback
-ldiv!(out, M, A) = LinearAlgebra.ldiv!(out, M, A)
-dot(a, M, b) = LinearAlgebra.dot(a, M, b)
-dot(a, b) = LinearAlgebra.dot(a, b)
-
-
-#####################################################
-#  [0] Defaults and aliases for easier reading of the code
-#      these can also be over-written if necessary.
+# x for updating P
+function _precondition!(out, method::AbstractOptimizer, x, ∇f)
+    _apply_precondprep(method, x)
+    __precondition!(out, method.P, ∇f)
+end
+# no updating
+__precondition!(out, P::Nothing, ∇f) = copyto!(out, ∇f)
+# fallback
+__precondition!(out, P, ∇f) = ldiv!(out, P, ∇f)
+__precondition!(out, P::AbstractMatrix, ∇f) = copyto!(out, P\∇f)
 
-# default preconditioner update
-precondprep!(P, x) = nothing
+function _inverse_precondition(method::AbstractOptimizer, state::AbstractOptimizerState)
+    _inverse_precondition(method.P, state.s)
+end
+function _inverse_precondition(P, s)
+    real(dot(s, P, s))
+end
+function _inverse_precondition(P::Nothing, s)
+    real(dot(s, s))
+end
 
+_apply_precondprep(method::AbstractOptimizer, x) =
+    _apply_precondprep(method.P, method.precondprep!, x)
+_apply_precondprep(::Nothing, ::Returns{Nothing}, x) = x
+_apply_precondprep(P, precondprep!, x) = precondprep!(P, x)
 
 #####################################################
 #  [1] Empty preconditioner = Identity
 #
-
 # out =  P^{-1} * A
-ldiv!(out, ::Nothing, A) = copyto!(out, A)
-
-# A' * P B
-dot(A, ::Nothing, B) = dot(A, B)
-
-
 #####################################################
 #  [2] Diagonal preconditioner
 #      P = Diag(d)
 #      Covered by base
-
 #####################################################
 #  [3] Inverse Diagonal preconditioner
 #      here, P is stored by the entries of its inverse
-#      TODO: maybe implement this in Base?
 
 mutable struct InverseDiagonal
-   diag
+    diag::Any
+end
+# If not precondprep was added we just use a constant inverse
+_apply_precondprep(P::InverseDiagonal, ::Returns{Nothing}, x) = P
+_apply_precondprep(P::InverseDiagonal, precondprep!, x) = precondprep!(P, x)
+__precondition!(out, P::InverseDiagonal, ∇f) = copyto!(out, P.diag .* ∇f)
+
+function _inverse_precondition(P::InverseDiagonal, s)
+   real(dot(s, P.diag .\ s))
 end
-ldiv!(out::AbstractArray, P::InverseDiagonal, A::AbstractArray) = copyto!(out, A .* P.diag)
-dot(A::AbstractArray, P::InverseDiagonal, B::Vector) = dot(A, B ./ P.diag)
 
 #####################################################
 #  [4] Matrix Preconditioner
-#  the assumption here is that P is given by its inverse, which is typical
-#     > ldiv! is about to be moved to Base, so we need a temporary hack
-#     > mul! is already in Base, which defines `dot`
-#  nothing to do!
-ldiv!(x, P::AbstractMatrix, b) = copyto!(x, P \ b)
+#   Works by stdlib methods
+#   It interprets 
diff --git a/src/multivariate/solvers/first_order/cg.jl b/src/multivariate/solvers/first_order/cg.jl
@@ -42,7 +42,7 @@
 #   below.  The default value for alphamax is Inf. See alphamaxfunc
 #   for cgdescent and alphamax for linesearch_hz.
 
-struct ConjugateGradient{Tf, T, Tprep, IL, L} <: FirstOrderOptimizer
+struct ConjugateGradient{Tf,T,Tprep,IL,L} <: FirstOrderOptimizer
     eta::Tf
     P::T
     precondprep!::Tprep
@@ -61,7 +61,7 @@ ConjugateGradient(; alphaguess = LineSearches.InitialHagerZhang(),
 linesearch = LineSearches.HagerZhang(),
 eta = 0.4,
 P = nothing,
-precondprep = (P, x) -> nothing,
+precondprep = Returns(nothing),
 manifold = Flat())
 ```
 The strictly positive constant ``eta`` is used in determining
@@ -79,17 +79,16 @@ Zhang (2006).
  - W. W. Hager and H. Zhang (2006) Algorithm 851: CG_DESCENT, a conjugate gradient method with guaranteed descent. ACM Transactions on Mathematical Software 32: 113-137.
  - W. W. Hager and H. Zhang (2013), The Limited Memory Conjugate Gradient Method. SIAM Journal on Optimization, 23, pp. 2150-2168.
 """
-function ConjugateGradient(; alphaguess = LineSearches.InitialHagerZhang(),
-                             linesearch = LineSearches.HagerZhang(),
-                             eta::Real = 0.4,
-                             P::Any = nothing,
-                             precondprep = (P, x) -> nothing,
-                             manifold::Manifold=Flat())
-
-    ConjugateGradient(eta,
-                      P, precondprep,
-                      _alphaguess(alphaguess), linesearch,
-                      manifold)
+function ConjugateGradient(;
+    alphaguess = LineSearches.InitialHagerZhang(),
+    linesearch = LineSearches.HagerZhang(),
+    eta::Real = 0.4,
+    P::Any = nothing,
+    precondprep = Returns(nothing),
+    manifold::Manifold = Flat(),
+)
+
+    ConjugateGradient(eta, P, precondprep, _alphaguess(alphaguess), linesearch, manifold)
 end
 
 mutable struct ConjugateGradientState{Tx,T,G} <: AbstractOptimizerState
@@ -104,10 +103,10 @@ mutable struct ConjugateGradientState{Tx,T,G} <: AbstractOptimizerState
     @add_linesearch_fields()
 end
 
-function reset!(cg, cgs::ConjugateGradientState, obj, x)
+function reset!(cg::ConjugateGradient, cgs::ConjugateGradientState, obj, x)
     cgs.x .= x
-    cg.precondprep!(cg.P, x)
-    ldiv!(cgs.pg, cg.P, gradient(obj))
+    _precondition!(cgs.pg, cg, x, gradient(obj))
+
     if cg.P !== nothing
         project_tangent!(cg.manifold, cgs.pg, x)
     end
@@ -137,71 +136,86 @@ function initial_state(method::ConjugateGradient, options, d, initial_x)
     # Determine the intial search direction
     #    if we don't precondition, then this is an extra superfluous copy
     #    TODO: consider allowing a reference for pg instead of a copy
-    method.precondprep!(method.P, initial_x)
-    ldiv!(pg, method.P, gradient(d))
+    _precondition!(pg, method, initial_x, gradient(d))
     if method.P !== nothing
         project_tangent!(method.manifold, pg, initial_x)
     end
 
-    ConjugateGradientState(initial_x, # Maintain current state in state.x
-                         0 .*(initial_x), # Maintain previous state in state.x_previous
-                         0 .*(gradient(d)), # Store previous gradient in state.g_previous
-                         real(T)(NaN), # Store previous f in state.f_x_previous
-                         0 .*(initial_x), # Intermediate value in CG calculation
-                         0 .*(initial_x), # Preconditioned intermediate value in CG calculation
-                         pg, # Maintain the preconditioned gradient in pg
-                         -pg, # Maintain current search direction in state.s
-                         @initial_linesearch()...)
+    ConjugateGradientState(
+        initial_x, # Maintain current state in state.x
+        0 .* (initial_x), # Maintain previous state in state.x_previous
+        0 .* (gradient(d)), # Store previous gradient in state.g_previous
+        real(T)(NaN), # Store previous f in state.f_x_previous
+        0 .* (initial_x), # Intermediate value in CG calculation
+        0 .* (initial_x), # Preconditioned intermediate value in CG calculation
+        pg, # Maintain the preconditioned gradient in pg
+        -pg, # Maintain current search direction in state.s
+        @initial_linesearch()...,
+    )
 end
 
 function update_state!(d, state::ConjugateGradientState, method::ConjugateGradient)
-        # Search direction is predetermined
-
-        # Maintain a record of the previous gradient
-        copyto!(state.g_previous, gradient(d))
-
-        # Determine the distance of movement along the search line
-        lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d))
-
-        # Update current position # x = x + alpha * s
-        @. state.x = state.x + state.alpha * state.s
-        retract!(method.manifold, state.x)
-
-        # Update the function value and gradient
-        value_gradient!(d, state.x)
-        project_tangent!(method.manifold, gradient(d), state.x)
-
-        # Check sanity of function and gradient
-        isfinite(value(d)) || error("Non-finite f(x) while optimizing ($(value(d)))")
-
-        # Determine the next search direction using HZ's CG rule
-        #  Calculate the beta factor (HZ2013)
-        # -----------------
-        # Comment on py: one could replace the computation of py with
-        #    ydotpgprev = dot(y, pg)
-        #    dot(y, py)  >>>  dot(y, pg) - ydotpgprev
-        # but I am worried about round-off here, so instead we make an
-        # extra copy, which is probably minimal overhead.
-        # -----------------
-        method.precondprep!(method.P, state.x)
-        @compat dPd = real(dot(state.s, method.P, state.s))
-        etak = method.eta * real(dot(state.s, state.g_previous)) / dPd # New in HZ2013
-        state.y .= gradient(d) .- state.g_previous
-        ydots = real(dot(state.y, state.s))
-        copyto!(state.py, state.pg)        # below, store pg - pg_previous in py
-        ldiv!(state.pg, method.P, gradient(d))
-        state.py .= state.pg .- state.py
-        # ydots may be zero if f is not strongly convex or the line search does not satisfy Wolfe
-        betak = (real(dot(state.y, state.pg)) - real(dot(state.y, state.py)) * real(dot(gradient(d), state.s)) / ydots) / ydots
-        # betak may be undefined if ydots is zero (may due to f not strongly convex or non-Wolfe linesearch)
-        beta = NaNMath.max(betak, etak) # TODO: Set to zero if betak is NaN?
-        state.s .= beta.*state.s .- state.pg
-        project_tangent!(method.manifold, state.s, state.x)
-        lssuccess == false # break on linesearch error
+    # Search direction is predetermined
+
+    # Maintain a record of the previous gradient
+    copyto!(state.g_previous, gradient(d))
+
+    # Determine the distance of movement along the search line
+    lssuccess = perform_linesearch!(state, method, ManifoldObjective(method.manifold, d))
+
+    # Update current position # x = x + alpha * s
+    @. state.x = state.x + state.alpha * state.s
+    retract!(method.manifold, state.x)
+
+    # Update the function value and gradient
+    value_gradient!(d, state.x)
+    project_tangent!(method.manifold, gradient(d), state.x)
+
+    # Check sanity of function and gradient
+    isfinite(value(d)) || error("Non-finite f(x) while optimizing ($(value(d)))")
+
+    # Determine the next search direction using HZ's CG rule
+    #  Calculate the beta factor (HZ2013)
+    # -----------------
+    # Comment on py: one could replace the computation of py with
+    #    ydotpgprev = dot(y, pg)
+    #    dot(y, py)  >>>  dot(y, pg) - ydotpgprev
+    # but I am worried about round-off here, so instead we make an
+    # extra copy, which is probably minimal overhead.
+    # -----------------
+    # also updates P for the preconditioning step below
+    dPd = _inverse_precondition(method, state)
+    etak = method.eta * real(dot(state.s, state.g_previous)) / dPd # New in HZ2013
+    state.y .= gradient(d) .- state.g_previous
+    ydots = real(dot(state.y, state.s))
+    copyto!(state.py, state.pg)        # below, store pg - pg_previous in py
+    # P already updated in _inverse_precondition above
+    __precondition!(state.pg, method.P, gradient(d))
+
+    state.py .= state.pg .- state.py
+    # ydots may be zero if f is not strongly convex or the line search does not satisfy Wolfe
+    betak =
+        (
+            real(dot(state.y, state.pg)) -
+            real(dot(state.y, state.py)) * real(dot(gradient(d), state.s)) / ydots
+        ) / ydots
+    # betak may be undefined if ydots is zero (may due to f not strongly convex or non-Wolfe linesearch)
+    beta = NaNMath.max(betak, etak) # TODO: Set to zero if betak is NaN?
+    state.s .= beta .* state.s .- state.pg
+    project_tangent!(method.manifold, state.s, state.x)
+    lssuccess == false # break on linesearch error
 end
 
 update_g!(d, state, method::ConjugateGradient) = nothing
 
-function trace!(tr, d, state, iteration, method::ConjugateGradient, options, curr_time=time())
-  common_trace!(tr, d, state, iteration, method, options, curr_time)
+function trace!(
+    tr,
+    d,
+    state,
+    iteration,
+    method::ConjugateGradient,
+    options,
+    curr_time = time(),
+)
+    common_trace!(tr, d, state, iteration, method, options, curr_time)
 end