Add docs for adam and adamax (JuliaNLSolvers#1072)

pkofod · avik-pal · commit 8167bf4cadd9 · 2024-11-19T18:52:18.000-05:00
* Add docs for adam and adamax

* Update make.jl
diff --git a/docs/make.jl b/docs/make.jl
@@ -40,6 +40,7 @@ makedocs(
              "Particle Swarm" => "algo/particle_swarm.md",
              ],
         "Gradient Required" => [
+             "Adam and AdaMax" => "algo/adam_adamax.md",
              "Conjugate Gradient" => "algo/cg.md",
              "Gradient Descent" => "algo/gradientdescent.md",
              "(L-)BFGS" => "algo/lbfgs.md",
diff --git a/docs/src/algo/adam_adamax.md b/docs/src/algo/adam_adamax.md
@@ -0,0 +1,22 @@
+# Adam and AdaMax
+This page contains information about Adam and AdaMax.
+## Constructors
+```julia
+Adam(;  alpha=0.0001,
+        beta_mean=0.9,
+        beta_var=0.999,
+        epsilon=1e-8)
+```
+
+where `alpha` is the step length or learning parameter. `beta_mean` and `beta_var` are exponential decay parameters for the first and second moments estimates. Setting these closer to 0 will cause past iterates to matter less for the current steps and setting them closer to 1 means emphasizing past iterates more. `epsilon` should rarely be changed, and just exists to avoid a division by 0.
+
+
+```julia
+AdaMax(; alpha=0.002,
+         beta_mean=0.9,
+         beta_var=0.999)
+```
+where `alpha` is the step length or learning parameter. `beta_mean` and `beta_var` are exponential decay parameters for the first and second moments estimates. Setting these closer to 0 will cause past iterates to matter less for the current steps and setting them closer to 1 means emphasizing past iterates more.
+
+## References
+Kingma, Diederik P., and Jimmy Ba. "Adam: A method for stochastic optimization." arXiv preprint arXiv:1412.6980 (2014).
diff --git a/src/multivariate/solvers/first_order/adam.jl b/src/multivariate/solvers/first_order/adam.jl
@@ -73,12 +73,8 @@ function update_state!(d, state::AdamState{T}, method::Adam) where T
     #  m̂ = m./(1-β₁^state.iter)
     # v̂ = v./(1-β₂^state.iter)
     #@. z = z - α*m̂/(sqrt(v̂+ϵ))
-    @. z = z - α*m/(1-β₁^state.iter)/(sqrt(v./(1-β₂^state.iter)+ϵ))
-
-    # not quite the same because epsilon is in the sqrt
-    # not sure where I got this from
-    #    αₜ = α * sqrt(1 - β₂^state.iter) / (1 - β₁^state.iter)
-    #    z .= z .- αₜ .* m ./ (sqrt.(v .+ ϵ) )
+    αₜ = α * sqrt(1 - β₂^state.iter) / (1 - β₁^state.iter)
+    @. z = z - αₜ * m / (sqrt(v) + ϵ)
 
     for _i in eachindex(z)
         # since m and u start at 0, this can happen if the initial gradient is exactly 0