[feat] remove support for non-multitensor Adam

andersonic · Jun Ru Anderson · web-flow · commit 81a2cf04e673 · 2020-08-13T11:23:21.000-07:00
Co-authored-by: Jun Ru Anderson &lt;andersonic@fb.com&gt;
diff --git a/fairscale/clib/fused_adam_cuda/fused_adam_cuda.cpp b/fairscale/clib/fused_adam_cuda/fused_adam_cuda.cpp
@@ -1,32 +1,8 @@
 #include <torch/extension.h>
 
 // CUDA forward declaration
-void fused_adam_cuda(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay);
-
-void fused_adam_cuda_mt(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay);
-
-
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-// C++ interface
-void adam(at::Tensor & p, at::Tensor & p_copy, at::Tensor & m, at::Tensor & v, at::Tensor & g, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay) {
-        CHECK_INPUT(p);
-        if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
-        CHECK_INPUT(m);
-        CHECK_INPUT(v);
-        CHECK_INPUT(g);
-        int64_t num_elem = p.numel();
-        AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
-        AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
-        AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
-        AT_ASSERTM(p_copy.numel() == num_elem || p_copy.numel() == 0, "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");
-
-        fused_adam_cuda(p, p_copy, m, v, g, lr, beta1, beta2, eps, grad_scale, step, mode, bias_correction, decay);
-}
+void fused_adam_cuda(int chunk_size, at::Tensor noop_flag, std::vector<std::vector<at::Tensor>> tensor_lists, float lr, float beta1, float beta2, float eps, float grad_scale, int step, int mode, int bias_correction, float decay);
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-        m.def("adam", &adam, "Adam optimized CUDA implementation.");
-        m.def("adam_mt", &fused_adam_cuda_mt, "Multi tensor Adam optimized CUDA implementation.");
+        m.def("adam", &fused_adam_cuda, "Multi tensor Adam optimized CUDA implementation.");
 }
diff --git a/fairscale/clib/fused_adam_cuda/fused_adam_cuda_kernel.cu b/fairscale/clib/fused_adam_cuda/fused_adam_cuda_kernel.cu
@@ -21,43 +21,7 @@ typedef enum{
     ADAM_MODE_1   =1  // eps outside square root
 } adamMode_t;
 
-template <typename T, typename GRAD_T>
-__global__ void adam_cuda_kernel(
-        GRAD_T* __restrict__ p,
-        GRAD_T* __restrict__ p_copy, // For mixed precision training, pass NULL if not needed
-        T* __restrict__ m,
-        T* __restrict__ v,
-        const GRAD_T * __restrict__ g,
-        const float b1,
-        const float b2,
-        const float eps,
-        const float grad_scale,
-        const float step_size,
-        const size_t tsize,
-        adamMode_t mode,
-        const float decay)
-{
-        //Assuming 2D grids and 2D blocks
-        const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
-        const int threadsPerBlock = blockDim.x * blockDim.y;
-        const int threadIdInBlock = threadIdx.y * blockDim.x + threadIdx.x;
-        const int i = (blockId * threadsPerBlock + threadIdInBlock);
-        const int totThreads = gridDim.x*gridDim.y*threadsPerBlock;
 
-        for (int j = i; j < tsize; j+=totThreads) {
-                T scaled_grad = g[j]/grad_scale;
-                m[j] = b1*m[j] + (1-b1)*scaled_grad;
-                v[j] = b2*v[j] + (1-b2)*scaled_grad*scaled_grad;
-                float denom;
-                if (mode == ADAM_MODE_0)
-                    denom = sqrtf(v[j] + eps);
-                else // Mode 1
-                    denom = sqrtf(v[j]) + eps;
-                float update = (m[j]/denom) + (decay*p[j]);
-                p[j] = (GRAD_T)((float)p[j] - (step_size*update));
-                if (p_copy != NULL) p_copy[j] = (GRAD_T) p[j];
-        }
-}
 
 template <int DEPTH, typename T, typename GRAD_T>
 struct AdamFunctor
@@ -147,87 +111,6 @@ struct AdamFunctor
 };
 
 void fused_adam_cuda(
-        at::Tensor & p,
-        at::Tensor & p_copy,
-        at::Tensor & m,
-        at::Tensor & v,
-        at::Tensor & g,
-        float lr,
-        float beta1,
-        float beta2,
-        float eps,
-        float grad_scale,
-        int step,
-        int mode,
-        int bias_correction,
-        float decay)
-{
-//        using namespace at;
-
-        //Get tensor size
-        int tsize = p.numel();
-        //Determine #threads and #blocks
-        const int threadsPerBlock = 512;
-        const dim3 blocks((tsize+threadsPerBlock-1)/threadsPerBlock);
-        AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p), "parameter tensor is too large to be indexed with int32");
-        //Constants
-        float step_size = 0;
-        if (bias_correction == 1) {
-            const float bias_correction1 = 1 - std::pow(beta1, step);
-            const float bias_correction2 = 1 - std::pow(beta2, step);
-            step_size = lr * std::sqrt(bias_correction2)/bias_correction1;
-        }
-        else {
-            step_size = lr;
-        }
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-        if (g.scalar_type() == at::ScalarType::Half) {
-//all other values should be fp32 for half gradients
-            AT_ASSERTM(p.scalar_type() == at::ScalarType::Half, "expected parameter to be of half type");
-//dispatch is done on the gradient type
-            using namespace at; // prevents "toString is undefined" errors
-            DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel",
-                using accscalar_t = at::acc_type<scalar_t_0, true>;
-                adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
-                        p.DATA_PTR<scalar_t_0>(),
-                        p_copy.numel() ? p_copy.DATA_PTR<scalar_t_0>() : NULL,
-                        m.DATA_PTR<accscalar_t>(),
-                        v.DATA_PTR<accscalar_t>(),
-                        g.DATA_PTR<scalar_t_0>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t) mode,
-                        decay);
-                );
-      } else {
-            using namespace at;
-            DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
-                adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
-                        p.DATA_PTR<scalar_t_0>(),
-                        NULL, //don't output p_copy for fp32, it's wasted write
-                        m.DATA_PTR<scalar_t_0>(),
-                        v.DATA_PTR<scalar_t_0>(),
-                        g.DATA_PTR<scalar_t_0>(),
-                        beta1,
-                        beta2,
-                        eps,
-                        grad_scale,
-                        step_size,
-                        tsize,
-                        (adamMode_t) mode,
-                        decay);
-            );
-      }
-      THCudaCheck(cudaGetLastError());
-
-}
-
-void fused_adam_cuda_mt(
     int chunk_size,
     at::Tensor noop_flag,
     std::vector<std::vector<at::Tensor>> tensor_lists, // p, m, v, g, p_copy
diff --git a/fairscale/optim/adam.py b/fairscale/optim/adam.py
@@ -55,13 +55,10 @@ def __init__(
             weight_decay: Optional[float] = 0.0,
             max_grad_norm: Optional[float] = 0.0,
             amsgrad: Optional[bool] = False,
-            use_mt: Optional[bool] = True,
         ):
 
             self._use_multi_tensor = False
-            if use_mt:
-                self._use_multi_tensor = True
-                self._overflow_buf = torch.cuda.IntTensor([0])  # type: ignore
+            self._overflow_buf = torch.cuda.IntTensor([0])  # type: ignore
 
             if amsgrad:
                 raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
@@ -131,51 +128,30 @@ def step(self, closure: Optional[Callable[[], float]] = None, scale: Optional[fl
                     state["step"] += 1
                     out_p = torch.tensor([])
 
-                    if self._use_multi_tensor:
-                        pl = [p.data, exp_avg, exp_avg_sq, grad]
-
-                        if p.device not in tensorlists:
-                            tensorlists[p.device] = [[], [], [], []]
-
-                        for tl, t in zip(tensorlists[p.device], pl):
-                            tl.append(t)
-
-                    else:
-                        with torch.cuda.device(p.device):
-                            fused_adam_cuda.adam(
-                                p.data,
-                                out_p,
-                                exp_avg,
-                                exp_avg_sq,
-                                grad,
-                                group["lr"],
-                                beta1,
-                                beta2,
-                                group["eps"],
-                                scale,
-                                state["step"],
-                                self.eps_mode,
-                                bias_correction,
-                                group["weight_decay"],
-                            )
-
-                if self._use_multi_tensor:
-                    for tensordevice, tensorlist in tensorlists.items():
-                        with torch.cuda.device(tensordevice):
-                            fused_adam_cuda.adam_mt(
-                                2048 * 32,
-                                self._overflow_buf,
-                                tensorlist,
-                                group["lr"],
-                                beta1,
-                                beta2,
-                                group["eps"],
-                                scale,
-                                state["step"],
-                                self.eps_mode,
-                                bias_correction,
-                                group["weight_decay"],
-                            )
+                    pl = [p.data, exp_avg, exp_avg_sq, grad]
+
+                    if p.device not in tensorlists:
+                        tensorlists[p.device] = [[], [], [], []]
+
+                    for tl, t in zip(tensorlists[p.device], pl):
+                        tl.append(t)
+
+                for tensordevice, tensorlist in tensorlists.items():
+                    with torch.cuda.device(tensordevice):
+                        fused_adam_cuda.adam(
+                            2048 * 32,
+                            self._overflow_buf,
+                            tensorlist,
+                            group["lr"],
+                            beta1,
+                            beta2,
+                            group["eps"],
+                            scale,
+                            state["step"],
+                            self.eps_mode,
+                            bias_correction,
+                            group["weight_decay"],
+                        )
 
             return loss