PaddlePaddle · windstamp · Feb 28, 2022 · Feb 27, 2022 · Feb 27, 2022 · Feb 27, 2022
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -23,28 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class CPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
-    TruncatedNormal<T> truncated_normal(mean, std);
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = truncated_normal(dist(*engine));
-    }
-  }
-};
-
 class TruncatedGaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -124,5 +102,3 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random,
                              ops::TruncatedGaussianRandomOp,
                              ops::TruncatedGaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(truncated_gaussian_random,
-                       ops::CPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
+
+#include <limits>
+#include <random>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out) {
+  auto tensor = out;
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                         1.0);
+  TruncatedNormal<T> truncated_normal(mean, std);
+  int64_t size = tensor->numel();
+
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = truncated_normal(dist(*engine));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(truncated_gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TruncatedGaussianRandomKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include <limits>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+// #include "paddle/phi/core/generator.h"
+
+namespace phi {
+
+template <typename T>
+struct GPUTruncatedNormal {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+
+  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
+      : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T>
+struct TruncatedNormalOffset {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+  int offset_;
+
+  __host__ __device__
+  TruncatedNormalOffset(T mean, T std, T numeric_min, int seed, int offset)
+      : mean(mean),
+        std(std),
+        seed(seed),
+        numeric_min(numeric_min),
+        offset_(offset) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n + offset_);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
+template <typename T, typename Context>
+void TruncatedGaussianRandomKernel(const Context& dev_ctx,
+                                   const ScalarArray& shape,
+                                   float mean,
+                                   float std,
+                                   int seed,
+                                   DataType dtype,
+                                   DenseTensor* out) {
+  auto tensor = out;
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  thrust::counting_iterator<int64_t> index_sequence_begin(0);
+  int64_t size = tensor->numel();
+
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    int64_t gen_offset = size * seed_offset.second;
+    thrust::transform(index_sequence_begin,
+                      index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      TruncatedNormalOffset<T>(mean,
+                                               std,
+                                               std::numeric_limits<T>::min(),
+                                               seed_offset.first,
+                                               gen_offset));
+  } else {
+    thrust::transform(
+        index_sequence_begin,
+        index_sequence_begin + size,
+        thrust::device_ptr<T>(data),
+        GPUTruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(truncated_gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TruncatedGaussianRandomKernel,
+                   float) {}