PaddlePaddle · luotao1 · Apr 27, 2023 · Mar 21, 2023 · Mar 22, 2023 · Mar 22, 2023
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -228,6 +228,17 @@ struct FMaxFunctor<dtype::float16> {
   }
 };
 
+template <>
+struct FMaxFunctor<dtype::bfloat16> {
+  inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a,
+                                               const dtype::bfloat16 b) const {
+    float float_a = static_cast<float>(a);
+    float float_b = static_cast<float>(b);
+    auto result = std::fmax(float_a, float_b);
+    return static_cast<dtype::bfloat16>(result);
+  }
+};
+
 template <>
 struct FMaxFunctor<int> {
   inline HOSTDEVICE int operator()(const int a, const int b) const {
@@ -265,6 +276,16 @@ struct FMaxGradDx<dtype::float16> {
   }
 };
 
+template <>
+struct FMaxGradDx<dtype::bfloat16> {
+  HOSTDEVICE dtype::bfloat16 operator()(dtype::bfloat16 x,
+                                        dtype::bfloat16 y,
+                                        dtype::bfloat16 out,
+                                        dtype::bfloat16 dout) const {
+    return dout * static_cast<dtype::bfloat16>((x >= y) || dtype::isnan(y));
+  }
+};
+
 template <>
 struct FMaxGradDx<int> {
   HOSTDEVICE int operator()(int x, int y, int out, int dout) const {
@@ -299,6 +320,16 @@ struct FMaxGradDy<dtype::float16> {
   }
 };
 
+template <>
+struct FMaxGradDy<dtype::bfloat16> {
+  HOSTDEVICE dtype::bfloat16 operator()(dtype::bfloat16 x,
+                                        dtype::bfloat16 y,
+                                        dtype::bfloat16 out,
+                                        dtype::bfloat16 dout) const {
+    return dout * static_cast<dtype::bfloat16>(!((x >= y) || dtype::isnan(y)));
+  }
+};
+
 template <>
 struct FMaxGradDy<int64_t> {
   HOSTDEVICE int64_t operator()(int64_t x,

diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -98,6 +98,7 @@ PD_REGISTER_KERNEL(fmax_grad,
                    double,
                    int,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int64_t) {}
 
 PD_REGISTER_KERNEL(fmin_grad,

diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -117,6 +117,7 @@ PD_REGISTER_KERNEL(fmax,
                    double,
                    int,
                    float16,
+                   bfloat16,
                    int64_t) {}
 
 PD_REGISTER_KERNEL(fmin,

diff --git a/python/paddle/fluid/tests/unittests/test_fmax_op.py b/python/paddle/fluid/tests/unittests/test_fmax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid.core as core
@@ -241,5 +241,31 @@ def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out')
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestFmaxBF16OP(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_fmax"
+        self.python_api = paddle.fmax
+        self.dtype = np.uint16
+        x = np.random.randn(11, 17).astype('float32')
+        y = np.random.randn(11, 17).astype('float32')
+        out = np.fmax(x, y)
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
 if __name__ == "__main__":
     unittest.main()