Added scale op FP32/BF16 FWD/BWD kernels (#32975)

jakpiase · web-flow · commit 86ea8dceb1b6 · 2021-05-25T10:01:47.000+08:00
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
@@ -143,7 +143,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
                                     const Tensor& in, Tensor* out,
-                                    platform::Place place) {
+                                    platform::Place place, bool always_copy) {
   PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::undef,
                     platform::errors::InvalidArgument(
                         "Input tensor format is invalid. Input tensor should "
@@ -177,7 +177,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  if (in_format != out_format) {
+  if ((in_format != out_format) || always_copy) {
     void* in_data = GetDataFromTensor(in, in_type);
     std::string key =
         platform::CreateKey(*dev_ctx, in_tz, in_format, out_format, in_type);
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
@@ -78,7 +78,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
 
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
                                     const Tensor& in, Tensor* out,
-                                    platform::Place place);
+                                    platform::Place place,
+                                    bool always_copy = false);
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -161,8 +162,24 @@ void Tensor::CopyToCpu(T *data) {
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
 
+  paddle::framework::Tensor out;
+  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
+      static_cast<void *>(data), ele_num * sizeof(T),
+      paddle::platform::CPUPlace());
+  out.ResetHolder(mem_allocation);
+
   if (paddle::platform::is_cpu_place(t_place)) {
+#ifdef PADDLE_WITH_MKLDNN
+    if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
+      paddle::framework::innerTransDataLayoutFromMKLDNN(
+          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
+                                .get_cur_paddle_data_layout(),
+          *tensor, &out, paddle::platform::CPUPlace(), true);
+    else
+      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#else
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#endif
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     paddle::platform::DeviceContextPool &pool =
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename T>
+class ScaleMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+    bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
+
+    float scale = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
+                                            : (float)*(scale_tensor->data<T>());
+    float bias = ctx.Attr<float>("bias");
+
+    // if bias_after_scale == true
+    //   out = scale*X + bias
+    // else
+    //   out = scale*(X + bias) = scale*X + scale*bias
+
+    if (!bias_after_scale) bias *= scale;
+
+    auto x_tz = framework::vectorize<int64_t>(x->dims());
+    bool is_inplaced = x->IsSharedBufferWith(*out);
+
+    platform::ActivationMKLDNNHandler<T> handler(
+        x_tz, mkldnn::algorithm::eltwise_linear, scale, bias, x->format(),
+        dev_ctx, ctx.GetPlace(), ctx.InputName("X"), is_inplaced);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto activation_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+    activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p},
+                                    {MKLDNN_ARG_TO, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(scale, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ScaleMKLDNNKernel<float>,
+                   ops::ScaleMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
@@ -54,6 +54,21 @@ class ScaleOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,6 +102,9 @@ if bias_after_scale=True:
         "Apply bias addition after or before scaling. It is useful for "
         "numeric stability in some circumstances.")
         .SetDefault(true);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
   }
 };
 
@@ -112,6 +130,8 @@ class ScaleGradMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetAttr("scale", this->GetAttr("scale"));
     grad_op->SetAttr("bias", 0.0f);
     grad_op->SetAttr("bias_after_scale", true);
+    if (grad_op->HasAttr("use_mkldnn"))
+      grad_op->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn"));
   }
 };
 
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
@@ -234,6 +234,7 @@ register_unity_group(cc
     save_combine_op.cc
     save_op.cc
     scale_op.cc
+    mkldnn/scale_mkldnn_op.cc
     scatter_nd_add_op.cc
     scatter_op.cc
     seed_op.cc
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_bf16_mkldnn_op.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+@unittest.skipIf(core.is_compiled_with_cuda(),
+                 "core is compiled with CUDA which has no BF implementation")
+class TestScaleOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale = -2.3
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'scale': self.scale, 'use_mkldnn': True, 'bias': 0.4}
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.x_fp32 * self.attrs['scale']) + self.attrs['bias']
+        }
+
+    def calculate_grads(self):
+        bias = 0
+        if 'bias' in self.attrs:
+            bias = self.attrs['bias']
+
+        scale = self.scale
+        if 'ScaleTensor' in self.attrs:
+            scale = self.attrs['ScaleTensor']
+
+        self.out = (self.x_fp32 * scale) + bias
+        self.dx = (self.out * scale)
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.out)])
+
+
+class TestScaleOpBF16BiasNotAfterScale(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale = 1.5
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {
+            'scale': self.scale,
+            'use_mkldnn': True,
+            'bias': 0.0,
+            'bias_after_scale': False
+        }
+        self.use_mkldnn = True
+        self.outputs = {
+            'Out': (self.x_fp32 + self.attrs['bias']) * self.attrs['scale']
+        }
+
+
+class TestScaleOpBF16ScaleTensor(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = -2.3
+        self.x_fp32 = np.random.random((10, 10)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale_tensor = np.array([self.scale]).astype(np.float32)
+        self.inputs = {
+            'X': self.x_bf16,
+            'ScaleTensor': convert_float_to_uint16(self.scale_tensor)
+        }
+        self.attrs = {'use_mkldnn': True}
+        self.outputs = {'Out': self.x_fp32 * self.scale}
+
+
+class TestScaleOpBF16ScaleTensorNotBiasAfterScale(TestScaleOpBF16):
+    def setUp(self):
+        self.op_type = "scale"
+        self.scale = 1.2
+        self.x_fp32 = np.random.random((9, 13)).astype(np.float32)
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.scale_tensor = np.array([self.scale]).astype(np.float32)
+        self.inputs = {
+            'X': self.x_bf16,
+            'ScaleTensor': convert_float_to_uint16(self.scale_tensor)
+        }
+        self.attrs = {
+            'bias': -1.1,
+            'bias_after_scale': False,
+            'use_mkldnn': True
+        }
+        self.outputs = {'Out': (self.x_fp32 + self.attrs['bias']) * self.scale}
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_scale_mkldnn_op.py
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py