PaddlePaddle · risemeup1 · Feb 28, 2025 · Feb 26, 2025
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
@@ -129,19 +129,22 @@ class DataNormKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
                           "The dims of Input(X) should be greater than 0."));
 
     const T *batch_size_in =
-        ctx.Input<phi::DenseTensor>("BatchSize")->data<T>();
-    const T *batch_sum_in = ctx.Input<phi::DenseTensor>("BatchSum")->data<T>();
+        ctx.Input<phi::DenseTensor>("BatchSize")->template data<T>();
+    const T *batch_sum_in =
+        ctx.Input<phi::DenseTensor>("BatchSum")->template data<T>();
     const T *batch_square_sum_in =
-        ctx.Input<phi::DenseTensor>("BatchSquareSum")->data<T>();
-    auto *x_data = x->data<T>();
+        ctx.Input<phi::DenseTensor>("BatchSquareSum")->template data<T>();
+    auto *x_data = x->template data<T>();
 
     // alloc memory
-    T *y_data =
-        ctx.Output<phi::DenseTensor>("Y")->mutable_data<T>(ctx.GetPlace());
+    T *y_data = ctx.Output<phi::DenseTensor>("Y")->template mutable_data<T>(
+        ctx.GetPlace());
     T *mean_out_data =
-        ctx.Output<phi::DenseTensor>("Means")->mutable_data<T>(ctx.GetPlace());
+        ctx.Output<phi::DenseTensor>("Means")->template mutable_data<T>(
+            ctx.GetPlace());
     T *scale_out_data =
-        ctx.Output<phi::DenseTensor>("Scales")->mutable_data<T>(ctx.GetPlace());
+        ctx.Output<phi::DenseTensor>("Scales")->template mutable_data<T>(
+            ctx.GetPlace());
 
     auto stream = ctx.template device_context<phi::GPUContext>().stream();
 
@@ -185,31 +188,32 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
     }
     T *d_batch_size =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSize"))
-            ->mutable_data<T>(ctx.GetPlace());
+            ->template mutable_data<T>(ctx.GetPlace());
     T *d_batch_sum =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSum"))
-            ->mutable_data<T>(ctx.GetPlace());
+            ->template mutable_data<T>(ctx.GetPlace());
     T *d_batch_square_sum =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("BatchSquareSum"))
-            ->mutable_data<T>(ctx.GetPlace());
+            ->template mutable_data<T>(ctx.GetPlace());
 
     auto stream = ctx.template device_context<phi::GPUContext>().stream();
     if (d_x != nullptr) {
       KernelDataNormBP<<<GET_BLOCKS(C * N),
                          PADDLE_CUDA_NUM_THREADS,
                          0,
-                         stream>>>(N,
-                                   C,
-                                   d_y->data<T>(),
-                                   scales->data<T>(),
-                                   d_x->mutable_data<T>(ctx.GetPlace()));
+                         stream>>>(
+          N,
+          C,
+          d_y->template data<T>(),
+          scales->template data<T>(),
+          d_x->template mutable_data<T>(ctx.GetPlace()));
     }
 
     KernelDataNormBPStat<<<GET_BLOCKS(C), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         N,
         C,
-        x->data<T>(),
-        means->data<T>(),
+        x->template data<T>(),
+        means->template data<T>(),
         epsilon,
         d_batch_size,
         d_batch_sum,
@@ -306,11 +310,11 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
     }
 
     T *batch_size_data = ctx.Output<phi::DenseTensor>("BatchSize")
-                             ->mutable_data<T>(ctx.GetPlace());
+                             ->template mutable_data<T>(ctx.GetPlace());
     T *batch_sum_data = ctx.Output<phi::DenseTensor>("BatchSum")
-                            ->mutable_data<T>(ctx.GetPlace());
+                            ->template mutable_data<T>(ctx.GetPlace());
     T *batch_square_sum_data = ctx.Output<phi::DenseTensor>("BatchSquareSum")
-                                   ->mutable_data<T>(ctx.GetPlace());
+                                   ->template mutable_data<T>(ctx.GetPlace());
     KernelUpdateParam<<<GET_BLOCKS(C), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         C,
         d_batch_size,

diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
@@ -162,7 +162,7 @@ class OverflowKernel : public framework::OpKernel<T> {
   virtual void Compute(const framework::ExecutionContext& ctx) const {
     auto* x = ctx.InputVar("X");
     auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
+    out->template mutable_data<T>(ctx.GetPlace());
     Functor functor;
     if (x->IsType<phi::DenseTensor>()) {
       auto* in = ctx.Input<phi::DenseTensor>("X");

diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -36,10 +36,10 @@ static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
   for (size_t i = 0; i < slot_size; i++) {
     const auto *slot = inputs[i];
     const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
+        reinterpret_cast<const uint64_t *>(slot->template data<int64_t>());
     all_keys[i] = single_slot_keys;
     slot_lengths[i] = slot->numel();
-    auto *output = outputs[i]->mutable_data<T>(ctx.GetPlace());
+    auto *output = outputs[i]->template mutable_data<T>(ctx.GetPlace());
     // double type is not fully supported now
     all_values[i] = reinterpret_cast<float *>(output);
   }
@@ -68,7 +68,7 @@ static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
   for (size_t i = 0; i < slot_size; i++) {
     const auto *slot = inputs[i];
     const uint64_t *single_slot_keys =
-        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
+        reinterpret_cast<const uint64_t *>(slot->template data<int64_t>());
     all_keys[i] = single_slot_keys;
     slot_lengths[i] = slot->numel();
     int cur_batch_size =
@@ -82,7 +82,7 @@ static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
                             "The batch size of all input slots should be same, "
                             "please check"));
     }
-    const float *grad_value = d_output[i]->data<float>();
+    const float *grad_value = d_output[i]->template data<float>();
     all_grad_values[i] = grad_value;
   }
 #ifdef PADDLE_WITH_HETERPS

diff --git a/test/cpp/fluid/framework/data_device_transform_test.cu b/test/cpp/fluid/framework/data_device_transform_test.cu
@@ -73,7 +73,7 @@ class TestKernel : public OpKernel<float> {
     std::cout << "input place:" << input->place() << std::endl;
     auto* output = ctx.Output<phi::DenseTensor>("output");
     output->Resize(input->dims());
-    output->mutable_data<T>(ctx.GetPlace());
+    output->template mutable_data<T>(ctx.GetPlace());
 
     phi::funcs::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
         *input,