[XPU] fix bugs of depthwise conv test and change default quant type (#70859)

lj970926 · web-flow · commit efd481536f5b · 2025-03-07T11:29:37.000+08:00
* [XPU] fix bugs of depthwise conv test and change default quant type

* fix typo

* change default quant to float for fp32

* fp32 use tf32

* fix some ci bugs

* fix more ci bugs
diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
@@ -16,6 +16,7 @@
 
 #ifdef PADDLE_WITH_XPU
 
+#include <unordered_map>
 #include <vector>
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
@@ -41,29 +42,60 @@ enum XPUFCCalcType {
   FC_FLOAT16,
 };
 
-template <typename T>
-XPUFCCalcType FCCalcType() {
-  const char* xpu_paddle_fc_float16 = std::getenv("XPU_PADDLE_FC_FLOAT16");
-  if (xpu_paddle_fc_float16 != nullptr &&
-      (std::is_same<phi::dtype::float16, T>::value ||
-       std::is_same<XPUTypeFP16, T>::value || std::is_same<float, T>::value)) {
-    return XPUFCCalcType::FC_FLOAT16;
-  } else if (std::is_same<phi::dtype::float16, T>::value ||
-             std::is_same<XPUTypeFP16, T>::value) {
-    return XPUFCCalcType::FC_INT16;
-  } else if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
-    return XPUFCCalcType::FC_INT32;
-  } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
-    return XPUFCCalcType::FC_FLOAT;
-  } else if (std::getenv("XPU_PADDLE_FC_INT32_WITH_LL") != nullptr) {
-    return XPUFCCalcType::FC_INT32_WITH_LL;
-  } else if ((std::is_same<phi::dtype::bfloat16, T>::value ||
-              std::is_same<XPUTypeBF16, T>::value) ||
-             (std::is_same<float, T>::value &&
-              std::getenv("XPU_PADDLE_FC_TF32") != nullptr)) {
-    return XPUFCCalcType::FC_TF32;
+using XPUFCCalcTypeMap = std::vector<std::pair<const char*, XPUFCCalcType>>;
+
+inline XPUFCCalcType GetFCCalcTypeFromEnv(const XPUFCCalcTypeMap& env_map,
+                                          XPUFCCalcType default_calc_type) {
+  for (auto [env_name, calc_type] : env_map) {
+    if (std::getenv(env_name) != nullptr) {
+      return calc_type;
+    }
   }
-  return XPUFCCalcType::FC_INT16;
+  return default_calc_type;
+}
+
+template <typename T>
+inline XPUFCCalcType FCCalcType() {
+  // FLOAT32
+  XPUFCCalcTypeMap calc_type_map = {
+      {"XPU_PADDLE_FC_FLOAT", XPUFCCalcType::FC_FLOAT},
+      {"XPU_PADDLE_FC_LOCAL_INT16", XPUFCCalcType::FC_FLOAT},
+      {"XPU_PADDLE_FC_TF32", XPUFCCalcType::FC_TF32},
+      {"XPU_PADDLE_FC_INT16", XPUFCCalcType::FC_INT16},
+      {"XPU_PADDLE_FC_INT32", XPUFCCalcType::FC_INT32},
+      {"XPU_PADDLE_FC_INT32_WITH_LL", XPUFCCalcType::FC_INT32_WITH_LL},
+  };
+#ifdef PADDLE_WITH_XPU_XRE5
+  auto default_calc_type = XPUFCCalcType::FC_TF32;
+#else
+  auto default_calc_type = XPUFCCalcType::FC_INT16;
+#endif
+  return GetFCCalcTypeFromEnv(calc_type_map, default_calc_type);
+}
+
+template <>
+inline XPUFCCalcType FCCalcType<XPUTypeFP16>() {
+  XPUFCCalcTypeMap calc_type_map = {
+      {"XPU_PADDLE_FC_FLOAT16", XPUFCCalcType::FC_FLOAT16},
+      {"XPU_PADDLE_FC_INT16", XPUFCCalcType::FC_INT16},
+      {"XPU_PADDLE_FC_FLOAT", XPUFCCalcType::FC_FLOAT},
+      {"XPU_PADDLE_FC_LOCAL_INT16", XPUFCCalcType::FC_FLOAT}};
+#ifdef PADDLE_WITH_XPU_XRE5
+  auto default_calc_type = XPUFCCalcType::FC_FLOAT16;
+#else
+  auto default_calc_type = XPUFCCalcType::FC_INT16;
+#endif
+  return GetFCCalcTypeFromEnv(calc_type_map, default_calc_type);
+}
+
+template <>
+inline XPUFCCalcType FCCalcType<XPUTypeBF16>() {
+  XPUFCCalcTypeMap calc_type_map = {
+      // TF32 is the default, do not need to be listed here.
+      {"XPU_PADDLE_FC_FLOAT", XPUFCCalcType::FC_FLOAT},
+      {"XPU_PADDLE_FC_LOCAL_INT16", XPUFCCalcType::FC_FLOAT}};
+  auto default_calc_type = XPUFCCalcType::FC_TF32;
+  return GetFCCalcTypeFromEnv(calc_type_map, default_calc_type);
 }
 
 struct XpuFcInfo {
diff --git a/test/dygraph_to_static/test_save_inference_model.py b/test/dygraph_to_static/test_save_inference_model.py
@@ -84,6 +84,11 @@ def forward(self, x):
 class TestDyToStaticSaveInferenceModel(Dy2StTestBase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
+        self.atol = 0
+        self.rtol = 1e-5
+        if paddle.is_compiled_with_xpu():
+            self.atol = 1e-4
+            self.rtol = 1e-4
 
     def tearDown(self):
         self.temp_dir.cleanup()
@@ -205,7 +210,9 @@ def check_save_inference_model(
             infer_model_dir, model_filename, params_filename, inputs
         )
 
-        np.testing.assert_allclose(gt_out, infer_out, rtol=1e-05)
+        np.testing.assert_allclose(
+            gt_out, infer_out, atol=self.atol, rtol=self.rtol
+        )
 
     def load_and_run_inference(
         self, model_path, model_filename, params_filename, inputs
diff --git a/test/ir/inference/test_xpu_matmul_weight_trans_pass.py b/test/ir/inference/test_xpu_matmul_weight_trans_pass.py
@@ -25,7 +25,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(use_xpu=True)
         yield config, [
             "matmul_v2",
-        ], (1e-3, 1e-3)
+        ], (5e-3, 5e-3)
 
     def sample_program_config(self, draw):
         # 1. Generate shape and attr of matmul
diff --git a/test/ir/pir/fused_pass/xpu/test_conv2d_add_fuse_xpu_pass.py b/test/ir/pir/fused_pass/xpu/test_conv2d_add_fuse_xpu_pass.py
@@ -74,7 +74,7 @@ def sample_program(self):
         yield pir_program, False
 
     def test_check_output(self):
-        self.check_pass_correct(atol=1e-3, rtol=1e-3)
+        self.check_pass_correct(atol=2e-3, rtol=2e-3)
 
     def setUp(self):
         if core.is_compiled_with_xpu():
diff --git a/test/legacy_test/test_executor_and_mul.py b/test/legacy_test/test_executor_and_mul.py
@@ -45,10 +45,13 @@ def test_mul(self):
         )
 
         self.assertEqual((100, 100), res.shape)
-        np.testing.assert_allclose(res, np.dot(a_np, b_np), rtol=1e-05)
-        np.testing.assert_allclose(res_array[0], a_np, rtol=1e-05)
-        np.testing.assert_allclose(res_array[1], b_np, rtol=1e-05)
-        np.testing.assert_allclose(res_array[2], res, rtol=1e-05)
+        rtol = 1e-5
+        if paddle.is_compiled_with_xpu():
+            rtol = 1e-4
+        np.testing.assert_allclose(res, np.dot(a_np, b_np), rtol=rtol)
+        np.testing.assert_allclose(res_array[0], a_np, rtol=rtol)
+        np.testing.assert_allclose(res_array[1], b_np, rtol=rtol)
+        np.testing.assert_allclose(res_array[2], res, rtol=rtol)
 
 
 if __name__ == '__main__':
diff --git a/test/prim/process/test_prim_amp.py b/test/prim/process/test_prim_amp.py
@@ -47,6 +47,11 @@ def setUp(self):
         paddle.seed(2022)
         self.x = paddle.randn([4, 2, 6, 6], dtype="float32")
         self.x.stop_gradient = False
+        self.atol = 1e-3
+        self.rtol = 1e-3
+        if paddle.is_compiled_with_xpu():
+            self.atol = 5e-3
+            self.rtol = 5e-3
 
     def train(self, use_prim):
         core._set_prim_all_enabled(use_prim)
@@ -75,8 +80,8 @@ def test_amp_01(self):
             np.testing.assert_allclose(
                 expected,
                 actual,
-                rtol=1e-3,
-                atol=1e-3,
+                rtol=self.rtol,
+                atol=self.atol,
             )
 
     def test_amp_O1_infer(self):
@@ -101,8 +106,8 @@ def test_amp_O1_infer(self):
             np.testing.assert_allclose(
                 res,
                 res_amp,
-                rtol=1e-3,
-                atol=1e-3,
+                rtol=self.rtol,
+                atol=self.atol,
             )
 
 
diff --git a/test/xpu/get_test_cover_info.py b/test/xpu/get_test_cover_info.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import fcntl
 import inspect
 import os
@@ -362,6 +363,20 @@ def wrapper(cls):
     return wrapper
 
 
+@contextlib.contextmanager
+def xpu_matmul_quant_type_guard(dtype):
+    # only fp32 is supported now
+    assert dtype == "float"
+    env_name = "XPU_PADDLE_FC_FLOAT"
+    origin_env = os.getenv(env_name)
+    os.environ[env_name] = "1"
+    yield
+    if origin_env is not None:
+        os.environ[env_name] = origin_env
+    else:
+        del os.environ[env_name]
+
+
 def get_test_cover_info():
     xpu_version = core.get_xpu_device_version(0)
     version_str = get_version_str(xpu_version)
diff --git a/test/xpu/test_bmm_op_xpu.py b/test/xpu/test_bmm_op_xpu.py
@@ -61,7 +61,7 @@ def set_xpu(self):
             self.__class__.op_type = self.in_type
 
         def test_check_output(self):
-            self.check_output_with_place(self.place)
+            self.check_output_with_place(self.place, atol=5e-3, rtol=1e-3)
 
         def test_check_grad_normal(self):
             self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
diff --git a/test/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py
@@ -256,7 +256,7 @@ def has_cuda(self):
         def test_check_output(self):
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
-                self.check_output_with_place(self.place)
+                self.check_output_with_place(self.place, atol=0.005, rtol=0.005)
 
         def test_check_grad(self):
             if hasattr(self, "no_need_check_grad") and self.no_need_check_grad:
@@ -418,7 +418,9 @@ def test_check_output(self):
             # TODO(wangzhongpu): support onednn op in dygraph mode
             if core.is_compiled_with_xpu():
                 paddle.enable_static()
-                self.check_output_with_place(place=self.place)
+                self.check_output_with_place(
+                    place=self.place, atol=0.005, rtol=0.005
+                )
 
         def test_check_grad(self):
             # TODO(wangzhongpu): support onednn op in dygraph mode
diff --git a/test/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py
@@ -225,8 +225,8 @@ def setUp(self):
             }
 
             np.random.seed(100)
-            input = np.random.random(self.input_size).astype(self.dtype)
-            filter = np.random.random(self.filter_size).astype(self.dtype)
+            input = np.random.random(self.input_size).astype(self.dtype) - 0.5
+            filter = np.random.random(self.filter_size).astype(self.dtype) - 0.5
             output = conv3d_forward_naive(
                 input,
                 filter,
@@ -251,7 +251,7 @@ def setUp(self):
 
         def test_check_output(self):
             place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, atol=0.005, rtol=0.005)
 
         def test_check_grad(self):
             place = paddle.XPUPlace(0)
@@ -397,8 +397,8 @@ def setUp(self):
             }
 
             np.random.seed(100)
-            input = np.random.random(self.input_size).astype(self.dtype)
-            filter = np.random.random(self.filter_size).astype(self.dtype)
+            input = np.random.random(self.input_size).astype(self.dtype) - 0.5
+            filter = np.random.random(self.filter_size).astype(self.dtype) - 0.5
             output = conv3d_forward_naive(
                 input,
                 filter,
@@ -426,7 +426,7 @@ def setUp(self):
 
         def test_check_output(self):
             place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, atol=0.005, rtol=0.005)
 
         def test_check_grad(self):
             place = paddle.XPUPlace(0)
diff --git a/test/xpu/test_depthwise_conv2d_op_xpu.py b/test/xpu/test_depthwise_conv2d_op_xpu.py
@@ -26,8 +26,6 @@
 )
 from test_conv2d_op_xpu import XPUTestConv2DOp, XPUTestConv2DOp_v2
 
-from paddle.base import core
-
 
 class XPUTestDepthwiseConv2DOp(XPUOpTestWrapper):
     def __init__(self):
@@ -46,10 +44,6 @@ def init_test_case(self):
             self.filter_size = [12, f_c, 3, 3]
             self.op_type = "depthwise_conv2d"
 
-    @unittest.skipIf(
-        core.get_xpu_device_version(0) == core.XPUVersion.XPU3,
-        "bugs on kl3, disable it temporarily",
-    )
     class TestDepthwiseConv2(XPUTestConv2DOp.TestConv2DOp):
         def init_test_case(self):
             self.use_cuda = False
@@ -62,10 +56,6 @@ def init_test_case(self):
             self.filter_size = [12, f_c, 3, 3]
             self.op_type = "depthwise_conv2d"
 
-    @unittest.skipIf(
-        core.get_xpu_device_version(0) == core.XPUVersion.XPU3,
-        "bugs on kl3, disable it temporarily",
-    )
     class TestDepthwiseConv3(XPUTestConv2DOp.TestConv2DOp):
         def init_test_case(self):
             self.use_cuda = False
@@ -78,10 +68,6 @@ def init_test_case(self):
             self.filter_size = [24, f_c, 3, 3]
             self.op_type = "depthwise_conv2d"
 
-    @unittest.skipIf(
-        core.get_xpu_device_version(0) == core.XPUVersion.XPU3,
-        "bugs on kl3, disable it temporarily",
-    )
     class TestDepthwiseConvWithDilation(XPUTestConv2DOp.TestConv2DOp):
         def init_test_case(self):
             self.use_cuda = False
@@ -95,10 +81,6 @@ def init_test_case(self):
             self.filter_size = [24, f_c, 3, 3]
             self.op_type = "depthwise_conv2d"
 
-    @unittest.skipIf(
-        core.get_xpu_device_version(0) == core.XPUVersion.XPU3,
-        "bugs on kl3, disable it temporarily",
-    )
     class TestDepthwiseConvWithDilation2(XPUTestConv2DOp.TestConv2DOp):
         def init_test_case(self):
             self.use_cuda = False
@@ -163,10 +145,6 @@ def init_paddings(self):
             self.pad = [1, 1, 0, 0]
             self.padding_algorithm = "EXPLICIT"
 
-    @unittest.skipIf(
-        core.get_xpu_device_version(0) == core.XPUVersion.XPU3,
-        "bugs on kl3, disable it temporarily",
-    )
     class TestDepthwiseConvWithDilation_AsyPadding(
         XPUTestConv2DOp_v2.TestConv2DOp_v2
     ):
diff --git a/test/xpu/test_einsum_op_xpu.py b/test/xpu/test_einsum_op_xpu.py
@@ -72,7 +72,7 @@ def test_check_output(self):
                 self.check_output_with_place(
                     paddle.XPUPlace(0),
                     no_check_set=["InnerCache", "XShape"],
-                    atol=5e-3,
+                    atol=2e-2,
                 )
 
         def test_grad(self):
@@ -166,7 +166,7 @@ def test_api(self):
         output = paddle.einsum(self.equation, *inputs)
         expect = np.einsum(self.equation, *[x.numpy() for x in inputs])
         np.testing.assert_allclose(
-            output.numpy(), expect, atol=0.0006, rtol=0.0001
+            output.numpy(), expect, atol=5e-3, rtol=0.0001
         )
         output = output.mean()
         output.backward()
diff --git a/test/xpu/test_flash_attention_op_xpu.py b/test/xpu/test_flash_attention_op_xpu.py
diff --git a/test/xpu/test_fused_attention_op_xpu.py b/test/xpu/test_fused_attention_op_xpu.py
diff --git a/test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py b/test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
diff --git a/test/xpu/test_fused_gemm_epilogue_op_xpu.py b/test/xpu/test_fused_gemm_epilogue_op_xpu.py
diff --git a/test/xpu/test_fused_resnet_basic_block_op_xpu.py b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
diff --git a/test/xpu/test_matmul_v2_op_xpu.py b/test/xpu/test_matmul_v2_op_xpu.py