PaddlePaddle
diff --git a/‎.gitignore‎
Lines changed: 8 additions & 2 deletions b/‎.gitignore‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README_cn.md‎
Lines changed: 1 addition & 1 deletion b/‎README_cn.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/external/brpc.cmake‎
Lines changed: 4 additions & 5 deletions b/‎cmake/external/brpc.cmake‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎cmake/external/xpu.cmake‎
Lines changed: 4 additions & 4 deletions b/‎cmake/external/xpu.cmake‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/fluid/distributed/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/distributed/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/distributed/auto_parallel/CMakeLists.txt‎
Lines changed: 7 additions & 21 deletions b/‎paddle/fluid/distributed/auto_parallel/CMakeLists.txt‎
Lines changed: 7 additions & 21 deletions
diff --git a/‎paddle/fluid/distributed/auto_parallel/auto_parallel.proto‎
Lines changed: 53 additions & 0 deletions b/‎paddle/fluid/distributed/auto_parallel/auto_parallel.proto‎
Lines changed: 53 additions & 0 deletions
@@ -5,7 +5,7 @@ paddle/fluid/API_PR.spec
 paddle/fluid/eager/api/generated/*
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
-paddle/fluid/operators/ops_extra_info.h
+paddle/fluid/operators/ops_extra_info.cc
 paddle/phi/api/backward/backward_api.h
 paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
@@ -38,6 +38,7 @@ build_doc/
 CMakeSettings.json
 Makefile
 .test_env/
+.cache/
 third_party/
 
 *~
@@ -65,9 +66,14 @@ paddle/infrt/dialect/pd/common/pd_ops_info.h
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
 paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
-paddle/fluid/pybind/eager_final_state_op_function.cc
+paddle/fluid/pybind/eager_op_function.cc
 
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op.cc
 paddle/phi/ops/compat/generated_sig.cc
 paddle/phi/api/yaml/parsed_apis/
+python/paddle/utils/code_gen/
+paddle/fluid/pybind/tmp_eager_op_function_impl.h
+paddle/fluid/pybind/eager_op_function_impl.h
+paddle/fluid/pybind/eager_op_function_impl.h
+paddle/fluid/pybind/op_function_impl.h
@@ -64,6 +64,13 @@ repos:
             (?x)^(
                 paddle/utils/.*
             )$
+-   repo: local
+    hooks:
+    -   id: auto-generate-cmakelists
+        name: auto-generate-cmakelists
+        entry: bash ./tools/gen_ut_cmakelists.hook
+        language: system
+        files: testslist.csv$
 -   repo: https://github.com/cheshirekow/cmake-format-precommit
     rev: v0.6.13
     hooks:
 
@@ -364,6 +364,18 @@ if(WIN32)
   endif()
 endif()
 
+if(NOT WITH_TESTING AND WITH_MULTINODE_TESTING)
+  message(
+    WARNING
+      "Disable WITH_MULTINODE_TESTING when compiling without TESTING. Force WITH_MULTINODE_TESTING=OFF."
+  )
+  set(WITH_MULTINODE_TESTING
+      OFF
+      CACHE STRING
+            "Disable WITH_MULTINODE_TESTING when compiling without TESTING"
+            FORCE)
+endif()
+
 if(NOT WITH_GPU AND WITH_NCCL)
   message(
     WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
 
@@ -51,7 +51,7 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
 
 - **High-Performance Inference Engines for Comprehensive Deployment Environments**
 
-   PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
+   PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/master/guides/introduction/index_intro.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
 
 
 - **Industry-Oriented Models and Libraries with Open Source Repositories**
 
@@ -49,7 +49,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 
 - **支持多端多平台的高性能推理部署工具**
 
-    飞桨不仅广泛兼容第三方开源框架训练的模型部署，并且为不同的场景的生产环境提供了完备的推理引擎，包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html)，面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving)，针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)，以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时，透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
+    飞桨不仅广泛兼容第三方开源框架训练的模型部署，并且为不同的场景的生产环境提供了完备的推理引擎，包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://www.paddlepaddle.org.cn/inference/product_introduction/inference_intro.html)，面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving)，针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)，以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时，透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
 
 
 - **面向产业应用，开源开放覆盖多领域的工业级模型库。**
 
@@ -45,9 +45,8 @@ set(prefix_path
 ExternalProject_Add(
   extern_brpc
   ${EXTERNAL_PROJECT_LOG_ARGS}
-  # TODO(gongwb): change to de newst repo when they changed
-  GIT_REPOSITORY "https://github.com/wangjiawei04/brpc"
-  GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e"
+  GIT_REPOSITORY "https://github.com/apache/incubator-brpc"
+  GIT_TAG 1.2.0
   PREFIX ${BRPC_PREFIX_DIR}
   UPDATE_COMMAND ""
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -60,8 +59,8 @@ ExternalProject_Add(
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
              -DCMAKE_PREFIX_PATH=${prefix_path}
              -DWITH_GLOG=ON
-             -DIOBUF_WITH_HUGE_BLOCK=ON
-             -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
+             -DBUILD_BRPC_TOOLS=ON
+             -DBUILD_SHARED_LIBS=ON
              ${EXTERNAL_OPTIONAL_ARGS}
   LIST_SEPARATOR |
   CMAKE_CACHE_ARGS
 
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220810")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220831")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -19,7 +19,7 @@ endif()
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   set(XPU_XDNN_BASE_URL_WITHOUT_DATE
       "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220810")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220831")
 else()
   set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
@@ -52,7 +52,7 @@ elseif(WITH_BDCENTOS)
 elseif(WITH_UBUNTU)
   set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
   set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
-  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+  set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64")
   # ubuntu and centos: use output by XDNN API team
   set(XPU_XDNN_URL
       "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
@@ -68,7 +68,7 @@ elseif(WITH_CENTOS)
 else()
   set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
   set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
-  set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+  set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64")
   # default: use output by XDNN API team
   set(XPU_XDNN_URL
       "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
 
@@ -1,3 +1,4 @@
+add_subdirectory(auto_parallel)
 add_subdirectory(collective)
 add_subdirectory(store)
 if(WITH_PYTHON)
@@ -47,4 +48,3 @@ add_subdirectory(ps)
 add_subdirectory(test)
 add_subdirectory(index_dataset)
 add_subdirectory(fleet_executor)
-add_subdirectory(auto_parallel)
@@ -1,37 +1,23 @@
+proto_library(auto_parallel_proto SRCS auto_parallel.proto)
+
 cc_library(
   device_mesh
   SRCS device_mesh.cc
-  DEPS auto_parallel_proto)
-cc_test(
-  device_mesh_test
-  SRCS device_mesh_test.cc
-  DEPS device_mesh)
+  DEPS auto_parallel_proto phi_enforce)
 
 cc_library(
   process_mesh
   SRCS process_mesh.cc
-  DEPS auto_parallel_proto)
-cc_test(
-  process_mesh_test
-  SRCS process_mesh_test.cc
-  DEPS process_mesh)
+  DEPS auto_parallel_proto phi_enforce)
 
 cc_library(
   dist_attr
   SRCS dist_attr.cc
-  DEPS process_mesh auto_parallel_proto proto_desc)
-cc_test(
-  dist_attr_test
-  SRCS dist_attr_test.cc
-  DEPS dist_attr)
+  DEPS process_mesh auto_parallel_proto proto_desc phi_enforce)
 
 cc_library(
   dist_mapper
   SRCS dist_mapper.cc
-  DEPS device_mesh auto_parallel_proto)
-cc_test(
-  dist_mapper_test
-  SRCS dist_mapper_test.cc
-  DEPS dist_mapper)
+  DEPS device_mesh auto_parallel_proto phi_enforce)
 
-proto_library(auto_parallel_proto SRCS auto_parallel.proto)
+cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper)
@@ -30,6 +30,59 @@ message ProcessMeshProto {
 
 }
 
+// This distributed attribute describes how to distribute the corresponding tensor,
+// and store any other information needed by auto parallel.
+message TensorDistAttrProto {
+  // The process mesh where a tensor is distributed.
+  optional ProcessMeshProto process_mesh = 1;
+
+  // The length of dims_mapping is same as the length of the tensor shape.
+  // The i-th dimension of the tensor will be sharded by the dims_mapping[i]-th dimension 
+  // of the above process mesh. If dims_mapping[i] is -1, the i-th dimension of the tensor
+  // will not be sharded. For example, given a tensor shape [2, 6, 12], a process mesh
+  // shape [2, 3] and a dims_mapping [-1, 1, 0], each sharded tensor will have a shape [2, 2, 6].
+  repeated int64 dims_mapping = 2;
+
+  // The batch dimension of the corresponding tensor. 
+  optional int64 batch_dim = 3;
+
+  // If the dynamic_dims[i] is True, the i-th dimension of the corresponding tensor 
+  // is dynamic changed. Otherwise, the i-th dimension of the tensor is static determined. 
+  repeated bool dynamic_dims = 4;
+}
+
+// This distributed attribute describes how to distribute the corresponding operator,
+// and store any other information needed by auto parallel.
+message OperatorDistAttrProto {
+  message TensorDistAttrMappingEntryProto {
+    optional string name = 1;
+    optional TensorDistAttrProto tensor_dist_attr = 2;
+  } 
+  // The key of this map is the input tensor name and the value is the distributed attribute
+  // of the input tensor required by this corresponding operator.   
+  // The distributed attribute of the actual tensor may be not the same as that within 
+  // the distributed attribute of the operator.
+  repeated TensorDistAttrMappingEntryProto input_dist_attrs = 1;
+
+  // The key of this map is the output tensor name and the value is the distributed attribute
+  // of the output tensor required by this corresponding operator.   
+  // The distributed attribute of the actual tensor may be not the same as that within 
+  // the distributed attribute of the operator.
+  repeated TensorDistAttrMappingEntryProto output_dist_attrs = 2;
+
+  // The process mesh where a op is distributed.
+  optional ProcessMeshProto process_mesh = 3;
+
+  // A operator ideally has a distributed operator which may have multiple distributed implementations.
+  // This filed is usually same as the operator type. However, some operators such as the element-wise operators
+  // may shared the same distributed operator, the field is use for this scenario.
+  optional string impl_type = 4;
+
+  // This field tells which distributed implementations of this corresponding operator 
+  // will be selected for the actual computation.
+  optional int64 impl_idx = 5;
+}
+
 // This proto describes the capability of one device such as the computation and memory.
 message DeviceCapabilityProto {
   optional double single_precision_flops = 1;