Skip to content

Commit bff34fb

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into tmp_typing_all
2 parents cb781c5 + 94846a0 commit bff34fb

File tree

213 files changed

+4305
-6718
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

213 files changed

+4305
-6718
lines changed

cmake/cinn.cmake

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -177,12 +177,6 @@ target_link_libraries(cinnapi op_dialect pir phi)
177177
add_dependencies(cinnapi op_dialect pir phi)
178178

179179
add_dependencies(cinnapi python)
180-
if(LINUX)
181-
target_link_libraries(cinnapi "-Wl,--unresolved-symbols=ignore-all")
182-
elseif(APPLE)
183-
target_link_libraries(cinnapi "-Wl,-undefined,dynamic_lookup")
184-
endif()
185-
186180
if(WITH_MKL)
187181
target_link_libraries(cinnapi cinn_mklml)
188182
add_dependencies(cinnapi cinn_mklml)

cmake/cinn/core.cmake

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ function(cinn_cc_library TARGET_NAME)
1919
endif()
2020

2121
if(cinn_cc_library_DEPS)
22-
# Don't need link libwarpctc.so
22+
if("${cinn_cc_library_DEPS};" MATCHES "python;")
23+
list(REMOVE_ITEM cinn_cc_library_DEPS python)
24+
add_dependencies(${TARGET_NAME} python)
25+
endif()
2326
target_link_libraries(${TARGET_NAME} ${cinn_cc_library_DEPS})
2427
add_dependencies(${TARGET_NAME} ${cinn_cc_library_DEPS})
2528
endif()

cmake/external/flashattn.cmake

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ else()
103103
CACHE FILEPATH "flash-attn Library" FORCE)
104104
endif()
105105

106+
if(NOT DEFINED FA_JOB_POOLS_COMPILE)
107+
set(FA_JOB_POOLS_COMPILE 4)
108+
endif()
109+
106110
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
107111
OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
108112
OR WIN32)
@@ -172,7 +176,7 @@ else()
172176
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
173177
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
174178
-DCMAKE_JOB_POOL_COMPILE:STRING=compile
175-
-DCMAKE_JOB_POOLS:STRING=compile=4
179+
-DCMAKE_JOB_POOLS:STRING=compile=${FA_JOB_POOLS_COMPILE}
176180
-DNVCC_ARCH_BIN=${FA_NVCC_ARCH_BIN}
177181
${EXTERNAL_OPTIONAL_ARGS}
178182
CMAKE_CACHE_ARGS

cmake/external/xpu.cmake

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ if(NOT DEFINED XPU_XRE_BASE_VERSION)
3030
set(XPU_XRE_BASE_VERSION "4.32.0.1")
3131
endif()
3232
if(NOT DEFINED XPU_XHPC_BASE_DATE)
33-
set(XPU_XHPC_BASE_DATE "20240626")
33+
set(XPU_XHPC_BASE_DATE "20240704")
3434
endif()
35-
set(XPU_XCCL_BASE_VERSION "1.2.1.2")
35+
set(XPU_XCCL_BASE_VERSION "1.2.3")
3636
if(NOT DEFINED XPU_XFT_BASE_VERSION)
3737
set(XPU_XFT_BASE_VERSION "20230602")
3838
endif()
@@ -43,7 +43,7 @@ set(XPU_XRE_BASE_URL
4343
)
4444

4545
set(XPU_XCCL_BASE_URL
46-
"https://klx-sdk-release-public.su.bcebos.com/xccl/release/${XPU_XCCL_BASE_VERSION}"
46+
"https://klx-sdk-release-public.su.bcebos.com/xccl/release/${XPU_XCCL_BASE_VERSION}.1"
4747
)
4848

4949
if(NOT XPU_XFT_BASE_URL)
@@ -60,21 +60,28 @@ if(WITH_XPTI)
6060
endif()
6161

6262
if(WITH_XPU_XRE5)
63-
set(XPU_XRE_BASE_VERSION "5.0.3.1")
63+
set(XPU_XRE_BASE_VERSION "5.0.11.1")
6464
set(XPU_XRE_BASE_URL
6565
"https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/${XPU_XRE_BASE_VERSION}"
6666
)
67+
set(XPU_XCCL_BASE_URL
68+
"https://klx-sdk-release-public.su.bcebos.com/xccl/release/${XPU_XCCL_BASE_VERSION}.2"
69+
)
6770
endif()
6871

6972
if(WITH_XCCL_RDMA)
70-
set(XPU_XCCL_PREFIX "xccl_rdma")
73+
set(XPU_XCCL_PREFIX "xccl_rdma-")
7174
else()
72-
set(XPU_XCCL_PREFIX "xccl_socket")
75+
set(XPU_XCCL_PREFIX "xccl_socket-")
76+
# NOTE(lijin23): socket has not been supported for XPU3, so the xccl output name was changed.
77+
if(WITH_XPU_XRE5)
78+
set(XPU_XCCL_PREFIX "xccl_")
79+
endif()
7380
endif()
7481

7582
if(WITH_AARCH64)
7683
set(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
77-
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-kylin_aarch64")
84+
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}kylin_aarch64")
7885
set(XPU_XFT_DIR_NAME "") # TODO: xft has no kylin output at now.
7986
elseif(WITH_SUNWAY)
8087
set(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
@@ -87,11 +94,11 @@ elseif(WITH_BDCENTOS)
8794
else()
8895
set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
8996
endif()
90-
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64")
97+
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}bdcentos_x86_64")
9198
set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82")
9299
elseif(WITH_CENTOS)
93100
set(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
94-
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64")
101+
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}bdcentos_x86_64")
95102
set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82")
96103
else()
97104
# Ubuntu as default
@@ -102,7 +109,7 @@ else()
102109
set(XPU_XRE_DIR_NAME "xre-ubuntu_1604_x86_64")
103110
set(XPU_XHPC_DIR_NAME "xhpc-ubuntu1604_x86_64")
104111
endif()
105-
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-ubuntu_x86_64")
112+
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}ubuntu_x86_64")
106113
set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64")
107114
endif()
108115

cmake/generic.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,11 @@ function(nv_test TARGET_NAME)
709709
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
710710
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS}
711711
${os_dependency_modules} paddle_gtest_main phi)
712+
if(WIN32)
713+
target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
714+
else()
715+
target_link_libraries(${TARGET_NAME} python)
716+
endif()
712717
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main)
713718
common_link(${TARGET_NAME})
714719
add_test(${TARGET_NAME} ${TARGET_NAME})

paddle/cinn/ast_gen_ius/tensor_group.cc

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -215,27 +215,5 @@ absl::flat_hash_map<std::string, ir::Tensor> TensorGroup::AllocateBuffers() {
215215
return name_to_tensor_;
216216
}
217217

218-
void StageMapShareMemory(const poly::StageMap& stages) {
219-
absl::flat_hash_map<std::string, ir::_Tensor_*> tensor_map;
220-
for (auto& stage : stages) {
221-
tensor_map[stage.second->tensor()->name] = stage.second->tensor();
222-
}
223-
for (auto& stage : stages) {
224-
if (!stage.second->tensor()->buffer.defined() &&
225-
!stage.second->meta.tensors_to_share_buffer_with.empty()) {
226-
for (auto& str : stage.second->meta.tensors_to_share_buffer_with) {
227-
if (tensor_map[str]->buffer.defined()) {
228-
auto edited_shape = tensor_map[str]->buffer->shape;
229-
stage.second->tensor()->Bind(tensor_map[str]->buffer);
230-
tensor_map[str]->buffer->shape = edited_shape;
231-
VLOG(3) << "Stage Tensor " << stage.second->tensor()->name
232-
<< " bind buffer to " << tensor_map[str]->name << " , "
233-
<< tensor_map[str]->buffer->name;
234-
}
235-
}
236-
}
237-
}
238-
}
239-
240218
} // namespace ast_gen_ius
241219
} // namespace cinn

paddle/cinn/backends/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ gather_srcs(
77
codegen_c.cc
88
codegen_c_x86.cc
99
codegen_cuda_host.cc
10+
codegen_invoke_module.cc
1011
extern_func_emitter.cc
1112
extern_func_emitter_builtin.cc
1213
function_prototype.cc

paddle/cinn/backends/codegen_cuda_host.cc

Lines changed: 0 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -194,72 +194,6 @@ llvm::Value* CodeGenCUDA_Host::LowerGPUKernelLauncher(
194194
return function;
195195
}
196196

197-
llvm::Value* CodeGenCUDA_Host::LowerHostFunc(const ir::_LoweredFunc_* func) {
198-
// Create the function
199-
// @{
200-
auto* function_type = GenFunctionTypeFromCinnFunction(func, true);
201-
f_ = llvm::Function::Create(
202-
function_type, llvm::Function::ExternalLinkage, func->name, m_);
203-
f_->setCallingConv(llvm::CallingConv::C);
204-
f_->setHasUWTable();
205-
206-
std::vector<llvm::Value*> ll_function_args;
207-
std::transform(f_->arg_begin(),
208-
f_->arg_end(),
209-
std::back_inserter(ll_function_args),
210-
[](auto& arg) { return std::addressof(arg); });
211-
// @}
212-
213-
// Set local scope table
214-
PADDLE_ENFORCE_EQ(ll_function_args.size(),
215-
func->args.size(),
216-
phi::errors::InvalidArgument(
217-
"The number of arguments is not equal to the number of "
218-
"function arguments"));
219-
for (int i = 0; i < ll_function_args.size(); ++i) {
220-
SetVar(func->args[i].name(), ll_function_args[i]);
221-
}
222-
llvm::BasicBlock* entry = llvm::BasicBlock::Create(
223-
/*Context=*/b_->getContext(),
224-
/*Name=*/"entry",
225-
/*Parent=*/f_,
226-
/*InsertBefore=*/nullptr);
227-
b_->SetInsertPoint(entry);
228-
CodeGenLLVM::Visit(&func->body);
229-
230-
// Reset local scope table
231-
for (const ir::Argument& func_arg : func->args) {
232-
symbol_table_->Erase(func_arg.name());
233-
}
234-
RetVoid();
235-
236-
return f_;
237-
}
238-
239-
llvm::Value* CodeGenCUDA_Host::LowerParseArgsValueCall(
240-
const ir::Call* call_ir) {
241-
auto ret_type = CinnTypeToLLVMType(Int(64), m_);
242-
std::vector<llvm::Type*> args_type;
243-
PADDLE_ENFORCE_EQ(
244-
call_ir->read_args.size(),
245-
2,
246-
phi::errors::InvalidArgument(
247-
"The number of arguments of ParseArgsValue should be 2"));
248-
CHECK(call_ir->read_args[0].is_var() &&
249-
call_ir->read_args[0].as_var()->type().is_cpp_handle());
250-
CHECK(call_ir->read_args[1].type().is_int(32));
251-
args_type.push_back(CinnTypeToLLVMType(type_of<void*>(), m_));
252-
args_type.push_back(CinnTypeToLLVMType(type_of<int32_t>(), m_));
253-
254-
auto func_type = llvm::FunctionType::get(ret_type, args_type, false);
255-
auto call_func = m_->getOrInsertFunction(call_ir->name, func_type);
256-
257-
std::vector<llvm::Value*> call_args;
258-
call_args.push_back(std::addressof(*f_->arg_begin()));
259-
call_args.push_back(b_->getInt32(call_ir->read_args[1].as_int32()));
260-
return b_->CreateCall(call_func, call_args);
261-
}
262-
263197
llvm::Value* CodeGenCUDA_Host::LowerCUDAKernelCall(const ir::Call* call_ir) {
264198
std::vector<llvm::Value*> ll_function_args;
265199
std::transform(f_->arg_begin(),

paddle/cinn/backends/codegen_cuda_host.h

Lines changed: 9 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,9 @@
1313
// limitations under the License.
1414

1515
#pragma once
16-
17-
#include <absl/container/flat_hash_map.h>
18-
1916
#include <memory>
20-
#include <string>
21-
#include <tuple>
22-
#include <vector>
2317

24-
#include "paddle/cinn/backends/llvm/codegen_llvm.h"
18+
#include "paddle/cinn/backends/codegen_invoke_module.h"
2519
#include "paddle/cinn/runtime/intrinsic.h"
2620

2721
PD_DECLARE_bool(cinn_bucket_compile);
@@ -30,30 +24,29 @@ namespace cinn {
3024
namespace backends {
3125

3226
/**
33-
* CodeGenCUDA takes a CINN Module with host functions and output a LLVM module.
27+
* CodeGenCUDA_Host takes a CINN Module with CUDA host functions and output a
28+
* LLVM module.
3429
*/
35-
class CodeGenCUDA_Host : public CodeGenLLVM {
30+
class CodeGenCUDA_Host : public CodeGenHost {
3631
public:
3732
explicit CodeGenCUDA_Host(llvm::Module *m,
3833
llvm::IRBuilder<> *b,
3934
const std::shared_ptr<SymbolTable> &vars = nullptr)
40-
: CodeGenLLVM(m, b, vars) {}
35+
: CodeGenHost(m, b, vars) {}
4136

42-
using CodeGenLLVM::Visit;
37+
// TODO(Hongqing-work): remove this after we clear some old codes.
4338
llvm::Value *Visit(const ir::_LoweredFunc_ *func) override {
4439
if (FLAGS_cinn_bucket_compile) {
45-
return LowerHostFunc(func);
40+
return CodeGenHost::Visit(func);
4641
}
4742
return LowerGPUKernelLauncher(func);
4843
}
4944

5045
llvm::Value *Visit(const ir::Call *op) override {
51-
if (op->name == runtime::intrinsic::get_value_in_cuda_kernel_args) {
52-
return LowerParseArgsValueCall(op);
53-
} else if (op->name == runtime::intrinsic::call_cuda_kernel) {
46+
if (op->name == runtime::intrinsic::call_cuda_kernel) {
5447
return LowerCUDAKernelCall(op);
5548
} else {
56-
return CodeGenLLVM::Visit(op);
49+
return CodeGenHost::Visit(op);
5750
}
5851
}
5952

@@ -73,10 +66,6 @@ class CodeGenCUDA_Host : public CodeGenLLVM {
7366
*/
7467
llvm::Value *LowerGPUKernelLauncher(const ir::_LoweredFunc_ *func);
7568

76-
llvm::Value *LowerHostFunc(const ir::_LoweredFunc_ *func);
77-
78-
llvm::Value *LowerParseArgsValueCall(const ir::Call *call_ir);
79-
8069
llvm::Value *LowerCUDAKernelCall(const ir::Call *op);
8170
};
8271

0 commit comments

Comments
 (0)