-
Notifications
You must be signed in to change notification settings - Fork 27
Open
Description
When I try the samples in the llama2 examples, The pos_cli report an error:
root@iZt4n09kz1g7mi4j4b1vckZ:~# pos_cli --start --target daemon
POS Log >>>>>>>>>> PhOS Workspace <<<<<<<<<<
_____ _ _ ____ _____
| __ \| | (_) / __ \ / ____|
| |__) | |__ ___ ___ _ __ ___ _| | | | (___
| ___/| '_ \ / _ \ / _ \ '_ \| \ \/ / | | |\___ \
| | | | | | (_) | __/ | | | |> <| |__| |____) |
|_| |_| |_|\___/ \___|_| |_|_/_/\_\\____/|_____/
POS Log PhoenixOS workspace created, welcome!
+00:00:00.286012 INFO: waiting for RPC requests...
Cache Optimization: Enabled!
Async Optimization: Enabled!
Handler Optimization: Enabled!
xpu remote address: localhost
create shm buffer
Segmentation fault (core dumped)
POS Warn failed execution of command cricket-rpc-server 2>&1: exit_code(139)
POS Warn failed to start posdFor the convenience of debugging, I just replace the llama2 model to gpt2, and the python process report a same error with the llama2:
+00:00:11.613831 ERROR: image is not an ELF! in cpu-client-driver.c:466and I use the GDB trace the call stacks of the python, I find that phos fails with the following call stacks:
root@iZt4n09kz1g7mi4j4b1vckZ:~# gdb cricket-rpc-server core-cricket-rpc-ser-635504-1733110269
......
# Some irrelevant information is omitted here
warning: Unexpected size of section `.reg-xstate/635511' in core file.
Using host libthread_db library "/usr/lib/x86_64-linux-gnu/libthread_db.so.1".
Core was generated by `cricket-rpc-server'.
Program terminated with signal SIGSEGV, Segmentation fault.
warning: Unexpected size of section `.reg-xstate/635511' in core file.
#0 0x00007f19407346f5 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
[Current thread is 1 (Thread 0x7f18b8fde000 (LWP 635511))]
(gdb) bt
#0 0x00007f19407346f5 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f19584a7e25 in xdr_string () from /usr/lib/x86_64-linux-gnu/libtirpc.so.3
#2 0x000055c329ff541f in xdr_str_result ()
#3 0x000055c32a042752 in dispatch(int, __rpc_xdr*, __rpc_xdr*) ()
#4 0x000055c32a044be8 in svc_run::{lambda(int)#1}::operator()(int) const ()
#5 0x000055c32a045898 in void std::__invoke_impl<void, svc_run::{lambda(int)#1}, int>(std::__invoke_other, svc_run::{lambda(int)#1}&&, int&&) ()
#6 0x000055c32a045816 in std::__invoke_result<svc_run::{lambda(int)#1}, int>::type std::__invoke<svc_run::{lambda(int)#1}, int>(std::__invoke_result&&, (svc_run::{lambda(int)#1}&&)...) ()
#7 0x000055c32a045785 in void std::thread::_Invoker<std::tuple<svc_run::{lambda(int)#1}, int> >::_M_invoke<0ul, 1ul>(std::_Index_tuple<0ul, 1ul>) ()
#8 0x000055c32a045740 in std::thread::_Invoker<std::tuple<svc_run::{lambda(int)#1}, int> >::operator()() ()
#9 0x000055c32a045724 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<svc_run::{lambda(int)#1}, int> > >::_M_run() ()
#10 0x00007f19408a9793 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#11 0x00007f195816d609 in start_thread () from /usr/lib/x86_64-linux-gnu/libpthread.so.0
#12 0x00007f19406cb133 in clone () from /usr/lib/x86_64-linux-gnu/libc.so.6
(gdb)Moreover, I also checks the code in cpu-client-driver.c:466, and the code reponse to this error is related to the cuModuleLoadData.
CUresult cuModuleLoadData(CUmodule* module, const void* image)
{
int proc = 1026;
cpu_time_start(totals, proc);
enum clnt_stat retval;
ptr_result result;
mem_data mem;
if (image == NULL) {
LOGE(LOG_ERROR, "image is NULL!");
return CUDA_ERROR_INVALID_IMAGE;
}
Elf64_Ehdr *ehdr = (Elf64_Ehdr*)image;
if (ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
ehdr->e_ident[EI_MAG3] != ELFMAG3) {
LOGE(LOG_ERROR, "image is not an ELF!");
return CUDA_ERROR_INVALID_IMAGE;
}
// TODO: [POS] how many bytes should we copy?
// LOGE(LOG_WARNING,
// "!!! e_shoff: %u, end of sh: %u, "
// "e_phoff: %u, end of ph: %u\n",
// ehdr->e_shoff,
// ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize,
// ehdr->e_phoff + ehdr->e_phnum * ehdr->e_phentsize
// );
// mem.mem_data_len = ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize;
mem.mem_data_len = ehdr->e_phoff + ehdr->e_phnum * ehdr->e_phentsize;
mem.mem_data_val = (uint8_t*)image;
LOGE(LOG_DEBUG, "image_size = %#0zx", mem.mem_data_len);
if (elf2_parameter_info(mem.mem_data_val, mem.mem_data_len) != 0) {
LOGE(LOG_ERROR, "could not get kernel infos from memory");
return CUDA_ERROR_INVALID_IMAGE;
}
retval = rpc_cumoduleloaddata_1(mem, &result, clnt);
LOGE(LOG_DEBUG, "[rpc] %s(%p) = %d, result %p\n", __FUNCTION__, image, result.err, (void*)result.ptr_result_u.ptr);
if (retval != RPC_SUCCESS) {
fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
return CUDA_ERROR_UNKNOWN;
}
if (module != NULL) {
*module = (CUmodule)result.ptr_result_u.ptr;
}
cpu_time_end(totals, proc);
return result.err;
}So can Phos support GPU lambdas and How can I fix this problem?
Metadata
Metadata
Assignees
Labels
No labels