Skip to content

is Phos that support lambda ? #13

@182yzh

Description

@182yzh

When I try the samples in the llama2 examples, The pos_cli report an error:

root@iZt4n09kz1g7mi4j4b1vckZ:~# pos_cli --start --target daemon
 POS Log  >>>>>>>>>> PhOS Workspace <<<<<<<<<<
 _____  _                      _       ____   _____
|  __ \| |                    (_)     / __ \ / ____|
| |__) | |__   ___   ___ _ __  ___  _| |  | | (___
|  ___/| '_ \ / _ \ / _ \ '_ \| \ \/ / |  | |\___ \
| |    | | | | (_) |  __/ | | | |>  <| |__| |____) |
|_|    |_| |_|\___/ \___|_| |_|_/_/\_\\____/|_____/

 POS Log  PhoenixOS workspace created, welcome!
+00:00:00.286012 INFO:  waiting for RPC requests...
Cache Optimization: Enabled!
Async Optimization: Enabled!
Handler Optimization: Enabled!
xpu remote address: localhost
create shm buffer
Segmentation fault (core dumped)
 POS Warn  failed execution of command cricket-rpc-server 2>&1: exit_code(139)
 POS Warn  failed to start posd

For the convenience of debugging, I just replace the llama2 model to gpt2, and the python process report a same error with the llama2:

+00:00:11.613831 ERROR: image is not an ELF!    in cpu-client-driver.c:466

and I use the GDB trace the call stacks of the python, I find that phos fails with the following call stacks:

root@iZt4n09kz1g7mi4j4b1vckZ:~# gdb cricket-rpc-server core-cricket-rpc-ser-635504-1733110269
......
# Some irrelevant information is omitted here

warning: Unexpected size of section `.reg-xstate/635511' in core file.
Using host libthread_db library "/usr/lib/x86_64-linux-gnu/libthread_db.so.1".
Core was generated by `cricket-rpc-server'.
Program terminated with signal SIGSEGV, Segmentation fault.

warning: Unexpected size of section `.reg-xstate/635511' in core file.
#0  0x00007f19407346f5 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
[Current thread is 1 (Thread 0x7f18b8fde000 (LWP 635511))]
(gdb) bt
#0  0x00007f19407346f5 in ?? () from /usr/lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f19584a7e25 in xdr_string () from /usr/lib/x86_64-linux-gnu/libtirpc.so.3
#2  0x000055c329ff541f in xdr_str_result ()
#3  0x000055c32a042752 in dispatch(int, __rpc_xdr*, __rpc_xdr*) ()
#4  0x000055c32a044be8 in svc_run::{lambda(int)#1}::operator()(int) const ()
#5  0x000055c32a045898 in void std::__invoke_impl<void, svc_run::{lambda(int)#1}, int>(std::__invoke_other, svc_run::{lambda(int)#1}&&, int&&) ()
#6  0x000055c32a045816 in std::__invoke_result<svc_run::{lambda(int)#1}, int>::type std::__invoke<svc_run::{lambda(int)#1}, int>(std::__invoke_result&&, (svc_run::{lambda(int)#1}&&)...) ()
#7  0x000055c32a045785 in void std::thread::_Invoker<std::tuple<svc_run::{lambda(int)#1}, int> >::_M_invoke<0ul, 1ul>(std::_Index_tuple<0ul, 1ul>) ()
#8  0x000055c32a045740 in std::thread::_Invoker<std::tuple<svc_run::{lambda(int)#1}, int> >::operator()() ()
#9  0x000055c32a045724 in std::thread::_State_impl<std::thread::_Invoker<std::tuple<svc_run::{lambda(int)#1}, int> > >::_M_run() ()
#10 0x00007f19408a9793 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#11 0x00007f195816d609 in start_thread () from /usr/lib/x86_64-linux-gnu/libpthread.so.0
#12 0x00007f19406cb133 in clone () from /usr/lib/x86_64-linux-gnu/libc.so.6
(gdb)

Moreover, I also checks the code in cpu-client-driver.c:466, and the code reponse to this error is related to the cuModuleLoadData.

CUresult cuModuleLoadData(CUmodule* module, const void* image)
{
    int proc = 1026;
    cpu_time_start(totals, proc);
	enum clnt_stat retval;
    ptr_result result;
    mem_data mem;

    if (image == NULL) {
        LOGE(LOG_ERROR, "image is NULL!");
        return CUDA_ERROR_INVALID_IMAGE;
    }
    Elf64_Ehdr *ehdr = (Elf64_Ehdr*)image;

    if (ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
        ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
        ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
        ehdr->e_ident[EI_MAG3] != ELFMAG3) {
        LOGE(LOG_ERROR, "image is not an ELF!");
        return CUDA_ERROR_INVALID_IMAGE;
    }

    // TODO: [POS] how many bytes should we copy?
    // LOGE(LOG_WARNING, 
    //     "!!! e_shoff: %u, end of sh: %u, "
    //     "e_phoff: %u, end of ph: %u\n",
    //     ehdr->e_shoff,
    //     ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize,
    //     ehdr->e_phoff + ehdr->e_phnum * ehdr->e_phentsize
    // );
    // mem.mem_data_len = ehdr->e_shoff + ehdr->e_shnum * ehdr->e_shentsize;
    mem.mem_data_len = ehdr->e_phoff + ehdr->e_phnum * ehdr->e_phentsize;
    mem.mem_data_val = (uint8_t*)image;

    LOGE(LOG_DEBUG, "image_size = %#0zx", mem.mem_data_len);
    
    if (elf2_parameter_info(mem.mem_data_val, mem.mem_data_len) != 0) {
        LOGE(LOG_ERROR, "could not get kernel infos from memory");
        return CUDA_ERROR_INVALID_IMAGE;
    }

    retval = rpc_cumoduleloaddata_1(mem, &result, clnt);
    LOGE(LOG_DEBUG, "[rpc] %s(%p) = %d, result %p\n", __FUNCTION__, image, result.err, (void*)result.ptr_result_u.ptr);
	if (retval != RPC_SUCCESS) {
		fprintf(stderr, "[rpc] %s failed.", __FUNCTION__);
        return CUDA_ERROR_UNKNOWN;
	}
    if (module != NULL) {
       *module = (CUmodule)result.ptr_result_u.ptr;
    }
    cpu_time_end(totals, proc);
    return result.err;
}

So can Phos support GPU lambdas and How can I fix this problem?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions