-
-
Notifications
You must be signed in to change notification settings - Fork 606
Closed
Labels
Description
While running Open MPI:
page fault outside application, addr: 0x0000000001b7e000
[registers]
RIP: 0x000000000047e83a <memcpy_repmov_ssse3+234>
RFL: 0x0000000000010246 CS: 0x0000000000000008 SS: 0x0000000000000010
RAX: 0x0000000000400020 RBX: 0x00002000002ffd80 RCX: 0x00000000003be060 RDX: 0xffff8000053c3040
RSI: 0xffff800005405000 RDI: 0xffff800005806000 RBP: 0x00002000002ff7c0 R8: 0x0000000000000005
R9: 0x0000000000000000 R10: 0x0000000000000001 R11: 0x0000000000000020 R12: 0x0000000000000017
R13: 0x0000000000000000 R14: 0x0000000000000001 R15: 0x0000000000000020 RSP: 0x00002000002ff750
Aborted
[backtrace]
0x000000000022bc39 <abort(char const*, ...)+270>
0x00000000003c4f8b <???+3952523>
0x00000000003c5128 <mmu::vm_fault(unsigned long, exception_frame*)+350>
0x000000000048945e <page_fault+310>
0x0000000000488346 <???+4752198>
0x000000000058be50 <realloc+34>
0x00001000060d01c1 <???+101515713>
0x00001000060d0075 <???+101515381>
0x000010000605be8b <???+101039755>
0x000010000605bb7e <mca_base_framework_components_open+104>
0x00001000060cfd1d <???+101514525>
0x000010000606b06d <mca_base_framework_open+224>
0x00001000060308fe <opal_init+267>
0x0000100005806593 <???+92300691>
0x000010000580586d <???+92297325>
0x000000000061e3d7 <osv::application::run_main(std::string, int, char**)+713>
0x000000000061e608 <osv::application::run_main()+296>
0x000000000061e0db <osv::application::main()+105>
0x000000000061dce9 <???+6413545>
0x000000000061dd0e <???+6413582>
0x00000000006804a8 <???+6816936>
0x0000000000682b81 <???+6826881>
0x000000000044a675 <std::function<void ()>::operator()() const+49>
0x00000000005b9b87 <sched::thread::main()+27>
0x00000000005b5b36 <thread_main_c+38>
0x00000000004892c5 <???+4756165>
GDB:
(gdb) bt
#0 processor::cli_hlt () at arch/x64/processor.hh:248
#1 0x0000000000209613 in arch::halt_no_interrupts () at arch/x64/arch.hh:48
#2 0x0000000000498a72 in osv::halt () at arch/x64/power.cc:24
#3 0x000000000022bc60 in abort (fmt=0x99bc8c "Aborted\n") at runtime.cc:130
#4 0x000000000022bb2b in abort () at runtime.cc:96
#5 0x00000000003c4f8c in mmu::vm_sigsegv (addr=28827648, ef=0xffff8000050ec078) at core/mmu.cc:1316
#6 0x00000000003c5129 in mmu::vm_fault (addr=28827648, ef=0xffff8000050ec078) at core/mmu.cc:1338
#7 0x000000000048945f in page_fault (ef=0xffff8000050ec078) at arch/x64/mmu.cc:38
#8 <signal handler called>
#9 0x000000000047e83a in repmovsb (n=<optimized out>, src=<optimized out>, dest=<optimized out>) at arch/x64/string.cc:102
#10 memcpy_repmov_ssse3 (dest=0xffff8000057c4040, src=0xffff8000053c3040, n=4194336) at arch/x64/string.cc:290
#11 0x000000000058b87d in std_realloc (object=0xffff8000053c3040, size=4194336) at core/mempool.cc:1585
#12 0x000000000058be51 in realloc (obj=0xffff8000053c3040, size=4194336) at core/mempool.cc:1728
#13 0x00001000060d01c2 in opal_memory_linux_ptmalloc2_open () at ../../../../../opal/mca/memory/linux/memory_linux_ptmalloc2.c:81
#14 0x00001000060d0076 in linux_open () at ../../../../../opal/mca/memory/linux/memory_linux_component.c:207
#15 0x000010000605be8c in open_components (framework=0x10000631c7e0 <opal_memory_base_framework>) at ../../../../opal/mca/base/mca_base_components_open.c:173
#16 0x000010000605bb7f in mca_base_framework_components_open (framework=0x10000631c7e0 <opal_memory_base_framework>, flags=MCA_BASE_OPEN_DEFAULT) at ../../../../opal/mca/base/mca_base_components_open.c:66
#17 0x00001000060cfd1e in opal_memory_base_open (flags=MCA_BASE_OPEN_DEFAULT) at ../../../../opal/mca/memory/base/memory_base_open.c:76
#18 0x000010000606b06e in mca_base_framework_open (framework=0x10000631c7e0 <opal_memory_base_framework>, flags=MCA_BASE_OPEN_DEFAULT) at ../../../../opal/mca/base/mca_base_framework.c:158
#19 0x00001000060308ff in opal_init (pargc=0x2000002ff99c, pargv=0x2000002ff990) at ../../opal/runtime/opal_init.c:407
#20 0x0000100005806594 in orterun (argc=18, argv=0x2000002ffcc0) at ../../../../orte/tools/orterun/orterun.c:714
#21 0x000010000580586e in main (argc=18, argv=0x2000002ffcc0) at ../../../../orte/tools/orterun/main.c:18
#22 0x000000000061e3d8 in osv::application::run_main (this=0xffffa00004080318, path="/usr/bin/mpirun", argc=18, argv=0xffffa00004081500) at core/app.cc:293
#23 0x000000000061e609 in osv::application::run_main (this=0xffffa00004080318) at core/app.cc:310
#24 0x000000000061e0dc in osv::application::main (this=0xffffa00004080318) at core/app.cc:238
#25 0x000000000061dcea in osv::application::__lambda5::operator() (__closure=0x0, app=0xffffa00004080318) at core/app.cc:188
#26 0x000000000061dd0f in osv::application::__lambda5::_FUN (app=0xffffa00004080318) at core/app.cc:190
#27 0x00000000006804a9 in pthread_private::pthread::pthread(void* (*)(void*), void*, sigset_t, pthread_private::thread_attr const*)::{lambda()#1}::operator()() const () at libc/pthread.cc:101
#28 0x0000000000682b82 in std::_Function_handler<void(), pthread_private::pthread::pthread(void* (*)(void*), void*, sigset_t, const pthread_private::thread_attr*)::__lambda6>::_M_invoke(const std::_Any_data &) (__functor=...) at /usr/include/c++/4.8.2/functional:2071
#29 0x000000000044a676 in std::function<void ()>::operator()() const (this=0xffff8000050e7080) at /usr/include/c++/4.8.2/functional:2471
#30 0x00000000005b9b88 in sched::thread::main (this=0xffff8000050e7050) at core/sched.cc:1171
#31 0x00000000005b5b37 in sched::thread_main_c (t=0xffff8000050e7050) at arch/x64/arch-switch.hh:164
#32 0x00000000004892c6 in thread_main () at arch/x64/entry.S:113
The code from frame 13 0x00001000060d01c2 in opal_memory_linux_ptmalloc2_open, 0x00001000060d01c2 in opal_memory_linux_ptmalloc2_open () at ../../../../../opal/mca/memory/linux/memory_linux_ptmalloc2.c:8
// Setup some memory hooks first, then
p = malloc(1024 * 1024 * 4);
if (NULL == p) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
p = realloc(p, 1024 * 1024 * 4 + 32);
Unless hooks are problem, this is safe.
The first 4MB of 0xffff8000053c3040 and 0xffff8000057c4040 are readable (x/1048576 0xffff8000053c3040)
memcpy_repmov_ssse3 src:
x/1048576 0xffff8000053c3040
0xffff8000053c3040: 0x00000000 0x00000000 0x00000000 0x00000000
...
0xffff8000057c3030: 0x00000000 0x00000000 0x00000000 0x00000000
dest:
0xffff8000057c4040: 0x00000000 0x00000000 0x00000000 0x00000000
...
0xffff800005bc4030: 0x00000000 0x00000000 0x00000000 0x00000000
And 4 MB + 32 B are also readable :
(gdb) x/32 0xffff8000057c3030
(gdb) x/32 0xffff800005bc4030
I point at memcpy_repmov_ssse3, it should not try to copy 4194336 == 1024 * 1024 * 4 + 32 B from src.
But I'm still confused, why this failed on that particular run - bytes after 0xffff8000057c3030 are readable - and why "page fault outside application, addr: 0x0000000001b7e000" - where does that addr come from :/