-
Notifications
You must be signed in to change notification settings - Fork 5.9k
Closed
Description
Happens on the trainer, after the training has run for a while.
In my setting I have changed the dist fit a line to run for 1000 passes, it happens frequently (2 out of 3 tries).
Commands:
GLOG_logtostderr=1 GLOG_v=3 PSERVERS=172.17.0.5:6174 SERVER_ENDPOINT=172.17.0.5:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
GLOG_logtostderr=1 GLOG_v=3 PSERVERS=172.17.0.5:6174 SERVER_ENDPOINT=172.17.0.5:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
GLOG_logtostderr=1 GLOG_v=0 PSERVERS=172.17.0.5:6174 SERVER_ENDPOINT=172.17.0.5:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
notest_dist_fit_a_line.py is taken from here
I0119 21:26:14.525514 16639 send_op.cc:44] sending fc_0.w_0@GRAD
I0119 21:26:14.525590 16639 send_op.cc:44] sending fc_0.b_0@GRAD
E0119 21:26:14.529606 16639 grpc_client.cc:119] proc param error:name:[fc_0.w_0@GRAD] ep:[172.17.0.5:6174] grpc error:Connect Failed
Traceback (most recent call last):
File "notest_dist_fit_a_line.py", line 70, in <module>
fetch_list=[avg_cost])
File "/root/.local/lib/python2.7/site-packages/paddle/v2/fluid/executor.py", line 177, in run
self.executor.run(program.desc, scope, 0, True, True)
paddle.v2.fluid.core.EnforceNotMet: at [/home/helin/repo/Paddle/paddle/operators/send_op.cc:47]
PaddlePaddle Call Stacks:
0 0x7faab725cf17p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 727
1 0x7faab7aefaacp paddle::operators::SendOp::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 2988
2 0x7faab7310107p paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 1463
3 0x7faab7275893p void pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}, void, paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}&&, void (*)(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN(pybind11::detail::function_call) + 579
4 0x7faab72734e4p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 1236
5 0x4cad00p PyEval_EvalFrameEx + 28048
6 0x4c2705p PyEval_EvalCodeEx + 597
7 0x4ca088p PyEval_EvalFrameEx + 24856
8 0x4c2705p PyEval_EvalCodeEx + 597
9 0x4c24a9p PyEval_EvalCode + 25
10 0x4f19efp
11 0x4ec372p PyRun_FileExFlags + 130
12 0x4eaaf1p PyRun_SimpleFileExFlags + 401
13 0x49e208p Py_Main + 1736
14 0x7fab4c825830p __libc_start_main + 240
15 0x49da59p _start + 41
Metadata
Metadata
Labels
No labels