Skip to content

Commit 1052c73

Browse files
committed
UCP/DEVICE: Add retry loops in perftest and gtest
1 parent 1e1d55c commit 1052c73

File tree

2 files changed

+20
-5
lines changed

2 files changed

+20
-5
lines changed

src/tools/perf/cuda/ucp_cuda_kernel.cu

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,15 @@ private:
152152
params.num_elements = count;
153153
params.elements = elems;
154154

155-
ucs_status_t status = ucp_device_mem_list_create(perf.ucp.ep, &params,
156-
&m_params.mem_list);
155+
ucs_status_t status;
156+
do {
157+
status = ucp_device_mem_list_create(perf.ucp.ep, &params,
158+
&m_params.mem_list);
159+
if (status == UCS_ERR_NOT_CONNECTED) {
160+
ucp_worker_progress(perf.ucp.worker);
161+
}
162+
} while (status == UCS_ERR_NOT_CONNECTED);
163+
157164
if (status != UCS_OK) {
158165
throw std::runtime_error("Failed to create memory list");
159166
}

test/gtest/ucp/test_ucp_device.cc

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,17 @@ test_ucp_device::mem_list::mem_list(entity &sender, entity &receiver,
144144
params.num_elements = count;
145145
params.elements = elems.data();
146146

147-
// Create memory list
148-
ASSERT_UCS_OK(
149-
ucp_device_mem_list_create(sender.ep(), &params, &m_mem_list_h));
147+
// Create memory list (with retry on connection)
148+
ucs_status_t status;
149+
do {
150+
status = ucp_device_mem_list_create(sender.ep(), &params, &m_mem_list_h);
151+
if (status != UCS_ERR_NOT_CONNECTED) {
152+
break;
153+
}
154+
sender.progress();
155+
receiver.progress();
156+
} while (status == UCS_ERR_NOT_CONNECTED);
157+
ASSERT_UCS_OK(status);
150158
}
151159

152160
test_ucp_device::mem_list::~mem_list()

0 commit comments

Comments
 (0)