Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions src/tools/perf/cuda/ucp_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,17 @@ private:
params.num_elements = count;
params.elements = elems;

ucs_status_t status = ucp_device_mem_list_create(perf.ucp.ep, &params,
&m_params.mem_list);
ucs_status_t status;
const ucs_time_t deadline = ucs_get_time() + ucs_time_from_sec(5.0);
do {
ucp_worker_progress(perf.ucp.worker);
status = ucp_device_mem_list_create(perf.ucp.ep, &params,
&m_params.mem_list);
if (ucs_get_time() >= deadline) {
throw std::runtime_error("Timeout waiting for connection");
}
} while (status == UCS_ERR_NOT_CONNECTED);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add timeout?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added


if (status != UCS_OK) {
throw std::runtime_error("Failed to create memory list");
}
Expand Down
9 changes: 0 additions & 9 deletions src/ucp/core/ucp_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -549,15 +549,6 @@ ucp_device_mem_list_create(ucp_ep_h ep,
ucp_ep_config_t *ep_config;
uct_allocated_memory_t mem;

if (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) {
/*
* Do not log error here because UCS_ERR_NOT_CONNECTED is expected
* during connection establishment. Applications are expected to retry
* with progress.
*/
return UCS_ERR_NOT_CONNECTED;
}

/* Parameter sanity checks and extraction */
status = ucp_device_mem_list_params_check(ep->worker->context, params,
&rkey_cfg_index, &local_sys_dev,
Expand Down
49 changes: 27 additions & 22 deletions test/gtest/ucp/test_ucp_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ class test_ucp_device : public ucp_test {
MODE_LAST_ELEM_COUNTER
};

mem_list(entity &sender, entity &receiver, size_t size, unsigned count,
mem_list(const test_ucp_device &test, entity &sender, entity &receiver,
size_t size, unsigned count,
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_CUDA,
mem_list_mode_t mode = MODE_DATA_ONLY);
~mem_list();
Expand All @@ -54,6 +55,7 @@ class test_ucp_device : public ucp_test {
void dst_pattern_check(unsigned index, uint64_t seed) const;

private:
const test_ucp_device &m_test;
entity &m_receiver;
std::vector<std::unique_ptr<mapped_buffer>> m_src, m_dst;
std::vector<ucs::handle<ucp_rkey_h>> m_rkeys;
Expand Down Expand Up @@ -85,19 +87,14 @@ void test_ucp_device::init()
if (!is_loopback()) {
receiver().connect(&sender(), get_ep_params());
}

ucp_device_mem_list_handle_h handle;
while (ucp_device_mem_list_create(sender().ep(), NULL, &handle) ==
UCS_ERR_NOT_CONNECTED) {
progress();
}
}

test_ucp_device::mem_list::mem_list(entity &sender, entity &receiver,
test_ucp_device::mem_list::mem_list(const test_ucp_device &test,
entity &sender, entity &receiver,
size_t size, unsigned count,
ucs_memory_type_t mem_type,
mem_list_mode_t mode) :
m_receiver(receiver)
m_test(test), m_receiver(receiver)
{
bool has_counter = (mode != MODE_DATA_ONLY);
size_t data_count = (has_counter) ? count - 1 : count;
Expand Down Expand Up @@ -150,9 +147,17 @@ test_ucp_device::mem_list::mem_list(entity &sender, entity &receiver,
params.num_elements = count;
params.elements = elems.data();

// Create memory list
ASSERT_UCS_OK(
ucp_device_mem_list_create(sender.ep(), &params, &m_mem_list_h));
// Create memory list (with retry on connection)
ucs_status_t status = UCS_ERR_NOT_CONNECTED;
m_test.wait_for_cond(
[&]() {
m_test.progress();
status = ucp_device_mem_list_create(sender.ep(), &params, &m_mem_list_h);
return status != UCS_ERR_NOT_CONNECTED;
},
[]() {}, 5.0);

ASSERT_UCS_OK(status);
}

test_ucp_device::mem_list::~mem_list()
Expand Down Expand Up @@ -236,7 +241,7 @@ uint64_t test_ucp_device::counter_read(const mapped_buffer &buffer)

UCS_TEST_P(test_ucp_device, create_success)
{
mem_list list(sender(), receiver(), 4 * UCS_MBYTE, 4);
mem_list list(*this, sender(), receiver(), 4 * UCS_MBYTE, 4);
EXPECT_NE(nullptr, list.handle());
}

Expand Down Expand Up @@ -337,7 +342,7 @@ UCS_TEST_P(test_ucp_device, create_fail)
UCS_TEST_P(test_ucp_device, get_mem_list_length)
{
constexpr unsigned num_elements = 8;
mem_list list(sender(), receiver(), 1 * UCS_KBYTE, num_elements);
mem_list list(*this, sender(), receiver(), 1 * UCS_KBYTE, num_elements);
EXPECT_EQ(num_elements, ucp_device_get_mem_list_length(list.handle()));
}

Expand Down Expand Up @@ -548,7 +553,7 @@ class test_ucp_device_xfer : public test_ucp_device_kernel {
UCS_TEST_P(test_ucp_device_xfer, put_single)
{
static constexpr size_t size = 32 * UCS_KBYTE;
mem_list list(sender(), receiver(), size, 6);
mem_list list(*this, sender(), receiver(), size, 6);

// Perform the transfer
static constexpr unsigned mem_list_index = 3;
Expand Down Expand Up @@ -577,7 +582,7 @@ UCS_TEST_SKIP_COND_P(test_ucp_device_xfer, put_single_stress_test,

static constexpr size_t size = 8;
static constexpr unsigned mem_list_index = 0;
mem_list list(sender(), receiver(), size, 1);
mem_list list(*this, sender(), receiver(), size, 1);

// Perform the transfer
auto params = init_params();
Expand All @@ -601,8 +606,8 @@ UCS_TEST_P(test_ucp_device_xfer, put_multi)
{
static constexpr size_t size = 32 * UCS_KBYTE;
unsigned count = get_multi_elem_count();
mem_list list(sender(), receiver(), size, count + 1, UCS_MEMORY_TYPE_CUDA,
mem_list::MODE_LAST_ELEM_COUNTER);
mem_list list(*this, sender(), receiver(), size, count + 1,
UCS_MEMORY_TYPE_CUDA, mem_list::MODE_LAST_ELEM_COUNTER);

const unsigned counter_index = count;
list.dst_counter_init(counter_index);
Expand Down Expand Up @@ -631,7 +636,7 @@ UCS_TEST_SKIP_COND_P(test_ucp_device_xfer, put_multi_stress_test,

static constexpr size_t size = 8;
unsigned count = get_multi_elem_count();
mem_list list(sender(), receiver(), size, count + 1);
mem_list list(*this, sender(), receiver(), size, count + 1);

const unsigned counter_index = count;
list.dst_counter_init(counter_index);
Expand All @@ -657,8 +662,8 @@ UCS_TEST_P(test_ucp_device_xfer, put_multi_partial)
{
static constexpr size_t size = 32 * UCS_KBYTE;
unsigned total_count = get_multi_elem_count() * 2;
mem_list list(sender(), receiver(), size, total_count + 1, UCS_MEMORY_TYPE_CUDA,
mem_list::MODE_LAST_ELEM_COUNTER);
mem_list list(*this, sender(), receiver(), size, total_count + 1,
UCS_MEMORY_TYPE_CUDA, mem_list::MODE_LAST_ELEM_COUNTER);

const unsigned counter_index = total_count;
list.dst_counter_init(counter_index);
Expand Down Expand Up @@ -708,7 +713,7 @@ UCS_TEST_P(test_ucp_device_xfer, put_multi_partial)
UCS_TEST_P(test_ucp_device_xfer, counter)
{
const size_t size = counter_size();
mem_list list(sender(), receiver(), size, 1, UCS_MEMORY_TYPE_CUDA,
mem_list list(*this, sender(), receiver(), size, 1, UCS_MEMORY_TYPE_CUDA,
mem_list::MODE_COUNTER_ONLY);

static constexpr unsigned mem_list_index = 0;
Expand Down
Loading