Skip to content

Commit 00dcd40

Browse files
committed
Merge branch 'master' into device-remove-ep-connected-check
2 parents 138897b + 6969a83 commit 00dcd40

File tree

8 files changed

+105
-21
lines changed

8 files changed

+105
-21
lines changed

buildlib/pr/build_job.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ jobs:
4545
CONTAINER: rhel90
4646
rhel100:
4747
CONTAINER: rhel100
48+
rocky89:
49+
CONTAINER: rocky89
50+
rocky96:
51+
CONTAINER: rocky96
4852
fedora41:
4953
CONTAINER: fedora41
5054
centos7:
@@ -66,6 +70,10 @@ jobs:
6670
CONTAINER: ubuntu2404_aarch64
6771
rhel100_aarch64:
6872
CONTAINER: rhel100_aarch64
73+
rocky89_aarch64:
74+
CONTAINER: rocky89_aarch64
75+
rocky96_aarch64:
76+
CONTAINER: rocky96_aarch64
6977
timeoutInMinutes: 340
7078

7179
steps:

buildlib/pr/main.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,18 @@ resources:
4242
- container: rhel100_aarch64
4343
image: rdmz-harbor.rdmz.labs.mlnx/hpcx/aarch64/rhel10.0/builder:mofed-25.07-0.9.7.0
4444
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)
45+
- container: rocky89
46+
image: rdmz-harbor.rdmz.labs.mlnx/hpcx/x86_64/rocky8.9/builder:mofed-24.10-3.2.5.0
47+
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)
48+
- container: rocky96
49+
image: rdmz-harbor.rdmz.labs.mlnx/hpcx/x86_64/rocky9.6/builder:mofed-24.10-3.2.5.0
50+
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)
51+
- container: rocky89_aarch64
52+
image: rdmz-harbor.rdmz.labs.mlnx/hpcx/aarch64/rocky8.9/builder:mofed-24.10-3.2.5.0
53+
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)
54+
- container: rocky96_aarch64
55+
image: rdmz-harbor.rdmz.labs.mlnx/hpcx/aarch64/rocky9.6/builder:mofed-24.10-3.2.5.0
56+
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)
4557
- container: ubuntu2004
4658
image: rdmz-harbor.rdmz.labs.mlnx/ucx/x86_64/ubuntu20.04/builder:mofed-5.0-1.0.0.0
4759
options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES)

src/ucp/api/device/ucp_device_types.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,22 +63,22 @@ typedef struct ucp_device_mem_list_handle {
6363
/**
6464
* Array of local addresses for the device transfer operations.
6565
*/
66-
void **local_addrs;
66+
void **local_addrs;
6767

6868
/**
6969
* Array of remote addresses for the device transfer operations.
7070
*/
71-
uint64_t *remote_addrs;
72-
71+
uint64_t *remote_addrs;
72+
7373
/**
7474
* Array of lengths of the local buffers in bytes.
7575
*/
76-
size_t *lengths;
76+
size_t *lengths;
7777

7878
/**
7979
* Array of UCT memory element objects.
8080
*/
81-
void *uct_mem_elements;
81+
void *uct_mem_elements;
8282

8383
/**
8484
* local address, remote address, and length arrays, are allocated contiguously.

src/ucp/api/device/ucp_host.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ typedef struct ucp_device_mem_list_params {
152152
*
153153
* @param [in] ep Remote endpoint handle.
154154
* @param [in] params Parameters used to create the handle.
155-
* @param [out] handle Created descriptor list handle.
155+
* @param [out] handle Created descriptors list handle.
156156
*
157157
* @return Error code as defined by @ref ucs_status_t.
158158
* @retval UCS_ERR_NOT_CONNECTED if the endpoint is not connected yet.
@@ -164,6 +164,18 @@ ucp_device_mem_list_create(ucp_ep_h ep,
164164
ucp_device_mem_list_handle_h *handle);
165165

166166

167+
/**
168+
* @ingroup UCP_DEVICE
169+
* @brief Return the number of elements in the descriptors mem list handle.
170+
*
171+
* @param [in] handle Descriptors list handle.
172+
*
173+
* @return Descriptors mem list length.
174+
*/
175+
uint32_t
176+
ucp_device_get_mem_list_length(const ucp_device_mem_list_handle_h handle);
177+
178+
167179
/**
168180
* @ingroup UCP_DEVICE
169181
* @brief Release function for a descriptor list handle.

src/ucp/core/ucp_device.c

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,17 @@
2121
#include "ucp_mm.inl"
2222

2323

24+
typedef struct {
25+
uct_allocated_memory_t mem;
26+
uint32_t mem_list_length;
27+
} ucp_device_handle_info_t;
28+
2429
KHASH_TYPE(ucp_device_handle_allocs, ucp_device_mem_list_handle_h,
25-
uct_allocated_memory_t);
30+
ucp_device_handle_info_t);
2631
#define ucp_device_handle_hash_key(_handle) \
2732
kh_int64_hash_func((uintptr_t)(_handle))
2833
KHASH_IMPL(ucp_device_handle_allocs, ucp_device_mem_list_handle_h,
29-
uct_allocated_memory_t, 1, ucp_device_handle_hash_key,
34+
ucp_device_handle_info_t, 1, ucp_device_handle_hash_key,
3035
kh_int64_hash_equal);
3136

3237
/* Hash to track handle allocator, used at release time */
@@ -50,11 +55,16 @@ void ucp_device_cleanup(void)
5055
}
5156

5257
static ucs_status_t
53-
ucp_device_mem_handle_hash_insert(uct_allocated_memory_t *mem_handle)
58+
ucp_device_mem_handle_hash_insert(const uct_allocated_memory_t *mem_handle,
59+
uint32_t mem_list_length)
5460
{
5561
ucs_status_t status;
5662
khiter_t iter;
5763
int ret;
64+
ucp_device_handle_info_t info;
65+
66+
info.mem = *mem_handle;
67+
info.mem_list_length = mem_list_length;
5868

5969
ucs_spin_lock(&ucp_device_handle_hash_lock);
6070
iter = kh_put(ucp_device_handle_allocs, &ucp_device_handle_hash,
@@ -66,7 +76,7 @@ ucp_device_mem_handle_hash_insert(uct_allocated_memory_t *mem_handle)
6676
ucs_error("handle=%p already found in hash", mem_handle->address);
6777
status = UCS_ERR_ALREADY_EXISTS;
6878
} else {
69-
kh_value(&ucp_device_handle_hash, iter) = *mem_handle;
79+
kh_value(&ucp_device_handle_hash, iter) = info;
7080
status = UCS_OK;
7181
}
7282

@@ -84,7 +94,7 @@ ucp_device_mem_handle_hash_remove(ucp_device_mem_list_handle_h handle)
8494
iter = kh_get(ucp_device_handle_allocs, &ucp_device_handle_hash, handle);
8595
ucs_assertv_always((iter != kh_end(&ucp_device_handle_hash)), "handle=%p",
8696
handle);
87-
mem = kh_value(&ucp_device_handle_hash, iter);
97+
mem = kh_value(&ucp_device_handle_hash, iter).mem;
8898
kh_del(ucp_device_handle_allocs, &ucp_device_handle_hash, iter);
8999
ucs_spin_unlock(&ucp_device_handle_hash_lock);
90100
return mem;
@@ -586,7 +596,7 @@ ucp_device_mem_list_create(ucp_ep_h ep,
586596
}
587597

588598
/* Track memory allocator for later release */
589-
status = ucp_device_mem_handle_hash_insert(&mem);
599+
status = ucp_device_mem_handle_hash_insert(&mem, params->num_elements);
590600
if (status != UCS_OK) {
591601
uct_mem_free(&mem);
592602
} else {
@@ -596,6 +606,24 @@ ucp_device_mem_list_create(ucp_ep_h ep,
596606
return status;
597607
}
598608

609+
uint32_t
610+
ucp_device_get_mem_list_length(const ucp_device_mem_list_handle_h handle)
611+
{
612+
khiter_t iter;
613+
uint32_t length;
614+
615+
ucs_assert(handle != NULL);
616+
617+
ucs_spin_lock(&ucp_device_handle_hash_lock);
618+
iter = kh_get(ucp_device_handle_allocs, &ucp_device_handle_hash, handle);
619+
ucs_assertv_always((iter != kh_end(&ucp_device_handle_hash)), "handle=%p",
620+
handle);
621+
length = kh_value(&ucp_device_handle_hash, iter).mem_list_length;
622+
ucs_spin_unlock(&ucp_device_handle_hash_lock);
623+
624+
return length;
625+
}
626+
599627
void ucp_device_mem_list_release(ucp_device_mem_list_handle_h handle)
600628
{
601629
uct_allocated_memory_t mem = ucp_device_mem_handle_hash_remove(handle);

src/uct/ib/mlx5/gdaki/gdaki.cuh

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,23 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_wqe_prepare_put_or_atomic(
165165
doca_gpu_dev_verbs_store_wqe_seg(dseg_ptr, (uint64_t*)&(dseg));
166166
}
167167

168+
UCS_F_DEVICE void uct_rc_mlx5_gda_lock(int *lock) {
169+
while (atomicCAS(lock, 0, 1) != 0)
170+
;
171+
#ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
172+
asm volatile("fence.acquire.gpu;");
173+
#else
174+
uint32_t dummy;
175+
uint32_t UCS_V_UNUSED val;
176+
asm volatile("ld.acquire.gpu.b32 %0, [%1];" : "=r"(val) : "l"(&dummy));
177+
#endif
178+
}
179+
180+
UCS_F_DEVICE void uct_rc_mlx5_gda_unlock(int *lock) {
181+
cuda::atomic_ref<int, cuda::thread_scope_device> lock_aref(*lock);
182+
lock_aref.store(0, cuda::std::memory_order_release);
183+
}
184+
168185
UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
169186
uint64_t wqe_base, unsigned count,
170187
uint64_t flags)
@@ -184,13 +201,11 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
184201
return;
185202
}
186203

187-
doca_gpu_dev_verbs_lock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
188-
&ep->sq_lock);
204+
uct_rc_mlx5_gda_lock(&ep->sq_lock);
189205
uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
190206
uct_rc_mlx5_gda_update_dbr(ep, ep->sq_ready_index);
191207
uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
192-
doca_gpu_dev_verbs_unlock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
193-
&ep->sq_lock);
208+
uct_rc_mlx5_gda_unlock(&ep->sq_lock);
194209
}
195210

196211
UCS_F_DEVICE bool

src/uct/ib/mlx5/gga/gga_mlx5.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -852,9 +852,11 @@ uct_gga_mlx5_query_tl_devices(uct_md_h md,
852852
return UCS_ERR_NO_DEVICE;
853853
}
854854

855-
ucs_assertv(mlx5_md->super.cap_flags & UCT_MD_FLAG_EXPORTED_MKEY,
856-
"md %p: cap_flags=0x%" PRIx64 " do not have EXPORTED_MKEY flag",
857-
mlx5_md, mlx5_md->super.cap_flags);
855+
if (!(mlx5_md->super.cap_flags & UCT_MD_FLAG_EXPORTED_MKEY)) {
856+
ucs_debug("md %p: cap_flags=0x%" PRIx64 " does not have EXPORTED_MKEY "
857+
"flag", mlx5_md, mlx5_md->super.cap_flags);
858+
return UCS_ERR_NO_DEVICE;
859+
}
858860

859861
ucs_assertv(ucs_test_all_flags(mlx5_md->flags, UCT_GGA_MLX5_MD_CAPS),
860862
"md %p: flags=0x%x do not have mandatory capabilities 0x%x",

test/gtest/ucp/test_ucp_device.cc

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,13 @@ UCS_TEST_P(test_ucp_device, create_fail)
334334
EXPECT_EQ(nullptr, handle);
335335
}
336336

337+
UCS_TEST_P(test_ucp_device, get_mem_list_length)
338+
{
339+
constexpr unsigned num_elements = 8;
340+
mem_list list(sender(), receiver(), 1 * UCS_KBYTE, num_elements);
341+
EXPECT_EQ(num_elements, ucp_device_get_mem_list_length(list.handle()));
342+
}
343+
337344
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, rc_gda, "rc,rc_gda")
338345

339346

@@ -563,7 +570,7 @@ UCS_TEST_P(test_ucp_device_xfer, put_single)
563570

564571
/* TODO: Enable these tests in CI */
565572
UCS_TEST_SKIP_COND_P(test_ucp_device_xfer, put_single_stress_test,
566-
RUNNING_ON_VALGRIND || true)
573+
RUNNING_ON_VALGRIND)
567574
{
568575
#ifdef __SANITIZE_ADDRESS__
569576
UCS_TEST_SKIP_R("Skipping stress test under ASAN");
@@ -617,7 +624,7 @@ UCS_TEST_P(test_ucp_device_xfer, put_multi)
617624
}
618625

619626
UCS_TEST_SKIP_COND_P(test_ucp_device_xfer, put_multi_stress_test,
620-
RUNNING_ON_VALGRIND || true)
627+
RUNNING_ON_VALGRIND)
621628
{
622629
#ifdef __SANITIZE_ADDRESS__
623630
UCS_TEST_SKIP_R("Skipping stress test under ASAN");

0 commit comments

Comments
 (0)