Skip to content

Commit 134ed65

Browse files
vchoi-hdfgroupjhendersonHDF
authored andcommitted
Implement selection vector I/O with collective chunk filling (HDFGroup#3826)
* Changes for ECP-344: Implement selection vector I/O with collective chunk filling. Also fix a bug in H5FD__mpio_write_vector() to account for fixed size optimization when computing max address. * Fixes based on PR review comments: For H5Dchunk.c: fix H5MM_xfree() For H5FDmpio.c: 1) Revert the fix to H5FD__mpio_write_vector() 2) Apply the patch from Neil on the proper length of s_sizes reported by H5FD__mpio_vector_build_types() * Put back the logic of dividing up the work among all the mpi ranks similar to the original H5D__chunk_collective_fill() routine. * Add a test to verify the fix for the illegal reference problem in H5FD__mpio_write_vector().
1 parent 55e126d commit 134ed65

File tree

4 files changed

+434
-189
lines changed

4 files changed

+434
-189
lines changed

src/H5Dchunk.c

Lines changed: 101 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -5536,11 +5536,9 @@ H5D__chunk_update_old_edge_chunks(H5D_t *dset, hsize_t old_dim[])
55365536
/*-------------------------------------------------------------------------
55375537
* Function: H5D__chunk_collective_fill
55385538
*
5539-
* Purpose: Use MPIO collective write to fill the chunks (if number of
5540-
* chunks to fill is greater than the number of MPI procs;
5541-
* otherwise use independent I/O).
5539+
* Purpose: Use MPIO selection vector I/O for writing fill chunks
55425540
*
5543-
* Return: Non-negative on success/Negative on failure
5541+
* Return: Non-negative on success/Negative on failure
55445542
*
55455543
*-------------------------------------------------------------------------
55465544
*/
@@ -5554,19 +5552,24 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
55545552
int mpi_code; /* MPI return code */
55555553
size_t num_blocks; /* Number of blocks between processes. */
55565554
size_t leftover_blocks; /* Number of leftover blocks to handle */
5557-
int blocks, leftover; /* converted to int for MPI */
5558-
MPI_Aint *chunk_disp_array = NULL;
5559-
MPI_Aint *block_disps = NULL;
5560-
int *block_lens = NULL;
5561-
MPI_Datatype mem_type = MPI_BYTE, file_type = MPI_BYTE;
5562-
H5FD_mpio_xfer_t prev_xfer_mode; /* Previous data xfer mode */
5563-
bool have_xfer_mode = false; /* Whether the previous xffer mode has been retrieved */
5564-
bool need_sort = false;
5565-
size_t i; /* Local index variable */
5555+
int blocks; /* converted to int for MPI */
5556+
int leftover; /* converted to int for MPI */
5557+
H5FD_mpio_xfer_t prev_xfer_mode; /* Previous data xfer mode */
5558+
bool have_xfer_mode = false; /* Whether the previous xffer mode has been retrieved */
5559+
size_t i; /* Local index variable */
5560+
haddr_t *io_addrs = NULL;
5561+
size_t *io_sizes = NULL;
5562+
const void **io_wbufs = NULL;
5563+
H5FD_mem_t io_types[2];
5564+
bool all_same_block_len = true;
5565+
bool need_sort = false;
5566+
size_t io_2sizes[2];
55665567
herr_t ret_value = SUCCEED; /* Return value */
55675568

55685569
FUNC_ENTER_PACKAGE
55695570

5571+
assert(chunk_fill_info->num_chunks != 0);
5572+
55705573
/*
55715574
* If a separate fill buffer is provided for partial chunks, ensure
55725575
* that the "don't filter partial edge chunks" flag is set.
@@ -5589,6 +5592,7 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
55895592
/* Distribute evenly the number of blocks between processes. */
55905593
if (mpi_size == 0)
55915594
HGOTO_ERROR(H5E_DATASET, H5E_BADVALUE, FAIL, "Resulted in division by zero");
5595+
55925596
num_blocks =
55935597
(size_t)(chunk_fill_info->num_chunks / (size_t)mpi_size); /* value should be the same on all procs */
55945598

@@ -5602,157 +5606,97 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
56025606
H5_CHECKED_ASSIGN(leftover, int, leftover_blocks, size_t);
56035607

56045608
/* Check if we have any chunks to write on this rank */
5605-
if (num_blocks > 0 || (leftover && leftover > mpi_rank)) {
5606-
MPI_Aint partial_fill_buf_disp = 0;
5607-
bool all_same_block_len = true;
5608-
5609-
/* Allocate buffers */
5610-
if (NULL == (chunk_disp_array = (MPI_Aint *)H5MM_malloc((size_t)(blocks + 1) * sizeof(MPI_Aint))))
5611-
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk file displacement buffer");
5612-
5613-
if (partial_chunk_fill_buf) {
5614-
MPI_Aint fill_buf_addr;
5615-
MPI_Aint partial_fill_buf_addr;
5616-
5617-
/* Calculate the displacement between the fill buffer and partial chunk fill buffer */
5618-
if (MPI_SUCCESS != (mpi_code = MPI_Get_address(fill_buf, &fill_buf_addr)))
5619-
HMPI_GOTO_ERROR(FAIL, "MPI_Get_address failed", mpi_code)
5620-
if (MPI_SUCCESS != (mpi_code = MPI_Get_address(partial_chunk_fill_buf, &partial_fill_buf_addr)))
5621-
HMPI_GOTO_ERROR(FAIL, "MPI_Get_address failed", mpi_code)
5622-
5623-
#if H5_CHECK_MPI_VERSION(3, 1)
5624-
partial_fill_buf_disp = MPI_Aint_diff(partial_fill_buf_addr, fill_buf_addr);
5625-
#else
5626-
partial_fill_buf_disp = partial_fill_buf_addr - fill_buf_addr;
5627-
#endif
5609+
if (num_blocks > 0 || leftover > mpi_rank) {
56285610

5629-
/*
5630-
* Allocate all-zero block displacements array. If a block's displacement
5631-
* is left as zero, that block will be written to from the regular fill
5632-
* buffer. If a block represents an unfiltered partial edge chunk, its
5633-
* displacement will be set so that the block is written to from the
5634-
* unfiltered fill buffer.
5635-
*/
5636-
if (NULL == (block_disps = (MPI_Aint *)H5MM_calloc((size_t)(blocks + 1) * sizeof(MPI_Aint))))
5637-
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate block displacements buffer");
5638-
}
5611+
if (NULL == (io_addrs = H5MM_malloc((size_t)(blocks + 1) * sizeof(*io_addrs))))
5612+
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
5613+
"couldn't allocate space for I/O addresses vector");
56395614

5640-
/*
5641-
* Perform initial scan of chunk info list to:
5642-
* - make sure that chunk addresses are monotonically non-decreasing
5643-
* - check if all blocks have the same length
5644-
*/
5645-
for (i = 1; i < chunk_fill_info->num_chunks; i++) {
5646-
if (chunk_fill_info->chunk_info[i].addr < chunk_fill_info->chunk_info[i - 1].addr)
5647-
need_sort = true;
5648-
5649-
if (chunk_fill_info->chunk_info[i].chunk_size != chunk_fill_info->chunk_info[i - 1].chunk_size)
5650-
all_same_block_len = false;
5651-
}
5615+
if (NULL == (io_wbufs = H5MM_malloc((size_t)(blocks + 1) * sizeof(*io_wbufs))))
5616+
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate space for I/O buffers vector");
5617+
}
56525618

5653-
if (need_sort)
5654-
qsort(chunk_fill_info->chunk_info, chunk_fill_info->num_chunks,
5655-
sizeof(struct chunk_coll_fill_info), H5D__chunk_cmp_coll_fill_info);
5619+
/*
5620+
* Perform initial scan of chunk info list to:
5621+
* - make sure that chunk addresses are monotonically non-decreasing
5622+
* - check if all blocks have the same length
5623+
*/
5624+
for (i = 1; i < chunk_fill_info->num_chunks; i++) {
5625+
if (chunk_fill_info->chunk_info[i].addr < chunk_fill_info->chunk_info[i - 1].addr)
5626+
need_sort = true;
56565627

5657-
/* Allocate buffer for block lengths if necessary */
5658-
if (!all_same_block_len)
5659-
if (NULL == (block_lens = (int *)H5MM_malloc((size_t)(blocks + 1) * sizeof(int))))
5660-
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate chunk lengths buffer");
5628+
if (chunk_fill_info->chunk_info[i].chunk_size != chunk_fill_info->chunk_info[i - 1].chunk_size)
5629+
all_same_block_len = false;
5630+
}
56615631

5662-
for (i = 0; i < (size_t)blocks; i++) {
5663-
size_t idx = i + (size_t)(mpi_rank * blocks);
5632+
/*
5633+
* Note that we sort all of the chunks here, and not just a subset
5634+
* corresponding to this rank. We do this since we have found MPI I/O to work
5635+
* better when each rank writes blocks that are contiguous in the file,
5636+
* and by sorting the full list we maximize the chance of that happening.
5637+
*/
5638+
if (need_sort)
5639+
qsort(chunk_fill_info->chunk_info, chunk_fill_info->num_chunks, sizeof(struct chunk_coll_fill_info),
5640+
H5D__chunk_cmp_coll_fill_info);
56645641

5665-
/* store the chunk address as an MPI_Aint */
5666-
chunk_disp_array[i] = (MPI_Aint)(chunk_fill_info->chunk_info[idx].addr);
5642+
/*
5643+
* If all the chunks have the same length, use the compressed feature
5644+
* to store the size.
5645+
* Otherwise, allocate the array of sizes for storing chunk sizes.
5646+
*/
5647+
if (all_same_block_len) {
5648+
io_2sizes[0] = chunk_fill_info->chunk_info[0].chunk_size;
5649+
io_2sizes[1] = 0;
5650+
}
5651+
else {
5652+
if (NULL == (io_sizes = H5MM_malloc((size_t)(blocks + 1) * sizeof(*io_sizes))))
5653+
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "couldn't allocate space for I/O sizes vector");
5654+
}
56675655

5668-
if (!all_same_block_len)
5669-
H5_CHECKED_ASSIGN(block_lens[i], int, chunk_fill_info->chunk_info[idx].chunk_size, size_t);
5656+
/*
5657+
* Since the type of all chunks is raw data, use the compressed feature
5658+
* to store the chunk type.
5659+
*/
5660+
io_types[0] = H5FD_MEM_DRAW;
5661+
io_types[1] = H5FD_MEM_NOLIST;
56705662

5671-
if (chunk_fill_info->chunk_info[idx].unfiltered_partial_chunk) {
5672-
assert(partial_chunk_fill_buf);
5673-
block_disps[i] = partial_fill_buf_disp;
5674-
}
5675-
} /* end for */
5663+
/*
5664+
* For the chunks corresponding to this rank, fill in the
5665+
* address, size and buf pointer for each chunk.
5666+
*/
5667+
for (i = 0; i < (size_t)blocks; i++) {
5668+
size_t idx = i + (size_t)(mpi_rank * blocks);
56765669

5677-
/* Calculate if there are any leftover blocks after evenly
5678-
* distributing. If there are, then round-robin the distribution
5679-
* to processes 0 -> leftover.
5680-
*/
5681-
if (leftover && leftover > mpi_rank) {
5682-
chunk_disp_array[blocks] =
5683-
(MPI_Aint)chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].addr;
5684-
5685-
if (!all_same_block_len)
5686-
H5_CHECKED_ASSIGN(block_lens[blocks], int,
5687-
chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].chunk_size,
5688-
size_t);
5689-
5690-
if (chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].unfiltered_partial_chunk) {
5691-
assert(partial_chunk_fill_buf);
5692-
block_disps[blocks] = partial_fill_buf_disp;
5693-
}
5670+
io_addrs[i] = chunk_fill_info->chunk_info[idx].addr;
56945671

5695-
blocks++;
5696-
}
5672+
if (!all_same_block_len)
5673+
io_sizes[i] = chunk_fill_info->chunk_info[idx].chunk_size;
56975674

5698-
/* Create file and memory types for the write operation */
5699-
if (all_same_block_len) {
5700-
int block_len;
5675+
if (chunk_fill_info->chunk_info[idx].unfiltered_partial_chunk)
5676+
io_wbufs[i] = partial_chunk_fill_buf;
5677+
else
5678+
io_wbufs[i] = fill_buf;
5679+
}
57015680

5702-
H5_CHECKED_ASSIGN(block_len, int, chunk_fill_info->chunk_info[0].chunk_size, size_t);
5681+
/*
5682+
* For the leftover chunk corresponding to this rank, fill in the
5683+
* address, size and buf pointer for the chunk.
5684+
*/
5685+
if (leftover > mpi_rank) {
5686+
io_addrs[blocks] = chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].addr;
57035687

5704-
mpi_code =
5705-
MPI_Type_create_hindexed_block(blocks, block_len, chunk_disp_array, MPI_BYTE, &file_type);
5706-
if (mpi_code != MPI_SUCCESS)
5707-
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed_block failed", mpi_code)
5688+
if (!all_same_block_len)
5689+
io_sizes[blocks] = chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].chunk_size;
57085690

5709-
if (partial_chunk_fill_buf) {
5710-
/*
5711-
* If filters are disabled for partial edge chunks, those chunks could
5712-
* potentially have the same block length as the other chunks, but still
5713-
* need to be written to using the unfiltered fill buffer. Use an hindexed
5714-
* block type rather than an hvector.
5715-
*/
5716-
mpi_code =
5717-
MPI_Type_create_hindexed_block(blocks, block_len, block_disps, MPI_BYTE, &mem_type);
5718-
if (mpi_code != MPI_SUCCESS)
5719-
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed_block failed", mpi_code)
5720-
}
5721-
else {
5722-
mpi_code = MPI_Type_create_hvector(blocks, block_len, 0, MPI_BYTE, &mem_type);
5723-
if (mpi_code != MPI_SUCCESS)
5724-
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hvector failed", mpi_code)
5725-
}
5726-
}
5727-
else {
5728-
/*
5729-
* Currently, different block lengths implies that there are partial
5730-
* edge chunks and the "don't filter partial edge chunks" flag is set.
5731-
*/
5691+
if (chunk_fill_info->chunk_info[(blocks * mpi_size) + mpi_rank].unfiltered_partial_chunk) {
57325692
assert(partial_chunk_fill_buf);
5733-
assert(block_lens);
5734-
assert(block_disps);
5735-
5736-
mpi_code = MPI_Type_create_hindexed(blocks, block_lens, chunk_disp_array, MPI_BYTE, &file_type);
5737-
if (mpi_code != MPI_SUCCESS)
5738-
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
5739-
5740-
mpi_code = MPI_Type_create_hindexed(blocks, block_lens, block_disps, MPI_BYTE, &mem_type);
5741-
if (mpi_code != MPI_SUCCESS)
5742-
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
5693+
io_wbufs[blocks] = partial_chunk_fill_buf;
57435694
}
5695+
else
5696+
io_wbufs[blocks] = fill_buf;
57445697

5745-
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&file_type)))
5746-
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
5747-
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(&mem_type)))
5748-
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
5749-
} /* end if */
5750-
5751-
/* Set MPI-IO VFD properties */
5752-
5753-
/* Set MPI datatypes for operation */
5754-
if (H5CX_set_mpi_coll_datatypes(mem_type, file_type) < 0)
5755-
HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set MPI-I/O properties");
5698+
blocks++;
5699+
}
57565700

57575701
/* Get current transfer mode */
57585702
if (H5CX_get_io_xfer_mode(&prev_xfer_mode) < 0)
@@ -5763,31 +5707,24 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
57635707
if (H5CX_set_io_xfer_mode(H5FD_MPIO_COLLECTIVE) < 0)
57645708
HGOTO_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set transfer mode");
57655709

5766-
/* Low-level write (collective) */
5767-
if (H5F_shared_block_write(H5F_SHARED(dset->oloc.file), H5FD_MEM_DRAW, (haddr_t)0,
5768-
(blocks) ? (size_t)1 : (size_t)0, fill_buf) < 0)
5769-
HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "unable to write raw data to file");
5770-
57715710
/* Barrier so processes don't race ahead */
57725711
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(mpi_comm)))
57735712
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
57745713

5714+
/* Perform the selection vector I/O for the chunks */
5715+
if (H5F_shared_vector_write(H5F_SHARED(dset->oloc.file), (uint32_t)blocks, io_types, io_addrs,
5716+
all_same_block_len ? io_2sizes : io_sizes, io_wbufs) < 0)
5717+
HGOTO_ERROR(H5E_DATASET, H5E_WRITEERROR, FAIL, "vector write call failed");
5718+
57755719
done:
57765720
if (have_xfer_mode)
5777-
/* Set transfer mode */
5721+
/* Restore transfer mode */
57785722
if (H5CX_set_io_xfer_mode(prev_xfer_mode) < 0)
57795723
HDONE_ERROR(H5E_DATASET, H5E_CANTSET, FAIL, "can't set transfer mode");
57805724

5781-
/* free things */
5782-
if (MPI_BYTE != file_type)
5783-
if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&file_type)))
5784-
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
5785-
if (MPI_BYTE != mem_type)
5786-
if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&mem_type)))
5787-
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code)
5788-
H5MM_xfree(chunk_disp_array);
5789-
H5MM_xfree(block_disps);
5790-
H5MM_xfree(block_lens);
5725+
H5MM_xfree(io_addrs);
5726+
H5MM_xfree(io_wbufs);
5727+
H5MM_xfree(io_sizes);
57915728

57925729
FUNC_LEAVE_NOAPI(ret_value)
57935730
} /* end H5D__chunk_collective_fill() */
@@ -5805,6 +5742,7 @@ H5D__chunk_cmp_coll_fill_info(const void *_entry1, const void *_entry2)
58055742

58065743
FUNC_LEAVE_NOAPI(H5_addr_cmp(entry1->addr, entry2->addr))
58075744
} /* end H5D__chunk_cmp_coll_fill_info() */
5745+
58085746
#endif /* H5_HAVE_PARALLEL */
58095747

58105748
/*-------------------------------------------------------------------------

0 commit comments

Comments
 (0)