@@ -5536,11 +5536,9 @@ H5D__chunk_update_old_edge_chunks(H5D_t *dset, hsize_t old_dim[])
5536
5536
/*-------------------------------------------------------------------------
5537
5537
* Function: H5D__chunk_collective_fill
5538
5538
*
5539
- * Purpose: Use MPIO collective write to fill the chunks (if number of
5540
- * chunks to fill is greater than the number of MPI procs;
5541
- * otherwise use independent I/O).
5539
+ * Purpose: Use MPIO selection vector I/O for writing fill chunks
5542
5540
*
5543
- * Return: Non-negative on success/Negative on failure
5541
+ * Return: Non-negative on success/Negative on failure
5544
5542
*
5545
5543
*-------------------------------------------------------------------------
5546
5544
*/
@@ -5554,19 +5552,24 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
5554
5552
int mpi_code ; /* MPI return code */
5555
5553
size_t num_blocks ; /* Number of blocks between processes. */
5556
5554
size_t leftover_blocks ; /* Number of leftover blocks to handle */
5557
- int blocks , leftover ; /* converted to int for MPI */
5558
- MPI_Aint * chunk_disp_array = NULL ;
5559
- MPI_Aint * block_disps = NULL ;
5560
- int * block_lens = NULL ;
5561
- MPI_Datatype mem_type = MPI_BYTE , file_type = MPI_BYTE ;
5562
- H5FD_mpio_xfer_t prev_xfer_mode ; /* Previous data xfer mode */
5563
- bool have_xfer_mode = false; /* Whether the previous xffer mode has been retrieved */
5564
- bool need_sort = false;
5565
- size_t i ; /* Local index variable */
5555
+ int blocks ; /* converted to int for MPI */
5556
+ int leftover ; /* converted to int for MPI */
5557
+ H5FD_mpio_xfer_t prev_xfer_mode ; /* Previous data xfer mode */
5558
+ bool have_xfer_mode = false; /* Whether the previous xffer mode has been retrieved */
5559
+ size_t i ; /* Local index variable */
5560
+ haddr_t * io_addrs = NULL ;
5561
+ size_t * io_sizes = NULL ;
5562
+ const void * * io_wbufs = NULL ;
5563
+ H5FD_mem_t io_types [2 ];
5564
+ bool all_same_block_len = true;
5565
+ bool need_sort = false;
5566
+ size_t io_2sizes [2 ];
5566
5567
herr_t ret_value = SUCCEED ; /* Return value */
5567
5568
5568
5569
FUNC_ENTER_PACKAGE
5569
5570
5571
+ assert (chunk_fill_info -> num_chunks != 0 );
5572
+
5570
5573
/*
5571
5574
* If a separate fill buffer is provided for partial chunks, ensure
5572
5575
* that the "don't filter partial edge chunks" flag is set.
@@ -5589,6 +5592,7 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
5589
5592
/* Distribute evenly the number of blocks between processes. */
5590
5593
if (mpi_size == 0 )
5591
5594
HGOTO_ERROR (H5E_DATASET , H5E_BADVALUE , FAIL , "Resulted in division by zero" );
5595
+
5592
5596
num_blocks =
5593
5597
(size_t )(chunk_fill_info -> num_chunks / (size_t )mpi_size ); /* value should be the same on all procs */
5594
5598
@@ -5602,157 +5606,97 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
5602
5606
H5_CHECKED_ASSIGN (leftover , int , leftover_blocks , size_t );
5603
5607
5604
5608
/* Check if we have any chunks to write on this rank */
5605
- if (num_blocks > 0 || (leftover && leftover > mpi_rank )) {
5606
- MPI_Aint partial_fill_buf_disp = 0 ;
5607
- bool all_same_block_len = true;
5608
-
5609
- /* Allocate buffers */
5610
- if (NULL == (chunk_disp_array = (MPI_Aint * )H5MM_malloc ((size_t )(blocks + 1 ) * sizeof (MPI_Aint ))))
5611
- HGOTO_ERROR (H5E_DATASET , H5E_CANTALLOC , FAIL , "couldn't allocate chunk file displacement buffer" );
5612
-
5613
- if (partial_chunk_fill_buf ) {
5614
- MPI_Aint fill_buf_addr ;
5615
- MPI_Aint partial_fill_buf_addr ;
5616
-
5617
- /* Calculate the displacement between the fill buffer and partial chunk fill buffer */
5618
- if (MPI_SUCCESS != (mpi_code = MPI_Get_address (fill_buf , & fill_buf_addr )))
5619
- HMPI_GOTO_ERROR (FAIL , "MPI_Get_address failed" , mpi_code )
5620
- if (MPI_SUCCESS != (mpi_code = MPI_Get_address (partial_chunk_fill_buf , & partial_fill_buf_addr )))
5621
- HMPI_GOTO_ERROR (FAIL , "MPI_Get_address failed" , mpi_code )
5622
-
5623
- #if H5_CHECK_MPI_VERSION (3 , 1 )
5624
- partial_fill_buf_disp = MPI_Aint_diff (partial_fill_buf_addr , fill_buf_addr );
5625
- #else
5626
- partial_fill_buf_disp = partial_fill_buf_addr - fill_buf_addr ;
5627
- #endif
5609
+ if (num_blocks > 0 || leftover > mpi_rank ) {
5628
5610
5629
- /*
5630
- * Allocate all-zero block displacements array. If a block's displacement
5631
- * is left as zero, that block will be written to from the regular fill
5632
- * buffer. If a block represents an unfiltered partial edge chunk, its
5633
- * displacement will be set so that the block is written to from the
5634
- * unfiltered fill buffer.
5635
- */
5636
- if (NULL == (block_disps = (MPI_Aint * )H5MM_calloc ((size_t )(blocks + 1 ) * sizeof (MPI_Aint ))))
5637
- HGOTO_ERROR (H5E_DATASET , H5E_CANTALLOC , FAIL , "couldn't allocate block displacements buffer" );
5638
- }
5611
+ if (NULL == (io_addrs = H5MM_malloc ((size_t )(blocks + 1 ) * sizeof (* io_addrs ))))
5612
+ HGOTO_ERROR (H5E_RESOURCE , H5E_CANTALLOC , FAIL ,
5613
+ "couldn't allocate space for I/O addresses vector" );
5639
5614
5640
- /*
5641
- * Perform initial scan of chunk info list to:
5642
- * - make sure that chunk addresses are monotonically non-decreasing
5643
- * - check if all blocks have the same length
5644
- */
5645
- for (i = 1 ; i < chunk_fill_info -> num_chunks ; i ++ ) {
5646
- if (chunk_fill_info -> chunk_info [i ].addr < chunk_fill_info -> chunk_info [i - 1 ].addr )
5647
- need_sort = true;
5648
-
5649
- if (chunk_fill_info -> chunk_info [i ].chunk_size != chunk_fill_info -> chunk_info [i - 1 ].chunk_size )
5650
- all_same_block_len = false;
5651
- }
5615
+ if (NULL == (io_wbufs = H5MM_malloc ((size_t )(blocks + 1 ) * sizeof (* io_wbufs ))))
5616
+ HGOTO_ERROR (H5E_RESOURCE , H5E_CANTALLOC , FAIL , "couldn't allocate space for I/O buffers vector" );
5617
+ }
5652
5618
5653
- if (need_sort )
5654
- qsort (chunk_fill_info -> chunk_info , chunk_fill_info -> num_chunks ,
5655
- sizeof (struct chunk_coll_fill_info ), H5D__chunk_cmp_coll_fill_info );
5619
+ /*
5620
+ * Perform initial scan of chunk info list to:
5621
+ * - make sure that chunk addresses are monotonically non-decreasing
5622
+ * - check if all blocks have the same length
5623
+ */
5624
+ for (i = 1 ; i < chunk_fill_info -> num_chunks ; i ++ ) {
5625
+ if (chunk_fill_info -> chunk_info [i ].addr < chunk_fill_info -> chunk_info [i - 1 ].addr )
5626
+ need_sort = true;
5656
5627
5657
- /* Allocate buffer for block lengths if necessary */
5658
- if (!all_same_block_len )
5659
- if (NULL == (block_lens = (int * )H5MM_malloc ((size_t )(blocks + 1 ) * sizeof (int ))))
5660
- HGOTO_ERROR (H5E_DATASET , H5E_CANTALLOC , FAIL , "couldn't allocate chunk lengths buffer" );
5628
+ if (chunk_fill_info -> chunk_info [i ].chunk_size != chunk_fill_info -> chunk_info [i - 1 ].chunk_size )
5629
+ all_same_block_len = false;
5630
+ }
5661
5631
5662
- for (i = 0 ; i < (size_t )blocks ; i ++ ) {
5663
- size_t idx = i + (size_t )(mpi_rank * blocks );
5632
+ /*
5633
+ * Note that we sort all of the chunks here, and not just a subset
5634
+ * corresponding to this rank. We do this since we have found MPI I/O to work
5635
+ * better when each rank writes blocks that are contiguous in the file,
5636
+ * and by sorting the full list we maximize the chance of that happening.
5637
+ */
5638
+ if (need_sort )
5639
+ qsort (chunk_fill_info -> chunk_info , chunk_fill_info -> num_chunks , sizeof (struct chunk_coll_fill_info ),
5640
+ H5D__chunk_cmp_coll_fill_info );
5664
5641
5665
- /* store the chunk address as an MPI_Aint */
5666
- chunk_disp_array [i ] = (MPI_Aint )(chunk_fill_info -> chunk_info [idx ].addr );
5642
+ /*
5643
+ * If all the chunks have the same length, use the compressed feature
5644
+ * to store the size.
5645
+ * Otherwise, allocate the array of sizes for storing chunk sizes.
5646
+ */
5647
+ if (all_same_block_len ) {
5648
+ io_2sizes [0 ] = chunk_fill_info -> chunk_info [0 ].chunk_size ;
5649
+ io_2sizes [1 ] = 0 ;
5650
+ }
5651
+ else {
5652
+ if (NULL == (io_sizes = H5MM_malloc ((size_t )(blocks + 1 ) * sizeof (* io_sizes ))))
5653
+ HGOTO_ERROR (H5E_RESOURCE , H5E_CANTALLOC , FAIL , "couldn't allocate space for I/O sizes vector" );
5654
+ }
5667
5655
5668
- if (!all_same_block_len )
5669
- H5_CHECKED_ASSIGN (block_lens [i ], int , chunk_fill_info -> chunk_info [idx ].chunk_size , size_t );
5656
+ /*
5657
+ * Since the type of all chunks is raw data, use the compressed feature
5658
+ * to store the chunk type.
5659
+ */
5660
+ io_types [0 ] = H5FD_MEM_DRAW ;
5661
+ io_types [1 ] = H5FD_MEM_NOLIST ;
5670
5662
5671
- if (chunk_fill_info -> chunk_info [idx ].unfiltered_partial_chunk ) {
5672
- assert (partial_chunk_fill_buf );
5673
- block_disps [i ] = partial_fill_buf_disp ;
5674
- }
5675
- } /* end for */
5663
+ /*
5664
+ * For the chunks corresponding to this rank, fill in the
5665
+ * address, size and buf pointer for each chunk.
5666
+ */
5667
+ for (i = 0 ; i < (size_t )blocks ; i ++ ) {
5668
+ size_t idx = i + (size_t )(mpi_rank * blocks );
5676
5669
5677
- /* Calculate if there are any leftover blocks after evenly
5678
- * distributing. If there are, then round-robin the distribution
5679
- * to processes 0 -> leftover.
5680
- */
5681
- if (leftover && leftover > mpi_rank ) {
5682
- chunk_disp_array [blocks ] =
5683
- (MPI_Aint )chunk_fill_info -> chunk_info [(blocks * mpi_size ) + mpi_rank ].addr ;
5684
-
5685
- if (!all_same_block_len )
5686
- H5_CHECKED_ASSIGN (block_lens [blocks ], int ,
5687
- chunk_fill_info -> chunk_info [(blocks * mpi_size ) + mpi_rank ].chunk_size ,
5688
- size_t );
5689
-
5690
- if (chunk_fill_info -> chunk_info [(blocks * mpi_size ) + mpi_rank ].unfiltered_partial_chunk ) {
5691
- assert (partial_chunk_fill_buf );
5692
- block_disps [blocks ] = partial_fill_buf_disp ;
5693
- }
5670
+ io_addrs [i ] = chunk_fill_info -> chunk_info [idx ].addr ;
5694
5671
5695
- blocks ++ ;
5696
- }
5672
+ if (! all_same_block_len )
5673
+ io_sizes [ i ] = chunk_fill_info -> chunk_info [ idx ]. chunk_size ;
5697
5674
5698
- /* Create file and memory types for the write operation */
5699
- if (all_same_block_len ) {
5700
- int block_len ;
5675
+ if (chunk_fill_info -> chunk_info [idx ].unfiltered_partial_chunk )
5676
+ io_wbufs [i ] = partial_chunk_fill_buf ;
5677
+ else
5678
+ io_wbufs [i ] = fill_buf ;
5679
+ }
5701
5680
5702
- H5_CHECKED_ASSIGN (block_len , int , chunk_fill_info -> chunk_info [0 ].chunk_size , size_t );
5681
+ /*
5682
+ * For the leftover chunk corresponding to this rank, fill in the
5683
+ * address, size and buf pointer for the chunk.
5684
+ */
5685
+ if (leftover > mpi_rank ) {
5686
+ io_addrs [blocks ] = chunk_fill_info -> chunk_info [(blocks * mpi_size ) + mpi_rank ].addr ;
5703
5687
5704
- mpi_code =
5705
- MPI_Type_create_hindexed_block (blocks , block_len , chunk_disp_array , MPI_BYTE , & file_type );
5706
- if (mpi_code != MPI_SUCCESS )
5707
- HMPI_GOTO_ERROR (FAIL , "MPI_Type_create_hindexed_block failed" , mpi_code )
5688
+ if (!all_same_block_len )
5689
+ io_sizes [blocks ] = chunk_fill_info -> chunk_info [(blocks * mpi_size ) + mpi_rank ].chunk_size ;
5708
5690
5709
- if (partial_chunk_fill_buf ) {
5710
- /*
5711
- * If filters are disabled for partial edge chunks, those chunks could
5712
- * potentially have the same block length as the other chunks, but still
5713
- * need to be written to using the unfiltered fill buffer. Use an hindexed
5714
- * block type rather than an hvector.
5715
- */
5716
- mpi_code =
5717
- MPI_Type_create_hindexed_block (blocks , block_len , block_disps , MPI_BYTE , & mem_type );
5718
- if (mpi_code != MPI_SUCCESS )
5719
- HMPI_GOTO_ERROR (FAIL , "MPI_Type_create_hindexed_block failed" , mpi_code )
5720
- }
5721
- else {
5722
- mpi_code = MPI_Type_create_hvector (blocks , block_len , 0 , MPI_BYTE , & mem_type );
5723
- if (mpi_code != MPI_SUCCESS )
5724
- HMPI_GOTO_ERROR (FAIL , "MPI_Type_create_hvector failed" , mpi_code )
5725
- }
5726
- }
5727
- else {
5728
- /*
5729
- * Currently, different block lengths implies that there are partial
5730
- * edge chunks and the "don't filter partial edge chunks" flag is set.
5731
- */
5691
+ if (chunk_fill_info -> chunk_info [(blocks * mpi_size ) + mpi_rank ].unfiltered_partial_chunk ) {
5732
5692
assert (partial_chunk_fill_buf );
5733
- assert (block_lens );
5734
- assert (block_disps );
5735
-
5736
- mpi_code = MPI_Type_create_hindexed (blocks , block_lens , chunk_disp_array , MPI_BYTE , & file_type );
5737
- if (mpi_code != MPI_SUCCESS )
5738
- HMPI_GOTO_ERROR (FAIL , "MPI_Type_create_hindexed failed" , mpi_code )
5739
-
5740
- mpi_code = MPI_Type_create_hindexed (blocks , block_lens , block_disps , MPI_BYTE , & mem_type );
5741
- if (mpi_code != MPI_SUCCESS )
5742
- HMPI_GOTO_ERROR (FAIL , "MPI_Type_create_hindexed failed" , mpi_code )
5693
+ io_wbufs [blocks ] = partial_chunk_fill_buf ;
5743
5694
}
5695
+ else
5696
+ io_wbufs [blocks ] = fill_buf ;
5744
5697
5745
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit (& file_type )))
5746
- HMPI_GOTO_ERROR (FAIL , "MPI_Type_commit failed" , mpi_code )
5747
- if (MPI_SUCCESS != (mpi_code = MPI_Type_commit (& mem_type )))
5748
- HMPI_GOTO_ERROR (FAIL , "MPI_Type_commit failed" , mpi_code )
5749
- } /* end if */
5750
-
5751
- /* Set MPI-IO VFD properties */
5752
-
5753
- /* Set MPI datatypes for operation */
5754
- if (H5CX_set_mpi_coll_datatypes (mem_type , file_type ) < 0 )
5755
- HGOTO_ERROR (H5E_DATASET , H5E_CANTSET , FAIL , "can't set MPI-I/O properties" );
5698
+ blocks ++ ;
5699
+ }
5756
5700
5757
5701
/* Get current transfer mode */
5758
5702
if (H5CX_get_io_xfer_mode (& prev_xfer_mode ) < 0 )
@@ -5763,31 +5707,24 @@ H5D__chunk_collective_fill(const H5D_t *dset, H5D_chunk_coll_fill_info_t *chunk_
5763
5707
if (H5CX_set_io_xfer_mode (H5FD_MPIO_COLLECTIVE ) < 0 )
5764
5708
HGOTO_ERROR (H5E_DATASET , H5E_CANTSET , FAIL , "can't set transfer mode" );
5765
5709
5766
- /* Low-level write (collective) */
5767
- if (H5F_shared_block_write (H5F_SHARED (dset -> oloc .file ), H5FD_MEM_DRAW , (haddr_t )0 ,
5768
- (blocks ) ? (size_t )1 : (size_t )0 , fill_buf ) < 0 )
5769
- HGOTO_ERROR (H5E_IO , H5E_WRITEERROR , FAIL , "unable to write raw data to file" );
5770
-
5771
5710
/* Barrier so processes don't race ahead */
5772
5711
if (MPI_SUCCESS != (mpi_code = MPI_Barrier (mpi_comm )))
5773
5712
HMPI_GOTO_ERROR (FAIL , "MPI_Barrier failed" , mpi_code )
5774
5713
5714
+ /* Perform the selection vector I/O for the chunks */
5715
+ if (H5F_shared_vector_write (H5F_SHARED (dset -> oloc .file ), (uint32_t )blocks , io_types , io_addrs ,
5716
+ all_same_block_len ? io_2sizes : io_sizes , io_wbufs ) < 0 )
5717
+ HGOTO_ERROR (H5E_DATASET , H5E_WRITEERROR , FAIL , "vector write call failed" );
5718
+
5775
5719
done :
5776
5720
if (have_xfer_mode )
5777
- /* Set transfer mode */
5721
+ /* Restore transfer mode */
5778
5722
if (H5CX_set_io_xfer_mode (prev_xfer_mode ) < 0 )
5779
5723
HDONE_ERROR (H5E_DATASET , H5E_CANTSET , FAIL , "can't set transfer mode" );
5780
5724
5781
- /* free things */
5782
- if (MPI_BYTE != file_type )
5783
- if (MPI_SUCCESS != (mpi_code = MPI_Type_free (& file_type )))
5784
- HMPI_DONE_ERROR (FAIL , "MPI_Type_free failed" , mpi_code )
5785
- if (MPI_BYTE != mem_type )
5786
- if (MPI_SUCCESS != (mpi_code = MPI_Type_free (& mem_type )))
5787
- HMPI_DONE_ERROR (FAIL , "MPI_Type_free failed" , mpi_code )
5788
- H5MM_xfree (chunk_disp_array );
5789
- H5MM_xfree (block_disps );
5790
- H5MM_xfree (block_lens );
5725
+ H5MM_xfree (io_addrs );
5726
+ H5MM_xfree (io_wbufs );
5727
+ H5MM_xfree (io_sizes );
5791
5728
5792
5729
FUNC_LEAVE_NOAPI (ret_value )
5793
5730
} /* end H5D__chunk_collective_fill() */
@@ -5805,6 +5742,7 @@ H5D__chunk_cmp_coll_fill_info(const void *_entry1, const void *_entry2)
5805
5742
5806
5743
FUNC_LEAVE_NOAPI (H5_addr_cmp (entry1 -> addr , entry2 -> addr ))
5807
5744
} /* end H5D__chunk_cmp_coll_fill_info() */
5745
+
5808
5746
#endif /* H5_HAVE_PARALLEL */
5809
5747
5810
5748
/*-------------------------------------------------------------------------
0 commit comments