[Feature]Add collect_results for Ascend NPU

xuuyangg · xuuyangg · commit 2e69c786fb8b · 2023-08-17T16:57:08.000+08:00
diff --git a/mmengine/dist/__init__.py b/mmengine/dist/__init__.py
@@ -2,7 +2,8 @@
 from .dist import (all_gather_object, all_reduce, all_gather, all_reduce_dict,
                    collect_results, gather, broadcast, gather_object,
                    sync_random_seed, broadcast_object_list,
-                   collect_results_cpu, collect_results_gpu, all_reduce_params)
+                   collect_results_cpu, collect_results_gpu,
+                   collect_results_npu, all_reduce_params)
 from .utils import (get_dist_info, init_dist, init_local_group, get_backend,
                     get_world_size, get_rank, get_local_size, get_local_rank,
                     is_main_process, master_only, barrier, get_local_group,
@@ -11,11 +12,12 @@
 
 __all__ = [
     'all_gather_object', 'all_reduce', 'all_gather', 'all_reduce_dict',
-    'collect_results', 'collect_results_cpu', 'collect_results_gpu', 'gather',
-    'broadcast', 'gather_object', 'sync_random_seed', 'broadcast_object_list',
-    'get_dist_info', 'init_dist', 'init_local_group', 'get_backend',
-    'get_world_size', 'get_rank', 'get_local_size', 'get_local_group',
-    'get_local_rank', 'is_main_process', 'master_only', 'barrier',
-    'is_distributed', 'get_default_group', 'all_reduce_params',
-    'get_data_device', 'get_comm_device', 'cast_data_device', 'infer_launcher'
+    'collect_results', 'collect_results_cpu', 'collect_results_gpu',
+    'collect_results_npu', 'gather', 'broadcast', 'gather_object',
+    'sync_random_seed', 'broadcast_object_list', 'get_dist_info', 'init_dist',
+    'init_local_group', 'get_backend', 'get_world_size', 'get_rank',
+    'get_local_size', 'get_local_group', 'get_local_rank', 'is_main_process',
+    'master_only', 'barrier', 'is_distributed', 'get_default_group',
+    'all_reduce_params', 'get_data_device', 'get_comm_device',
+    'cast_data_device', 'infer_launcher'
 ]
diff --git a/mmengine/dist/dist.py b/mmengine/dist/dist.py
@@ -898,10 +898,11 @@ def collect_results(results: list,
             object.
         size (int): Size of the results, commonly equal to length of
             the results.
-        device (str): Device name. Optional values are 'cpu' and 'gpu'.
+        device (str): Device name. Optional values are 'cpu', 'gpu' or 'npu'.
         tmpdir (str | None): Temporal directory for collected results to
             store. If set to None, it will create a temporal directory for it.
-            ``tmpdir`` should be None when device is 'gpu'. Defaults to None.
+            ``tmpdir`` should be None when device is 'gpu' and 'npu'.
+            Defaults to None.
 
     Returns:
         list or None: The collected results.
@@ -927,6 +928,9 @@ def collect_results(results: list,
     if device == 'gpu':
         assert tmpdir is None, 'tmpdir should be None when device is "gpu"'
         return collect_results_gpu(results, size)
+    elif device == 'npu':
+        assert tmpdir is None, 'tmpdir should be None when device is "npu"'
+        return collect_results_npu(results, size)
     else:
         return collect_results_cpu(results, size, tmpdir)
 
@@ -1068,6 +1072,56 @@ def collect_results_gpu(result_part: list, size: int) -> Optional[list]:
         return None
 
 
+def collect_results_npu(result_part: list, size: int) -> Optional[list]:
+    """Collect results under npu mode.
+
+    On npu mode, this function will encode results to npu tensors and use npu
+    communication for results collection.
+
+    Args:
+        result_part (list[object]): Result list containing result parts
+            to be collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+
+    Returns:
+        list or None: The collected results.
+
+    Examples:
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> import mmengine.dist as dist
+        >>> if dist.get_rank() == 0:
+                data = ['foo', {1: 2}]
+            else:
+                data = [24, {'a': 'b'}]
+        >>> size = 4
+        >>> output = dist.collect_results_npu(data, size)
+        >>> output
+        ['foo', 24, {1: 2}, {'a': 'b'}]  # rank 0
+        None  # rank 1
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part[:size]
+
+    # gather all result part. Note that NCCL does not support gather so use
+    # all_gather_object instead.
+    part_list = all_gather_object(result_part)
+
+    if rank == 0:
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
+    else:
+        return None
+
+
 def _all_reduce_coalesced(tensors: List[torch.Tensor],
                           bucket_size_mb: int = -1,
                           op: str = 'sum',