[data] fix repartitioning empty datasets (ray-project#54107)

raulchen · minerharry · commit 7c5ed3025cb4 · 2025-06-27T21:12:35.000+10:00
Fix the following error when repartitioning an empty dataset:
```
    first_block_schema = reduce_metadata_schema[0].schema
IndexError: list index out of range
```

Signed-off-by: Hao Chen &lt;chenh1024@gmail.com&gt;
diff --git a/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py b/python/ray/data/_internal/planner/exchange/split_repartition_task_scheduler.py
@@ -117,15 +117,21 @@ def execute(
             )
 
             num_empty_blocks = output_num_blocks - len(reduce_block_refs)
-            first_block_schema = reduce_metadata_schema[0].schema
-            if first_block_schema is None:
-                raise ValueError(
-                    "Cannot split partition on blocks with unknown block format."
-                )
-            elif isinstance(first_block_schema, pa.Schema):
+            if len(reduce_metadata_schema) > 0:
+                first_block_schema = reduce_metadata_schema[0].schema
+                if isinstance(first_block_schema, pa.Schema):
+                    builder = ArrowBlockBuilder()
+                elif isinstance(first_block_schema, PandasBlockSchema):
+                    builder = PandasBlockBuilder()
+                else:
+                    raise ValueError(
+                        "Cannot split partition on blocks with unknown block schema:"
+                        f" {first_block_schema}."
+                    )
+            else:
+                # If the result is empty, default to Arrow format for the empty blocks.
                 builder = ArrowBlockBuilder()
-            elif isinstance(first_block_schema, PandasBlockSchema):
-                builder = PandasBlockBuilder()
+
             empty_block = builder.build()
             empty_meta_with_schema = BlockMetadataWithSchema.from_block(
                 empty_block
diff --git a/python/ray/data/tests/test_repartition_e2e.py b/python/ray/data/tests/test_repartition_e2e.py
@@ -206,6 +206,22 @@ def test_repartition_invalid_inputs(
         )
 
 
+@pytest.mark.parametrize("shuffle", [True, False])
+def test_repartition_empty_datasets(ray_start_regular_shared_2_cpus, shuffle):
+    # Test repartitioning an empty dataset with shuffle=True
+    num_partitions = 5
+    ds_empty = ray.data.range(100).filter(lambda row: False)
+    ds_repartitioned = ds_empty.repartition(num_partitions, shuffle=shuffle)
+
+    ref_bundles = list(ds_repartitioned.iter_internal_ref_bundles())
+    assert len(ref_bundles) == num_partitions
+    for ref_bundle in ref_bundles:
+        assert len(ref_bundle.blocks) == 1
+        metadata = ref_bundle.blocks[0][1]
+        assert metadata.num_rows == 0
+        assert metadata.size_bytes == 0
+
+
 if __name__ == "__main__":
     import sys