More efficient determination of shard lengths and total size

tomvdw · The TensorFlow Datasets Authors · commit b6d8afa49898 · 2025-09-29T12:31:45.000-07:00
PiperOrigin-RevId: 812896990
diff --git a/tensorflow_datasets/core/file_adapters.py b/tensorflow_datasets/core/file_adapters.py
@@ -19,6 +19,7 @@
 
 import abc
 from collections.abc import Iterable, Iterator
+import concurrent.futures
 import enum
 import itertools
 import os
@@ -187,6 +188,31 @@ def num_examples(cls, filename: epath.PathLike) -> int:
       n += 1
     return n
 
+  @classmethod
+  def shard_lengths_and_sizes(
+      cls,
+      filename_template: naming.ShardedFileTemplate,
+      num_shards: int | None = None,
+  ) -> list[tuple[int, int]]:
+    """Returns the number of examples in each shard."""
+    if num_shards is not None:
+      shards = filename_template.sharded_filepaths(num_shards=num_shards)
+    else:
+      shards = filename_template.data_dir.glob(filename_template.glob_pattern())
+    shards = sorted([os.fspath(s) for s in shards])
+
+    def _get_length_and_size(shard: tuple[int, str]) -> tuple[int, int, int]:
+      index, shard = shard
+      length = cls.num_examples(shard)
+      size = epath.Path(shard).stat().length
+      return index, length, size
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
+      results = executor.map(_get_length_and_size, enumerate(shards))
+    # Sort results by the index and remove the index from the tuple.
+    sorted_results = sorted(results, key=lambda x: x[0])
+    return [(length, size) for _, length, size in sorted_results]
+
 
 class TfRecordFileAdapter(FileAdapter):
   """File adapter for TFRecord file format."""
diff --git a/tensorflow_datasets/core/file_adapters_test.py b/tensorflow_datasets/core/file_adapters_test.py
@@ -19,10 +19,12 @@
 import pathlib
 from typing import Type, TypeAlias
 
+from etils import epath
 import pytest
 from tensorflow_datasets import testing
 from tensorflow_datasets.core import dataset_builder
 from tensorflow_datasets.core import file_adapters
+from tensorflow_datasets.core import naming
 
 
 FileFormat: TypeAlias = file_adapters.FileFormat
@@ -138,3 +140,41 @@ def test_prase_file_format(format_enum_value, file_format):
 def test_convert_path_to_file_format(path, file_format, expected_path):
   converted_path = file_adapters.convert_path_to_file_format(path, file_format)
   assert os.fspath(converted_path) == expected_path
+
+
+@pytest.mark.parametrize(
+    'adapter_cls',
+    (
+        (file_adapters.TfRecordFileAdapter),
+        (file_adapters.ArrayRecordFileAdapter),
+    ),
+)
+def test_shard_lengths(
+    tmp_path: pathlib.Path, adapter_cls: file_adapters.FileAdapter
+):
+  file_template = naming.ShardedFileTemplate(
+      data_dir=tmp_path,
+      dataset_name='data',
+      filetype_suffix=adapter_cls.FILE_SUFFIX,
+      split='train',
+  )
+  tmp_path_1 = file_template.sharded_filepath(shard_index=0, num_shards=2)
+  tmp_path_2 = file_template.sharded_filepath(shard_index=1, num_shards=2)
+  adapter_cls.write_examples(
+      tmp_path_1, [(0, b'0'), (1, b'1'), (2, b'2222'), (3, b'33333')]
+  )
+  adapter_cls.write_examples(tmp_path_2, [(3, b'3'), (4, b'4'), (5, b'555')])
+  size_1 = epath.Path(tmp_path_1).stat().length
+  size_2 = epath.Path(tmp_path_2).stat().length
+  expected_shard_lengths = [(4, size_1), (3, size_2)]
+
+  # First test without passing the number of shards explicitly.
+  actual_no_num_shards = adapter_cls.shard_lengths_and_sizes(file_template)
+  assert actual_no_num_shards == expected_shard_lengths, 'no num_shards passed'
+
+  # Now test with passing the number of shards explicitly.
+  actual_with_num_shards = adapter_cls.shard_lengths_and_sizes(
+      file_template,
+      num_shards=2,
+  )
+  assert actual_with_num_shards == expected_shard_lengths, 'num_shards passed'
diff --git a/tensorflow_datasets/core/utils/file_utils.py b/tensorflow_datasets/core/utils/file_utils.py
@@ -18,7 +18,7 @@
 from __future__ import annotations
 
 import collections
-from collections.abc import Iterator, Sequence
+from collections.abc import Iterable, Iterator, Sequence
 import contextlib
 import dataclasses
 import functools
diff --git a/tensorflow_datasets/core/writer.py b/tensorflow_datasets/core/writer.py
@@ -18,7 +18,6 @@
 from __future__ import annotations
 
 from collections.abc import Iterable, Iterator, Sequence
-import concurrent.futures
 import dataclasses
 import functools
 import itertools
@@ -822,23 +821,11 @@ def finalize(self) -> tuple[list[int], int]:
       in each shard, and size of the files (in bytes).
     """
     logging.info("Finalizing writer for %s", self._filename_template.split)
-    # We don't know the number of shards, the length of each shard, nor the
-    # total size, so we compute them here.
-    shards = self._filename_template.data_dir.glob(
-        self._filename_template.glob_pattern()
+    shard_lengths_and_sizes = self._file_adapter.shard_lengths_and_sizes(
+        self._filename_template, num_shards=self._num_shards
     )
-
-    def _get_length_and_size(shard: epath.Path) -> tuple[epath.Path, int, int]:
-      length = self._file_adapter.num_examples(shard)
-      size = shard.stat().length
-      return shard, length, size
-
-    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
-      shard_sizes = executor.map(_get_length_and_size, shards)
-
-    shard_sizes = sorted(shard_sizes, key=lambda x: x[0])
-    shard_lengths: list[int] = [x[1] for x in shard_sizes]
-    total_size_bytes: int = sum([x[2] for x in shard_sizes])
+    shard_lengths = [length for length, _ in shard_lengths_and_sizes]
+    total_size_bytes = sum(size for _, size in shard_lengths_and_sizes)
 
     logging.info(
         "Found %d shards with a total size of %d bytes.",