fix: resolve mypy errors and minor logic and name changes

jet-tong · jet-tong · commit daef051f18a4 · 2025-10-09T14:19:12.000+01:00
- type annotations, missing arguments / return statements, etc
- minor logic/name changes in list_of_ranges.py
- very minor change to fix mypy error on test_user_agent.py
diff --git a/s3torchconnector/src/s3torchconnector/dcp/s3_file_system.py b/s3torchconnector/src/s3torchconnector/dcp/s3_file_system.py
@@ -354,7 +354,11 @@ def __init__(
                 e.g. S3ReaderConstructor.sequential() or S3ReaderConstructor.range_based()
         """
         super().__init__(path)
-        self.fs = S3FileSystem(region, s3client_config=s3client_config, reader_constructor=reader_constructor)  # type: ignore
+        self.fs: S3FileSystem = S3FileSystem(  # type: ignore[assignment]
+            region,
+            s3client_config=s3client_config,
+            reader_constructor=reader_constructor,
+        )
         self.path = self.fs.init_path(path)
         self.sync_files = False
 
@@ -376,7 +380,7 @@ def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         # Inject ranges if using DCP list-of-ranges reader constructor
         if isinstance(self.fs._reader_constructor, DCPListOfRangesConstructor):
             # Calculate ranges per file
-            per_file_ranges = {}
+            per_file_ranges: Dict[str, List[RangeRequest]] = {}
             for read_item in plan.items:
                 item_md = self.storage_data[read_item.storage_index]
                 path = item_md.relative_path
@@ -391,6 +395,9 @@ def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
 
         # Sort items in plan based on their offset in checkpoints shards
         plan.items.sort(key=lambda item: self.storage_data[item.storage_index].offset)
+        logger.info(
+            f"Sorted {len(plan.items)} items in load plan based on offset in checkpoint shards"
+        )
         return plan
 
 
diff --git a/s3torchconnector/src/s3torchconnector/s3reader/list_of_ranges.py b/s3torchconnector/src/s3torchconnector/s3reader/list_of_ranges.py
@@ -1,7 +1,6 @@
 #  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 #  // SPDX-License-Identifier: BSD
 
-import os
 import logging
 from dataclasses import dataclass
 from typing import List, Optional, Callable, Union, Dict
@@ -53,8 +52,7 @@ def __init__(
         # Calculate range groups using coalescing logic
         self._range_groups = self._calculate_range_groups(ranges, max_gap_size)
 
-        # Pre-create all readers and prefetch immediately
-        # TODO - judge if this is beneficial or not.
+        # Pre-create all readers
         self._group_readers: Dict[int, SequentialS3Reader] = {}
         for i, group in enumerate(self._range_groups):
             reader = SequentialS3Reader(
@@ -65,16 +63,11 @@ def __init__(
                 start_offset=group.start,
                 end_offset=group.end,
             )
+            # TODO - judge if this is beneficial or not.
             reader.prefetch()  # Batch prefetch all ranges
             self._group_readers[i] = reader
 
-        # Pre-calculate request-to-reader mapping
-        self._request_to_reader: Dict[int, int] = {}
-        for i, group in enumerate(self._range_groups):
-            for request in group.requests:
-                self._request_to_reader[request.start] = i
-
-        self._current_position = 0
+        self._position: int = 0
 
     @property
     def bucket(self) -> str:
@@ -92,6 +85,7 @@ def _calculate_range_groups(
         if not ranges:
             return []
 
+        # TODO: could be pre-sorted in prepare_local_plan for dcp.load
         sorted_ranges = sorted(ranges, key=lambda r: r.start)
         groups = []
         current_group = [sorted_ranges[0]]
@@ -117,48 +111,41 @@ def _create_range_group(self, ranges: List[RangeRequest]) -> RangeGroup:
         group_end = max(r.end for r in ranges)
         return RangeGroup(start=group_start, end=group_end, requests=ranges)
 
-    def get_reader_for_request(
-        self, request_start: int
-    ) -> Optional[SequentialS3Reader]:
-        """O(1) lookup using pre-calculated mapping."""
-        reader_idx = self._request_to_reader.get(request_start)
-        return self._group_readers.get(reader_idx) if reader_idx is not None else None
-
     def _find_reader_for_offset(self, offset: int) -> Optional[SequentialS3Reader]:
         """Find reader that contains the given offset."""
-        # TODO: improve logic using binary search
-        for reader in self._group_readers.values():
-            if reader._start_offset <= offset < reader._end_offset:
-                return reader
-            elif reader._start_offset > offset:
-                break  # Early termination since readers are ordered
+        for i, group in enumerate(self._range_groups):
+            if group.start <= offset < group.end:
+                self._current_reader_index = i
+                return self._group_readers[i]
+            if group.start > offset:  # TODO handle this case properly by raising errors
+                break
         return None
 
     def seek(self, offset: int, whence: int = SEEK_SET, /) -> int:
-        self._current_position = offset
+        self._position = offset
         reader = self._find_reader_for_offset(offset)
         if not reader:
-            return self._current_position
-        reader.seek(offset, whence)
+            return self._position
+        return reader.seek(offset, whence)
 
     def read(self, size: Optional[int] = None) -> bytes:
-        reader = self._find_reader_for_offset(self._current_position)
+        reader = self._find_reader_for_offset(self._position)
         if not reader:
             return b""
         data = reader.read(size)
-        self._current_position += len(data)
+        self._position += len(data)
         return data
 
     def readinto(self, buf) -> int:
-        reader = self._find_reader_for_offset(self._current_position)
+        reader = self._find_reader_for_offset(self._position)
         if not reader:
             return 0
         bytes_read = reader.readinto(buf)
-        self._current_position += bytes_read
+        self._position += bytes_read
         return bytes_read
 
     def tell(self) -> int:
-        return self._current_position
+        return self._position
 
     def close(self) -> None:
         for reader in self._group_readers.values():
diff --git a/s3torchconnector/src/s3torchconnector/s3reader/sequential.py b/s3torchconnector/src/s3torchconnector/s3reader/sequential.py
@@ -107,7 +107,7 @@ def prefetch(self) -> None:
             if self._start_offset is not None or self._end_offset is not None:
                 self._stream = self._get_stream(self._start_offset, self._end_offset)
             else:
-                self._stream = self._get_stream()
+                self._stream = self._get_stream(None, None)
 
     def readinto(self, buf) -> int:
         """Read up to len(buf) bytes into a pre-allocated, writable bytes-like object buf.
diff --git a/s3torchconnector/tst/unit/test_user_agent.py b/s3torchconnector/tst/unit/test_user_agent.py
@@ -39,5 +39,5 @@ def test_default_user_agent_creation():
 
 @pytest.mark.parametrize("invalid_comment", [0, "string"])
 def test_invalid_comments_argument(invalid_comment):
-    with pytest.raises(ValueError, match="Argument comments must be a List\[str\]"):
+    with pytest.raises(ValueError, match=r"Argument comments must be a List\[str\]"):
         UserAgent(invalid_comment)