11# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
22# // SPDX-License-Identifier: BSD
33
4- import os
54import logging
65from dataclasses import dataclass
76from typing import List , Optional , Callable , Union , Dict
@@ -53,8 +52,7 @@ def __init__(
5352 # Calculate range groups using coalescing logic
5453 self ._range_groups = self ._calculate_range_groups (ranges , max_gap_size )
5554
56- # Pre-create all readers and prefetch immediately
57- # TODO - judge if this is beneficial or not.
55+ # Pre-create all readers
5856 self ._group_readers : Dict [int , SequentialS3Reader ] = {}
5957 for i , group in enumerate (self ._range_groups ):
6058 reader = SequentialS3Reader (
@@ -65,16 +63,11 @@ def __init__(
6563 start_offset = group .start ,
6664 end_offset = group .end ,
6765 )
66+ # TODO - judge if this is beneficial or not.
6867 reader .prefetch () # Batch prefetch all ranges
6968 self ._group_readers [i ] = reader
7069
71- # Pre-calculate request-to-reader mapping
72- self ._request_to_reader : Dict [int , int ] = {}
73- for i , group in enumerate (self ._range_groups ):
74- for request in group .requests :
75- self ._request_to_reader [request .start ] = i
76-
77- self ._current_position = 0
70+ self ._position : int = 0
7871
7972 @property
8073 def bucket (self ) -> str :
@@ -92,6 +85,7 @@ def _calculate_range_groups(
9285 if not ranges :
9386 return []
9487
88+ # TODO: could be pre-sorted in prepare_local_plan for dcp.load
9589 sorted_ranges = sorted (ranges , key = lambda r : r .start )
9690 groups = []
9791 current_group = [sorted_ranges [0 ]]
@@ -117,48 +111,41 @@ def _create_range_group(self, ranges: List[RangeRequest]) -> RangeGroup:
117111 group_end = max (r .end for r in ranges )
118112 return RangeGroup (start = group_start , end = group_end , requests = ranges )
119113
120- def get_reader_for_request (
121- self , request_start : int
122- ) -> Optional [SequentialS3Reader ]:
123- """O(1) lookup using pre-calculated mapping."""
124- reader_idx = self ._request_to_reader .get (request_start )
125- return self ._group_readers .get (reader_idx ) if reader_idx is not None else None
126-
127114 def _find_reader_for_offset (self , offset : int ) -> Optional [SequentialS3Reader ]:
128115 """Find reader that contains the given offset."""
129- # TODO: improve logic using binary search
130- for reader in self . _group_readers . values () :
131- if reader . _start_offset <= offset < reader . _end_offset :
132- return reader
133- elif reader . _start_offset > offset :
134- break # Early termination since readers are ordered
116+ for i , group in enumerate ( self . _range_groups ):
117+ if group . start <= offset < group . end :
118+ self . _current_reader_index = i
119+ return self . _group_readers [ i ]
120+ if group . start > offset : # TODO handle this case properly by raising errors
121+ break
135122 return None
136123
137124 def seek (self , offset : int , whence : int = SEEK_SET , / ) -> int :
138- self ._current_position = offset
125+ self ._position = offset
139126 reader = self ._find_reader_for_offset (offset )
140127 if not reader :
141- return self ._current_position
142- reader .seek (offset , whence )
128+ return self ._position
129+ return reader .seek (offset , whence )
143130
144131 def read (self , size : Optional [int ] = None ) -> bytes :
145- reader = self ._find_reader_for_offset (self ._current_position )
132+ reader = self ._find_reader_for_offset (self ._position )
146133 if not reader :
147134 return b""
148135 data = reader .read (size )
149- self ._current_position += len (data )
136+ self ._position += len (data )
150137 return data
151138
152139 def readinto (self , buf ) -> int :
153- reader = self ._find_reader_for_offset (self ._current_position )
140+ reader = self ._find_reader_for_offset (self ._position )
154141 if not reader :
155142 return 0
156143 bytes_read = reader .readinto (buf )
157- self ._current_position += bytes_read
144+ self ._position += bytes_read
158145 return bytes_read
159146
160147 def tell (self ) -> int :
161- return self ._current_position
148+ return self ._position
162149
163150 def close (self ) -> None :
164151 for reader in self ._group_readers .values ():
0 commit comments