@@ -233,8 +233,12 @@ def __init__(self, *operands, qualifiers=None, options=None):
233233 num_modes_in = tuple (len (m ) for m in modes_in )
234234 self .qualifiers_in = utils .check_tensor_qualifiers (qualifiers , cutn .tensor_qualifiers_dtype , num_inputs )
235235
236- self .contraction , modes_out , extents_out , strides_out = utils .create_output_tensor (
237- self .output_class , self .package , self .output , self .size_dict , self .device_id , self .data_type )
236+ # Create the output in the context of the current stream to work around a performance issue with CuPy's memory pool.
237+ stream = None
238+ self .logger .debug ("Beginning output tensor creation..." )
239+ self .contraction , self .contraction_output_event , modes_out , extents_out , strides_out = utils .create_output_tensor (
240+ self .output_class , self .package , self .output , self .size_dict , self .device_id , stream , self .data_type )
241+ self .logger .debug ("The output tensor has been created." )
238242
239243 # Create/set handle.
240244 if options .handle is not None :
@@ -631,7 +635,13 @@ def autotune(self, *, iterations=3, stream=None):
631635
632636 # Check if we still hold an output tensor; if not, create a new one.
633637 if self .contraction is None :
638+ self .logger .debug ("Beginning output (empty) tensor creation..." )
634639 self .contraction = utils .create_empty_tensor (self .output_class , self .extents_out , self .data_type , self .device_id , stream_ctx )
640+ self .logger .debug ("The output (empty) tensor has been created." )
641+ elif self .contraction_output_event is not None :
642+ stream .wait_event (self .contraction_output_event )
643+ self .contraction_output_event = None
644+ self .logger .debug ("Established ordering with output tensor creation event." )
635645
636646 timing = bool (self .logger and self .logger .handlers )
637647 self .logger .info (f"Starting autotuning..." )
@@ -716,7 +726,13 @@ def contract(self, *, slices=None, stream=None):
716726
717727 # Check if we still hold an output tensor; if not, create a new one.
718728 if self .contraction is None :
729+ self .logger .debug ("Beginning output (empty) tensor creation..." )
719730 self .contraction = utils .create_empty_tensor (self .output_class , self .extents_out , self .data_type , self .device_id , stream_ctx )
731+ self .logger .debug ("The output (empty) tensor has been created." )
732+ elif self .contraction_output_event is not None :
733+ stream .wait_event (self .contraction_output_event )
734+ self .contraction_output_event = None
735+ self .logger .debug ("Established ordering with output tensor creation event." )
720736
721737 # Create a slice group for contraction.
722738 slice_group = None
0 commit comments