@@ -35,6 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
35
35
private int serviceUnavailableRetryCount ;
36
36
private bool isReadRequest ;
37
37
private bool canUseMultipleWriteLocations ;
38
+ private bool isMultiMasterWriteRequest ;
38
39
private Uri locationEndpoint ;
39
40
private RetryContext retryContext ;
40
41
private DocumentServiceRequest documentServiceRequest ;
@@ -57,6 +58,7 @@ public ClientRetryPolicy(
57
58
this . sessionTokenRetryCount = 0 ;
58
59
this . serviceUnavailableRetryCount = 0 ;
59
60
this . canUseMultipleWriteLocations = false ;
61
+ this . isMultiMasterWriteRequest = false ;
60
62
this . isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled ;
61
63
}
62
64
@@ -97,6 +99,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
97
99
98
100
if ( exception is DocumentClientException clientException )
99
101
{
102
+ // Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
103
+ // get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
104
+ // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
105
+ // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
106
+ // might not be a throttling scenario at all and the status code in that case would be different than 429.
107
+ if ( this . ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite (
108
+ clientException . StatusCode ,
109
+ clientException . GetSubStatus ( ) ) )
110
+ {
111
+ DefaultTrace . TraceError (
112
+ "Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}." ,
113
+ StatusCodes . TooManyRequests , SubStatusCodes . SystemResourceUnavailable ) ;
114
+
115
+ return this . TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable (
116
+ shouldMarkEndpointUnavailableForPkRange : true ) ;
117
+ }
118
+
100
119
ShouldRetryResult shouldRetryResult = await this . ShouldRetryInternalAsync (
101
120
clientException ? . StatusCode ,
102
121
clientException ? . GetSubStatus ( ) ) ;
@@ -143,6 +162,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
143
162
return shouldRetryResult ;
144
163
}
145
164
165
+ // Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
166
+ // get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
167
+ // for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
168
+ // reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
169
+ // might not be a throttling scenario at all and the status code in that case would be different than 429.
170
+ if ( this . ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite (
171
+ cosmosResponseMessage . StatusCode ,
172
+ cosmosResponseMessage ? . Headers . SubStatusCode ) )
173
+ {
174
+ DefaultTrace . TraceError (
175
+ "Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}." ,
176
+ StatusCodes . TooManyRequests , SubStatusCodes . SystemResourceUnavailable ) ;
177
+
178
+ return this . TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable (
179
+ shouldMarkEndpointUnavailableForPkRange : true ) ;
180
+ }
181
+
146
182
return await this . throttlingRetry . ShouldRetryAsync ( cosmosResponseMessage , cancellationToken ) ;
147
183
}
148
184
@@ -156,6 +192,8 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
156
192
this . isReadRequest = request . IsReadOnlyRequest ;
157
193
this . canUseMultipleWriteLocations = this . globalEndpointManager . CanUseMultipleWriteLocations ( request ) ;
158
194
this . documentServiceRequest = request ;
195
+ this . isMultiMasterWriteRequest = ! this . isReadRequest
196
+ && ( this . globalEndpointManager ? . CanSupportMultipleWriteLocations ( request ) ?? false ) ;
159
197
160
198
// clear previous location-based routing directive
161
199
request . RequestContext . ClearRouteToLocation ( ) ;
@@ -274,16 +312,8 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
274
312
// Received 503 due to client connect timeout or Gateway
275
313
if ( statusCode == HttpStatusCode . ServiceUnavailable )
276
314
{
277
- DefaultTrace . TraceWarning ( "ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}" ,
278
- this . documentServiceRequest ? . RequestContext ? . LocationEndpointToRoute ? . ToString ( ) ?? string . Empty ,
279
- this . documentServiceRequest ? . ResourceAddress ?? string . Empty ) ;
280
-
281
- // Mark the partition as unavailable.
282
- // Let the ClientRetry logic decide if the request should be retried
283
- this . partitionKeyRangeLocationCache . TryMarkEndpointUnavailableForPartitionKeyRange (
284
- this . documentServiceRequest ) ;
285
-
286
- return this . ShouldRetryOnServiceUnavailable ( ) ;
315
+ return this . TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable (
316
+ shouldMarkEndpointUnavailableForPkRange : true ) ;
287
317
}
288
318
289
319
return null ;
@@ -406,6 +436,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques
406
436
}
407
437
}
408
438
439
+ /// <summary>
440
+ /// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if
441
+ /// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503
442
+ /// Service Unavailable response is received, indicating that the service might be temporarily unavailable.
443
+ /// It optionally marks the partition key range as unavailable, which will influence future routing decisions.
444
+ /// </summary>
445
+ /// <param name="shouldMarkEndpointUnavailableForPkRange">A boolean flag indicating whether the endpoint for the
446
+ /// current partition key range should be marked as unavailable.</param>
447
+ /// <returns>An instance of <see cref="ShouldRetryResult"/> indicating whether the operation should be retried.</returns>
448
+ private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable (
449
+ bool shouldMarkEndpointUnavailableForPkRange )
450
+ {
451
+ DefaultTrace . TraceWarning ( "ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}" ,
452
+ this . documentServiceRequest ? . RequestContext ? . LocationEndpointToRoute ? . ToString ( ) ?? string . Empty ,
453
+ this . documentServiceRequest ? . ResourceAddress ?? string . Empty ) ;
454
+
455
+ if ( shouldMarkEndpointUnavailableForPkRange )
456
+ {
457
+ // Mark the partition as unavailable.
458
+ // Let the ClientRetry logic decide if the request should be retried
459
+ this . partitionKeyRangeLocationCache . TryMarkEndpointUnavailableForPartitionKeyRange (
460
+ this . documentServiceRequest ) ;
461
+ }
462
+
463
+ return this . ShouldRetryOnServiceUnavailable ( ) ;
464
+ }
465
+
409
466
/// <summary>
410
467
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available.
411
468
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well.
@@ -449,6 +506,24 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
449
506
return ShouldRetryResult . RetryAfter ( TimeSpan . Zero ) ;
450
507
}
451
508
509
+ /// <summary>
510
+ /// Returns a boolean flag indicating if the endpoint should be marked as unavailable
511
+ /// due to a 429 response with a sub status code of 3092 (system resource unavailable).
512
+ /// This is applicable for write requests targeted for multi master accounts.
513
+ /// </summary>
514
+ /// <param name="statusCode">An instance of <see cref="HttpStatusCode"/> containing the status code.</param>
515
+ /// <param name="subStatusCode">An instance of <see cref="SubStatusCodes"/> containing the sub status code.</param>
516
+ /// <returns>A boolean flag indicating is the endpoint should be marked as unavailable.</returns>
517
+ private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite (
518
+ HttpStatusCode ? statusCode ,
519
+ SubStatusCodes ? subStatusCode )
520
+ {
521
+ return this . isMultiMasterWriteRequest
522
+ && statusCode . HasValue
523
+ && ( int ) statusCode . Value == ( int ) StatusCodes . TooManyRequests
524
+ && subStatusCode == SubStatusCodes . SystemResourceUnavailable ;
525
+ }
526
+
452
527
private sealed class RetryContext
453
528
{
454
529
public int RetryLocationIndex { get ; set ; }
0 commit comments