Skip to content

Commit df8974c

Browse files
authored
Merge branch 'master' into users/sourabhjain/opentelemetryopsname
2 parents 5493b52 + 7839885 commit df8974c

File tree

5 files changed

+185
-13
lines changed

5 files changed

+185
-13
lines changed

Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs

Lines changed: 85 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
3535
private int serviceUnavailableRetryCount;
3636
private bool isReadRequest;
3737
private bool canUseMultipleWriteLocations;
38+
private bool isMultiMasterWriteRequest;
3839
private Uri locationEndpoint;
3940
private RetryContext retryContext;
4041
private DocumentServiceRequest documentServiceRequest;
@@ -57,6 +58,7 @@ public ClientRetryPolicy(
5758
this.sessionTokenRetryCount = 0;
5859
this.serviceUnavailableRetryCount = 0;
5960
this.canUseMultipleWriteLocations = false;
61+
this.isMultiMasterWriteRequest = false;
6062
this.isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled;
6163
}
6264

@@ -97,6 +99,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
9799

98100
if (exception is DocumentClientException clientException)
99101
{
102+
// Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
103+
// get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
104+
// for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
105+
// reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
106+
// might not be a throttling scenario at all and the status code in that case would be different than 429.
107+
if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
108+
clientException.StatusCode,
109+
clientException.GetSubStatus()))
110+
{
111+
DefaultTrace.TraceError(
112+
"Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.",
113+
StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);
114+
115+
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
116+
shouldMarkEndpointUnavailableForPkRange: true);
117+
}
118+
100119
ShouldRetryResult shouldRetryResult = await this.ShouldRetryInternalAsync(
101120
clientException?.StatusCode,
102121
clientException?.GetSubStatus());
@@ -143,6 +162,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
143162
return shouldRetryResult;
144163
}
145164

165+
// Today, the only scenario where we would treat a throttling (429) exception as service unavailable is when we
166+
// get 429 (TooManyRequests) with sub status code 3092 (System Resource Not Available). Note that this is applicable
167+
// for write requests targeted to a multiple master account. In such case, the 429/3092 will be treated as 503. The
168+
// reason to keep the code out of the throttling retry policy is that in the near future, the 3092 sub status code
169+
// might not be a throttling scenario at all and the status code in that case would be different than 429.
170+
if (this.ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
171+
cosmosResponseMessage.StatusCode,
172+
cosmosResponseMessage?.Headers.SubStatusCode))
173+
{
174+
DefaultTrace.TraceError(
175+
"Operation will NOT be retried on local region. Treating SystemResourceUnavailable (429/3092) as ServiceUnavailable (503). Status code: {0}, sub status code: {1}.",
176+
StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);
177+
178+
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
179+
shouldMarkEndpointUnavailableForPkRange: true);
180+
}
181+
146182
return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken);
147183
}
148184

@@ -156,6 +192,8 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
156192
this.isReadRequest = request.IsReadOnlyRequest;
157193
this.canUseMultipleWriteLocations = this.globalEndpointManager.CanUseMultipleWriteLocations(request);
158194
this.documentServiceRequest = request;
195+
this.isMultiMasterWriteRequest = !this.isReadRequest
196+
&& (this.globalEndpointManager?.CanSupportMultipleWriteLocations(request) ?? false);
159197

160198
// clear previous location-based routing directive
161199
request.RequestContext.ClearRouteToLocation();
@@ -274,16 +312,8 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
274312
// Received 503 due to client connect timeout or Gateway
275313
if (statusCode == HttpStatusCode.ServiceUnavailable)
276314
{
277-
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
278-
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
279-
this.documentServiceRequest?.ResourceAddress ?? string.Empty);
280-
281-
// Mark the partition as unavailable.
282-
// Let the ClientRetry logic decide if the request should be retried
283-
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
284-
this.documentServiceRequest);
285-
286-
return this.ShouldRetryOnServiceUnavailable();
315+
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
316+
shouldMarkEndpointUnavailableForPkRange: true);
287317
}
288318

289319
return null;
@@ -406,6 +436,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques
406436
}
407437
}
408438

439+
/// <summary>
440+
/// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if
441+
/// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503
442+
/// Service Unavailable response is received, indicating that the service might be temporarily unavailable.
443+
/// It optionally marks the partition key range as unavailable, which will influence future routing decisions.
444+
/// </summary>
445+
/// <param name="shouldMarkEndpointUnavailableForPkRange">A boolean flag indicating whether the endpoint for the
446+
/// current partition key range should be marked as unavailable.</param>
447+
/// <returns>An instance of <see cref="ShouldRetryResult"/> indicating whether the operation should be retried.</returns>
448+
private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
449+
bool shouldMarkEndpointUnavailableForPkRange)
450+
{
451+
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
452+
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
453+
this.documentServiceRequest?.ResourceAddress ?? string.Empty);
454+
455+
if (shouldMarkEndpointUnavailableForPkRange)
456+
{
457+
// Mark the partition as unavailable.
458+
// Let the ClientRetry logic decide if the request should be retried
459+
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
460+
this.documentServiceRequest);
461+
}
462+
463+
return this.ShouldRetryOnServiceUnavailable();
464+
}
465+
409466
/// <summary>
410467
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available.
411468
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well.
@@ -449,6 +506,24 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
449506
return ShouldRetryResult.RetryAfter(TimeSpan.Zero);
450507
}
451508

509+
/// <summary>
510+
/// Returns a boolean flag indicating if the endpoint should be marked as unavailable
511+
/// due to a 429 response with a sub status code of 3092 (system resource unavailable).
512+
/// This is applicable for write requests targeted for multi master accounts.
513+
/// </summary>
514+
/// <param name="statusCode">An instance of <see cref="HttpStatusCode"/> containing the status code.</param>
515+
/// <param name="subStatusCode">An instance of <see cref="SubStatusCodes"/> containing the sub status code.</param>
516+
/// <returns>A boolean flag indicating is the endpoint should be marked as unavailable.</returns>
517+
private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
518+
HttpStatusCode? statusCode,
519+
SubStatusCodes? subStatusCode)
520+
{
521+
return this.isMultiMasterWriteRequest
522+
&& statusCode.HasValue
523+
&& (int)statusCode.Value == (int)StatusCodes.TooManyRequests
524+
&& subStatusCode == SubStatusCodes.SystemResourceUnavailable;
525+
}
526+
452527
private sealed class RetryContext
453528
{
454529
public int RetryLocationIndex { get; set; }

Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,20 @@ public virtual async Task RefreshLocationAsync(bool forceRefresh = false)
534534

535535
await this.RefreshDatabaseAccountInternalAsync(forceRefresh: forceRefresh);
536536
}
537+
538+
/// <summary>
539+
/// Determines whether the current configuration and state of the service allow for supporting multiple write locations.
540+
/// This method returns True is the AvailableWriteLocations in LocationCache is more than 1. Otherwise, it returns False.
541+
/// </summary>
542+
/// <param name="request">The document service request for which the write location support is being evaluated.</param>
543+
/// <returns>A boolean flag indicating if the available write locations are more than one.</returns>
544+
public bool CanSupportMultipleWriteLocations(DocumentServiceRequest request)
545+
{
546+
return this.locationCache.CanUseMultipleWriteLocations()
547+
&& this.locationCache.GetAvailableWriteLocations()?.Count > 1
548+
&& (request.ResourceType == ResourceType.Document ||
549+
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == OperationType.Execute));
550+
}
537551

538552
#pragma warning disable VSTHRD100 // Avoid async void methods
539553
private async void StartLocationBackgroundRefreshLoop()

Microsoft.Azure.Cosmos/src/Routing/IGlobalEndpointManager.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,7 @@ internal interface IGlobalEndpointManager : IDisposable
3636
ReadOnlyDictionary<string, Uri> GetAvailableWriteEndpointsByLocation();
3737

3838
ReadOnlyDictionary<string, Uri> GetAvailableReadEndpointsByLocation();
39+
40+
bool CanSupportMultipleWriteLocations(DocumentServiceRequest request);
3941
}
4042
}

Microsoft.Azure.Cosmos/src/Routing/LocationCache.cs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,11 @@ public ReadOnlyCollection<string> GetAvailableReadLocations()
263263
{
264264
return this.locationInfo.AvailableReadLocations;
265265
}
266+
267+
public ReadOnlyCollection<string> GetAvailableWriteLocations()
268+
{
269+
return this.locationInfo.AvailableWriteLocations;
270+
}
266271

267272
/// <summary>
268273
/// Resolves request to service endpoint.
@@ -532,8 +537,8 @@ public bool CanUseMultipleWriteLocations(DocumentServiceRequest request)
532537
return this.CanUseMultipleWriteLocations() &&
533538
(request.ResourceType == ResourceType.Document ||
534539
(request.ResourceType == ResourceType.StoredProcedure && request.OperationType == Documents.OperationType.ExecuteJavaScript));
535-
}
536-
540+
}
541+
537542
private void ClearStaleEndpointUnavailabilityInfo()
538543
{
539544
if (this.locationUnavailablityInfoByEndpoint.Any())
@@ -768,7 +773,7 @@ private ReadOnlyDictionary<string, Uri> GetEndpointByLocation(IEnumerable<Accoun
768773
return new ReadOnlyDictionary<string, Uri>(endpointsByLocation);
769774
}
770775

771-
private bool CanUseMultipleWriteLocations()
776+
internal bool CanUseMultipleWriteLocations()
772777
{
773778
return this.useMultipleWriteLocations && this.enableMultipleWriteLocations;
774779
}

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,82 @@ public void MultimasterMetadataWriteRetryTest()
8686
retryPolicy.OnBeforeSendRequest(request);
8787
Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint);
8888
}
89+
90+
/// <summary>
91+
/// Test to validate that when 429.3092 is thrown from the service, write requests on
92+
/// a multi master account should be converted to 503 and retried to the next region.
93+
/// </summary>
94+
[TestMethod]
95+
[DataRow(true, DisplayName = "Validate retry policy with multi master write account.")]
96+
[DataRow(false, DisplayName = "Validate retry policy with single master write account.")]
97+
public async Task ShouldRetryAsync_WhenRequestThrottledWithResourceNotAvailable_ShouldThrow503OnMultiMasterWriteAndRetryOnNextRegion(
98+
bool isMultiMasterAccount)
99+
{
100+
// Arrange.
101+
const bool enableEndpointDiscovery = true;
102+
using GlobalEndpointManager endpointManager = this.Initialize(
103+
useMultipleWriteLocations: isMultiMasterAccount,
104+
enableEndpointDiscovery: enableEndpointDiscovery,
105+
isPreferredLocationsListEmpty: false,
106+
multimasterMetadataWriteRetryTest: true);
107+
108+
await endpointManager.RefreshLocationAsync();
109+
110+
ClientRetryPolicy retryPolicy = new (
111+
endpointManager,
112+
this.partitionKeyRangeLocationCache,
113+
new RetryOptions(),
114+
enableEndpointDiscovery,
115+
false);
116+
117+
// Creates a sample write request.
118+
DocumentServiceRequest request = this.CreateRequest(
119+
isReadRequest: false,
120+
isMasterResourceType: false);
121+
122+
// On first attempt should get (default/non hub) location.
123+
retryPolicy.OnBeforeSendRequest(request);
124+
Assert.AreEqual(request.RequestContext.LocationEndpointToRoute, ClientRetryPolicyTests.Location1Endpoint);
125+
126+
// Creation of 429.3092 Error.
127+
HttpStatusCode throttleException = HttpStatusCode.TooManyRequests;
128+
SubStatusCodes resourceNotAvailable = SubStatusCodes.SystemResourceUnavailable;
129+
130+
Exception innerException = new ();
131+
Mock<INameValueCollection> nameValueCollection = new ();
132+
DocumentClientException documentClientException = new (
133+
message: "SystemResourceUnavailable: 429 with 3092 occurred.",
134+
innerException: innerException,
135+
statusCode: throttleException,
136+
substatusCode: resourceNotAvailable,
137+
requestUri: request.RequestContext.LocationEndpointToRoute,
138+
responseHeaders: nameValueCollection.Object);
139+
140+
// Act.
141+
Task<ShouldRetryResult> shouldRetry = retryPolicy.ShouldRetryAsync(
142+
documentClientException,
143+
new CancellationToken());
144+
145+
// Assert.
146+
Assert.IsTrue(shouldRetry.Result.ShouldRetry);
147+
retryPolicy.OnBeforeSendRequest(request);
148+
149+
if (isMultiMasterAccount)
150+
{
151+
Assert.AreEqual(
152+
expected: ClientRetryPolicyTests.Location2Endpoint,
153+
actual: request.RequestContext.LocationEndpointToRoute,
154+
message: "The request should be routed to the next region, since the accound is a multi master write account and the request" +
155+
"failed with 429.309 which got converted into 503 internally. This should trigger another retry attempt to the next region.");
156+
}
157+
else
158+
{
159+
Assert.AreEqual(
160+
expected: ClientRetryPolicyTests.Location1Endpoint,
161+
actual: request.RequestContext.LocationEndpointToRoute,
162+
message: "Since this is asingle master account, the write request should not be retried on the next region.");
163+
}
164+
}
89165

90166
/// <summary>
91167
/// Tests to see if different 503 substatus codes are handeled correctly

0 commit comments

Comments
 (0)