Skip to content

Commit cd21ae6

Browse files
Metadata requests: Fixes bug where certain metadata requests are not retried with a client cold start with only query requests (#5108)
# Pull Request Template ## Description ### Context Currently, there is a bug in the SDK where upon a cold start of the SDK and rare edge cases involving online/offline-ing regions, where only query requests are made, the SDK will not retry certain status code responses from metadata requests causing the entire request to fail. The correct behavior would be for the SDK to do cross region retries on these metadata requests. This pulls request includes several updates to enhance error handling and retry logic in the Cosmos DB SDK. The changes mainly focus on extending support for additional server error types and improving retry policies for various scenarios. ### Improvements to retry logic: * [`ClientRetryPolicy.cs`](diffhunk://#diff-2b056512ca285b1d95e025e31f60345059fa92d958becc38f90a6fb54ce1bbb4R331-R341): Enhanced retry logic to handle `InternalServerError` , `DatabaseAccountNotFound`, and `LeaseNotFound` status codes. * [`MetadataRequestThrottleRetryPolicy.cs`](diffhunk://#diff-a5ed5985909c3dcb6e4ce186cdd662d590dac5297ea14e68560c7d1eca307be4L26-R28): Refactored retry policy to handle additional status codes and renamed methods and constants to reflect the broader scope of endpoint unavailability. ### FaultInjection enhancements to error handling testing: * [`FaultInjectionRuleBuilder.cs`](diffhunk://#diff-d827164a4a6a0d8737e6598f8132c915ef48a1fc01daaa6422706f770dada5d5L152-R156): Added support for additional server error types such as `DatabaseAccountNotFound`, `ServiceUnavailable`, `InternalServerError`, and `LeaseNotFound` in the for metadata requests. * [`FaultInjectionServerErrorType.cs`](diffhunk://#diff-0c89faa9a48c428a7a98662d995474e34295618ac60e677ad9762fd048f33601L75-R82): Updated the `FaultInjectionServerErrorType` enum to include `LeaseNotFound` and corrected the status code for `DatabaseAccountNotFound`. * [`FaultInjectionServerErrorResultInternal.cs`](diffhunk://#diff-1ae8256c6d505a8f3b0a350978a0cc9a08f6234f7328f28f1db14302ee691d72L473-R473): Added handling for `LeaseNotFound` and updated the status code for `DatabaseAccountNotFound` in the `GetInjectedServerError` method. * ### Testing updates: * [`CosmosItemIntegrationTests.cs`](diffhunk://#diff-16d429adf686a32936696d2014afab3fc8faf91f10c880850fb8b30f8b96bb33R153-R214): Added a new test method `MetadataEndpointUnavailableCrossRegionalRetryTest` to validate the retry logic for various server error types. * [`ClientRetryPolicyTests.cs`](diffhunk://#diff-d3fdfdc5d4f4d8af2c2cc463d928285680e7695861422ef2e3330d1a956807e1L167-R176): Extended existing tests to cover additional status codes and substatus codes. ## Type of change Please delete options that are not relevant. - [] Bug fix (non-breaking change which fixes an issue) ## Closing issues To automatically close an issue: closes #4710 --------- Co-authored-by: Kiran Kumar Kolli <[email protected]>
1 parent 65000b5 commit cd21ae6

File tree

10 files changed

+226
-66
lines changed

10 files changed

+226
-66
lines changed

Microsoft.Azure.Cosmos/FaultInjection/src/FaultInjectionRuleBuilder.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,11 @@ private void ValidateGatewayConnection()
149149
{
150150
if (serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.TooManyRequests
151151
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.ResponseDelay
152-
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.SendDelay)
152+
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.SendDelay
153+
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.DatabaseAccountNotFound
154+
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.ServiceUnavailable
155+
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.InternalServerError
156+
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.LeaseNotFound)
153157
{
154158
throw new ArgumentException($"{serverErrorResult?.GetServerErrorType()} is not supported for metadata requests.");
155159
}

Microsoft.Azure.Cosmos/FaultInjection/src/FaultInjectionServerErrorType.cs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,13 @@ public enum FaultInjectionServerErrorType
7272
ServiceUnavailable,
7373

7474
/// <summary>
75-
/// 404:1008 Database account not found from gateway
75+
/// 403:1008 Database account not found from gateway
7676
/// </summary>
7777
DatabaseAccountNotFound,
78+
79+
/// <summary>
80+
/// 410:1022 Lease not Found
81+
/// </summary>
82+
LeaseNotFound,
7883
}
7984
}

Microsoft.Azure.Cosmos/FaultInjection/src/implementation/FaultInjectionServerErrorResultInternal.cs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ public HttpResponseMessage GetInjectedServerError(DocumentServiceRequest dsr, st
461461

462462
httpResponse.Headers.Add(
463463
WFConstants.BackendHeaders.SubStatus,
464-
((int)SubStatusCodes.RUBudgetExceeded).ToString(CultureInfo.InvariantCulture));
464+
((int)SubStatusCodes.Unknown).ToString(CultureInfo.InvariantCulture));
465465
httpResponse.Headers.Add(WFConstants.BackendHeaders.LocalLSN, lsn);
466466

467467
return httpResponse;
@@ -470,7 +470,7 @@ public HttpResponseMessage GetInjectedServerError(DocumentServiceRequest dsr, st
470470

471471
httpResponse = new HttpResponseMessage
472472
{
473-
StatusCode = HttpStatusCode.NotFound,
473+
StatusCode = HttpStatusCode.Forbidden,
474474
Content = new FauntInjectionHttpContent(
475475
new MemoryStream(
476476
FaultInjectionResponseEncoding.GetBytes($"Fault Injection Server Error: DatabaseAccountNotFound, rule: {ruleId}"))),
@@ -488,6 +488,28 @@ public HttpResponseMessage GetInjectedServerError(DocumentServiceRequest dsr, st
488488

489489
return httpResponse;
490490

491+
case FaultInjectionServerErrorType.LeaseNotFound:
492+
493+
httpResponse = new HttpResponseMessage
494+
{
495+
StatusCode = HttpStatusCode.Gone,
496+
Content = new FauntInjectionHttpContent(
497+
new MemoryStream(
498+
FaultInjectionResponseEncoding.GetBytes($"Fault Injection Server Error: LeaseNotFound, rule: {ruleId}"))),
499+
};
500+
501+
foreach (string header in headers.AllKeys())
502+
{
503+
httpResponse.Headers.Add(header, headers.Get(header));
504+
}
505+
506+
httpResponse.Headers.Add(
507+
WFConstants.BackendHeaders.SubStatus,
508+
((int)SubStatusCodes.LeaseNotFound).ToString(CultureInfo.InvariantCulture));
509+
httpResponse.Headers.Add(WFConstants.BackendHeaders.LocalLSN, lsn);
510+
511+
return httpResponse;
512+
491513
default:
492514
throw new ArgumentException($"Server error type {this.serverErrorType} is not supported");
493515
}

Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -308,11 +308,12 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
308308
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
309309
this.documentServiceRequest?.ResourceAddress ?? string.Empty);
310310

311+
//Retry policy will retry on the next preffered region as the original requert region is not accepting requests
311312
return await this.ShouldRetryOnEndpointFailureAsync(
312313
isReadRequest: this.isReadRequest,
313314
markBothReadAndWriteAsUnavailable: false,
314315
forceRefresh: false,
315-
retryOnPreferredLocations: false);
316+
retryOnPreferredLocations: true);
316317
}
317318

318319
if (statusCode == HttpStatusCode.NotFound
@@ -328,6 +329,13 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
328329
isSystemResourceUnavailableForWrite: false);
329330
}
330331

332+
// Recieved 500 status code or lease not found
333+
if ((statusCode == HttpStatusCode.InternalServerError && this.isReadRequest)
334+
|| (statusCode == HttpStatusCode.Gone && subStatusCode == SubStatusCodes.LeaseNotFound))
335+
{
336+
return this.ShouldRetryOnUnavailableEndpointStatusCodes();
337+
}
338+
331339
return null;
332340
}
333341

@@ -467,14 +475,15 @@ private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceU
467475

468476
this.TryMarkEndpointUnavailableForPkRange(isSystemResourceUnavailableForWrite);
469477

470-
return this.ShouldRetryOnServiceUnavailable();
478+
return this.ShouldRetryOnUnavailableEndpointStatusCodes();
471479
}
472480

473481
/// <summary>
474482
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available.
475483
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well.
484+
/// Other status codes include InternalServerError (500.0) and LeaseNotFound (410.1022).
476485
/// </summary>
477-
private ShouldRetryResult ShouldRetryOnServiceUnavailable()
486+
private ShouldRetryResult ShouldRetryOnUnavailableEndpointStatusCodes()
478487
{
479488
if (this.serviceUnavailableRetryCount++ >= ClientRetryPolicy.MaxServiceUnavailableRetryCount)
480489
{

Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs

Lines changed: 74 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP
2323
private const int DefaultMaxWaitTimeInSeconds = 60;
2424

2525
/// <summary>
26-
/// A constant integer defining the default maximum retry count on service unavailable.
26+
/// A constant integer defining the default maximum retry count on unavailable endpoint.
2727
/// </summary>
28-
private const int DefaultMaxServiceUnavailableRetryCount = 1;
28+
private const int DefaultMaxUnavailableEndpointRetryCount = 1;
2929

3030
/// <summary>
3131
/// An instance of <see cref="IGlobalEndpointManager"/>.
@@ -38,9 +38,9 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP
3838
private readonly IDocumentClientRetryPolicy throttlingRetryPolicy;
3939

4040
/// <summary>
41-
/// An integer defining the maximum retry count on service unavailable.
41+
/// An integer defining the maximum retry count on unavailable endpoint.
4242
/// </summary>
43-
private readonly int maxServiceUnavailableRetryCount;
43+
private readonly int maxUnavailableEndpointRetryCount;
4444

4545
/// <summary>
4646
/// An instance of <see cref="Uri"/> containing the location endpoint where the partition key
@@ -49,9 +49,9 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP
4949
private MetadataRetryContext retryContext;
5050

5151
/// <summary>
52-
/// An integer capturing the current retry count on service unavailable.
52+
/// An integer capturing the current retry count on unavailable endpoint.
5353
/// </summary>
54-
private int serviceUnavailableRetryCount;
54+
private int unavailableEndpointRetryCount;
5555

5656
/// <summary>
5757
/// The constructor to initialize an instance of <see cref="MetadataRequestThrottleRetryPolicy"/>.
@@ -66,8 +66,8 @@ public MetadataRequestThrottleRetryPolicy(
6666
int maxRetryWaitTimeInSeconds = DefaultMaxWaitTimeInSeconds)
6767
{
6868
this.globalEndpointManager = endpointManager;
69-
this.maxServiceUnavailableRetryCount = Math.Max(
70-
MetadataRequestThrottleRetryPolicy.DefaultMaxServiceUnavailableRetryCount,
69+
this.maxUnavailableEndpointRetryCount = Math.Max(
70+
MetadataRequestThrottleRetryPolicy.DefaultMaxUnavailableEndpointRetryCount,
7171
this.globalEndpointManager.PreferredLocationCount);
7272

7373
this.throttlingRetryPolicy = new ResourceThrottleRetryPolicy(
@@ -91,11 +91,43 @@ public Task<ShouldRetryResult> ShouldRetryAsync(
9191
Exception exception,
9292
CancellationToken cancellationToken)
9393
{
94-
if (exception is CosmosException cosmosException
95-
&& cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable
96-
&& cosmosException.Headers.SubStatusCode == SubStatusCodes.TransportGenerated503)
94+
if (exception is CosmosException cosmosException)
9795
{
98-
if (this.IncrementRetryIndexOnServiceUnavailableForMetadataRead())
96+
return this.ShouldRetryInternalAsync(
97+
cosmosException.StatusCode,
98+
(SubStatusCodes)cosmosException.SubStatusCode,
99+
exception,
100+
cancellationToken);
101+
}
102+
103+
if (exception is DocumentClientException clientException)
104+
{
105+
return this.ShouldRetryInternalAsync(
106+
clientException.StatusCode,
107+
clientException.GetSubStatus(),
108+
exception, cancellationToken);
109+
}
110+
111+
return this.throttlingRetryPolicy.ShouldRetryAsync(exception, cancellationToken);
112+
}
113+
114+
private Task<ShouldRetryResult> ShouldRetryInternalAsync(
115+
HttpStatusCode? statusCode,
116+
SubStatusCodes subStatus,
117+
Exception exception,
118+
CancellationToken cancellationToken)
119+
{
120+
if (statusCode == null)
121+
{
122+
return this.throttlingRetryPolicy.ShouldRetryAsync(exception, cancellationToken);
123+
}
124+
125+
if (statusCode == HttpStatusCode.ServiceUnavailable
126+
|| statusCode == HttpStatusCode.InternalServerError
127+
|| (statusCode == HttpStatusCode.Gone && subStatus == SubStatusCodes.LeaseNotFound)
128+
|| (statusCode == HttpStatusCode.Forbidden && subStatus == SubStatusCodes.DatabaseAccountNotFound))
129+
{
130+
if (this.IncrementRetryIndexOnUnavailableEndpointForMetadataRead())
99131
{
100132
return Task.FromResult(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
101133
}
@@ -114,16 +146,36 @@ public Task<ShouldRetryResult> ShouldRetryAsync(
114146
ResponseMessage cosmosResponseMessage,
115147
CancellationToken cancellationToken)
116148
{
117-
if (cosmosResponseMessage?.StatusCode == HttpStatusCode.ServiceUnavailable
118-
&& cosmosResponseMessage?.Headers?.SubStatusCode == SubStatusCodes.TransportGenerated503)
149+
return this.ShouldRetryInternalAsync(
150+
cosmosResponseMessage.StatusCode,
151+
(SubStatusCodes)Convert.ToInt32(cosmosResponseMessage.Headers[WFConstants.BackendHeaders.SubStatus]),
152+
cosmosResponseMessage,
153+
cancellationToken);
154+
}
155+
156+
private Task<ShouldRetryResult> ShouldRetryInternalAsync(
157+
HttpStatusCode? statusCode,
158+
SubStatusCodes subStatus,
159+
ResponseMessage responseMessage,
160+
CancellationToken cancellationToken)
161+
{
162+
if (statusCode == null)
163+
{
164+
return this.throttlingRetryPolicy.ShouldRetryAsync(responseMessage, cancellationToken);
165+
}
166+
167+
if (statusCode == HttpStatusCode.ServiceUnavailable
168+
|| statusCode == HttpStatusCode.InternalServerError
169+
|| (statusCode == HttpStatusCode.Gone && subStatus == SubStatusCodes.LeaseNotFound)
170+
|| (statusCode == HttpStatusCode.Forbidden && subStatus == SubStatusCodes.DatabaseAccountNotFound))
119171
{
120-
if (this.IncrementRetryIndexOnServiceUnavailableForMetadataRead())
172+
if (this.IncrementRetryIndexOnUnavailableEndpointForMetadataRead())
121173
{
122174
return Task.FromResult(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
123175
}
124176
}
125177

126-
return this.throttlingRetryPolicy.ShouldRetryAsync(cosmosResponseMessage, cancellationToken);
178+
return this.throttlingRetryPolicy.ShouldRetryAsync(responseMessage, cancellationToken);
127179
}
128180

129181
/// <summary>
@@ -146,23 +198,23 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
146198
}
147199

148200
/// <summary>
149-
/// Increments the location index when a service unavailable exception ocurrs, for any future read requests.
201+
/// Increments the location index when a unavailable endpoint exception ocurrs, for any future read requests.
150202
/// </summary>
151203
/// <returns>A boolean flag indicating if the operation was successful.</returns>
152-
private bool IncrementRetryIndexOnServiceUnavailableForMetadataRead()
204+
private bool IncrementRetryIndexOnUnavailableEndpointForMetadataRead()
153205
{
154-
if (this.serviceUnavailableRetryCount++ >= this.maxServiceUnavailableRetryCount)
206+
if (this.unavailableEndpointRetryCount++ >= this.maxUnavailableEndpointRetryCount)
155207
{
156-
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Retry count: {0} has exceeded the maximum permitted retry count on service unavailable: {1}.", this.serviceUnavailableRetryCount, this.maxServiceUnavailableRetryCount);
208+
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Retry count: {0} has exceeded the maximum permitted retry count on unavailable endpoint: {1}.", this.unavailableEndpointRetryCount, this.maxUnavailableEndpointRetryCount);
157209
return false;
158210
}
159211

160212
// Retrying on second PreferredLocations.
161213
// RetryCount is used as zero-based index.
162-
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Incrementing the metadata retry location index to: {0}.", this.serviceUnavailableRetryCount);
214+
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Incrementing the metadata retry location index to: {0}.", this.unavailableEndpointRetryCount);
163215
this.retryContext = new MetadataRetryContext()
164216
{
165-
RetryLocationIndex = this.serviceUnavailableRetryCount,
217+
RetryLocationIndex = this.unavailableEndpointRetryCount,
166218
RetryRequestOnPreferredLocations = true,
167219
};
168220

Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
using System.Text.Json.Serialization;
99
using System.Threading;
1010
using System.Threading.Tasks;
11+
using Microsoft.Azure.Cosmos.Diagnostics;
1112
using Microsoft.Azure.Cosmos.FaultInjection;
1213
using Microsoft.VisualStudio.TestTools.UnitTesting;
1314
using static Microsoft.Azure.Cosmos.Routing.GlobalPartitionEndpointManagerCore;
@@ -149,6 +150,85 @@ public async Task ReadMany2UnreachablePartitionsTest()
149150
}
150151
}
151152

153+
[TestMethod]
154+
[TestCategory("MultiRegion")]
155+
[DataRow(FaultInjectionServerErrorType.ServiceUnavailable)]
156+
[DataRow(FaultInjectionServerErrorType.InternalServerError)]
157+
[DataRow(FaultInjectionServerErrorType.DatabaseAccountNotFound)]
158+
[DataRow(FaultInjectionServerErrorType.LeaseNotFound)]
159+
public async Task MetadataEndpointUnavailableCrossRegionalRetryTest(FaultInjectionServerErrorType serverErrorType)
160+
{
161+
FaultInjectionRule collReadBad = new FaultInjectionRuleBuilder(
162+
id: "collread",
163+
condition: new FaultInjectionConditionBuilder()
164+
.WithOperationType(FaultInjectionOperationType.MetadataContainer)
165+
.WithRegion(region1)
166+
.Build(),
167+
result: new FaultInjectionServerErrorResultBuilder(serverErrorType)
168+
.Build())
169+
.Build();
170+
171+
FaultInjectionRule pkRangeBad = new FaultInjectionRuleBuilder(
172+
id: "pkrange",
173+
condition: new FaultInjectionConditionBuilder()
174+
.WithOperationType(FaultInjectionOperationType.MetadataPartitionKeyRange)
175+
.WithRegion(region1)
176+
.Build(),
177+
result: new FaultInjectionServerErrorResultBuilder(serverErrorType)
178+
.Build())
179+
.Build();
180+
181+
collReadBad.Disable();
182+
pkRangeBad.Disable();
183+
184+
FaultInjector faultInjector = new FaultInjector(new List<FaultInjectionRule> { pkRangeBad, collReadBad });
185+
186+
CosmosClientOptions cosmosClientOptions = new CosmosClientOptions()
187+
{
188+
ConsistencyLevel = ConsistencyLevel.Session,
189+
ConnectionMode = ConnectionMode.Direct,
190+
Serializer = this.cosmosSystemTextJsonSerializer,
191+
FaultInjector = faultInjector,
192+
ApplicationPreferredRegions = new List<string> { region1, region2, region3 }
193+
};
194+
195+
using (CosmosClient fiClient = new CosmosClient(
196+
connectionString: this.connectionString,
197+
clientOptions: cosmosClientOptions))
198+
{
199+
Database fidb = fiClient.GetDatabase(MultiRegionSetupHelpers.dbName);
200+
Container fic = fidb.GetContainer(MultiRegionSetupHelpers.containerName);
201+
202+
pkRangeBad.Enable();
203+
collReadBad.Enable();
204+
205+
try
206+
{
207+
FeedIterator<CosmosIntegrationTestObject> frTest = fic.GetItemQueryIterator<CosmosIntegrationTestObject>("SELECT * FROM c");
208+
while (frTest.HasMoreResults)
209+
{
210+
FeedResponse<CosmosIntegrationTestObject> feedres = await frTest.ReadNextAsync();
211+
212+
Assert.AreEqual(HttpStatusCode.OK, feedres.StatusCode);
213+
}
214+
}
215+
catch (CosmosException ex)
216+
{
217+
Assert.Fail(ex.Message);
218+
}
219+
finally
220+
{
221+
//Cross regional retry needs to ocur (could trigger for other metadata call to try on secondary region so rule would not trigger)
222+
Assert.IsTrue(pkRangeBad.GetHitCount() + collReadBad.GetHitCount() >= 1);
223+
224+
pkRangeBad.Disable();
225+
collReadBad.Disable();
226+
227+
fiClient.Dispose();
228+
}
229+
}
230+
}
231+
152232
[TestMethod]
153233
[TestCategory("MultiRegion")]
154234
public async Task AddressRefreshTimeoutTest()

0 commit comments

Comments
 (0)