Skip to content

Commit f869cce

Browse files
[Internal] Per Partition Automatic Failover: Removes Remove Environment Variable to Set PPAF at the SDK Layer and Add Support for Internal Client Options (#5284)
# Pull Request Template ## Description - **Remove Dependency on Environment Variable:** Today, the PPAF enablement in the .NET SDK is done completely through the Get Account metadata response. However, we still kept the environment variable `AZURE_COSMOS_PARTITION_LEVEL_FAILOVER_ENABLED` at a deprecated state to toggle the behavior, if the cosmos account doesn't have the flag enabled. As a part of this task, we will remove the environment variable completely from the .NET SDK ecosystem. - **Create a new Environment Variable to externalize circuit breaker timeout counter reset window:** Goal is to create a new environment variable `AZURE_COSMOS_PPCB_TIMEOUT_COUNTER_RESET_WINDOW_IN_MINUTES` to externalize the PPCB timeout counter reset window. The default value for this would be `5` minutes. - **Add New Internal Client Options to disable PPAF:** Add a new **internal** cosmos client options to disable PPAF explicitly. Once set, this will be used to disable PPAF irrespective of the account settings. ## Type of change Please delete options that are not relevant. - [] New feature (non-breaking change which adds functionality) ## Closing issues To automatically close an issue: closes #5277
1 parent faaa3a7 commit f869cce

File tree

8 files changed

+297
-49
lines changed

8 files changed

+297
-49
lines changed

Microsoft.Azure.Cosmos/src/ConnectionPolicy.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,12 @@ public bool EnablePartitionLevelCircuitBreaker
338338
set;
339339
}
340340

341+
internal bool DisablePartitionLevelFailoverClientLevelOverride
342+
{
343+
get;
344+
set;
345+
}
346+
341347
/// <summary>
342348
/// Gets or sets the certificate validation callback.
343349
/// </summary>

Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -770,7 +770,15 @@ bool EnableRemoteRegionPreferredForSessionRetry
770770
/// If compute gateway chooses to enable PPAF, then the .NET SDK will enable PPCB by default, which will improve the read availability and latency. This would mean
771771
/// when PPAF is enabled, the SDK will automatically enable PPCB as well.
772772
/// </summary>
773-
internal bool EnablePartitionLevelCircuitBreaker { get; set; } = ConfigurationManager.IsPartitionLevelCircuitBreakerEnabled(defaultValue: false);
773+
internal bool EnablePartitionLevelCircuitBreaker { get; set; } = ConfigurationManager.IsPartitionLevelCircuitBreakerEnabled(defaultValue: false);
774+
775+
/// <summary>
776+
/// Flag from gateway to disable partition level failover. Normally, the SDK will enable partition level failover based on the account settings.
777+
/// This flag will be used internally by the compute gateway as by default it will disable partition level failover.
778+
///
779+
/// The default value for this parameter is 'false'.
780+
/// </summary>
781+
internal bool DisablePartitionLevelFailover { get; set; } = false;
774782

775783
/// <summary>
776784
/// Quorum Read allowed with eventual consistency account or consistent prefix account.
@@ -1029,7 +1037,8 @@ internal virtual ConnectionPolicy GetConnectionPolicy(int clientId)
10291037
MaxRequestsPerTcpConnection = this.MaxRequestsPerTcpConnection,
10301038
MaxTcpConnectionsPerEndpoint = this.MaxTcpConnectionsPerEndpoint,
10311039
EnableEndpointDiscovery = !this.LimitToEndpoint,
1032-
EnablePartitionLevelCircuitBreaker = this.EnablePartitionLevelCircuitBreaker,
1040+
EnablePartitionLevelCircuitBreaker = this.EnablePartitionLevelCircuitBreaker,
1041+
DisablePartitionLevelFailoverClientLevelOverride = this.DisablePartitionLevelFailover,
10331042
PortReuseMode = this.portReuseMode,
10341043
EnableTcpConnectionEndpointRediscovery = this.EnableTcpConnectionEndpointRediscovery,
10351044
EnableAdvancedReplicaSelectionForTcp = this.EnableAdvancedReplicaSelectionForTcp,

Microsoft.Azure.Cosmos/src/DocumentClient.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,13 +1056,12 @@ private async Task<bool> GetInitializationTaskAsync(IStoreClientFactory storeCli
10561056
this.EnsureValidOverwrite(this.desiredConsistencyLevel.Value);
10571057
}
10581058

1059-
bool isPPafEnabled = ConfigurationManager.IsPartitionLevelFailoverEnabled(defaultValue: false);
1060-
if (this.accountServiceConfiguration != null && this.accountServiceConfiguration.AccountProperties.EnablePartitionLevelFailover.HasValue)
1059+
if (!this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride
1060+
&& this.accountServiceConfiguration != null && this.accountServiceConfiguration.AccountProperties.EnablePartitionLevelFailover.HasValue)
10611061
{
1062-
isPPafEnabled = this.accountServiceConfiguration.AccountProperties.EnablePartitionLevelFailover.Value;
1062+
this.ConnectionPolicy.EnablePartitionLevelFailover = this.accountServiceConfiguration.AccountProperties.EnablePartitionLevelFailover.Value;
10631063
}
10641064

1065-
this.ConnectionPolicy.EnablePartitionLevelFailover = isPPafEnabled;
10661065
this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker |= this.ConnectionPolicy.EnablePartitionLevelFailover;
10671066
this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures());
10681067
this.InitializePartitionLevelFailoverWithDefaultHedging();

Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -602,13 +602,11 @@ public virtual void InitializeAccountPropertiesAndStartBackgroundRefresh(Account
602602
return;
603603
}
604604

605-
bool isPPafEnabled = ConfigurationManager.IsPartitionLevelFailoverEnabled(defaultValue: false);
606-
if (databaseAccount.EnablePartitionLevelFailover.HasValue)
605+
if (!this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride && databaseAccount.EnablePartitionLevelFailover.HasValue)
607606
{
608-
isPPafEnabled = databaseAccount.EnablePartitionLevelFailover.Value;
607+
this.connectionPolicy.EnablePartitionLevelFailover = databaseAccount.EnablePartitionLevelFailover.Value;
609608
}
610609

611-
this.connectionPolicy.EnablePartitionLevelFailover = isPPafEnabled;
612610
GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(databaseAccount);
613611

614612
this.locationCache.OnDatabaseAccountRead(databaseAccount);

Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,7 @@ public PartitionKeyRangeFailoverInfo(
608608
this.ConsecutiveWriteRequestFailureCount = 0;
609609
this.ReadRequestFailureCounterThreshold = ConfigurationManager.GetCircuitBreakerConsecutiveFailureCountForReads(10);
610610
this.WriteRequestFailureCounterThreshold = ConfigurationManager.GetCircuitBreakerConsecutiveFailureCountForWrites(5);
611-
this.TimeoutCounterResetWindowInMinutes = TimeSpan.FromMinutes(1);
611+
this.TimeoutCounterResetWindowInMinutes = TimeSpan.FromMinutes(ConfigurationManager.GetCircuitBreakerTimeoutCounterResetWindowInMinutes(5));
612612
this.FirstRequestFailureTime = DateTime.UtcNow;
613613
this.LastRequestFailureTime = DateTime.UtcNow;
614614
}

Microsoft.Azure.Cosmos/src/Util/ConfigurationManager.cs

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,13 @@ internal static class ConfigurationManager
6060
/// A read-only string containing the environment variable name for capturing the consecutive failure count for writes, before triggering per partition
6161
/// circuit breaker flow. The default value for this interval is 10 consecutive requests within 1 min window.
6262
/// </summary>
63-
internal static readonly string CircuitBreakerConsecutiveFailureCountForWrites = "AZURE_COSMOS_PPCB_CONSECUTIVE_FAILURE_COUNT_FOR_WRITES";
63+
internal static readonly string CircuitBreakerConsecutiveFailureCountForWrites = "AZURE_COSMOS_PPCB_CONSECUTIVE_FAILURE_COUNT_FOR_WRITES";
64+
65+
/// <summary>
66+
/// A read-only string containing the environment variable name for capturing the consecutive failure count for writes, before triggering per partition
67+
/// circuit breaker flow. The default value for this interval is 5 consecutive requests within 1 min window.
68+
/// </summary>
69+
internal static readonly string CircuitBreakerTimeoutCounterResetWindowInMinutes = "AZURE_COSMOS_PPCB_TIMEOUT_COUNTER_RESET_WINDOW_IN_MINUTES";
6470

6571
/// <summary>
6672
/// Environment variable name for overriding optimistic direct execution of queries.
@@ -159,23 +165,6 @@ public static bool IsReplicaAddressValidationEnabled(
159165
defaultValue: true);
160166
}
161167

162-
/// <summary>
163-
/// Gets the boolean value of the partition level failover environment variable. Note that, partition level failover
164-
/// is disabled by default for both preview and GA releases. The user can set the respective environment variable
165-
/// 'AZURE_COSMOS_PARTITION_LEVEL_FAILOVER_ENABLED' to override the value for both preview and GA. The method will
166-
/// eventually be removed, once partition level failover is enabled by default for both preview and GA.
167-
/// </summary>
168-
/// <param name="defaultValue">A boolean field containing the default value for partition level failover.</param>
169-
/// <returns>A boolean flag indicating if partition level failover is enabled.</returns>
170-
public static bool IsPartitionLevelFailoverEnabled(
171-
bool defaultValue)
172-
{
173-
return ConfigurationManager
174-
.GetEnvironmentVariable(
175-
variable: ConfigurationManager.PartitionLevelFailoverEnabled,
176-
defaultValue: defaultValue);
177-
}
178-
179168
/// <summary>
180169
/// Gets the boolean value indicating whether the thin client mode is enabled based on the environment variable override.
181170
/// </summary>
@@ -284,7 +273,23 @@ public static int GetCircuitBreakerConsecutiveFailureCountForWrites(
284273
.GetEnvironmentVariable(
285274
variable: ConfigurationManager.CircuitBreakerConsecutiveFailureCountForWrites,
286275
defaultValue: defaultValue);
287-
}
276+
}
277+
278+
/// <summary>
279+
/// Gets the consecutive faulure count for writes (applicable for multi master accounts) before triggering
280+
/// the per partition circuit breaker flow. The default value for this interval is 5 consecutive requests within a 1-minute window.
281+
/// The user can set the respective environment variable 'AZURE_COSMOS_PPCB_TIMEOUT_COUNTER_RESET_WINDOW_IN_MINUTES' to override the value.
282+
/// </summary>
283+
/// <param name="defaultValue">An integer containing the default value for the consecutive failure count.</param>
284+
/// <returns>An double representing the timeout counter reset window in minutes.</returns>
285+
public static double GetCircuitBreakerTimeoutCounterResetWindowInMinutes(
286+
double defaultValue)
287+
{
288+
return ConfigurationManager
289+
.GetEnvironmentVariable(
290+
variable: ConfigurationManager.CircuitBreakerTimeoutCounterResetWindowInMinutes,
291+
defaultValue: defaultValue);
292+
}
288293

289294
/// <summary>
290295
/// Gets the boolean value indicating whether optimistic direct execution is enabled based on the environment variable override.

0 commit comments

Comments
 (0)