Azure
diff --git a/‎Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs‎
Lines changed: 66 additions & 26 deletions b/‎Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs‎
Lines changed: 66 additions & 26 deletions
diff --git a/‎Microsoft.Azure.Cosmos/src/ConnectionPolicy.cs‎
Lines changed: 6 additions & 0 deletions b/‎Microsoft.Azure.Cosmos/src/ConnectionPolicy.cs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs‎
Lines changed: 14 additions & 1 deletion b/‎Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎Microsoft.Azure.Cosmos/src/Diagnostics/UserAgentFeatureFlags.cs‎
Lines changed: 2 additions & 0 deletions b/‎Microsoft.Azure.Cosmos/src/Diagnostics/UserAgentFeatureFlags.cs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Microsoft.Azure.Cosmos/src/DocumentClient.cs‎
Lines changed: 5 additions & 2 deletions b/‎Microsoft.Azure.Cosmos/src/DocumentClient.cs‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎Microsoft.Azure.Cosmos/src/RetryPolicy.cs‎
Lines changed: 3 additions & 3 deletions b/‎Microsoft.Azure.Cosmos/src/RetryPolicy.cs‎
Lines changed: 3 additions & 3 deletions
@@ -28,7 +28,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy
         private readonly GlobalEndpointManager globalEndpointManager;
         private readonly GlobalPartitionEndpointManager partitionKeyRangeLocationCache;
         private readonly bool enableEndpointDiscovery;
-        private readonly bool isPertitionLevelFailoverEnabled;
+        private readonly bool isPartitionLevelFailoverEnabled;
         private int failoverRetryCount;
 
         private int sessionTokenRetryCount;
@@ -45,7 +45,7 @@ public ClientRetryPolicy(
             GlobalPartitionEndpointManager partitionKeyRangeLocationCache,
             RetryOptions retryOptions,
             bool enableEndpointDiscovery,
-            bool isPertitionLevelFailoverEnabled)
+            bool isPartitionLevelFailoverEnabled)
         {
             this.throttlingRetry = new ResourceThrottleRetryPolicy(
                 retryOptions.MaxRetryAttemptsOnThrottledRequests,
@@ -59,7 +59,7 @@ public ClientRetryPolicy(
             this.serviceUnavailableRetryCount = 0;
             this.canUseMultipleWriteLocations = false;
             this.isMultiMasterWriteRequest = false;
-            this.isPertitionLevelFailoverEnabled = isPertitionLevelFailoverEnabled;
+            this.isPartitionLevelFailoverEnabled = isPartitionLevelFailoverEnabled;
         }
 
         /// <summary> 
@@ -80,13 +80,9 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
                     this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
                     this.documentServiceRequest?.ResourceAddress ?? string.Empty);
 
-                if (this.isPertitionLevelFailoverEnabled)
-                {
-                    // In the event of the routing gateway having outage on region A, mark the partition as unavailable assuming that the
-                    // partition has been failed over to region B, when per partition automatic failover is enabled.
-                    this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
-                         this.documentServiceRequest);
-                }
+                // In the event of the routing gateway having outage on region A, mark the partition as unavailable assuming that the
+                // partition has been failed over to region B, when per partition automatic failover is enabled.
+                this.TryMarkEndpointUnavailableForPkRange(isSystemResourceUnavailableForWrite: false);
 
                 // Mark both read and write requests because it gateway exception.
                 // This means all requests going to the region will fail.
@@ -113,7 +109,7 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
                         StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);
 
                     return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
-                        shouldMarkEndpointUnavailableForPkRange: true);
+                        isSystemResourceUnavailableForWrite: true);
                 }
 
                 ShouldRetryResult shouldRetryResult = await this.ShouldRetryInternalAsync(
@@ -176,7 +172,7 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
                     StatusCodes.TooManyRequests, SubStatusCodes.SystemResourceUnavailable);
 
                 return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
-                    shouldMarkEndpointUnavailableForPkRange: true);
+                    isSystemResourceUnavailableForWrite: true);
             }
 
             return await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken);
@@ -236,8 +232,7 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
                     this.documentServiceRequest?.ResourceAddress ?? string.Empty);
 
                 // Mark the partition key range as unavailable to retry future request on a new region.
-                this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
-                     this.documentServiceRequest);
+                this.TryMarkEndpointUnavailableForPkRange(isSystemResourceUnavailableForWrite: false);
             }
 
             // Received 403.3 on write region, initiate the endpoint rediscovery
@@ -313,7 +308,7 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
             if (statusCode == HttpStatusCode.ServiceUnavailable)
             {
                 return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
-                    shouldMarkEndpointUnavailableForPkRange: true);
+                    isSystemResourceUnavailableForWrite: false);
             }
 
             return null;
@@ -442,23 +437,18 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques
         /// Service Unavailable response is received, indicating that the service might be temporarily unavailable.
         /// It optionally marks the partition key range as unavailable, which will influence future routing decisions.
         /// </summary>
-        /// <param name="shouldMarkEndpointUnavailableForPkRange">A boolean flag indicating whether the endpoint for the
-        /// current partition key range should be marked as unavailable.</param>
+        /// <param name="isSystemResourceUnavailableForWrite">A boolean flag indicating whether the endpoint for the
+        /// current partition key range should be marked as unavailable, if the failure happened due to system
+        /// resource unavailability.</param>
         /// <returns>An instance of <see cref="ShouldRetryResult"/> indicating whether the operation should be retried.</returns>
         private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
-            bool shouldMarkEndpointUnavailableForPkRange)
+            bool isSystemResourceUnavailableForWrite)
         {
             DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
                 this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
                 this.documentServiceRequest?.ResourceAddress ?? string.Empty);
 
-            if (shouldMarkEndpointUnavailableForPkRange)
-            {
-                // Mark the partition as unavailable.
-                // Let the ClientRetry logic decide if the request should be retried
-                this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
-                     this.documentServiceRequest);
-            }
+            this.TryMarkEndpointUnavailableForPkRange(isSystemResourceUnavailableForWrite);
 
             return this.ShouldRetryOnServiceUnavailable();
         }
@@ -477,7 +467,7 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
 
             if (!this.canUseMultipleWriteLocations
                     && !this.isReadRequest
-                    && !this.isPertitionLevelFailoverEnabled)
+                    && !this.isPartitionLevelFailoverEnabled)
             {
                 // Write requests on single master cannot be retried if partition level failover is disabled.
                 // This means there are no other regions available to serve the writes.
@@ -506,6 +496,30 @@ private ShouldRetryResult ShouldRetryOnServiceUnavailable()
             return ShouldRetryResult.RetryAfter(TimeSpan.Zero);
         }
 
+        /// <summary>
+        /// Attempts to mark the endpoint associated with the current partition key range as unavailable
+        /// which will influence future routing decisions.
+        /// </summary>
+        /// <param name="isSystemResourceUnavailableForWrite">A boolean flag indicating if the system resource was unavailable. If true,
+        /// the endpoint will be marked unavailable for the pk-range of a multi master write request, bypassing the circuit breaker check.</param>
+        /// <returns>A boolean flag indicating whether the endpoint was marked as unavailable.</returns>
+        private bool TryMarkEndpointUnavailableForPkRange(
+            bool isSystemResourceUnavailableForWrite)
+        {
+            if (this.documentServiceRequest != null
+                && (isSystemResourceUnavailableForWrite
+                || this.IsRequestEligibleForPerPartitionAutomaticFailover()
+                || this.IsRequestEligibleForPartitionLevelCircuitBreaker()))
+            {
+                // Mark the partition as unavailable.
+                // Let the ClientRetry logic decide if the request should be retried
+                return this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
+                    request: this.documentServiceRequest);
+            }
+
+            return false;
+        }
+
         /// <summary>
         /// Returns a boolean flag indicating if the endpoint should be marked as unavailable
         /// due to a 429 response with a sub status code of 3092 (system resource unavailable).
@@ -524,6 +538,32 @@ private bool ShouldMarkEndpointUnavailableOnSystemResourceUnavailableForWrite(
                 && subStatusCode == SubStatusCodes.SystemResourceUnavailable;
         }
 
+        /// <summary>
+        /// Determines if a request is eligible for per-partition automatic failover.
+        /// A request is eligible if it is a write request, partition level failover is enabled,
+        /// and the global endpoint manager cannot use multiple write locations for the request.
+        /// </summary>
+        /// <returns>True if the request is eligible for per-partition automatic failover, otherwise false.</returns>
+        private bool IsRequestEligibleForPerPartitionAutomaticFailover()
+        {
+            return this.partitionKeyRangeLocationCache.IsRequestEligibleForPerPartitionAutomaticFailover(
+                this.documentServiceRequest);
+        }
+
+        /// <summary>
+        /// Determines if a request is eligible for partition-level circuit breaker.
+        /// This method checks if the request is a read-only request or a multi master write request, if partition-level circuit breaker is enabled,
+        /// and if the partition key range location cache indicates that the partition can fail over based on the number of request failures.
+        /// </summary>
+        /// <returns>
+        /// True if the read request is eligible for partition-level circuit breaker, otherwise false.
+        /// </returns>
+        private bool IsRequestEligibleForPartitionLevelCircuitBreaker()
+        {
+            return this.partitionKeyRangeLocationCache.IsRequestEligibleForPartitionLevelCircuitBreaker(this.documentServiceRequest)
+                        && this.partitionKeyRangeLocationCache.IncrementRequestFailureCounterAndCheckIfPartitionCanFailover(this.documentServiceRequest);
+        }
+
         private sealed class RetryContext
         {
             public int RetryLocationIndex { get; set; }
 
@@ -332,6 +332,12 @@ public bool EnablePartitionLevelFailover
             set;
         }
 
+        public bool EnablePartitionLevelCircuitBreaker
+        {
+            get;
+            set;
+        }
+
         /// <summary>
         /// Gets or sets the certificate validation callback.
         /// </summary>
 
@@ -729,7 +729,14 @@ public Func<HttpClient> HttpClientFactory
         /// <summary>
         /// Enable partition key level failover
         /// </summary>
-        internal bool EnablePartitionLevelFailover { get; set; } = ConfigurationManager.IsPartitionLevelFailoverEnabled(defaultValue: false);
+        internal bool EnablePartitionLevelFailover { get; set; } = ConfigurationManager.IsPartitionLevelFailoverEnabled(defaultValue: false);
+
+        /// <summary>
+        /// Enable partition level circuit breaker (aka PPCB). For compute gateway use case, by default per partition automatic failover will be disabled, so does the PPCB.
+        /// If compute gateway chooses to enable PPAF, then the .NET SDK will enable PPCB by default, which will improve the read availability and latency. This would mean
+        /// when PPAF is enabled, the SDK will automatically enable PPCB as well.
+        /// </summary>
+        internal bool EnablePartitionLevelCircuitBreaker { get; set; } = ConfigurationManager.IsPartitionLevelCircuitBreakerEnabled(defaultValue: false);
 
         /// <summary>
         /// Quorum Read allowed with eventual consistency account or consistent prefix account.
@@ -983,6 +990,7 @@ internal virtual ConnectionPolicy GetConnectionPolicy(int clientId)
                 MaxTcpConnectionsPerEndpoint = this.MaxTcpConnectionsPerEndpoint,
                 EnableEndpointDiscovery = !this.LimitToEndpoint,
                 EnablePartitionLevelFailover = this.EnablePartitionLevelFailover,
+                EnablePartitionLevelCircuitBreaker = this.EnablePartitionLevelFailover || this.EnablePartitionLevelCircuitBreaker,
                 PortReuseMode = this.portReuseMode,
                 EnableTcpConnectionEndpointRediscovery = this.EnableTcpConnectionEndpointRediscovery,
                 EnableAdvancedReplicaSelectionForTcp = this.EnableAdvancedReplicaSelectionForTcp,
@@ -1221,6 +1229,11 @@ internal string GetUserAgentSuffix()
                 featureFlag += (int)UserAgentFeatureFlags.PerPartitionAutomaticFailover;
             }
 
+            if (this.EnablePartitionLevelFailover || this.EnablePartitionLevelCircuitBreaker)
+            {
+                featureFlag += (int)UserAgentFeatureFlags.PerPartitionCircuitBreaker;
+            }
+
             if (featureFlag == 0)
             {
                 return this.ApplicationName;
 
@@ -16,5 +16,7 @@ namespace Microsoft.Azure.Cosmos
     internal enum UserAgentFeatureFlags
     {
         PerPartitionAutomaticFailover = 1,
+
+        PerPartitionCircuitBreaker = 2,
     }
 }
@@ -939,8 +939,11 @@ internal virtual void Initialize(Uri serviceEndpoint,
 #endif
 
             this.GlobalEndpointManager = new GlobalEndpointManager(this, this.ConnectionPolicy);
-            this.PartitionKeyRangeLocation = this.ConnectionPolicy.EnablePartitionLevelFailover
-                ? new GlobalPartitionEndpointManagerCore(this.GlobalEndpointManager)
+            this.PartitionKeyRangeLocation = this.ConnectionPolicy.EnablePartitionLevelFailover || this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker
+                ? new GlobalPartitionEndpointManagerCore(
+                    this.GlobalEndpointManager,
+                    this.ConnectionPolicy.EnablePartitionLevelFailover,
+                    this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker)
                 : GlobalPartitionEndpointManagerNoOp.Instance;
 
             this.httpClient = CosmosHttpClientCore.CreateWithConnectionPolicy(
 
@@ -13,7 +13,7 @@ internal sealed class RetryPolicy : IRetryPolicyFactory
         private readonly GlobalPartitionEndpointManager partitionKeyRangeLocationCache;
         private readonly GlobalEndpointManager globalEndpointManager;
         private readonly bool enableEndpointDiscovery;
-        private readonly bool isPertitionLevelFailoverEnabled;
+        private readonly bool isPartitionLevelFailoverEnabled;
         private readonly RetryOptions retryOptions;
 
         /// <summary>
@@ -25,7 +25,7 @@ public RetryPolicy(
             GlobalPartitionEndpointManager partitionKeyRangeLocationCache)
         {
             this.enableEndpointDiscovery = connectionPolicy.EnableEndpointDiscovery;
-            this.isPertitionLevelFailoverEnabled = connectionPolicy.EnablePartitionLevelFailover;
+            this.isPartitionLevelFailoverEnabled = connectionPolicy.EnablePartitionLevelFailover;
             this.globalEndpointManager = globalEndpointManager;
             this.retryOptions = connectionPolicy.RetryOptions;
             this.partitionKeyRangeLocationCache = partitionKeyRangeLocationCache;
@@ -41,7 +41,7 @@ public IDocumentClientRetryPolicy GetRequestPolicy()
                 this.partitionKeyRangeLocationCache,
                 this.retryOptions,
                 this.enableEndpointDiscovery,
-                this.isPertitionLevelFailoverEnabled);
+                this.isPartitionLevelFailoverEnabled);
 
             return clientRetryPolicy;
         }
Original file line number	Diff line number	Diff line change
`@@ -16,5 +16,7 @@ namespace Microsoft.Azure.Cosmos`
`16`	`16`	`internal enum UserAgentFeatureFlags`
`17`	`17`	`{`
`18`	`18`	`PerPartitionAutomaticFailover = 1,`
	`19`	`+`
	`20`	`+ PerPartitionCircuitBreaker = 2,`
`19`	`21`	`}`
`20`	`22`	`}`