@@ -43,7 +43,8 @@ import (
4343
4444const (
4545 CheckAndRecoverGenericProblemRecoveryName string = "CheckAndRecoverGenericProblem"
46- RestartDirectReplicasRecoveryName string = "RestartDirectReplicas"
46+ RestartArbitraryDirectReplicaRecoveryName string = "RestartArbitraryDirectReplica"
47+ RestartAllDirectReplicasRecoveryName string = "RestartAllDirectReplicas"
4748 RecoverDeadPrimaryRecoveryName string = "RecoverDeadPrimary"
4849 RecoverPrimaryTabletDeletedRecoveryName string = "RecoverPrimaryTabletDeleted"
4950 RecoverPrimaryHasPrimaryRecoveryName string = "RecoverPrimaryHasPrimary"
@@ -104,7 +105,8 @@ type recoveryFunction int
104105const (
105106 noRecoveryFunc recoveryFunction = iota
106107 recoverGenericProblemFunc
107- restartDirectReplicasFunc
108+ restartArbitraryDirectReplicaFunc
109+ restartAllDirectReplicasFunc
108110 recoverDeadPrimaryFunc
109111 recoverPrimaryTabletDeletedFunc
110112 recoverPrimaryHasPrimaryFunc
@@ -351,8 +353,16 @@ func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry *inst.Repl
351353 return false , nil , nil
352354}
353355
356+ func restartArbitraryDirectReplica (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
357+ return restartDirectReplicas (ctx , analysisEntry , 1 , logger )
358+ }
359+
360+ func restartAllDirectReplicas (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
361+ return restartDirectReplicas (ctx , analysisEntry , 0 , logger )
362+ }
363+
354364// restartDirectReplicas restarts replication on direct replicas of an unreachable primary
355- func restartDirectReplicas (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
365+ func restartDirectReplicas (ctx context.Context , analysisEntry * inst.ReplicationAnalysis , maxReplicas int , logger * log.PrefixedLogger ) (bool , * TopologyRecovery , error ) {
356366 topologyRecovery , err := AttemptRecoveryRegistration (analysisEntry )
357367 if topologyRecovery == nil {
358368 message := fmt .Sprintf ("found an active or recent recovery on %+v. Will not issue another restartDirectReplicas." , analysisEntry .AnalyzedInstanceAlias )
@@ -399,8 +409,14 @@ func restartDirectReplicas(ctx context.Context, analysisEntry *inst.ReplicationA
399409 eg , _ := errgroup .WithContext (ctx )
400410 var restartExpected int
401411 var restartPerformed atomic.Int64
402- // Iterate through all tablets and find direct replicas of the primary
403- for _ , tabletInfo := range tablets {
412+ // Iterate through all tablets and find direct replicas of the primary.
413+ // We intentionally shuffle tablet order. When maxReplicas is non-zero, we want to
414+ // randomly pick which replicas to restart, to avoid biasing towards replicas.
415+ for i , tabletIndex := range rand .Perm (len (tablets )) {
416+ if maxReplicas > 0 && i >= maxReplicas {
417+ break
418+ }
419+ tabletInfo := tablets [tabletIndex ]
404420 tablet := tabletInfo .Tablet
405421 tabletAlias := topoproto .TabletAliasString (tablet .Alias )
406422
@@ -524,7 +540,9 @@ func getCheckAndRecoverFunctionCode(analysisEntry *inst.ReplicationAnalysis) (re
524540 case inst .DeadPrimaryAndReplicas :
525541 recoveryFunc = recoverGenericProblemFunc
526542 case inst .UnreachablePrimary :
527- recoveryFunc = restartDirectReplicasFunc
543+ recoveryFunc = restartArbitraryDirectReplicaFunc
544+ case inst .UnreachablePrimaryWithBrokenReplicas :
545+ recoveryFunc = restartAllDirectReplicasFunc
528546 case inst .UnreachablePrimaryWithLaggingReplicas :
529547 recoveryFunc = recoverGenericProblemFunc
530548 case inst .AllPrimaryReplicasNotReplicating :
@@ -549,7 +567,9 @@ func hasActionableRecovery(recoveryFunctionCode recoveryFunction) bool {
549567 return false
550568 case recoverGenericProblemFunc :
551569 return false
552- case restartDirectReplicasFunc :
570+ case restartArbitraryDirectReplicaFunc :
571+ return true
572+ case restartAllDirectReplicasFunc :
553573 return true
554574 case recoverDeadPrimaryFunc :
555575 return true
@@ -581,8 +601,10 @@ func getCheckAndRecoverFunction(recoveryFunctionCode recoveryFunction) (
581601 return nil
582602 case recoverGenericProblemFunc :
583603 return checkAndRecoverGenericProblem
584- case restartDirectReplicasFunc :
585- return restartDirectReplicas
604+ case restartArbitraryDirectReplicaFunc :
605+ return restartArbitraryDirectReplica
606+ case restartAllDirectReplicasFunc :
607+ return restartAllDirectReplicas
586608 case recoverDeadPrimaryFunc :
587609 return recoverDeadPrimary
588610 case recoverPrimaryTabletDeletedFunc :
@@ -612,8 +634,10 @@ func getRecoverFunctionName(recoveryFunctionCode recoveryFunction) string {
612634 return ""
613635 case recoverGenericProblemFunc :
614636 return CheckAndRecoverGenericProblemRecoveryName
615- case restartDirectReplicasFunc :
616- return RestartDirectReplicasRecoveryName
637+ case restartArbitraryDirectReplicaFunc :
638+ return RestartArbitraryDirectReplicaRecoveryName
639+ case restartAllDirectReplicasFunc :
640+ return RestartAllDirectReplicasRecoveryName
617641 case recoverDeadPrimaryFunc :
618642 return RecoverDeadPrimaryRecoveryName
619643 case recoverPrimaryTabletDeletedFunc :
0 commit comments