Skip to content

Commit cc64092

Browse files
authored
Vault 6773/raft rejoin nonvoter (#16324)
* raft: Ensure init before setting suffrage As reported in https://hashicorp.atlassian.net/browse/VAULT-6773: The /sys/storage/raft/join endpoint is intended to be unauthenticated. We rely on the seal to manage trust. It’s possible to use multiple join requests to switch nodes from voter to non-voter. The screenshot shows a 3 node cluster where vault_2 is the leader, and vault_3 and vault_4 are followers with non-voters set to false. sent two requests to the raft join endpoint to have vault_3 and vault_4 join the cluster with non_voters:true. This commit fixes the issue by delaying the call to SetDesiredSuffrage until after the initialization check, preventing unauthenticated mangling of voter status. Tested locally using https://github.com/hashicorp/vault-tools/blob/main/users/ncabatoff/cluster/raft.sh and the reproducer outlined in VAULT-6773. * raft: Return join err on failure This is necessary to correctly distinguish errors returned from the Join workflow. Previously, errors were being masked as timeouts. * raft: Default autopilot parameters in teststorage Change some defaults so we don't have to pass in parameters or set them in the originating tests. These storage types are only used in two places: 1) Raft HA testing 2) Seal migration testing Both consumers have been tested and pass with this change. * changelog: Unauthn voter status change bugfix
1 parent 228a26c commit cc64092

File tree

4 files changed

+27
-14
lines changed

4 files changed

+27
-14
lines changed

changelog/16324.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
storage/raft (enterprise): Prevent unauthenticated voter status change with rejoin
3+
```

helper/testhelpers/teststorage/teststorage.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,11 @@ func RaftHAFactory(f PhysicalBackendBundler) func(t testing.T, coreIdx int, logg
137137

138138
nodeID := fmt.Sprintf("core-%d", coreIdx)
139139
backendConf := map[string]string{
140-
"path": raftDir,
141-
"node_id": nodeID,
142-
"performance_multiplier": "8",
140+
"path": raftDir,
141+
"node_id": nodeID,
142+
"performance_multiplier": "8",
143+
"autopilot_reconcile_interval": "300ms",
144+
"autopilot_update_interval": "100ms",
143145
}
144146

145147
// Create and set the HA Backend

helper/testhelpers/teststorage/teststorage_reusable.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import (
1818
// seal migration, wherein a given physical backend must be re-used as several
1919
// test clusters are sequentially created, tested, and discarded.
2020
type ReusableStorage struct {
21-
2221
// IsRaft specifies whether the storage is using a raft backend.
2322
IsRaft bool
2423

@@ -169,9 +168,11 @@ func makeRaftDir(t testing.T) string {
169168
func makeReusableRaftBackend(t testing.T, coreIdx int, logger hclog.Logger, raftDir string, addressProvider raftlib.ServerAddressProvider, ha bool) *vault.PhysicalBackendBundle {
170169
nodeID := fmt.Sprintf("core-%d", coreIdx)
171170
conf := map[string]string{
172-
"path": raftDir,
173-
"node_id": nodeID,
174-
"performance_multiplier": "8",
171+
"path": raftDir,
172+
"node_id": nodeID,
173+
"performance_multiplier": "8",
174+
"autopilot_reconcile_interval": "300ms",
175+
"autopilot_update_interval": "100ms",
175176
}
176177

177178
backend, err := raft.NewRaftBackend(conf, logger)

vault/raft.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -835,11 +835,6 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
835835
return false, errors.New("raft backend not in use")
836836
}
837837

838-
if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
839-
c.logger.Error("failed to set desired suffrage for this node", "error", err)
840-
return false, nil
841-
}
842-
843838
init, err := c.InitializedLocally(ctx)
844839
if err != nil {
845840
return false, fmt.Errorf("failed to check if core is initialized: %w", err)
@@ -963,10 +958,21 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
963958
if err != nil {
964959
return fmt.Errorf("failed to check if core is initialized: %w", err)
965960
}
966-
if init && !isRaftHAOnly {
961+
962+
// InitializedLocally will return non-nil before HA backends are
963+
// initialized. c.Initialized(ctx) checks InitializedLocally first, so
964+
// we can't use that generically for both cases. Instead check
965+
// raftBackend.Initialized() directly for the HA-Only case.
966+
if (!isRaftHAOnly && init) || (isRaftHAOnly && raftBackend.Initialized()) {
967967
c.logger.Info("returning from raft join as the node is initialized")
968968
return nil
969969
}
970+
971+
if err := raftBackend.SetDesiredSuffrage(nonVoter); err != nil {
972+
c.logger.Error("failed to set desired suffrage for this node", "error", err)
973+
return nil
974+
}
975+
970976
challengeCh := make(chan *raftInformation)
971977
var expandedJoinInfos []*raft.LeaderJoinInfo
972978
for _, leaderInfo := range leaderInfos {
@@ -1001,6 +1007,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
10011007

10021008
select {
10031009
case <-ctx.Done():
1010+
err = ctx.Err()
10041011
case raftInfo := <-challengeCh:
10051012
if raftInfo != nil {
10061013
err = answerChallenge(ctx, raftInfo)
@@ -1009,7 +1016,7 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
10091016
}
10101017
}
10111018
}
1012-
return fmt.Errorf("timed out on raft join: %w", ctx.Err())
1019+
return err
10131020
}
10141021

10151022
switch retryFailures {

0 commit comments

Comments
 (0)