Skip to content

Commit 58c1330

Browse files
committed
Test out a more sophisticated approach to the readiness probe
1 parent dd35208 commit 58c1330

File tree

3 files changed

+110
-4
lines changed

3 files changed

+110
-4
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/bash
2+
3+
# Valkey Node Readiness Check for Kubernetes
4+
# Returns 0 (ready) if any of these conditions are met:
5+
# 1. Cluster state is "ok"
6+
# 2. Node has zero slots allocated
7+
# 3. 300 seconds have elapsed since pod started
8+
9+
set -x
10+
11+
# shellcheck source=./utils.sh
12+
. /scripts/utils.sh
13+
14+
# Configuration
15+
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-300}"
16+
17+
# Function to check if timeout has elapsed since Valkey started
18+
check_timeout() {
19+
local info_output
20+
info_output=$(valkey_cli 127.0.0.1 6379 -t 1 -c INFO server 2>/dev/null || echo "")
21+
22+
# Extract uptime_in_seconds from INFO output
23+
local uptime
24+
uptime=$(echo "$info_output" | grep "^uptime_in_seconds:" | cut -d: -f2 | tr -d '\r')
25+
26+
if [ -z "$uptime" ]; then
27+
echo "Warning: Could not retrieve Valkey uptime" >&2
28+
return 1
29+
fi
30+
31+
if [ "$uptime" -ge $TIMEOUT_SECONDS ]; then
32+
echo "Readiness check passed: Valkey uptime of ${uptime}s exceeds timeout of ${TIMEOUT_SECONDS}s"
33+
return 0
34+
fi
35+
return 1
36+
}
37+
38+
# Function to check cluster state
39+
check_cluster_state() {
40+
local cluster_info
41+
cluster_info=$(valkey_cli 127.0.0.1 6379 -t 1 -c CLUSTER INFO 2>/dev/null || echo "")
42+
43+
if echo "$cluster_info" | grep -q "cluster_state:ok"; then
44+
echo "Readiness check passed: cluster state is ok"
45+
return 0
46+
fi
47+
return 1
48+
}
49+
50+
# Function to check slot allocation
51+
check_slots() {
52+
local nodes_info
53+
nodes_info=$(valkey_cli 127.0.0.1 6379 -t 1 -c CLUSTER NODES 2>/dev/null || echo "")
54+
55+
# Find the current node (marked with "myself")
56+
local myself_line
57+
myself_line=$(echo "$nodes_info" | grep "myself" || echo "")
58+
59+
if [ -z "$myself_line" ]; then
60+
echo "Warning: Could not find current node in cluster nodes output" >&2
61+
return 1
62+
fi
63+
64+
# Check if the line contains any slot ranges (format: [slot-slot] or single slots)
65+
# Slots appear after the address and flags, typically after the 8th field
66+
if ! echo "$myself_line" | grep -qE '\[?[0-9]+-?[0-9]*\]?'; then
67+
echo "Readiness check passed: node has zero slots allocated"
68+
return 0
69+
fi
70+
71+
return 1
72+
}
73+
74+
# Main readiness check logic
75+
main() {
76+
# Check condition 3: timeout elapsed
77+
if check_timeout; then
78+
exit 0
79+
fi
80+
81+
# Check condition 1: cluster state ok
82+
if check_cluster_state; then
83+
exit 0
84+
fi
85+
86+
# Check condition 2: zero slots allocated
87+
if check_slots; then
88+
exit 0
89+
fi
90+
91+
# None of the conditions met
92+
echo "Readiness check failed: waiting for cluster state ok, zero slots, or timeout"
93+
exit 1
94+
}
95+
96+
main

internal/controller/valkeycluster_controller_configmap.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,18 @@ func (r *ValkeyClusterReconciler) upsertConfigMap(ctx context.Context, valkeyClu
4040
logger.Error(err, "failed to read utils.sh")
4141
return "", err
4242
}
43+
readiness, err := scripts.ReadFile("scripts/readiness.sh")
44+
if err != nil {
45+
logger.Error(err, "failed to read readiness.sh")
46+
return "", err
47+
}
4348
ls := labelsForValkeyCluster(valkeyCluster.Name)
4449
cmData := map[string]string{
4550
"pre_stop.sh": string(preStop),
4651
"post_start.sh": string(postStart),
4752
"meet.sh": string(meet),
4853
"utils.sh": string(utils),
54+
"readiness.sh": string(readiness),
4955
}
5056
valkeyConfContent, err := getValkeyConfigContent(valkeyCluster)
5157
if err != nil {

internal/controller/valkeycluster_controller_statefulset.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,15 @@ func (r *ValkeyClusterReconciler) statefulSet(name string, size int32, valkeyClu
169169
},
170170
ReadinessProbe: &corev1.Probe{
171171
ProbeHandler: corev1.ProbeHandler{
172-
TCPSocket: &corev1.TCPSocketAction{
173-
Port: intstr.FromInt(VALKEY_PORT),
172+
Exec: &corev1.ExecAction{
173+
Command: []string{"/bin/bash", "/scripts/readiness.sh"},
174174
},
175175
},
176176
InitialDelaySeconds: valkeyCluster.Spec.InitialDelaySeconds,
177+
TimeoutSeconds: 5,
178+
PeriodSeconds: 10,
179+
SuccessThreshold: 1,
180+
FailureThreshold: 30,
177181
},
178182
LivenessProbe: &corev1.Probe{
179183
ProbeHandler: corev1.ProbeHandler{
@@ -533,8 +537,8 @@ func (r *ValkeyClusterReconciler) compareActualToDesiredStatefulSet(ctx context.
533537
log.Info(fmt.Sprintf("StatefulSet %s Env is different: %s", stsName, cmp.Diff(actual.Spec.Template.Spec.Containers[0].Env, desired.Spec.Template.Spec.Containers[0].Env)))
534538
diff = true
535539
}
536-
if !cmp.Equal(actual.Spec.Template.Spec.Containers[0].ReadinessProbe.InitialDelaySeconds, desired.Spec.Template.Spec.Containers[0].ReadinessProbe.InitialDelaySeconds) {
537-
log.Info(fmt.Sprintf("StatefulSet %s ReadinessProbe.InitialDelaySeconds is different: %s", stsName, cmp.Diff(actual.Spec.Template.Spec.Containers[0].ReadinessProbe.InitialDelaySeconds, desired.Spec.Template.Spec.Containers[0].ReadinessProbe.InitialDelaySeconds)))
540+
if !cmp.Equal(actual.Spec.Template.Spec.Containers[0].ReadinessProbe, desired.Spec.Template.Spec.Containers[0].ReadinessProbe) {
541+
log.Info(fmt.Sprintf("StatefulSet %s ReadinessProbe is different: %s", stsName, cmp.Diff(actual.Spec.Template.Spec.Containers[0].ReadinessProbe, desired.Spec.Template.Spec.Containers[0].ReadinessProbe)))
538542
diff = true
539543
}
540544

0 commit comments

Comments
 (0)