@@ -34,7 +34,7 @@ const (
34
34
labelOperationStatus = "operation_status"
35
35
snapshotController = "snapshot_controller"
36
36
operationLatencyMetricName = "operation_total_seconds"
37
- operationLatencyMetricHelpMsg = "Total number of seconds spent by the snapshot controller for an operation to complete end to end"
37
+ operationLatencyMetricHelpMsg = "Total number of seconds spent by the controller on an operation from end to end"
38
38
)
39
39
40
40
type OperationState string
@@ -45,8 +45,8 @@ const (
45
45
InterimFailure OperationState = "InterimFailure"
46
46
47
47
// TerminatingFailure is an operation state which means the controller has encountered
48
- // an error which it's not able to resume the operation and has marked a
49
- // permanent failure of the operation.
48
+ // an error which is not recoverable and the controller has marked a permanent
49
+ // failure of the operation.
50
50
TerminatingFailure OperationState = "TerminatingFailure"
51
51
52
52
// Success states that the controller has successfully executed the operation.
@@ -61,12 +61,16 @@ type MetricsManager interface {
61
61
// and serve HTTP requests received on addr/pattern
62
62
// if the "pattern" is empty (i.e., ""), no endpoint will be started. An error
63
63
// will be returned if there is any.
64
- StartServing (pattern , addr string , logger promhttp.Logger ) error
64
+ StartServing (pattern , addr string , logger promhttp.Logger , wg * sync. WaitGroup ) ( * http. Server , error )
65
65
66
66
// OperationStart takes in an operation and cache it's start time.
67
67
// if the operation already exists, it's an no-op.
68
68
OperationStart (op Operation )
69
69
70
+ // DropOperation removes an operation from cache.
71
+ // if the operation does not exist, it's an no-op.
72
+ DropOperation (op Operation )
73
+
70
74
// RecordMetrics records a metric point. Note that it will be an no-op if an
71
75
// operation has not been marked "Started" previously via invoking "OperationStart".
72
76
// op - the Operation which the metric is associated with.
@@ -89,11 +93,23 @@ type Operation struct {
89
93
ResourceID types.UID
90
94
}
91
95
96
+ type operationTs struct {
97
+ // startTime is the timestamp when an operation has been picked up by the
98
+ // controller for processing
99
+ startTime time.Time
100
+
101
+ // lastFailTime is the timestamp of the last interim failure on an operation
102
+ // if this field is not specified, i.e., lastFailTime.IsZero == true, it means
103
+ // the operation has never failed previously
104
+ lastFailTime time.Time
105
+ }
106
+
92
107
type operationMetricsManager struct {
93
108
// cache is a concurrent-safe map which stores start timestamps for all
94
109
// ongoing operations.
95
110
// key is an Operation
96
- // value is the Operation's start time
111
+ // value is the operationTs which records the start time and last failing time
112
+ // of the operation
97
113
cache sync.Map
98
114
99
115
// registry is a wrapper around Prometheus Registry
@@ -112,7 +128,14 @@ func NewMetricsManager() MetricsManager {
112
128
}
113
129
114
130
func (opMgr * operationMetricsManager ) OperationStart (op Operation ) {
115
- opMgr .cache .LoadOrStore (op , time .Now ())
131
+ opTs := operationTs {
132
+ startTime : time .Now (),
133
+ }
134
+ opMgr .cache .LoadOrStore (op , opTs )
135
+ }
136
+
137
+ func (opMgr * operationMetricsManager ) DropOperation (op Operation ) {
138
+ opMgr .cache .Delete (op )
116
139
}
117
140
118
141
func (opMgr * operationMetricsManager ) RecordMetrics (op Operation , state OperationState ) {
@@ -122,24 +145,34 @@ func (opMgr *operationMetricsManager) RecordMetrics(op Operation, state Operatio
122
145
// the operation has not been cached, return directly
123
146
return
124
147
}
125
- ts , ok := obj .(time. Time )
148
+ ts , ok := obj .(operationTs )
126
149
if ! ok {
127
150
// the cached item is not a time.Time, should NEVER happen, clean and return
128
151
klog .Errorf ("Invalid cache entry for key %v" , op )
129
152
opMgr .cache .Delete (op )
130
153
return
131
154
}
132
- duration := time .Since (ts ).Seconds ()
133
- opMgr .opLatencyMetrics .WithLabelValues (op .Driver , op .Name , string (state )).Observe (duration )
155
+ duration := time .Since (ts .startTime ).Seconds ()
134
156
switch state {
135
157
case Success , TerminatingFailure :
136
158
opMgr .cache .Delete (op )
137
159
case InterimFailure :
138
- // do nothing
160
+ if ! ts .lastFailTime .IsZero () {
161
+ // override duration
162
+ duration = time .Since (ts .lastFailTime ).Seconds ()
163
+ }
164
+ // override lastFailTime
165
+ newTs := operationTs {
166
+ startTime : ts .startTime ,
167
+ lastFailTime : time .Now (),
168
+ }
169
+ opMgr .cache .Store (op , newTs )
139
170
default :
140
171
// log error
141
172
klog .Errorf ("Not supported operation state %s" , state )
173
+ return
142
174
}
175
+ opMgr .opLatencyMetrics .WithLabelValues (op .Driver , op .Name , string (state )).Observe (duration )
143
176
}
144
177
145
178
func (opMgr * operationMetricsManager ) init () {
@@ -156,11 +189,11 @@ func (opMgr *operationMetricsManager) init() {
156
189
opMgr .registry .MustRegister (opMgr .opLatencyMetrics )
157
190
}
158
191
159
- func (opMgr * operationMetricsManager ) StartServing (pattern , addr string , logger promhttp.Logger ) error {
192
+ func (opMgr * operationMetricsManager ) StartServing (pattern , addr string , logger promhttp.Logger , wg * sync. WaitGroup ) ( * http. Server , error ) {
160
193
if addr == "" {
161
- return fmt .Errorf ("metrics endpoint will not be started as endpoint address is not specified" )
194
+ return nil , fmt .Errorf ("metrics endpoint will not be started as endpoint address is not specified" )
162
195
}
163
-
196
+ srv := & http. Server { Addr : addr }
164
197
http .Handle (pattern , k8smetrics .HandlerFor (
165
198
opMgr .registry ,
166
199
k8smetrics.HandlerOpts {
@@ -170,10 +203,10 @@ func (opMgr *operationMetricsManager) StartServing(pattern, addr string, logger
170
203
171
204
// start serving the endpoint
172
205
go func () {
173
- err := http . ListenAndServe ( addr , nil )
174
- if err != nil {
206
+ defer wg . Done ( )
207
+ if err := srv . ListenAndServe (); err != http . ErrServerClosed {
175
208
klog .Fatalf ("failed to start endpoint at:%s/%s, error: %v" , addr , pattern , err )
176
209
}
177
210
}()
178
- return nil
211
+ return srv , nil
179
212
}
0 commit comments