@@ -31,6 +31,9 @@ const (
31
31
// DefaultSnapshotInterval is the default interval for creating snapshots (in seconds)
32
32
DefaultSnapshotInterval = 24 * time .Hour
33
33
34
+ // DefaultGarbageCollectionInterval defines how often garbage collection should happen per cache
35
+ DefaultGarbageCollectionInterval = 5 * time .Minute
36
+
34
37
// DefaultMaxFilesToList defines the default maximum number of files to list in a single operation, set to 1000.
35
38
DefaultMaxFilesToList int64 = 1000
36
39
)
@@ -54,6 +57,9 @@ type Config struct {
54
57
// SnapshotInterval is the interval for creating snapshots (in seconds)
55
58
SnapshotInterval time.Duration
56
59
60
+ // GarbageCollectionInterval defines the duration between automatic GC operation per cache
61
+ GarbageCollectionInterval time.Duration
62
+
57
63
// Addresses is a list of node addresses that this node will advertise to clients
58
64
Addresses []string
59
65
}
@@ -84,6 +90,7 @@ type Service struct {
84
90
errScalingCounter stats.Counter
85
91
errWrongNodeCounter stats.Counter
86
92
errInternalCounter stats.Counter
93
+ gcDuration stats.Histogram
87
94
}
88
95
}
89
96
@@ -102,6 +109,9 @@ type Cache interface {
102
109
// LoadSnapshot reads the cache contents from the provided reader
103
110
LoadSnapshot (r io.Reader ) error
104
111
112
+ // RunGarbageCollection is designed to do GC while the cache is online
113
+ RunGarbageCollection ()
114
+
105
115
// Close releases any resources associated with the cache and ensures proper cleanup. Returns an error if the operation fails.
106
116
Close () error
107
117
}
@@ -130,11 +140,12 @@ func NewService(
130
140
if config .TotalHashRanges == 0 {
131
141
config .TotalHashRanges = DefaultTotalHashRanges
132
142
}
133
-
134
143
if config .SnapshotInterval == 0 {
135
144
config .SnapshotInterval = DefaultSnapshotInterval
136
145
}
137
-
146
+ if config .GarbageCollectionInterval == 0 {
147
+ config .GarbageCollectionInterval = DefaultGarbageCollectionInterval
148
+ }
138
149
if config .MaxFilesToList == 0 {
139
150
config .MaxFilesToList = DefaultMaxFilesToList
140
151
}
@@ -150,7 +161,9 @@ func NewService(
150
161
logger : log .Withn (
151
162
logger .NewIntField ("nodeId" , int64 (config .NodeID )),
152
163
logger .NewIntField ("totalHashRanges" , int64 (config .TotalHashRanges )),
153
- logger .NewIntField ("snapshotInterval" , int64 (config .SnapshotInterval .Seconds ())),
164
+ logger .NewDurationField ("snapshotInterval" , config .SnapshotInterval ),
165
+ logger .NewDurationField ("garbageCollectionInterval" , config .GarbageCollectionInterval ),
166
+ logger .NewIntField ("maxFilesToList" , config .MaxFilesToList ),
154
167
),
155
168
}
156
169
@@ -160,6 +173,7 @@ func NewService(
160
173
service .metrics .errInternalCounter = stat .NewTaggedStat ("keydb_err_internal_count" , stats .CountType , statsTags )
161
174
service .metrics .getKeysCounters = make (map [uint32 ]stats.Counter )
162
175
service .metrics .putKeysCounter = make (map [uint32 ]stats.Counter )
176
+ service .metrics .gcDuration = stat .NewTaggedStat ("keydb_gc_duration_seconds" , stats .HistogramType , statsTags )
163
177
164
178
// Initialize caches for all hash ranges this node handles
165
179
if err := service .initCaches (ctx ); err != nil {
@@ -169,6 +183,8 @@ func NewService(
169
183
// Start background snapshot creation
170
184
service .waitGroup .Add (1 )
171
185
go service .snapshotLoop (ctx )
186
+ service .waitGroup .Add (1 )
187
+ go service .garbageCollection (ctx )
172
188
173
189
return service , nil
174
190
}
@@ -202,6 +218,7 @@ func (s *Service) snapshotLoop(ctx context.Context) {
202
218
}
203
219
if s .now ().Sub (s .lastSnapshotTime ) < s .config .SnapshotInterval { // TODO write a test for this
204
220
// we created a snapshot already recently due to a scaling operation
221
+ s .logger .Debugn ("Skipping snapshot due to scaling operation" )
205
222
return
206
223
}
207
224
if err := s .createSnapshots (ctx ); err != nil {
@@ -212,6 +229,46 @@ func (s *Service) snapshotLoop(ctx context.Context) {
212
229
}
213
230
}
214
231
232
+ func (s * Service ) garbageCollection (ctx context.Context ) {
233
+ defer s .waitGroup .Done ()
234
+
235
+ ticker := time .NewTicker (s .config .GarbageCollectionInterval )
236
+ defer ticker .Stop ()
237
+
238
+ for {
239
+ select {
240
+ case <- ctx .Done ():
241
+ return
242
+ case <- ticker .C :
243
+ func () {
244
+ s .mu .Lock () // TODO this might affect scaling operations
245
+ defer s .mu .Unlock ()
246
+
247
+ if s .scaling {
248
+ s .logger .Warnn ("Skipping garbage collection while scaling" )
249
+ return
250
+ }
251
+
252
+ start := time .Now ()
253
+ s .logger .Infon ("Running garbage collection" , logger .NewIntField ("noOfCaches" , int64 (len (s .caches ))))
254
+ defer func () {
255
+ elapsed := time .Since (start )
256
+ s .logger .Infon ("Garbage collection finished" ,
257
+ logger .NewIntField ("noOfCaches" , int64 (len (s .caches ))),
258
+ logger .NewDurationField ("duration" , elapsed ),
259
+ )
260
+ s .metrics .gcDuration .Observe (elapsed .Seconds ())
261
+ }()
262
+
263
+ for _ , cache := range s .caches {
264
+ // For now let's just run one GC at a time to avoid overwhelming the CPU
265
+ cache .RunGarbageCollection ()
266
+ }
267
+ }()
268
+ }
269
+ }
270
+ }
271
+
215
272
// initCaches initializes the caches for all hash ranges this node handles
216
273
func (s * Service ) initCaches (ctx context.Context ) error {
217
274
// Get hash ranges for this node
@@ -542,6 +599,7 @@ func (s *Service) ScaleComplete(_ context.Context, _ *pb.ScaleCompleteRequest) (
542
599
}
543
600
544
601
// CreateSnapshot implements the CreateSnapshot RPC method
602
+ // TODO should we trigger a Garbage Collection before taking a snapshot?
545
603
// TODO this can be optimized a lot! For example we could snapshot on local disk every 10s and work only on the head
546
604
// and tail of the file (i.e. remove expired from head and append new entries).
547
605
// Then once a minute we can upload the whole file to S3.
0 commit comments