chore: garbage collection

fracasula · fracasula · commit 64bb247e3261 · 2025-07-03T17:57:13.000+02:00
diff --git a/cmd/node/config.go b/cmd/node/config.go
@@ -9,4 +9,9 @@ var defaultHistogramBuckets = []float64{
 	300 /* 5 mins */, 600 /* 10 mins */, 1800, /* 30 mins */
 }
 
-var customBuckets = map[string][]float64{}
+var customBuckets = map[string][]float64{
+	"keydb_gc_duration_seconds": {
+		// 2ms, 5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s, 20s, 30s, 1m, 5m, 10m, 30m
+		0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 60, 300, 600, 1800,
+	},
+}
diff --git a/cmd/node/main.go b/cmd/node/main.go
@@ -101,6 +101,7 @@ func run(ctx context.Context, cancel func(), conf *config.Config, stat stats.Sta
 		logger.NewIntField("clusterSize", int64(nodeConfig.ClusterSize)),
 		logger.NewIntField("totalHashRanges", int64(nodeConfig.TotalHashRanges)),
 		logger.NewDurationField("snapshotInterval", nodeConfig.SnapshotInterval),
+		logger.NewDurationField("gcInterval", nodeConfig.GarbageCollectionInterval),
 		logger.NewStringField("nodeAddresses", fmt.Sprintf("%+v", nodeConfig.Addresses)),
 		logger.NewIntField("noOfAddresses", int64(len(nodeConfig.Addresses))),
 	)
diff --git a/internal/cache/badger/badger.go b/internal/cache/badger/badger.go
@@ -24,6 +24,7 @@ var ErrSnapshotInProgress = errors.New("snapshotting already in progress")
 type Cache struct {
 	cache            *badger.DB
 	compress         bool
+	discardRatio     float64
 	snapshotSince    uint64
 	snapshotting     bool
 	snapshottingLock sync.Mutex
@@ -83,8 +84,9 @@ func New(path string, conf *config.Config, log logger.Logger) (*Cache, error) {
 		return nil, err
 	}
 	return &Cache{
-		cache:    db,
-		compress: compress,
+		cache:        db,
+		compress:     compress,
+		discardRatio: conf.GetFloat64("BadgerDB.Dedup.DiscardRatio", 0.7),
 	}, nil
 }
 
@@ -233,6 +235,14 @@ func (c *Cache) LoadSnapshot(r io.Reader) error {
 	return c.cache.Load(r, 16)
 }
 
+func (c *Cache) RunGarbageCollection() {
+again: // see https://dgraph.io/docs/badger/get-started/#garbage-collection
+	err := c.cache.RunValueLogGC(c.discardRatio)
+	if err == nil {
+		goto again
+	}
+}
+
 func (c *Cache) Close() error {
 	return c.cache.Close()
 }
diff --git a/internal/cache/cachettl/cachettl.go b/internal/cache/cachettl/cachettl.go
@@ -59,6 +59,10 @@ func (c *Cache) LoadSnapshot(r io.Reader) error {
 	return cachettl.LoadSnapshot(r, c.cache)
 }
 
+func (c *Cache) RunGarbageCollection() {
+	// no need to implement since we won't be using cachettl
+}
+
 // Close releases any resources held by the cache and performs any necessary cleanup.
 func (c *Cache) Close() error { // TODO implementation
 	return nil
diff --git a/node/node.go b/node/node.go
@@ -31,6 +31,9 @@ const (
 	// DefaultSnapshotInterval is the default interval for creating snapshots (in seconds)
 	DefaultSnapshotInterval = 24 * time.Hour
 
+	// DefaultGarbageCollectionInterval defines how often garbage collection should happen per cache
+	DefaultGarbageCollectionInterval = 5 * time.Minute
+
 	// DefaultMaxFilesToList defines the default maximum number of files to list in a single operation, set to 1000.
 	DefaultMaxFilesToList int64 = 1000
 )
@@ -54,6 +57,9 @@ type Config struct {
 	// SnapshotInterval is the interval for creating snapshots (in seconds)
 	SnapshotInterval time.Duration
 
+	// GarbageCollectionInterval defines the duration between automatic GC operation per cache
+	GarbageCollectionInterval time.Duration
+
 	// Addresses is a list of node addresses that this node will advertise to clients
 	Addresses []string
 }
@@ -84,6 +90,7 @@ type Service struct {
 		errScalingCounter   stats.Counter
 		errWrongNodeCounter stats.Counter
 		errInternalCounter  stats.Counter
+		gcDuration          stats.Histogram
 	}
 }
 
@@ -102,6 +109,9 @@ type Cache interface {
 	// LoadSnapshot reads the cache contents from the provided reader
 	LoadSnapshot(r io.Reader) error
 
+	// RunGarbageCollection is designed to do GC while the cache is online
+	RunGarbageCollection()
+
 	// Close releases any resources associated with the cache and ensures proper cleanup. Returns an error if the operation fails.
 	Close() error
 }
@@ -130,11 +140,12 @@ func NewService(
 	if config.TotalHashRanges == 0 {
 		config.TotalHashRanges = DefaultTotalHashRanges
 	}
-
 	if config.SnapshotInterval == 0 {
 		config.SnapshotInterval = DefaultSnapshotInterval
 	}
-
+	if config.GarbageCollectionInterval == 0 {
+		config.GarbageCollectionInterval = DefaultGarbageCollectionInterval
+	}
 	if config.MaxFilesToList == 0 {
 		config.MaxFilesToList = DefaultMaxFilesToList
 	}
@@ -150,7 +161,9 @@ func NewService(
 		logger: log.Withn(
 			logger.NewIntField("nodeId", int64(config.NodeID)),
 			logger.NewIntField("totalHashRanges", int64(config.TotalHashRanges)),
-			logger.NewIntField("snapshotInterval", int64(config.SnapshotInterval.Seconds())),
+			logger.NewDurationField("snapshotInterval", config.SnapshotInterval),
+			logger.NewDurationField("garbageCollectionInterval", config.GarbageCollectionInterval),
+			logger.NewIntField("maxFilesToList", config.MaxFilesToList),
 		),
 	}
 
@@ -160,6 +173,7 @@ func NewService(
 	service.metrics.errInternalCounter = stat.NewTaggedStat("keydb_err_internal_count", stats.CountType, statsTags)
 	service.metrics.getKeysCounters = make(map[uint32]stats.Counter)
 	service.metrics.putKeysCounter = make(map[uint32]stats.Counter)
+	service.metrics.gcDuration = stat.NewTaggedStat("keydb_gc_duration_seconds", stats.HistogramType, statsTags)
 
 	// Initialize caches for all hash ranges this node handles
 	if err := service.initCaches(ctx); err != nil {
@@ -169,6 +183,8 @@ func NewService(
 	// Start background snapshot creation
 	service.waitGroup.Add(1)
 	go service.snapshotLoop(ctx)
+	service.waitGroup.Add(1)
+	go service.garbageCollection(ctx)
 
 	return service, nil
 }
@@ -202,6 +218,7 @@ func (s *Service) snapshotLoop(ctx context.Context) {
 				}
 				if s.now().Sub(s.lastSnapshotTime) < s.config.SnapshotInterval { // TODO write a test for this
 					// we created a snapshot already recently due to a scaling operation
+					s.logger.Debugn("Skipping snapshot due to scaling operation")
 					return
 				}
 				if err := s.createSnapshots(ctx); err != nil {
@@ -212,6 +229,46 @@ func (s *Service) snapshotLoop(ctx context.Context) {
 	}
 }
 
+func (s *Service) garbageCollection(ctx context.Context) {
+	defer s.waitGroup.Done()
+
+	ticker := time.NewTicker(s.config.GarbageCollectionInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			func() {
+				s.mu.Lock() // TODO this might affect scaling operations
+				defer s.mu.Unlock()
+
+				if s.scaling {
+					s.logger.Warnn("Skipping garbage collection while scaling")
+					return
+				}
+
+				start := time.Now()
+				s.logger.Infon("Running garbage collection", logger.NewIntField("noOfCaches", int64(len(s.caches))))
+				defer func() {
+					elapsed := time.Since(start)
+					s.logger.Infon("Garbage collection finished",
+						logger.NewIntField("noOfCaches", int64(len(s.caches))),
+						logger.NewDurationField("duration", elapsed),
+					)
+					s.metrics.gcDuration.Observe(elapsed.Seconds())
+				}()
+
+				for _, cache := range s.caches {
+					// For now let's just run one GC at a time to avoid overwhelming the CPU
+					cache.RunGarbageCollection()
+				}
+			}()
+		}
+	}
+}
+
 // initCaches initializes the caches for all hash ranges this node handles
 func (s *Service) initCaches(ctx context.Context) error {
 	// Get hash ranges for this node
@@ -542,6 +599,7 @@ func (s *Service) ScaleComplete(_ context.Context, _ *pb.ScaleCompleteRequest) (
 }
 
 // CreateSnapshot implements the CreateSnapshot RPC method
+// TODO should we trigger a Garbage Collection before taking a snapshot?
 // TODO this can be optimized a lot! For example we could snapshot on local disk every 10s and work only on the head
 // and tail of the file (i.e. remove expired from head and append new entries).
 // Then once a minute we can upload the whole file to S3.

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,7 @@ func run(ctx context.Context, cancel func(), conf *config.Config, stat stats.Sta`
`101`	`101`	`logger.NewIntField("clusterSize", int64(nodeConfig.ClusterSize)),`
`102`	`102`	`logger.NewIntField("totalHashRanges", int64(nodeConfig.TotalHashRanges)),`
`103`	`103`	`logger.NewDurationField("snapshotInterval", nodeConfig.SnapshotInterval),`
	`104`	`+ logger.NewDurationField("gcInterval", nodeConfig.GarbageCollectionInterval),`
`104`	`105`	`logger.NewStringField("nodeAddresses", fmt.Sprintf("%+v", nodeConfig.Addresses)),`
`105`	`106`	`logger.NewIntField("noOfAddresses", int64(len(nodeConfig.Addresses))),`
`106`	`107`	`)`