@@ -8,16 +8,15 @@ import (
8
8
"sync/atomic"
9
9
"time"
10
10
11
- "github.com/rudderlabs/rudder-go-kit/stats"
12
- obskit "github.com/rudderlabs/rudder-observability-kit/go/labels"
13
-
14
11
"google.golang.org/grpc"
15
12
"google.golang.org/grpc/credentials/insecure"
16
13
17
14
"github.com/rudderlabs/keydb/internal/hash"
18
15
pb "github.com/rudderlabs/keydb/proto"
19
16
"github.com/rudderlabs/rudder-go-kit/logger"
17
+ "github.com/rudderlabs/rudder-go-kit/stats"
20
18
kitsync "github.com/rudderlabs/rudder-go-kit/sync"
19
+ obskit "github.com/rudderlabs/rudder-observability-kit/go/labels"
21
20
)
22
21
23
22
var (
@@ -474,282 +473,6 @@ func (c *Client) put(ctx context.Context, keys []string, ttl time.Duration) erro
474
473
return nil
475
474
}
476
475
477
- // GetNodeInfo returns information about a node
478
- func (c * Client ) GetNodeInfo (ctx context.Context , nodeID uint32 ) (* pb.GetNodeInfoResponse , error ) {
479
- c .mu .RLock ()
480
- defer c .mu .RUnlock ()
481
-
482
- // Get the client for this node
483
- client , ok := c .clients [int (nodeID )]
484
- if ! ok {
485
- // this should never happen unless clusterSize is updated and the c.clients map isn't
486
- // or if there is a bug in the hashing function
487
- return nil , fmt .Errorf ("no client for node %d" , nodeID )
488
- }
489
-
490
- // Create the request
491
- req := & pb.GetNodeInfoRequest {NodeId : nodeID }
492
-
493
- // Send the request with retries
494
- var err error
495
- var resp * pb.GetNodeInfoResponse
496
- for i := 0 ; i <= c .config .RetryCount ; i ++ {
497
- resp , err = client .GetNodeInfo (ctx , req )
498
- if err == nil {
499
- break
500
- }
501
-
502
- // If this is the last retry, return the error
503
- if i == c .config .RetryCount {
504
- return nil , fmt .Errorf ("failed to get node info from node %d: %w" , nodeID , err )
505
- }
506
-
507
- // Wait before retrying
508
- select {
509
- case <- ctx .Done ():
510
- return nil , ctx .Err ()
511
- case <- time .After (c .config .RetryDelay ):
512
- }
513
- }
514
-
515
- if c .clusterSize != resp .ClusterSize {
516
- if err = c .updateClusterSize (resp .NodesAddresses ); err != nil {
517
- return nil , fmt .Errorf ("failed to update cluster size: %w" , err )
518
- }
519
- }
520
-
521
- return resp , nil
522
- }
523
-
524
- // CreateSnapshots forces the creation of snapshots on a node
525
- // This method is meant to be used by an Operator process only!
526
- func (c * Client ) CreateSnapshots (ctx context.Context ) error {
527
- c .mu .RLock ()
528
- defer c .mu .RUnlock ()
529
-
530
- group , ctx := kitsync .NewEagerGroup (ctx , len (c .clients ))
531
- for nodeID , client := range c .clients {
532
- group .Go (func () error {
533
- req := & pb.CreateSnapshotsRequest {}
534
-
535
- var err error
536
- var resp * pb.CreateSnapshotsResponse
537
- for i := 0 ; i <= c .config .RetryCount ; i ++ {
538
- resp , err = client .CreateSnapshots (ctx , req )
539
- if err == nil {
540
- break
541
- }
542
-
543
- // If this is the last retry, return the error
544
- if i == c .config .RetryCount {
545
- return fmt .Errorf ("failed to create snapshot on node %d: %w" , nodeID , err )
546
- }
547
-
548
- // Wait before retrying
549
- select {
550
- case <- ctx .Done ():
551
- return ctx .Err ()
552
- case <- time .After (c .config .RetryDelay ):
553
- }
554
- }
555
-
556
- if ! resp .Success {
557
- return fmt .Errorf ("failed to create snapshot on node %d: %w" , nodeID , err )
558
- }
559
-
560
- return nil
561
- })
562
- }
563
-
564
- return group .Wait ()
565
- }
566
-
567
- // LoadSnapshots forces all nodes to load snapshots from cloud storage
568
- // This method is meant to be used by an Operator process only!
569
- func (c * Client ) LoadSnapshots (ctx context.Context ) error {
570
- c .mu .RLock ()
571
- defer c .mu .RUnlock ()
572
-
573
- group , ctx := kitsync .NewEagerGroup (ctx , len (c .clients ))
574
- for nodeID , client := range c .clients {
575
- group .Go (func () error {
576
- req := & pb.LoadSnapshotsRequest {}
577
-
578
- var err error
579
- var resp * pb.LoadSnapshotsResponse
580
- for i := 0 ; i <= c .config .RetryCount ; i ++ {
581
- resp , err = client .LoadSnapshots (ctx , req )
582
- if err == nil && resp != nil && resp .Success {
583
- break
584
- }
585
-
586
- // If this is the last retry, return the error
587
- if i == c .config .RetryCount {
588
- errMsg := "unknown error"
589
- if err != nil {
590
- errMsg = err .Error ()
591
- } else if resp != nil {
592
- errMsg = resp .ErrorMessage
593
- }
594
- return fmt .Errorf ("failed to load snapshots on node %d: %s" , nodeID , errMsg )
595
- }
596
-
597
- // Wait before retrying
598
- select {
599
- case <- ctx .Done ():
600
- return ctx .Err ()
601
- case <- time .After (c .config .RetryDelay ):
602
- }
603
- }
604
-
605
- return nil
606
- })
607
- }
608
-
609
- return group .Wait ()
610
- }
611
-
612
- // Scale changes the number of nodes in the cluster
613
- // This method is meant to be used by an Operator process only!
614
- func (c * Client ) Scale (ctx context.Context , addresses ... string ) error {
615
- c .mu .Lock ()
616
- defer c .mu .Unlock ()
617
-
618
- newClusterSize := uint32 (len (addresses ))
619
- if newClusterSize == c .clusterSize {
620
- return nil // No change needed
621
- }
622
-
623
- // Handle case when newClusterSize is bigger
624
- if newClusterSize > c .clusterSize {
625
- // Establish new connections to the new nodes
626
- for i := int (c .clusterSize ); i < int (newClusterSize ); i ++ {
627
- addr := addresses [i ]
628
- conn , err := grpc .NewClient (addr ,
629
- grpc .WithTransportCredentials (insecure .NewCredentials ()),
630
- grpc .WithContextDialer (func (ctx context.Context , addr string ) (net.Conn , error ) {
631
- var dialer net.Dialer
632
- return dialer .DialContext (ctx , "tcp" , addr )
633
- }),
634
- )
635
- if err != nil {
636
- // Close any new connections we've made so far
637
- for j := int (c .clusterSize ); j < i ; j ++ {
638
- if conn , ok := c .connections [j ]; ok {
639
- _ = conn .Close ()
640
- delete (c .connections , j )
641
- delete (c .clients , j )
642
- }
643
- }
644
- return fmt .Errorf ("failed to connect to node %d at %s: %w" , i , addr , err )
645
- }
646
-
647
- c .connections [i ] = conn
648
- c .clients [i ] = pb .NewNodeServiceClient (conn )
649
- }
650
- } else if newClusterSize < c .clusterSize {
651
- // Handle case when newClusterSize is smaller
652
- // Close unnecessary connections
653
- for i := int (newClusterSize ); i < int (c .clusterSize ); i ++ {
654
- if conn , ok := c .connections [i ]; ok {
655
- _ = conn .Close () // Ignore errors during close
656
- delete (c .connections , i )
657
- delete (c .clients , i )
658
- }
659
- }
660
- }
661
-
662
- // Send ScaleRequest to all nodes
663
- group , ctx := kitsync .NewEagerGroup (ctx , len (c .clients ))
664
- for nodeID , client := range c .clients {
665
- group .Go (func () error {
666
- req := & pb.ScaleRequest {
667
- NewClusterSize : newClusterSize ,
668
- NodesAddresses : addresses ,
669
- }
670
-
671
- var err error
672
- var resp * pb.ScaleResponse
673
- for i := 0 ; i <= c .config .RetryCount ; i ++ {
674
- resp , err = client .Scale (ctx , req )
675
- if err == nil && resp != nil && resp .Success {
676
- break
677
- }
678
-
679
- // If this is the last retry, save the error
680
- if i == c .config .RetryCount {
681
- errMsg := "unknown error"
682
- if err != nil {
683
- errMsg = err .Error ()
684
- } else if resp != nil {
685
- errMsg = resp .ErrorMessage
686
- }
687
- return fmt .Errorf ("failed to scale node %d: %s" , nodeID , errMsg )
688
- }
689
-
690
- // Wait before retrying
691
- select {
692
- case <- ctx .Done ():
693
- return ctx .Err ()
694
- case <- time .After (c .config .RetryDelay ):
695
- }
696
- }
697
-
698
- return nil
699
- })
700
- }
701
-
702
- err := group .Wait ()
703
- if err != nil {
704
- return err
705
- }
706
-
707
- c .config .Addresses = addresses
708
- c .clusterSize = newClusterSize
709
-
710
- return nil
711
- }
712
-
713
- // ScaleComplete notifies a node that the scaling operation is complete
714
- // This method is meant to be used by an Operator process only!
715
- func (c * Client ) ScaleComplete (ctx context.Context ) error {
716
- c .mu .RLock ()
717
- defer c .mu .RUnlock ()
718
-
719
- group , ctx := kitsync .NewEagerGroup (ctx , len (c .clients ))
720
- for nodeID , client := range c .clients {
721
- group .Go (func () error {
722
- req := & pb.ScaleCompleteRequest {}
723
-
724
- // Send the request with retries
725
- var err error
726
- var resp * pb.ScaleCompleteResponse
727
- for i := 0 ; i <= c .config .RetryCount ; i ++ {
728
- resp , err = client .ScaleComplete (ctx , req )
729
- if err == nil && resp != nil && resp .Success {
730
- break
731
- }
732
-
733
- // If this is the last retry, return the error
734
- if i == c .config .RetryCount {
735
- return fmt .Errorf ("failed to complete scale operation on node %d: %w" , nodeID , err )
736
- }
737
-
738
- // Wait before retrying
739
- select {
740
- case <- ctx .Done ():
741
- return ctx .Err ()
742
- case <- time .After (c .config .RetryDelay ):
743
- }
744
- }
745
-
746
- return nil
747
- })
748
- }
749
-
750
- return group .Wait ()
751
- }
752
-
753
476
// updateClusterSize updates the cluster size in a race-condition safe manner.
754
477
// It takes a new cluster size and the current keys being processed.
755
478
// It returns a slice of keys that need to be fetched again.
0 commit comments