-
Notifications
You must be signed in to change notification settings - Fork 845
fix: Deal with deleted experiments when restoring from cache #5726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 59 commits
5192a92
0edf6d6
86c6f69
e53f83a
7a4aa22
7a1762e
bc16c71
df7081a
8aa59d5
f5d2809
999080a
7036911
15f8b0e
c077884
a5cc725
128a936
08faff9
9770e85
2bc7b18
5f4a4d0
5a5fcca
63723a4
2a21835
71a8184
049c8df
caa8fb3
365f8b3
c4601ea
17d0b10
7fea3cd
3bbf6f5
9d7e4fa
fdd4a5a
1469702
942d56f
b0191af
dce7eee
eaf7780
351a0f8
b392dbb
150df7c
373c9d6
0c307dd
96f0162
636bf0d
083d67e
1f0c8a6
c49adc1
d6cddbc
006d270
e00a6bc
afc43ce
e57a4aa
81f4276
667944c
b1415e7
8b54833
ae0f70d
a544528
76b0f1f
adb2ee8
5946b88
ba6223f
d6ceeb2
a634b49
e7ef7eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,31 +26,38 @@ import ( | |
"github.com/seldonio/seldon-core/operator/v2/pkg/utils" | ||
) | ||
|
||
func (s *SchedulerClient) StartExperiment(ctx context.Context, experiment *v1alpha1.Experiment) (error, bool) { | ||
func (s *SchedulerClient) StartExperiment(ctx context.Context, experiment *v1alpha1.Experiment, grpcClient scheduler.SchedulerClient) (bool, error) { | ||
sakoush marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logger := s.logger.WithName("StartExperiment") | ||
conn, err := s.getConnection(experiment.Namespace) | ||
if err != nil { | ||
return err, true | ||
var err error | ||
if grpcClient == nil { | ||
conn, err := s.getConnection(experiment.Namespace) | ||
if err != nil { | ||
return true, err | ||
} | ||
grpcClient = scheduler.NewSchedulerClient(conn) | ||
} | ||
grcpClient := scheduler.NewSchedulerClient(conn) | ||
|
||
req := &scheduler.StartExperimentRequest{ | ||
Experiment: experiment.AsSchedulerExperimentRequest(), | ||
} | ||
logger.Info("Start", "experiment name", experiment.Name) | ||
_, err = grcpClient.StartExperiment( | ||
_, err = grpcClient.StartExperiment( | ||
ctx, | ||
req, | ||
grpc_retry.WithMax(SchedulerConnectMaxRetries), | ||
grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), | ||
) | ||
return err, s.checkErrorRetryable(experiment.Kind, experiment.Name, err) | ||
return s.checkErrorRetryable(experiment.Kind, experiment.Name, err), err | ||
} | ||
|
||
func (s *SchedulerClient) StopExperiment(ctx context.Context, experiment *v1alpha1.Experiment) (error, bool) { | ||
func (s *SchedulerClient) StopExperiment(ctx context.Context, experiment *v1alpha1.Experiment, conn *grpc.ClientConn) (bool, error) { | ||
sakoush marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logger := s.logger.WithName("StopExperiment") | ||
conn, err := s.getConnection(experiment.Namespace) | ||
if err != nil { | ||
return err, true | ||
var err error | ||
if conn == nil { | ||
conn, err = s.getConnection(experiment.Namespace) | ||
if err != nil { | ||
return true, err | ||
} | ||
} | ||
grcpClient := scheduler.NewSchedulerClient(conn) | ||
req := &scheduler.StopExperimentRequest{ | ||
|
@@ -63,18 +70,33 @@ func (s *SchedulerClient) StopExperiment(ctx context.Context, experiment *v1alph | |
grpc_retry.WithMax(SchedulerConnectMaxRetries), | ||
grpc_retry.WithBackoff(grpc_retry.BackoffExponential(SchedulerConnectBackoffScalar)), | ||
) | ||
return err, s.checkErrorRetryable(experiment.Kind, experiment.Name, err) | ||
return s.checkErrorRetryable(experiment.Kind, experiment.Name, err), err | ||
} | ||
|
||
// namespace is not used in this function | ||
func (s *SchedulerClient) SubscribeExperimentEvents(ctx context.Context, conn *grpc.ClientConn, namespace string) error { | ||
func (s *SchedulerClient) SubscribeExperimentEvents(ctx context.Context, grcpClient scheduler.SchedulerClient, namespace string) error { | ||
logger := s.logger.WithName("SubscribeExperimentEvents") | ||
grcpClient := scheduler.NewSchedulerClient(conn) | ||
|
||
stream, err := grcpClient.SubscribeExperimentStatus(ctx, &scheduler.ExperimentSubscriptionRequest{SubscriberName: "seldon manager"}, grpc_retry.WithMax(1)) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// get experiments from the scheduler | ||
// if there are no experiments in the scheduler state then we need to create them | ||
// this is likely because of a restart of the scheduler that migrated the state | ||
// to v2 (where we delete the experiments from the scheduler state) | ||
numExperimentsFromScheduler, err := getNumExperimentsFromScheduler(ctx, grcpClient) | ||
if err != nil { | ||
return err | ||
} | ||
// if there are no experiments in the scheduler state then we need to create them if they exist in k8s | ||
// also remove finalizers from experiments that are being deleted | ||
if numExperimentsFromScheduler == 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A general comment: I can see how state inconsistencies may also be introduced by someone deleting an Experiment from k8s (with manual removal of finalizer) while the scheduler is down. When the scheduler comes back up, it will have that experiment in its local db (and will start it), but it's no longer in k8s. Now, there is an argument that this is what you get if you delete finalizers manually, and that should be avoided at all costs (however, one may know people that do things like that...). |
||
handleLoadedExperiments(ctx, namespace, s, grcpClient) | ||
handlePendingDeleteExperiments(ctx, namespace, s) | ||
} | ||
|
||
for { | ||
event, err := stream.Recv() | ||
if err != nil { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this change of interface to facilitate testing