Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ ifndef HAS_SETUP_ENVTEST
endif
echo "setup-envtest has already installed"


check: generate fmt vet lint

fmt:
Expand Down
6 changes: 6 additions & 0 deletions pkg/apis/controller/experiments/v1beta1/experiment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ type ExperimentStatus struct {
// List of trial names which have been early stopped.
EarlyStoppedTrialList []string `json:"earlyStoppedTrialList,omitempty"`

// List of trial names which have been metrics unavailable
MetricsUnavailableTrialList []string `json:"metricsUnavailableTrialList,omitempty"`

// Trials is the total number of trials owned by the experiment.
Trials int32 `json:"trials,omitempty"`

Expand All @@ -120,6 +123,9 @@ type ExperimentStatus struct {

// How many trials are currently early stopped.
TrialsEarlyStopped int32 `json:"trialsEarlyStopped,omitempty"`

// How many trials are currently metrics unavailable.
TrialMetricsUnavailable int32 `json:"trialMetricsUnavailable,omitempty"`
}

// OptimalTrial is the metrics and assignments of the best trial.
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 7 additions & 6 deletions pkg/apis/controller/trials/v1beta1/trial_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,13 @@ type TrialCondition struct {
type TrialConditionType string

const (
TrialCreated TrialConditionType = "Created"
TrialRunning TrialConditionType = "Running"
TrialSucceeded TrialConditionType = "Succeeded"
TrialKilled TrialConditionType = "Killed"
TrialFailed TrialConditionType = "Failed"
TrialEarlyStopped TrialConditionType = "EarlyStopped"
TrialCreated TrialConditionType = "Created"
TrialRunning TrialConditionType = "Running"
TrialSucceeded TrialConditionType = "Succeeded"
TrialKilled TrialConditionType = "Killed"
TrialFailed TrialConditionType = "Failed"
TrialMetricsUnavailable TrialConditionType = "MetricsUnavailable"
TrialEarlyStopped TrialConditionType = "EarlyStopped"
)

// +genclient
Expand Down
16 changes: 10 additions & 6 deletions pkg/apis/controller/trials/v1beta1/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,11 @@ func (trial *Trial) IsKilled() bool {

// IsMetricsUnavailable returns true if Trial metrics are not available
func (trial *Trial) IsMetricsUnavailable() bool {
cond := getCondition(trial, TrialSucceeded)
if cond != nil && cond.Status == v1.ConditionFalse {
return true
}
return false
return hasCondition(trial, TrialMetricsUnavailable)
}

func (trial *Trial) IsCompleted() bool {
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped()
return trial.IsSucceeded() || trial.IsFailed() || trial.IsKilled() || trial.IsEarlyStopped() || trial.IsMetricsUnavailable()
}

func (trial *Trial) IsEarlyStopped() bool {
Expand Down Expand Up @@ -158,3 +154,11 @@ func (trial *Trial) MarkTrialStatusKilled(reason, message string) {
}
trial.setCondition(TrialKilled, v1.ConditionTrue, reason, message)
}

func (trial *Trial) MarkTrialStatusMetricsUnavailable(reason, message string) {
currentCond := getCondition(trial, TrialRunning)
if currentCond != nil {
trial.setCondition(TrialRunning, v1.ConditionFalse, currentCond.Reason, currentCond.Message)
}
trial.setCondition(TrialMetricsUnavailable, v1.ConditionTrue, reason, message)
}
269 changes: 137 additions & 132 deletions pkg/apis/manager/v1beta1/api.pb.go

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions pkg/apis/manager/v1beta1/api.proto
Original file line number Diff line number Diff line change
Expand Up @@ -226,16 +226,17 @@ message ParameterAssignment {
* Current Trial status. It contains Trial's latest condition, start time, completion time, observation.
*/
message TrialStatus {
// Trial can be in one of 6 conditions.
// Trial can be in one of 8 conditions.
// TODO (andreyvelich): Remove unused conditions.
enum TrialConditionType {
CREATED = 0;
RUNNING = 1;
SUCCEEDED = 2;
KILLED = 3;
FAILED = 4;
EARLYSTOPPED = 5;
UNKNOWN = 6;
METRICSUNAVAILABLE = 5;
EARLYSTOPPED = 6;
UNKNOWN = 7;
}
string start_time = 1; // Trial start time in RFC3339 format
string completion_time = 2; // Trial completion time in RFC3339 format
Expand Down
7 changes: 4 additions & 3 deletions pkg/apis/manager/v1beta1/gen-doc/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ Types of value for HyperParameter.
<a name="api-v1-beta1-TrialStatus-TrialConditionType"></a>

### TrialStatus.TrialConditionType
Trial can be in one of 6 conditions.
Trial can be in one of 8 conditions.
TODO (andreyvelich): Remove unused conditions.

| Name | Number | Description |
Expand All @@ -781,8 +781,9 @@ TODO (andreyvelich): Remove unused conditions.
| SUCCEEDED | 2 | |
| KILLED | 3 | |
| FAILED | 4 | |
| EARLYSTOPPED | 5 | |
| UNKNOWN | 6 | |
| METRICSUNAVAILABLE | 5 | |
| EARLYSTOPPED | 6 | |
| UNKNOWN | 7 | |



Expand Down
12 changes: 9 additions & 3 deletions pkg/apis/manager/v1beta1/gen-doc/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1777,7 +1777,7 @@ <h3 id="api.v1.beta1.ParameterType">ParameterType</h3>
</table>

<h3 id="api.v1.beta1.TrialStatus.TrialConditionType">TrialStatus.TrialConditionType</h3>
<p>Trial can be in one of 6 conditions.</p><p>TODO (andreyvelich): Remove unused conditions.</p>
<p>Trial can be in one of 8 conditions.</p><p>TODO (andreyvelich): Remove unused conditions.</p>
<table class="enum-table">
<thead>
<tr><td>Name</td><td>Number</td><td>Description</td></tr>
Expand Down Expand Up @@ -1815,17 +1815,23 @@ <h3 id="api.v1.beta1.TrialStatus.TrialConditionType">TrialStatus.TrialConditionT
</tr>

<tr>
<td>EARLYSTOPPED</td>
<td>METRICSUNAVAILABLE</td>
<td>5</td>
<td><p></p></td>
</tr>

<tr>
<td>UNKNOWN</td>
<td>EARLYSTOPPED</td>
<td>6</td>
<td><p></p></td>
</tr>

<tr>
<td>UNKNOWN</td>
<td>7</td>
<td><p></p></td>
</tr>

</tbody>
</table>

Expand Down
124 changes: 64 additions & 60 deletions pkg/apis/manager/v1beta1/python/api_pb2.py

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions pkg/apis/v1beta1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions pkg/apis/v1beta1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,14 @@
"description": "Represents last time when the Experiment was reconciled. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC.",
"$ref": "#/definitions/v1.Time"
},
"metricsUnavailableTrialList": {
"description": "List of trial names which have been metrics unavailable",
"type": "array",
"items": {
"type": "string",
"default": ""
}
},
"pendingTrialList": {
"description": "List of trial names which are pending.",
"type": "array",
Expand Down Expand Up @@ -718,6 +726,11 @@
"default": ""
}
},
"trialMetricsUnavailable": {
"description": "How many trials are currently metrics unavailable.",
"type": "integer",
"format": "int32"
},
"trials": {
"description": "Trials is the total number of trials owned by the experiment.",
"type": "integer",
Expand Down
22 changes: 17 additions & 5 deletions pkg/controller.v1beta1/experiment/util/status_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,13 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
var bestTrialValue float64
sts := &instance.Status
sts.Trials = 0
sts.RunningTrialList, sts.PendingTrialList, sts.FailedTrialList, sts.SucceededTrialList, sts.KilledTrialList, sts.EarlyStoppedTrialList = nil, nil, nil, nil, nil, nil
sts.RunningTrialList = nil
sts.PendingTrialList = nil
sts.FailedTrialList = nil
sts.SucceededTrialList = nil
sts.KilledTrialList = nil
sts.EarlyStoppedTrialList = nil
sts.MetricsUnavailableTrialList = nil
bestTrialIndex := -1
isObjectiveGoalReached := false
var objectiveValueGoal float64
Expand All @@ -79,6 +85,8 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
sts.EarlyStoppedTrialList = append(sts.EarlyStoppedTrialList, trial.Name)
} else if trial.IsRunning() {
sts.RunningTrialList = append(sts.RunningTrialList, trial.Name)
} else if trial.IsMetricsUnavailable() {
sts.MetricsUnavailableTrialList = append(sts.MetricsUnavailableTrialList, trial.Name)
} else {
sts.PendingTrialList = append(sts.PendingTrialList, trial.Name)
}
Expand All @@ -95,7 +103,7 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
continue
}

//initialize vars to objective metric value of the first trial
// initialize vars to objective metric value of the first trial
if bestTrialIndex == -1 {
bestTrialValue = objectiveMetricValue
bestTrialIndex = index
Expand Down Expand Up @@ -126,6 +134,7 @@ func updateTrialsSummary(instance *experimentsv1beta1.Experiment, trials *trials
sts.TrialsFailed = int32(len(sts.FailedTrialList))
sts.TrialsKilled = int32(len(sts.KilledTrialList))
sts.TrialsEarlyStopped = int32(len(sts.EarlyStoppedTrialList))
sts.TrialMetricsUnavailable = int32(len(sts.MetricsUnavailableTrialList))

// if best trial is set
if bestTrialIndex != -1 {
Expand Down Expand Up @@ -177,8 +186,9 @@ func getObjectiveMetricValue(trial trialsv1beta1.Trial) string {
// UpdateExperimentStatusCondition updates the experiment status.
func UpdateExperimentStatusCondition(collector *ExperimentsCollector, instance *experimentsv1beta1.Experiment, isObjectiveGoalReached bool, getSuggestionDone bool) {
logger := log.WithValues("Experiment", types.NamespacedName{Name: instance.GetName(), Namespace: instance.GetNamespace()})
completedTrialsCount := instance.Status.TrialsSucceeded + instance.Status.TrialsFailed + instance.Status.TrialsKilled + instance.Status.TrialsEarlyStopped
failedTrialsCount := instance.Status.TrialsFailed
completedTrialsCount :=
instance.Status.TrialsSucceeded + instance.Status.TrialsFailed + instance.Status.TrialsKilled + instance.Status.TrialsEarlyStopped + instance.Status.TrialMetricsUnavailable
failedTrialsCount := instance.Status.TrialsFailed + instance.Status.TrialMetricsUnavailable
Copy link
Member

@johnugeorge johnugeorge Jun 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussion point:
Is TrialMetricsUnavailable state a failed state for a user?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that the TrialMetricsUnavailable state is a failed Trial for a user since katib can not use that Trial result to optimize hyperparameters even though the training is successful if the metrics-collector container fails to collect metrics.

What do you think? @johnugeorge

Copy link
Member

@johnugeorge johnugeorge Jun 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so since it is a unrecoverable and an end state. We have to include this in docs as well to explain "Failed" trial"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes sense.
As I can see katib docs, we don't have any sections to describe Experiment status, so It might be better to explain Experiment status in this paragraph describing Experiment.

Does it sound good to you? @johnugeorge

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. We will wait for a day to get other reviews as well.

activeTrialsCount := instance.Status.TrialsPending + instance.Status.TrialsRunning
now := metav1.Now()

Expand All @@ -192,7 +202,9 @@ func UpdateExperimentStatusCondition(collector *ExperimentsCollector, instance *
}

// First check if MaxFailedTrialCount is reached.
if (instance.Spec.MaxFailedTrialCount != nil) && (failedTrialsCount > *instance.Spec.MaxFailedTrialCount) {
if (instance.Spec.MaxFailedTrialCount != nil) &&
((*instance.Spec.MaxFailedTrialCount != *instance.Spec.MaxTrialCount && failedTrialsCount > *instance.Spec.MaxFailedTrialCount) ||
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain this condition from the current one?

Copy link
Member Author

@tenzen-y tenzen-y Jun 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@johnugeorge Thanks for your review!
Currently, if *instance.Spec.MaxFailedTrialCount is equal to *instance.Spec.MaxTrialCount and failedTrialsCount is equal to *instance.Spec.MaxFailedTrialCount, katib skip this condition and gives Completed to the Experiment status.

In other words, katib gives Completed to Experiment status even though the number of failed Trials reaches *instance.Spec.MaxFailedTrialCount.

In this implementation, when the same condition as mentioned above, katib doesn't skip this condition and gives Failed to the experiment status.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does it matter with *instance.Spec.MaxTrialCount? If I understand correctly, status is not set correctly when the number of failed Trials reaches *instance.Spec.MaxFailedTrialCount. Then, Isn't condition failedTrialsCount >= *instance.Spec.MaxFailedTrialCount enough ?

Copy link
Member Author

@tenzen-y tenzen-y Jun 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the current implementation, katib create *instance.Spec.MaxFailedTrialCount + 1 times Trials if all Trials failed.
So, I implemented this part to keep this specification.
Should I change to failedTrialsCount >= *instance.Spec.MaxFailedTrialCount?

Copy link
Member

@johnugeorge johnugeorge Jun 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't failedTrialsCount >= *instance.Spec.MaxFailedTrialCount the right one? Experiment should be completed when failed trials reach MaxFailedTrialCount.
/cc @andreyvelich

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't failedTrialsCount >= *instance.Spec.MaxFailedTrialCount the right one? Experiment should be completed when failed trials reach MaxFailedTrialCount.

Yes, I agree with @johnugeorge .
However, because as you say, there is an inconsistency between maxFailedTrialCount and maxTrialCount, we need to change definition of maxFailedTrialCount in order to change to failedTrialsCount >= *instance.Spec.MaxFailedTrialCount in this part.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tenzen-y Please go ahead and make this change - failedTrialsCount >= *instance.Spec.MaxFailedTrialCount

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@johnugeorge Sure.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid being set Failed in Experiment status when failedTrialsCount and *instance.Spec.MaxFailedTrialCount is equal to 0, I added condition failedTrialsCount != 0 to this part.

(*instance.Spec.MaxFailedTrialCount == *instance.Spec.MaxTrialCount && failedTrialsCount == *instance.Spec.MaxFailedTrialCount)) {
msg := "Experiment has failed because max failed count has reached"
instance.MarkExperimentStatusFailed(ExperimentFailedReason, msg)
instance.Status.CompletionTime = &now
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -848,8 +848,8 @@ func newFakeTrials() []trialsv1beta1.Trial {
Status: trialsv1beta1.TrialStatus{
Conditions: []trialsv1beta1.TrialCondition{
{
Type: trialsv1beta1.TrialSucceeded,
Status: corev1.ConditionFalse,
Type: trialsv1beta1.TrialMetricsUnavailable,
Status: corev1.ConditionTrue,
Message: "Metrics are not available",
},
},
Expand Down
3 changes: 1 addition & 2 deletions pkg/controller.v1beta1/trial/trial_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,8 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error {

// Job already exists.
// If Trial is EarlyStopped we need to verify/update observation logs.
// TODO (andreyvelich): We can include "MetricsUnavailable" condition to "Complete".
// In that case, Trial's job will be deleted even if metrics are not available.
if deployedJob != nil && ((!instance.IsCompleted() && !instance.IsMetricsUnavailable()) || instance.IsEarlyStopped()) {
if deployedJob != nil && (!instance.IsCompleted() || instance.IsEarlyStopped()) {
jobStatus, err := trialutil.GetDeployedJobStatus(instance, deployedJob)
if err != nil {
logger.Error(err, "GetDeployedJobStatus error")
Expand Down
19 changes: 7 additions & 12 deletions pkg/controller.v1beta1/trial/trial_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package trial

import (
"fmt"
"sync"
"testing"
"time"
Expand All @@ -41,7 +40,7 @@ import (
api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
"github.com/kubeflow/katib/pkg/controller.v1beta1/consts"
trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util"
util "github.com/kubeflow/katib/pkg/controller.v1beta1/util"
"github.com/kubeflow/katib/pkg/controller.v1beta1/util"
managerclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/trial/managerclient"
)

Expand Down Expand Up @@ -267,8 +266,8 @@ func TestReconcileBatchJob(t *testing.T) {
}
return trial.IsSucceeded() &&
len(trial.Status.Observation.Metrics) > 0 &&
trial.Status.Observation.Metrics[0].Max == "0.99" &&
trial.Status.Observation.Metrics[0].Min == "0.11" &&
trial.Status.Observation.Metrics[0].Max == "0.99" &&
trial.Status.Observation.Metrics[0].Latest == "0.11"
}, timeout).Should(gomega.BeTrue())

Expand All @@ -291,15 +290,11 @@ func TestReconcileBatchJob(t *testing.T) {
if err = c.Get(ctx, trialKey, trial); err != nil {
return false
}
isConditionCorrect := false
for _, cond := range trial.Status.Conditions {
if cond.Type == trialsv1beta1.TrialSucceeded && cond.Status == corev1.ConditionFalse &&
cond.Reason == fmt.Sprintf("%v. Job reason: %v", TrialMetricsUnavailableReason, batchJobCompleteReason) &&
cond.Message == fmt.Sprintf("Metrics are not available. Job message: %v", batchJobCompleteMessage) {
isConditionCorrect = true
}
}
return isConditionCorrect
return trial.IsMetricsUnavailable() &&
len(trial.Status.Observation.Metrics) > 0 &&
trial.Status.Observation.Metrics[0].Min == consts.UnavailableMetricValue &&
trial.Status.Observation.Metrics[0].Max == consts.UnavailableMetricValue &&
trial.Status.Observation.Metrics[0].Latest == consts.UnavailableMetricValue
}, timeout).Should(gomega.BeTrue())

// Delete the Trial
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller.v1beta1/trial/trial_controller_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria
r.collector.IncreaseTrialsSucceededCount(instance.Namespace)
}
} else if !instance.IsMetricsUnavailable() {
// TODO (andreyvelich): Is it correct to mark succeeded status false when metrics are unavailable?
// Ref issue to add new condition: https://github.com/kubeflow/katib/issues/1343
msg := "Metrics are not available"
reason := TrialMetricsUnavailableReason

Expand All @@ -81,10 +79,12 @@ func (r *ReconcileTrial) UpdateTrialStatusCondition(instance *trialsv1beta1.Tria
}

logger.Info("Trial status changed to Metrics Unavailable")
instance.MarkTrialStatusSucceeded(corev1.ConditionFalse, reason, msg)
instance.MarkTrialStatusMetricsUnavailable(reason, msg)
instance.Status.CompletionTime = &timeNow

eventMsg := fmt.Sprintf("Metrics are not available for Job %v", deployedJobName)
r.recorder.Eventf(instance, corev1.EventTypeWarning, JobMetricsUnavailableReason, eventMsg)
r.collector.IncreaseTrialsMetricsUnavailableCount(instance.Namespace)
}
} else if jobStatus.Condition == trialutil.JobFailed && !instance.IsFailed() && !instance.IsEarlyStopped() {
msg := "Trial has failed"
Expand Down
Loading