Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions cmd/katib-controller/v1beta1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/spf13/viper"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
"sigs.k8s.io/controller-runtime/pkg/client/config"
"sigs.k8s.io/controller-runtime/pkg/healthz"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/manager"
Expand All @@ -44,6 +45,7 @@ func main() {

var experimentSuggestionName string
var metricsAddr string
var healthzAddr string
var webhookPort int
var injectSecurityContext bool
var enableGRPCProbeInSuggestion bool
Expand All @@ -54,6 +56,7 @@ func main() {
flag.StringVar(&experimentSuggestionName, "experiment-suggestion-name",
"default", "The implementation of suggestion interface in experiment controller (default)")
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&healthzAddr, "healthz-addr", ":18080", "The address the healthz endpoint binds to.")
flag.BoolVar(&injectSecurityContext, "webhook-inject-securitycontext", false, "Inject the securityContext of container[0] in the sidecar")
flag.BoolVar(&enableGRPCProbeInSuggestion, "enable-grpc-probe-in-suggestion", true, "enable grpc probe in suggestions")
flag.Var(&trialResources, "trial-resources", "The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org)")
Expand Down Expand Up @@ -82,6 +85,8 @@ func main() {
webhookPort,
"metrics-addr",
metricsAddr,
"healthz-addr",
healthzAddr,
consts.ConfigInjectSecurityContext,
viper.GetBool(consts.ConfigInjectSecurityContext),
consts.ConfigEnableGRPCProbeInSuggestion,
Expand All @@ -99,9 +104,10 @@ func main() {

// Create a new katib controller to provide shared dependencies and start components
mgr, err := manager.New(cfg, manager.Options{
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
MetricsBindAddress: metricsAddr,
HealthProbeBindAddress: healthzAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
})
if err != nil {
log.Error(err, "Failed to create the manager")
Expand Down Expand Up @@ -129,6 +135,17 @@ func main() {
os.Exit(1)
}

log.Info("Setting up health checker.")
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
log.Error(err, "Unable to add healthz endpoint to the manager")
os.Exit(1)
}
// TODO (@anencore94) need to more detailed check whether is it possible to communicate with k8s-apiserver or db-manager at '/readyz' ?
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
log.Error(err, "Unable to add readyz endpoint to the manager")
os.Exit(1)
}

// Start the Cmd
log.Info("Starting the Cmd.")
if err := mgr.Start(signals.SetupSignalHandler()); err != nil {
Expand Down
3 changes: 2 additions & 1 deletion docs/developer-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ Below is a list of command-line flags accepted by Katib controller:
|---------------------------------|---------------------------|-------------------------------|------------------------------------------------------------------------------------------------------------------------|
| enable-grpc-probe-in-suggestion | bool | true | Enable grpc probe in suggestions |
| experiment-suggestion-name | string | "default" | The implementation of suggestion interface in experiment controller |
| metrics-addr | string | ":8080" | The address the metric endpoint binds to |
| metrics-addr | string | ":8080" | The address that the metrics endpoint binds to |
| healthz-addr | string | ":18080" | The address that the healthz endpoint binds to |
| trial-resources | []schema.GroupVersionKind | null | The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org) |
| webhook-inject-securitycontext | bool | false | Inject the securityContext of container[0] in the sidecar |
| webhook-port | int | 8443 | The port number to be used for admission webhook server |
Expand Down
11 changes: 11 additions & 0 deletions manifests/v1beta1/components/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ spec:
- containerPort: 8080
name: metrics
protocol: TCP
- containerPort: 18080
name: healthz
protocol: TCP
readinessProbe:
httpGet:
path: /readyz
port: healthz
livenessProbe:
httpGet:
path: /healthz
port: healthz
env:
- name: KATIB_CORE_NAMESPACE
valueFrom:
Expand Down
3 changes: 3 additions & 0 deletions manifests/v1beta1/components/controller/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,8 @@ spec:
- name: metrics
port: 8080
targetPort: 8080
- name: healthz
port: 18080
targetPort: 18080
selector:
katib.kubeflow.org/component: controller
4 changes: 3 additions & 1 deletion pkg/webhook/v1beta1/webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
)

func AddToManager(mgr manager.Manager, port int) error {

// Create a webhook server.
hookServer := &webhook.Server{
Port: port,
Expand All @@ -36,6 +35,9 @@ func AddToManager(mgr manager.Manager, port int) error {
if err := mgr.Add(hookServer); err != nil {
return fmt.Errorf("Add webhook server to the manager failed: %v", err)
}
if err := mgr.AddHealthzCheck("healthz", hookServer.StartedChecker()); err != nil {
return fmt.Errorf("Add webhook server health checker to the manager failed: %v", err)
}

experimentValidator := experiment.NewExperimentValidator(mgr.GetClient())
experimentDefaulter := experiment.NewExperimentDefaulter(mgr.GetClient())
Expand Down
4 changes: 0 additions & 4 deletions test/e2e/v1beta1/scripts/gh-actions/setup-katib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,6 @@ kubectl wait --for=condition=complete --timeout=${TIMEOUT} -l katib.kubeflow.org
kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in ($WITH_DATABASE_TYPE,controller,db-manager,ui)" -n kubeflow pod ||
(kubectl get pods -n kubeflow && kubectl describe pods -n kubeflow && exit 1)

# Wait until all Katib pods is actually ready.
# Since Katib-controller does not use Readinessprobe yet, just wait for a while.
sleep 30

echo "All Katib components are running."
echo "Katib deployments"
kubectl -n kubeflow get deploy
Expand Down