@@ -17,7 +17,10 @@ limitations under the License.
17
17
package v1beta1
18
18
19
19
import (
20
+ "bytes"
21
+ "context"
20
22
"encoding/json"
23
+ "io"
21
24
"log"
22
25
"net/http"
23
26
"path/filepath"
@@ -27,8 +30,15 @@ import (
27
30
"sigs.k8s.io/controller-runtime/pkg/client"
28
31
29
32
experimentv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1"
33
+ trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1"
30
34
api_pb_v1beta1 "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
31
35
"github.com/kubeflow/katib/pkg/util/v1beta1/katibclient"
36
+
37
+ apiv1 "k8s.io/api/core/v1"
38
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
39
+ "k8s.io/apimachinery/pkg/types"
40
+ corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
41
+ "sigs.k8s.io/controller-runtime/pkg/client/config"
32
42
)
33
43
34
44
func NewKatibUIHandler (dbManagerAddr string ) * KatibUIHandler {
@@ -421,3 +431,87 @@ func (k *KatibUIHandler) FetchTrial(w http.ResponseWriter, r *http.Request) {
421
431
return
422
432
}
423
433
}
434
+
435
+ // FetchTrialLogs fetches logs for a trial in specific namespace.
436
+ func (k * KatibUIHandler ) FetchTrialLogs (w http.ResponseWriter , r * http.Request ) {
437
+ log .Printf ("Requesting logs" )
438
+
439
+ trialName := r .URL .Query ()["trialName" ][0 ]
440
+ namespace := r .URL .Query ()["namespace" ][0 ]
441
+ log .Printf ("Requesting logs" )
442
+
443
+ logs , err := getTrialLogs (k , trialName , namespace )
444
+ if err != nil {
445
+ log .Printf ("GetLogs failed: %v" , err )
446
+ http .Error (w , err .Error (), http .StatusInternalServerError )
447
+ return
448
+ }
449
+ response , err := json .Marshal (logs )
450
+ if err != nil {
451
+ log .Printf ("Marshal logs failed: %v" , err )
452
+ http .Error (w , err .Error (), http .StatusInternalServerError )
453
+ return
454
+ }
455
+ w .Write (response )
456
+ }
457
+
458
+ // GetTrialLogs returns logs of a master Pod for the given job name and namespace
459
+ func getTrialLogs (k * KatibUIHandler , trialName string , namespace string ) (string , error ) {
460
+ cfg , err := config .GetConfig ()
461
+ if err != nil {
462
+ return "" , err
463
+ }
464
+
465
+ clientset , err := corev1 .NewForConfig (cfg )
466
+ if err != nil {
467
+ return "" , err
468
+ }
469
+
470
+ trial := & trialsv1beta1.Trial {}
471
+ if err := k .katibClient .GetClient ().Get (context .TODO (), types.NamespacedName {Name : trialName , Namespace : namespace }, trial ); err != nil {
472
+ return "" , err
473
+ }
474
+
475
+ selectionLabel := "training.kubeflow.org/job-name=" + trialName + ",training.kubeflow.org/job-role=master"
476
+ if trial .Spec .RunSpec .GetKind () == "Job" {
477
+ selectionLabel = "job-name=" + trialName
478
+ }
479
+
480
+ podList , err := clientset .Pods (namespace ).List (context .Background (), metav1.ListOptions {LabelSelector : selectionLabel })
481
+ if err != nil {
482
+ return "" , err
483
+ }
484
+
485
+ if len (podList .Items ) == 0 {
486
+ message := `Logs for the trial could not be found.
487
+ Was 'retain: true' specified in the Experiment definition?
488
+ An example can be found here: https://github.com/kubeflow/katib/blob/master/examples/v1beta1/argo/argo-workflow.yaml#L33`
489
+
490
+ return message , nil
491
+ }
492
+
493
+ podLogOpts := apiv1.PodLogOptions {}
494
+ podLogOpts .Container = trial .Spec .PrimaryContainerName
495
+ for container := range podList .Items [0 ].Spec .Containers {
496
+ if podList .Items [0 ].Spec .Containers [container ].Name == "metrics-logger-and-collector" {
497
+ podLogOpts .Container = "metrics-logger-and-collector"
498
+ break
499
+ }
500
+ }
501
+
502
+ req := clientset .Pods (namespace ).GetLogs (podList .Items [0 ].Name , & podLogOpts )
503
+ podLogs , err := req .Stream (context .Background ())
504
+ if err != nil {
505
+ return "" , err
506
+ }
507
+ defer podLogs .Close ()
508
+
509
+ buf := new (bytes.Buffer )
510
+ _ , err = io .Copy (buf , podLogs )
511
+ if err != nil {
512
+ return "" , err
513
+ }
514
+ str := buf .String ()
515
+
516
+ return str , nil
517
+ }
0 commit comments