Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions cmd/nvidia-dra-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"slices"
"strings"
"sync"

resourceapi "k8s.io/api/resource/v1beta1"
Expand All @@ -29,6 +30,8 @@ import (
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"

"golang.org/x/mod/semver"

configapi "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/v1alpha1"
)

Expand Down Expand Up @@ -390,6 +393,21 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
allocatableDevices[r.Device] = s.allocatable[r.Device]
}

// allow devices only with cuda compute compatility >= 7.5 as time slicing and MPS does not work with old arch
shareableAllocatableDevices := make(AllocatableDevices)
for device, deviceType := range allocatableDevices {
if deviceType.Gpu != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean that we don't timeslice MIG devices?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, does it make sense to factor these checks into a function where we can better test the various combinations of options?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these changes also need in unpreprare function

cudaCCv := "v" + strings.TrimPrefix(deviceType.Gpu.cudaComputeCapability, "v")
gpuUUID := deviceType.Gpu.UUID
if semver.Compare(semver.Canonical(cudaCCv), semver.Canonical("v7.5")) >= 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@guptaNswati where does the v7.5 threshold come from? In #58 we check for >= v7.0 and for MPS specifically, v3.5 is mentioned.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

klog.Infof("GPU sharing is available on this device UUID=%v with CudaComputeCapability=%v", gpuUUID, cudaCCv)
shareableAllocatableDevices[device] = deviceType
} else {
return nil, fmt.Errorf("GPU sharing is not available on this device UUID=%v", gpuUUID)
}
}
}

// Declare a device group state object to populate.
var configState DeviceConfigState

Expand All @@ -400,7 +418,7 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
if tsc != nil {
err = s.tsManager.SetTimeSlice(allocatableDevices, tsc)
err = s.tsManager.SetTimeSlice(shareableAllocatableDevices, tsc)
if err != nil {
return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
Expand All @@ -413,7 +431,8 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
if err != nil {
return nil, fmt.Errorf("error getting MPS configuration: %w", err)
}
mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), allocatableDevices)

mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), shareableAllocatableDevices)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we distinguish between timeslicing-sharable and MPS-sharable devices?

if err := mpsControlDaemon.Start(ctx, mpsc); err != nil {
return nil, fmt.Errorf("error starting MPS control daemon: %w", err)
}
Expand Down
Loading