Skip to content

Commit

Permalink
wip: handle driver migrations in nvidiadriver controller
Browse files Browse the repository at this point in the history
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
  • Loading branch information
cdesiniotis committed Aug 27, 2024
1 parent 81640fd commit 57024bb
Show file tree
Hide file tree
Showing 19 changed files with 207 additions and 6 deletions.
167 changes: 167 additions & 0 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ import (
"context"
"fmt"
"maps"
"os"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/workqueue"
Expand Down Expand Up @@ -57,6 +59,7 @@ type NVIDIADriverReconciler struct {
stateManager state.Manager
nodeSelectorValidator validator.Validator
conditionUpdater conditions.Updater
operatorNamespace string
}

//+kubebuilder:rbac:groups=nvidia.com,resources=nvidiadrivers,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -124,6 +127,8 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
}
clusterPolicyInstance := clusterPolicyList.Items[0]

r.operatorNamespace = os.Getenv("OPERATOR_NAMESPACE")

// Create a new InfoCatalog which is a generic interface for passing information to state managers
infoCatalog := state.NewInfoCatalog()

Expand Down Expand Up @@ -168,6 +173,16 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
return reconcile.Result{}, nil
}

clusterpolicyDriverLabels, err := getClusterpolicyDriverLabels(r.ClusterInfo, clusterPolicyInstance)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to get clusterpolicy driver labels: %w", err)
}

err = updateNodesManagedByDriver(ctx, r, instance, clusterpolicyDriverLabels)
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to update nodes managed by driver: %w", err)
}

// Sync state and update status
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)

Expand All @@ -191,6 +206,9 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
}
}
// if no errors are reported from any state, then we would be waiting on driver daemonset pods
// TODO: Avoid marking 'default' NVIDIADriver instances as NotReady if DesiredNumberScheduled == 0.
// This will avoid unnecessary reconciliations when the 'default' instance is overriden on all

Check failure on line 210 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

`overriden` is a misspelling of `overridden` (misspell)
// GPU nodes, and thus, DesiredNumberScheduled being 0 is valid.
if errorInfo == nil {
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.DriverNotReady, "Waiting for driver pod to be ready")
if condErr != nil {
Expand Down Expand Up @@ -404,5 +422,154 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
return fmt.Errorf("failed to add index key: %w", err)
}

if err := mgr.GetFieldIndexer().IndexField(ctx, &corev1.Pod{}, "spec.nodeName", func(rawObj client.Object) []string {
pod := rawObj.(*corev1.Pod)
return []string{pod.Spec.NodeName}
}); err != nil {
return err
}

return nil
}

func updateNodesManagedByDriver(ctx context.Context, r *NVIDIADriverReconciler, instance *nvidiav1alpha1.NVIDIADriver, clusterpolicyDriverLabels map[string]string) error {
nodes, err := getNVIDIADriverSelectedNodes(ctx, r.Client, instance)
if err != nil {
return fmt.Errorf("failed to get selected nodes for NVIDIADriver CR: %w", err)
}

// A map tracking which node objects need to be updated. E.g. updated label / annotations
// need to be applied.
nodesToUpdate := map[*corev1.Node]struct{}{}

for _, node := range nodes.Items {
labels := node.GetLabels()
annotations := node.GetAnnotations()

managedBy, exists := labels["nvidia.com/gpu.driver.managed-by"]
if !exists {
// if 'managed-by' label does not exist, label node with cr.Name
labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
node.SetLabels(labels)
nodesToUpdate[&node] = struct{}{}
// If there is an orphan driver pod running on the node,
// indicate to the upgrade controller that an upgrade is required.
// This will occur when we are migrating from a Clusterpolicy owned
// daemonset to a NVIDIADriver owned daemonset.
podList := &corev1.PodList{}
err = r.Client.List(ctx, podList,
client.InNamespace(r.operatorNamespace),
client.MatchingLabels(clusterpolicyDriverLabels),
client.MatchingFields{"spec.nodeName": node.Name})
if err != nil {
return fmt.Errorf("failed to list driver pods: %w", err)
}
if len(podList.Items) == 0 {
continue
}
pod := podList.Items[0]
if pod.OwnerReferences == nil || len(pod.OwnerReferences) == 0 {

Check failure on line 471 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for []k8s.io/apimachinery/pkg/apis/meta/v1.OwnerReference is defined as zero (gosimple)
annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true"
node.SetAnnotations(annotations)
}
continue
}

// do nothing if node is already being managed by this CR
if managedBy == instance.Name {
continue
}

// If node is 'managed-by' another CR, there are several scenarios
// 1) There is no driver pod running on the node. Therefore the label is stale.
// 2) There is a driver pod running on the node, and it is not an orphan. There are
// two possible actions:
// a) If the NVIDIADriver instance we are currently reconciling is the 'default'
// instance (the node selector is empty), do nothing. All other NVIDIADriver
// instances take precedence.
// b) The pod running no longer falls into the node pool of the CR it is currently
// being managed by. Thus, the NVIDIADriver instance we are currently reconciling
// should take ownership of the node.
podList := &corev1.PodList{}
err = r.Client.List(ctx, podList,
client.InNamespace(r.operatorNamespace),
client.MatchingLabels(map[string]string{AppComponentLabelKey: AppComponentLabelValue}),
client.MatchingFields{"spec.nodeName": node.Name})
if err != nil {
return fmt.Errorf("failed to list driver pods: %w", err)
}
if len(podList.Items) == 0 {
labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
node.SetLabels(labels)
nodesToUpdate[&node] = struct{}{}
continue
}
if instance.Spec.NodeSelector == nil || len(instance.Spec.NodeSelector) == 0 {

Check failure on line 507 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for map[string]string is defined as zero (gosimple)
// If the nodeSelector for the NVIDIADriver instance is empty, then we
// treat it as the 'default' CR which has the lowest precedence. Allow
// the existing driver pod, owned by another NVIDIADriver CR, to continue
// to run.
continue
}
pod := podList.Items[0]
if pod.OwnerReferences != nil && len(pod.OwnerReferences) > 0 {

Check failure on line 515 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for []k8s.io/apimachinery/pkg/apis/meta/v1.OwnerReference is defined as zero (gosimple)
err := r.Client.Patch(ctx, &pod, client.RawPatch(types.MergePatchType, []byte(fmt.Sprintf(`{"metadata":{"labels":{"app": null}}}`))))

Check failure on line 516 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1039: unnecessary use of fmt.Sprintf (gosimple)
if err != nil {
return fmt.Errorf("failed to patch pod in order to make it an orphan: %w", err)
}
}
labels["nvidia.com/gpu.driver.managed-by"] = instance.Name
annotations["nvidia.com/gpu-driver-upgrade-requested"] = "true"
node.SetLabels(labels)
node.SetAnnotations(annotations)
nodesToUpdate[&node] = struct{}{}
}

// Apply updated labels / annotations on node objects
for node := range nodesToUpdate {
err = r.Client.Update(ctx, node)
if err != nil {
return fmt.Errorf("failed to update node %s: %w", node.Name, err)
}
}

return nil
}

// getNVIDIADriverSelectedNodes returns selected nodes based on the nodeselector labels set for a given NVIDIADriver instance
func getNVIDIADriverSelectedNodes(ctx context.Context, k8sClient client.Client, cr *nvidiav1alpha1.NVIDIADriver) (*corev1.NodeList, error) {
nodeList := &corev1.NodeList{}

if cr.Spec.NodeSelector == nil {
cr.Spec.NodeSelector = cr.GetNodeSelector()
}

selector := labels.Set(cr.Spec.NodeSelector).AsSelector()

opts := []client.ListOption{
client.MatchingLabelsSelector{Selector: selector},
}
err := k8sClient.List(ctx, nodeList, opts...)

return nodeList, err
}

// getClusterpolicyDriverLabels returns a set of labels that can be used to identify driver pods running that are owned by Clusterpolicy
func getClusterpolicyDriverLabels(clusterInfo clusterinfo.Interface, clusterpolicy gpuv1.ClusterPolicy) (map[string]string, error) {
// initialize with common app=nvidia-driver-daemonset label
driverLabelKey := DriverLabelKey
driverLabelValue := DriverLabelValue

ocpVer, err := clusterInfo.GetOpenshiftVersion()
if err != nil {
return nil, fmt.Errorf("failed to get the OpenShift version: %w", err)
}
if ocpVer != "" && clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit != nil && *clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit == true {

Check failure on line 567 in controllers/nvidiadriver_controller.go

View workflow job for this annotation

GitHub Actions / go-check

S1002: should omit comparison to bool constant, can be simplified to `*clusterpolicy.Spec.Operator.UseOpenShiftDriverToolkit` (gosimple)
// For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not
// constant and changes based on rhcos version. Hence use DTK label instead
driverLabelKey = ocpDriverToolkitIdentificationLabel
driverLabelValue = ocpDriverToolkitIdentificationValue
}

return map[string]string{driverLabelKey: driverLabelValue}, nil
}
1 change: 1 addition & 0 deletions internal/state/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS

return &driverSpec{
Spec: spec,
CRName: cr.Name,
AppName: nvidiaDriverAppName,
Name: nvidiaDriverName,
ImagePath: imagePath,
Expand Down
1 change: 1 addition & 0 deletions internal/state/driver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@ func getMinimalDriverRenderData() *driverRenderData {
ReadinessProbe: getDefaultContainerProbeSpec(),
DriverType: nvidiav1alpha1.GPU,
},
CRName: "test-cr",
AppName: "nvidia-gpu-driver-ubuntu22.04-7c6d7bd86b",
Name: "nvidia-gpu-driver-ubuntu22.04",
ImagePath: "nvcr.io/nvidia/driver:525.85.03-ubuntu22.04",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
3 changes: 1 addition & 2 deletions internal/state/testdata/golden/driver-full-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,8 @@ spec:
mountPropagation: HostToContainer
name: run-mellanox-drivers
nodeSelector:
example.com/bar: bar
example.com/foo: foo
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: custom-priority-class-name
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-openshift
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-gdrcopy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-gds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-openshift
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/kernel-version.full: 5.4.0-150-generic
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-rdma-hostmofed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-rdma.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ spec:
nodeSelector:
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: 413.92.202304252344-0
nvidia.com/gpu.deploy.vgpu-manager: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-vgpu-manager-openshift
tolerations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.vgpu-manager: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-vgpu-manager-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/testdata/golden/driver-vgpu-licensing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ spec:
name: run-mellanox-drivers
nodeSelector:
nvidia.com/gpu.deploy.driver: "true"
nvidia.com/gpu.driver.managed-by: test-cr
priorityClassName: system-node-critical
serviceAccountName: nvidia-gpu-driver-ubuntu22.04
tolerations:
Expand Down
1 change: 1 addition & 0 deletions internal/state/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type SyncingSource source.SyncingSource
// which is to be populated with the fully-qualified image path.
type driverSpec struct {
Spec *nvidiav1alpha1.NVIDIADriverSpec
CRName string
AppName string
Name string
ImagePath string
Expand Down
24 changes: 23 additions & 1 deletion internal/validator/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,30 @@ func (nsv *nodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alph
return err
}

crHasEmptyNodeSelector := false
if cr.Spec.NodeSelector == nil || len(cr.Spec.NodeSelector) == 0 {

Check failure on line 55 in internal/validator/validator.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for map[string]string is defined as zero (gosimple)
crHasEmptyNodeSelector = true
}

names := map[string]struct{}{}
for di := range drivers.Items {
for di, driver := range drivers.Items {
if driver.Name == cr.Name {
continue
}

if crHasEmptyNodeSelector {
// If the CR we are validating has an empty node selector, it does not conflict
// with any other CR unless there is another CR that also is configured with an
// empty node selector. Only one NVIDIADriver instance can be configured with an
// empty node selector at any point in time. We deem such an instance as the 'default'
// instance, as it will get deployed on all GPU nodes. Other CRs, with non-empty
// node selectors, can override the 'default' NVIDIADriver instance.
if driver.Spec.NodeSelector == nil || len(driver.Spec.NodeSelector) == 0 {

Check failure on line 72 in internal/validator/validator.go

View workflow job for this annotation

GitHub Actions / go-check

S1009: should omit nil check; len() for map[string]string is defined as zero (gosimple)
return fmt.Errorf("cannot have empty nodeSelector as another CR is configured with an empty nodeSelector: %s", driver.Name)
}
continue
}

nodeList, err := nsv.getNVIDIADriverSelectedNodes(ctx, &drivers.Items[di])
if err != nil {
return err
Expand Down
4 changes: 1 addition & 3 deletions manifests/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,12 @@ spec:
{{- end }}
spec:
nodeSelector:
nvidia.com/gpu.driver.managed-by: {{ .Driver.CRName | quote }}
{{- if eq .Driver.Spec.DriverType "vgpu-host-manager" }}
nvidia.com/gpu.deploy.vgpu-manager: "true"
{{- else }}
nvidia.com/gpu.deploy.driver: "true"
{{- end }}
{{- if .Driver.Spec.NodeSelector }}
{{- .Driver.Spec.NodeSelector | yaml | nindent 8 }}
{{- end }}
{{- if and (.Openshift) (.Runtime.OpenshiftDriverToolkitEnabled) }}
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: {{ .Openshift.RHCOSVersion | quote }}
{{- end }}
Expand Down

0 comments on commit 57024bb

Please sign in to comment.