Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop infinite retries of upload and clean up snapshot #532

Merged
merged 1 commit into from
Jun 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 20 additions & 14 deletions pkg/apis/backupdriver/v1alpha1/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,33 @@ type SnapshotSpec struct {
// New - No work yet, next phase is InProgress
// InProgress - snapshot being taken
// Snapshotted - local snapshot complete, next phase is Protecting or SnapshotFailed
// SnapshotFailed - end state, snapshot was not able to be taken
// SnapshotFailed - terminal state, snapshot was not able to be taken
// Uploading - snapshot is being moved to durable storage
// Uploaded - end state, snapshot has been protected
// UploadFailed - end state, unable to move to durable storage
// Uploaded - terminal state, snapshot has been protected
// UploadFailed - unable to move to durable storage
// Canceling - when the SanpshotCancel flag is set, if the Snapshot has not already moved into a terminal state, the
// status will move to Canceling. The snapshot ID will be removed from the status status if has been filled in
// and the snapshot ID will not longer be valid for a Clone operation
// Canceled - the operation was canceled, the snapshot ID is not valid
// CleanupAfterUploadFailed - terminal state, unable to delete local snapshot
// UploadFailedAfterRetry - terminal state, local snapshot is deleted after upload CR retries reach the maximum retry count. User can set the "upload-cr-retry-max"
// parameter in config map velero-vsphere-plugin-config to specify the maximum count the upload CR will retry before trying to delete
// the local snapshot. If user does not specify this in the config map, default retry count is set to 10.

type SnapshotPhase string

const (
SnapshotPhaseNew SnapshotPhase = "New"
SnapshotPhaseInProgress SnapshotPhase = "InProgress"
SnapshotPhaseSnapshotted SnapshotPhase = "Snapshotted"
SnapshotPhaseSnapshotFailed SnapshotPhase = "SnapshotFailed"
SnapshotPhaseUploading SnapshotPhase = "Uploading"
SnapshotPhaseUploaded SnapshotPhase = "Uploaded"
SnapshotPhaseUploadFailed SnapshotPhase = "UploadFailed"
SnapshotPhaseCanceling SnapshotPhase = "Canceling"
SnapshotPhaseCanceled SnapshotPhase = "Canceled"
SnapshotPhaseCleanupFailed SnapshotPhase = "CleanupAfterUploadFailed"
SnapshotPhaseNew SnapshotPhase = "New"
SnapshotPhaseInProgress SnapshotPhase = "InProgress"
SnapshotPhaseSnapshotted SnapshotPhase = "Snapshotted"
SnapshotPhaseSnapshotFailed SnapshotPhase = "SnapshotFailed"
SnapshotPhaseUploading SnapshotPhase = "Uploading"
SnapshotPhaseUploaded SnapshotPhase = "Uploaded"
SnapshotPhaseUploadFailed SnapshotPhase = "UploadFailed"
SnapshotPhaseCanceling SnapshotPhase = "Canceling"
SnapshotPhaseCanceled SnapshotPhase = "Canceled"
SnapshotPhaseCleanupFailed SnapshotPhase = "CleanupAfterUploadFailed"
SnapshotPhaseUploadFailedAfterRetry SnapshotPhase = "UploadFailedAfterRetry"
lipingxue marked this conversation as resolved.
Show resolved Hide resolved
)

// UploadOperationProgress represents the progress of a
Expand Down Expand Up @@ -149,7 +155,7 @@ type CloneFromSnapshotSpec struct {
New - No work yet, next phase is InProgress
InProgress - snapshot being taken
Completed - new object has been created
Failed - end state, clone failed, no new object was created
Failed - terminal state, clone failed, no new object was created
Canceling - when the Clone flag is set, if the Clone has not already moved into a terminal state, the
status will move to Canceling. The object that was being created will be removed
Canceled - the Clone was canceled, no new object was created
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/backupdriver/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 19 additions & 8 deletions pkg/apis/datamover/v1alpha1/upload.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,28 @@ type UploadSpec struct {
}

// UploadPhase represents the lifecycle phase of a Upload.
// +kubebuilder:validation:Enum=New;InProgress;Completed;UploadError;CleanupFailed;Canceled;Canceling;
// +kubebuilder:validation:Enum=New;InProgress;Completed;UploadError;CleanupFailed;Canceled;Canceling;UploadFailedAfterRetry;
type UploadPhase string

// New: not processed yet
// InProgress: upload is in progress.
// Completed: upload is completed.
// UploadError: upload is failed and will be periodically retried. The status will change to "InProgress" at that point.
// CleanupFailed: delete local snapshot failed after the upload succeed or upload failed after maximum retry, this case will retry the delete local snapshot.
// Canceling: upload is being cancelled. It would happen if `velero backup delete` is called while the upload of snapshot is in progress.
// Canceled: upload is cancelled.
// UploadFailedAfterRetry: terminal state. Upload failed after retry and local snapshot is deleted. User can set the "upload-cr-retry-max"
// parameter in config map velero-vsphere-plugin-config to specify the maximum count that the upload CR will retry.
// If user does not specify this in the config map, default retry count is set to 10.
const (
UploadPhaseNew UploadPhase = "New"
UploadPhaseInProgress UploadPhase = "InProgress"
UploadPhaseCompleted UploadPhase = "Completed"
UploadPhaseUploadError UploadPhase = "UploadError"
UploadPhaseCleanupFailed UploadPhase = "CleanupFailed"
UploadPhaseCanceling UploadPhase = "Canceling"
UploadPhaseCanceled UploadPhase = "Canceled"
UploadPhaseNew UploadPhase = "New"
UploadPhaseInProgress UploadPhase = "InProgress"
UploadPhaseCompleted UploadPhase = "Completed"
UploadPhaseUploadError UploadPhase = "UploadError"
UploadPhaseCleanupFailed UploadPhase = "CleanupFailed"
UploadPhaseCanceling UploadPhase = "Canceling"
UploadPhaseCanceled UploadPhase = "Canceled"
UploadPhaseUploadFailedAfterRetry UploadPhase = "UploadFailedAfterRetry"
lipingxue marked this conversation as resolved.
Show resolved Hide resolved
)

// UploadStatus is the current status of a Upload.
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/datamover/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions pkg/backupdriver/backup_driver_controller_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ package backupdriver

import (
"context"
"strings"
"time"

"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/backuprepository"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/constants"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/utils"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/client-go/informers/core/v1"
"strings"
"time"

"k8s.io/client-go/rest"

Expand Down Expand Up @@ -957,6 +958,8 @@ func (ctrl *backupDriverController) syncUploadByKey(key string) error {
newSnapshotStatusPhase = backupdriverapi.SnapshotPhaseUploadFailed
case datamoverapi.UploadPhaseCleanupFailed:
newSnapshotStatusPhase = backupdriverapi.SnapshotPhaseCleanupFailed
case datamoverapi.UploadPhaseUploadFailedAfterRetry:
newSnapshotStatusPhase = backupdriverapi.SnapshotPhaseUploadFailedAfterRetry
default:
ctrl.logger.Debugf("syncUploadByKey: No change needed for upload phase %s", upload.Status.Phase)
return nil
Expand Down
21 changes: 15 additions & 6 deletions pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ const (
const (
RetryInterval = 5
RetryMaximum = 5
// Default maximum retry count for upload CR.
// After upload CR retry reaches the maximum and still cannot upload the snapshot to durable storage. Upload CR will stop further retry and
// try to delete the local snapshot.
// If local snapshot is deleted successfully, the status of upload CR will change to "UploadFailedAfterRetry".
// If local snapshot cannot be deleted, the status of upload CR will change to "CleanupFailed".
// If the default maximum retry count does work for user, it can be overwritten by setting the "upload-cr-retry-max"
// parameter in config map velero-vsphere-plugin-config.
DefaultUploadCRRetryMaximum = 10
lipingxue marked this conversation as resolved.
Show resolved Hide resolved
)

// Keys for supervisor cluster parameters
Expand Down Expand Up @@ -255,9 +263,9 @@ var ResourcesToBlock = map[string]bool{
// We comment the NetworkAttachmentDefinition resource out as it is not a vSphere specific resource
//"network-attachment-definitions.k8s.cni.cncf.io": true, // real name of NetworkAttachmentDefinition
//"networkattachmentdefinitions.k8s.cni.cncf.io": true, // parsed name of NetworkAttachmentDefinition
"networkinterfaces.netoperator.vmware.com": true,
"networks.netoperator.vmware.com": true,
"nsxerrors.nsx.vmware.com": true,
"networkinterfaces.netoperator.vmware.com": true,
"networks.netoperator.vmware.com": true,
"nsxerrors.nsx.vmware.com": true,
//"nsxlbmonitors.vmware.com": true, // DO NOT ADD IT BACK
//"nsxloadbalancermonitors.vmware.com": true, // DO NOT ADD IT BACK
"nsxlocks.nsx.vmware.com": true,
Expand Down Expand Up @@ -399,7 +407,7 @@ const (
const VsphereVolumeSnapshotLocationProvider = "velero.io/vsphere"

const (
VSphereCSIDriverName = "csi.vsphere.vmware.com"
VSphereCSIDriverName = "csi.vsphere.vmware.com"
VSphereCSIDriverMigrationAnnotation = "pv.kubernetes.io/migrated-to"
)

Expand All @@ -418,12 +426,13 @@ const (
VSphereSecretNameKey = "vsphere_secret_name"
DefaultSecretName = "velero-vsphere-config-secret"
DefaultSecretNamespace = "velero"
UploadCRRetryMaximumKey = "upload-cr-retry-max"
)

const (
// AnnVolumeHealth is the key for HealthStatus annotation on volume claim
// for vSphere CSI Driver.
AnnVolumeHealth = "volumehealth.storage.kubernetes.io/health"
// key for expressing timestamp for volume health annotation
AnnVolumeHealthTS = "volumehealth.storage.kubernetes.io/health-timestamp"
// key for expressing timestamp for volume health annotation
AnnVolumeHealthTS = "volumehealth.storage.kubernetes.io/health-timestamp"
lipingxue marked this conversation as resolved.
Show resolved Hide resolved
)
56 changes: 46 additions & 10 deletions pkg/controller/upload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ package controller
import (
"context"
"fmt"
"math"
"time"

backupdriverapi "github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/apis/backupdriver/v1alpha1"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/backuprepository"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/constants"
"k8s.io/apimachinery/pkg/util/wait"
"math"
"time"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -292,17 +293,22 @@ func (c *uploadController) processUpload(req *pluginv1api.Upload) error {
}).Debug("Upload request updated by retrieving from kubernetes API server")

if req.Status.Phase == pluginv1api.UploadPhaseCanceled {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of upload CR in kubernetes API server is canceled. Skipping it")
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is canceled. Skipping it")
return nil
}

if req.Status.Phase == pluginv1api.UploadPhaseCanceling {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of upload CR in kubernetes API server is canceling. Skipping it")
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is canceling. Skipping it")
return nil
}

if req.Status.Phase == pluginv1api.UploadPhaseCompleted {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of upload CR in kubernetes API server is completed. Skipping it")
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is completed. Skipping it")
return nil
}

if req.Status.Phase == pluginv1api.UploadPhaseUploadFailedAfterRetry {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is failed after retry. Skipping it")
return nil
}

Expand Down Expand Up @@ -334,7 +340,7 @@ func (c *uploadController) processUpload(req *pluginv1api.Upload) error {
errMsg := fmt.Sprintf("Failed to clean up local snapshot, %s, error: %v. will retry.", peID.String(), errors.WithStack(err))
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseCleanupFailed, errMsg)
if err != nil {
errMsg = fmt.Sprintf("Failed to patch upload status with retry, errror: %v", errors.WithStack(err))
errMsg = fmt.Sprintf("Failed to patch Upload status with retry, errror: %v", errors.WithStack(err))
log.Error(errMsg)
}
return errors.New(errMsg)
Expand Down Expand Up @@ -372,12 +378,35 @@ func (c *uploadController) processUpload(req *pluginv1api.Upload) error {
return nil
} else {
errMsg := fmt.Sprintf("Failed to upload snapshot, %v, to durable object storage. %v", peID.String(), errors.WithStack(err))
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseUploadError, errMsg)
uploadCRRetryMaximum := utils.GetUploadCRRetryMaximum(nil, c.logger)
log.Infof("UploadCRRetryMaximum is %d", uploadCRRetryMaximum)
if req.Status.RetryCount < int32(uploadCRRetryMaximum) {
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseUploadError, errMsg)
if err != nil {
errMsg = fmt.Sprintf("%v. %v", errMsg, errors.WithStack(err))
}
log.Error(errMsg)
return errors.New(errMsg)
}
log.Infof("Upload CR %s failed to upload snapshot after retrying %d times", req.Name, uploadCRRetryMaximum)
err = c.snapMgr.DeleteLocalSnapshot(peID)
if err != nil {
errMsg := fmt.Sprintf("Failed to clean up local snapshot, %s, error: %v. will retry.", peID.String(), errors.WithStack(err))
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseCleanupFailed, errMsg)
if err != nil {
errMsg = fmt.Sprintf("Failed to patch Upload status with retry, errror: %v", errors.WithStack(err))
log.Error(errMsg)
}
return errors.New(errMsg)
}
msg := fmt.Sprintf("Successfully deleted local snapshot, %v", peID.String())
log.Info(msg)
// UploadPhaseUploadFailedAfterRetry is a terminal state, and upload CR will not retry
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseUploadFailedAfterRetry, msg)
if err != nil {
errMsg = fmt.Sprintf("%v. %v", errMsg, errors.WithStack(err))
log.Error(err)
}
log.Error(errMsg)
return errors.New(errMsg)
return nil
}
}

Expand Down Expand Up @@ -487,6 +516,13 @@ func (c *uploadController) patchUploadByStatus(req *pluginv1api.Upload, newPhase
r.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
r.Status.Message = msg
})
case pluginv1api.UploadPhaseUploadFailedAfterRetry:
req, err = c.patchUpload(req, func(r *pluginv1api.Upload) {
r.Status.Phase = newPhase
r.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
r.Status.Message = msg
})

default:
err = errors.New("Unexpected upload phase")
}
Expand Down
Loading