Skip to content

Commit

Permalink
Fix upload retry. (#532)
Browse files Browse the repository at this point in the history
Signed-off-by: Liping Xue <lipingx@vmware.com>
  • Loading branch information
lipingxue authored Jun 2, 2023
1 parent 37abb4a commit 4f710f3
Show file tree
Hide file tree
Showing 12 changed files with 389 additions and 82 deletions.
34 changes: 20 additions & 14 deletions pkg/apis/backupdriver/v1alpha1/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,33 @@ type SnapshotSpec struct {
// New - No work yet, next phase is InProgress
// InProgress - snapshot being taken
// Snapshotted - local snapshot complete, next phase is Protecting or SnapshotFailed
// SnapshotFailed - end state, snapshot was not able to be taken
// SnapshotFailed - terminal state, snapshot was not able to be taken
// Uploading - snapshot is being moved to durable storage
// Uploaded - end state, snapshot has been protected
// UploadFailed - end state, unable to move to durable storage
// Uploaded - terminal state, snapshot has been protected
// UploadFailed - unable to move to durable storage
// Canceling - when the SanpshotCancel flag is set, if the Snapshot has not already moved into a terminal state, the
// status will move to Canceling. The snapshot ID will be removed from the status status if has been filled in
// and the snapshot ID will not longer be valid for a Clone operation
// Canceled - the operation was canceled, the snapshot ID is not valid
// CleanupAfterUploadFailed - terminal state, unable to delete local snapshot
// UploadFailedAfterRetry - terminal state, local snapshot is deleted after upload CR retries reach the maximum retry count. User can set the "upload-cr-retry-max"
// parameter in config map velero-vsphere-plugin-config to specify the maximum count the upload CR will retry before trying to delete
// the local snapshot. If user does not specify this in the config map, default retry count is set to 10.

type SnapshotPhase string

const (
SnapshotPhaseNew SnapshotPhase = "New"
SnapshotPhaseInProgress SnapshotPhase = "InProgress"
SnapshotPhaseSnapshotted SnapshotPhase = "Snapshotted"
SnapshotPhaseSnapshotFailed SnapshotPhase = "SnapshotFailed"
SnapshotPhaseUploading SnapshotPhase = "Uploading"
SnapshotPhaseUploaded SnapshotPhase = "Uploaded"
SnapshotPhaseUploadFailed SnapshotPhase = "UploadFailed"
SnapshotPhaseCanceling SnapshotPhase = "Canceling"
SnapshotPhaseCanceled SnapshotPhase = "Canceled"
SnapshotPhaseCleanupFailed SnapshotPhase = "CleanupAfterUploadFailed"
SnapshotPhaseNew SnapshotPhase = "New"
SnapshotPhaseInProgress SnapshotPhase = "InProgress"
SnapshotPhaseSnapshotted SnapshotPhase = "Snapshotted"
SnapshotPhaseSnapshotFailed SnapshotPhase = "SnapshotFailed"
SnapshotPhaseUploading SnapshotPhase = "Uploading"
SnapshotPhaseUploaded SnapshotPhase = "Uploaded"
SnapshotPhaseUploadFailed SnapshotPhase = "UploadFailed"
SnapshotPhaseCanceling SnapshotPhase = "Canceling"
SnapshotPhaseCanceled SnapshotPhase = "Canceled"
SnapshotPhaseCleanupFailed SnapshotPhase = "CleanupAfterUploadFailed"
SnapshotPhaseUploadFailedAfterRetry SnapshotPhase = "UploadFailedAfterRetry"
)

// UploadOperationProgress represents the progress of a
Expand Down Expand Up @@ -149,7 +155,7 @@ type CloneFromSnapshotSpec struct {
New - No work yet, next phase is InProgress
InProgress - snapshot being taken
Completed - new object has been created
Failed - end state, clone failed, no new object was created
Failed - terminal state, clone failed, no new object was created
Canceling - when the Clone flag is set, if the Clone has not already moved into a terminal state, the
status will move to Canceling. The object that was being created will be removed
Canceled - the Clone was canceled, no new object was created
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/backupdriver/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 19 additions & 8 deletions pkg/apis/datamover/v1alpha1/upload.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,28 @@ type UploadSpec struct {
}

// UploadPhase represents the lifecycle phase of a Upload.
// +kubebuilder:validation:Enum=New;InProgress;Completed;UploadError;CleanupFailed;Canceled;Canceling;
// +kubebuilder:validation:Enum=New;InProgress;Completed;UploadError;CleanupFailed;Canceled;Canceling;UploadFailedAfterRetry;
type UploadPhase string

// New: not processed yet
// InProgress: upload is in progress.
// Completed: upload is completed.
// UploadError: upload is failed and will be periodically retried. The status will change to "InProgress" at that point.
// CleanupFailed: delete local snapshot failed after the upload succeed or upload failed after maximum retry, this case will retry the delete local snapshot.
// Canceling: upload is being cancelled. It would happen if `velero backup delete` is called while the upload of snapshot is in progress.
// Canceled: upload is cancelled.
// UploadFailedAfterRetry: terminal state. Upload failed after retry and local snapshot is deleted. User can set the "upload-cr-retry-max"
// parameter in config map velero-vsphere-plugin-config to specify the maximum count that the upload CR will retry.
// If user does not specify this in the config map, default retry count is set to 10.
const (
UploadPhaseNew UploadPhase = "New"
UploadPhaseInProgress UploadPhase = "InProgress"
UploadPhaseCompleted UploadPhase = "Completed"
UploadPhaseUploadError UploadPhase = "UploadError"
UploadPhaseCleanupFailed UploadPhase = "CleanupFailed"
UploadPhaseCanceling UploadPhase = "Canceling"
UploadPhaseCanceled UploadPhase = "Canceled"
UploadPhaseNew UploadPhase = "New"
UploadPhaseInProgress UploadPhase = "InProgress"
UploadPhaseCompleted UploadPhase = "Completed"
UploadPhaseUploadError UploadPhase = "UploadError"
UploadPhaseCleanupFailed UploadPhase = "CleanupFailed"
UploadPhaseCanceling UploadPhase = "Canceling"
UploadPhaseCanceled UploadPhase = "Canceled"
UploadPhaseUploadFailedAfterRetry UploadPhase = "UploadFailedAfterRetry"
)

// UploadStatus is the current status of a Upload.
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/datamover/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions pkg/backupdriver/backup_driver_controller_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ package backupdriver

import (
"context"
"strings"
"time"

"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/backuprepository"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/constants"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/utils"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/client-go/informers/core/v1"
"strings"
"time"

"k8s.io/client-go/rest"

Expand Down Expand Up @@ -957,6 +958,8 @@ func (ctrl *backupDriverController) syncUploadByKey(key string) error {
newSnapshotStatusPhase = backupdriverapi.SnapshotPhaseUploadFailed
case datamoverapi.UploadPhaseCleanupFailed:
newSnapshotStatusPhase = backupdriverapi.SnapshotPhaseCleanupFailed
case datamoverapi.UploadPhaseUploadFailedAfterRetry:
newSnapshotStatusPhase = backupdriverapi.SnapshotPhaseUploadFailedAfterRetry
default:
ctrl.logger.Debugf("syncUploadByKey: No change needed for upload phase %s", upload.Status.Phase)
return nil
Expand Down
21 changes: 15 additions & 6 deletions pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ const (
const (
RetryInterval = 5
RetryMaximum = 5
// Default maximum retry count for upload CR.
// After upload CR retry reaches the maximum and still cannot upload the snapshot to durable storage. Upload CR will stop further retry and
// try to delete the local snapshot.
// If local snapshot is deleted successfully, the status of upload CR will change to "UploadFailedAfterRetry".
// If local snapshot cannot be deleted, the status of upload CR will change to "CleanupFailed".
// If the default maximum retry count does work for user, it can be overwritten by setting the "upload-cr-retry-max"
// parameter in config map velero-vsphere-plugin-config.
DefaultUploadCRRetryMaximum = 10
)

// Keys for supervisor cluster parameters
Expand Down Expand Up @@ -255,9 +263,9 @@ var ResourcesToBlock = map[string]bool{
// We comment the NetworkAttachmentDefinition resource out as it is not a vSphere specific resource
//"network-attachment-definitions.k8s.cni.cncf.io": true, // real name of NetworkAttachmentDefinition
//"networkattachmentdefinitions.k8s.cni.cncf.io": true, // parsed name of NetworkAttachmentDefinition
"networkinterfaces.netoperator.vmware.com": true,
"networks.netoperator.vmware.com": true,
"nsxerrors.nsx.vmware.com": true,
"networkinterfaces.netoperator.vmware.com": true,
"networks.netoperator.vmware.com": true,
"nsxerrors.nsx.vmware.com": true,
//"nsxlbmonitors.vmware.com": true, // DO NOT ADD IT BACK
//"nsxloadbalancermonitors.vmware.com": true, // DO NOT ADD IT BACK
"nsxlocks.nsx.vmware.com": true,
Expand Down Expand Up @@ -399,7 +407,7 @@ const (
const VsphereVolumeSnapshotLocationProvider = "velero.io/vsphere"

const (
VSphereCSIDriverName = "csi.vsphere.vmware.com"
VSphereCSIDriverName = "csi.vsphere.vmware.com"
VSphereCSIDriverMigrationAnnotation = "pv.kubernetes.io/migrated-to"
)

Expand All @@ -418,12 +426,13 @@ const (
VSphereSecretNameKey = "vsphere_secret_name"
DefaultSecretName = "velero-vsphere-config-secret"
DefaultSecretNamespace = "velero"
UploadCRRetryMaximumKey = "upload-cr-retry-max"
)

const (
// AnnVolumeHealth is the key for HealthStatus annotation on volume claim
// for vSphere CSI Driver.
AnnVolumeHealth = "volumehealth.storage.kubernetes.io/health"
// key for expressing timestamp for volume health annotation
AnnVolumeHealthTS = "volumehealth.storage.kubernetes.io/health-timestamp"
// key for expressing timestamp for volume health annotation
AnnVolumeHealthTS = "volumehealth.storage.kubernetes.io/health-timestamp"
)
56 changes: 46 additions & 10 deletions pkg/controller/upload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ package controller
import (
"context"
"fmt"
"math"
"time"

backupdriverapi "github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/apis/backupdriver/v1alpha1"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/backuprepository"
"github.com/vmware-tanzu/velero-plugin-for-vsphere/pkg/constants"
"k8s.io/apimachinery/pkg/util/wait"
"math"
"time"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -292,17 +293,22 @@ func (c *uploadController) processUpload(req *pluginv1api.Upload) error {
}).Debug("Upload request updated by retrieving from kubernetes API server")

if req.Status.Phase == pluginv1api.UploadPhaseCanceled {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of upload CR in kubernetes API server is canceled. Skipping it")
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is canceled. Skipping it")
return nil
}

if req.Status.Phase == pluginv1api.UploadPhaseCanceling {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of upload CR in kubernetes API server is canceling. Skipping it")
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is canceling. Skipping it")
return nil
}

if req.Status.Phase == pluginv1api.UploadPhaseCompleted {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of upload CR in kubernetes API server is completed. Skipping it")
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is completed. Skipping it")
return nil
}

if req.Status.Phase == pluginv1api.UploadPhaseUploadFailedAfterRetry {
log.WithField("phase", req.Status.Phase).WithField("generation", req.Generation).Debug("The status of Upload CR in Kubernetes API server is failed after retry. Skipping it")
return nil
}

Expand Down Expand Up @@ -334,7 +340,7 @@ func (c *uploadController) processUpload(req *pluginv1api.Upload) error {
errMsg := fmt.Sprintf("Failed to clean up local snapshot, %s, error: %v. will retry.", peID.String(), errors.WithStack(err))
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseCleanupFailed, errMsg)
if err != nil {
errMsg = fmt.Sprintf("Failed to patch upload status with retry, errror: %v", errors.WithStack(err))
errMsg = fmt.Sprintf("Failed to patch Upload status with retry, errror: %v", errors.WithStack(err))
log.Error(errMsg)
}
return errors.New(errMsg)
Expand Down Expand Up @@ -372,12 +378,35 @@ func (c *uploadController) processUpload(req *pluginv1api.Upload) error {
return nil
} else {
errMsg := fmt.Sprintf("Failed to upload snapshot, %v, to durable object storage. %v", peID.String(), errors.WithStack(err))
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseUploadError, errMsg)
uploadCRRetryMaximum := utils.GetUploadCRRetryMaximum(nil, c.logger)
log.Infof("UploadCRRetryMaximum is %d", uploadCRRetryMaximum)
if req.Status.RetryCount < int32(uploadCRRetryMaximum) {
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseUploadError, errMsg)
if err != nil {
errMsg = fmt.Sprintf("%v. %v", errMsg, errors.WithStack(err))
}
log.Error(errMsg)
return errors.New(errMsg)
}
log.Infof("Upload CR %s failed to upload snapshot after retrying %d times", req.Name, uploadCRRetryMaximum)
err = c.snapMgr.DeleteLocalSnapshot(peID)
if err != nil {
errMsg := fmt.Sprintf("Failed to clean up local snapshot, %s, error: %v. will retry.", peID.String(), errors.WithStack(err))
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseCleanupFailed, errMsg)
if err != nil {
errMsg = fmt.Sprintf("Failed to patch Upload status with retry, errror: %v", errors.WithStack(err))
log.Error(errMsg)
}
return errors.New(errMsg)
}
msg := fmt.Sprintf("Successfully deleted local snapshot, %v", peID.String())
log.Info(msg)
// UploadPhaseUploadFailedAfterRetry is a terminal state, and upload CR will not retry
_, err = c.patchUploadByStatusWithRetry(req, pluginv1api.UploadPhaseUploadFailedAfterRetry, msg)
if err != nil {
errMsg = fmt.Sprintf("%v. %v", errMsg, errors.WithStack(err))
log.Error(err)
}
log.Error(errMsg)
return errors.New(errMsg)
return nil
}
}

Expand Down Expand Up @@ -487,6 +516,13 @@ func (c *uploadController) patchUploadByStatus(req *pluginv1api.Upload, newPhase
r.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
r.Status.Message = msg
})
case pluginv1api.UploadPhaseUploadFailedAfterRetry:
req, err = c.patchUpload(req, func(r *pluginv1api.Upload) {
r.Status.Phase = newPhase
r.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
r.Status.Message = msg
})

default:
err = errors.New("Unexpected upload phase")
}
Expand Down
Loading

0 comments on commit 4f710f3

Please sign in to comment.