Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle case of recovery from resize failures #187

Merged
merged 5 commits into from
Jan 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 33 additions & 22 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,18 @@ func (ctrl *resizeController) syncPVC(key string) error {
klog.V(4).Infof("No need to resize PVC %q", util.PVCKey(pvc))
return nil
}
if !ctrl.pvNeedResize(pvc, pv) {
klog.V(4).Infof("No need to resize PV %q", pv.Name)
return nil
}

return ctrl.resizePVC(pvc, pv)
if utilfeature.DefaultFeatureGate.Enabled(features.RecoverVolumeExpansionFailure) {
_, _, err, _ := ctrl.expandAndRecover(pvc, pv)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why return pv, pvc, but without using use objects?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for now, mainly for tests.

return err
} else {
if !ctrl.pvNeedResize(pvc, pv) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seem pvNeedResize can be called before checking feature is enabled?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is pvNeedResize more like pv cannot be resized due to condition is not met? Does it consider an error?

Copy link
Contributor Author

@gnufied gnufied Jan 20, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is basically old flow - which checks if PV is resizable (such as, whether it is bound and bound to PVC which is being resized and whether it was already controller expanded and node expansion is pending). It is not a hard error if PV can not expanded at this moment. But anyways - this is old code and has kinda always worked okay.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh I confused about pvCanBeExpanded and pvNeedResize, thinking they are the same function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I am sorry about similar names. I am hoping to delete pvNeedResize when this feature goes beta.

klog.V(4).Infof("No need to resize PV %q", pv.Name)
return nil
}

return ctrl.resizePVC(pvc, pv)
}
}

// pvcNeedResize returns true is a pvc requests a resize operation.
Expand Down Expand Up @@ -476,7 +482,7 @@ func (ctrl *resizeController) resizeVolume(
}
klog.V(4).Infof("Resize volume succeeded for volume %q, start to update PV's capacity", pv.Name)

err = ctrl.updatePVCapacity(pv, pvc.Status.Capacity[v1.ResourceStorage], newSize, fsResizeRequired)
_, err = ctrl.updatePVCapacity(pv, pvc.Status.Capacity[v1.ResourceStorage], newSize, fsResizeRequired)
if err != nil {
return newSize, fsResizeRequired, err
}
Expand All @@ -496,7 +502,7 @@ func (ctrl *resizeController) markPVCAsFSResizeRequired(pvc *v1.PersistentVolume
newPVC.Status.Conditions = util.MergeResizeConditionsOfPVC(newPVC.Status.Conditions,
[]v1.PersistentVolumeClaimCondition{pvcCondition})

_, err := ctrl.patchClaim(pvc, newPVC)
_, err := ctrl.patchClaim(pvc, newPVC, true /* addResourceVersionCheck */)

if err != nil {
return fmt.Errorf("Mark PVC %q as file system resize required failed: %v", util.PVCKey(pvc), err)
Expand All @@ -509,6 +515,7 @@ func (ctrl *resizeController) markPVCAsFSResizeRequired(pvc *v1.PersistentVolume
return nil
}

// legacy markPVCResizeInProgress function, should be removed once RecoverFromVolumeExpansionFailure feature goes GA.
func (ctrl *resizeController) markPVCResizeInProgress(pvc *v1.PersistentVolumeClaim) (*v1.PersistentVolumeClaim, error) {
// Mark PVC as Resize Started
progressCondition := v1.PersistentVolumeClaimCondition{
Expand All @@ -520,11 +527,8 @@ func (ctrl *resizeController) markPVCResizeInProgress(pvc *v1.PersistentVolumeCl
newPVC.Status.Conditions = util.MergeResizeConditionsOfPVC(newPVC.Status.Conditions,
[]v1.PersistentVolumeClaimCondition{progressCondition})

updatedPVC, err := ctrl.patchClaim(pvc, newPVC)
if err != nil {
return nil, err
}
return updatedPVC, nil
updatedPVC, err := ctrl.patchClaim(pvc, newPVC, true /* addResourceVersionCheck */)
return updatedPVC, err
}

func (ctrl *resizeController) markPVCResizeFinished(
Expand All @@ -534,7 +538,7 @@ func (ctrl *resizeController) markPVCResizeFinished(
newPVC.Status.Capacity[v1.ResourceStorage] = newSize
newPVC.Status.Conditions = util.MergeResizeConditionsOfPVC(pvc.Status.Conditions, []v1.PersistentVolumeClaimCondition{})

_, err := ctrl.patchClaim(pvc, newPVC)
_, err := ctrl.patchClaim(pvc, newPVC, true /* addResourceVersionCheck */)
if err != nil {
return fmt.Errorf("Mark PVC %q as resize finished failed: %v", util.PVCKey(pvc), err)
}
Expand All @@ -545,19 +549,22 @@ func (ctrl *resizeController) markPVCResizeFinished(
return nil
}

func (ctrl *resizeController) patchClaim(oldPVC, newPVC *v1.PersistentVolumeClaim) (*v1.PersistentVolumeClaim, error) {
patchBytes, err := util.GetPVCPatchData(oldPVC, newPVC)
// Patches a given PVC with changes from newPVC. If addResourceVersionCheck is true
// then a version check is added to the patch to ensure that we are not patching
// old(and possibly outdated) PVC objects.
func (ctrl *resizeController) patchClaim(oldPVC, newPVC *v1.PersistentVolumeClaim, addResourceVersionCheck bool) (*v1.PersistentVolumeClaim, error) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you add some comments about this option addResourceVersionCheck?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

patchBytes, err := util.GetPVCPatchData(oldPVC, newPVC, addResourceVersionCheck)
if err != nil {
return nil, fmt.Errorf("can't patch status of PVC %s as generate path data failed: %v", util.PVCKey(oldPVC), err)
return oldPVC, fmt.Errorf("can't patch status of PVC %s as generate path data failed: %v", util.PVCKey(oldPVC), err)
}
updatedClaim, updateErr := ctrl.kubeClient.CoreV1().PersistentVolumeClaims(oldPVC.Namespace).
Patch(context.TODO(), oldPVC.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status")
if updateErr != nil {
return nil, fmt.Errorf("can't patch status of PVC %s with %v", util.PVCKey(oldPVC), updateErr)
return oldPVC, fmt.Errorf("can't patch status of PVC %s with %v", util.PVCKey(oldPVC), updateErr)
}
err = ctrl.claims.Update(updatedClaim)
if err != nil {
return nil, fmt.Errorf("error updating PVC %s in local cache: %v", util.PVCKey(newPVC), err)
return oldPVC, fmt.Errorf("error updating PVC %s in local cache: %v", util.PVCKey(newPVC), err)
}

return updatedClaim, nil
Expand All @@ -575,7 +582,11 @@ func (ctrl *resizeController) deletePreResizeCapAnnotation(pv *v1.PersistentVolu
return err
}

func (ctrl *resizeController) updatePVCapacity(pv *v1.PersistentVolume, oldCapacity, newCapacity resource.Quantity, fsResizeRequired bool) error {
func (ctrl *resizeController) updatePVCapacity(
pv *v1.PersistentVolume,
oldCapacity, newCapacity resource.Quantity,
fsResizeRequired bool) (*v1.PersistentVolume, error) {

klog.V(4).Infof("Resize volume succeeded for volume %q, start to update PV's capacity", pv.Name)
newPV := pv.DeepCopy()
newPV.Spec.Capacity[v1.ResourceStorage] = newCapacity
Expand All @@ -590,11 +601,11 @@ func (ctrl *resizeController) updatePVCapacity(pv *v1.PersistentVolume, oldCapac
}
}

_, err := ctrl.patchPersistentVolume(pv, newPV)
updatedPV, err := ctrl.patchPersistentVolume(pv, newPV)
if err != nil {
return fmt.Errorf("updating capacity of PV %q to %s failed: %v", pv.Name, newCapacity.String(), err)
return pv, fmt.Errorf("updating capacity of PV %q to %s failed: %v", pv.Name, newCapacity.String(), err)
}
return nil
return updatedPV, nil
}

func (ctrl *resizeController) patchPersistentVolume(oldPV, newPV *v1.PersistentVolume) (*v1.PersistentVolume, error) {
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ func createPV(capacityGB int, pvcName, pvcNamespace string, pvcUID types.UID, vo
Name: pvcName,
UID: pvcUID,
}
pv.Status.Phase = v1.VolumeBound
}
return pv
}
Expand Down
234 changes: 234 additions & 0 deletions pkg/controller/expand_and_recover.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
/*
Copyright 2022 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controller

import (
"fmt"
"github.com/kubernetes-csi/external-resizer/pkg/util"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"
)

func (ctrl *resizeController) expandAndRecover(pvc *v1.PersistentVolumeClaim, pv *v1.PersistentVolume) (*v1.PersistentVolumeClaim, *v1.PersistentVolume, error, bool) {
if !ctrl.pvCanBeExpanded(pv, pvc) {
klog.V(4).Infof("No need to resize PV %q", pv.Name)
return pvc, pv, nil, false
}
// only used as a sentinel value when function returns without
// actually performing expansion on the volume.
resizeNotCalled := false

// if we are here that already means pvc.Spec.Size > pvc.Status.Size
pvcSpecSize := pvc.Spec.Resources.Requests[v1.ResourceStorage]
pvcStatusSize := pvc.Status.Capacity[v1.ResourceStorage]
pvSize := pv.Spec.Capacity[v1.ResourceStorage]

newSize := pvcSpecSize
resizeStatus := v1.PersistentVolumeClaimNoExpansionInProgress
if pvc.Status.ResizeStatus != nil {
resizeStatus = *pvc.Status.ResizeStatus
}

var allocatedSize *resource.Quantity
t, ok := pvc.Status.AllocatedResources[v1.ResourceStorage]
if ok {
allocatedSize = &t
}

if pvSize.Cmp(pvcSpecSize) < 0 {
// PV is smaller than user requested size. In general some control-plane volume expansion
// is necessary at this point but if we were expanding the PVC before we should let
// previous operation finish before starting expansion to new user requested size.
switch resizeStatus {
case v1.PersistentVolumeClaimControllerExpansionInProgress,
v1.PersistentVolumeClaimNodeExpansionFailed:
if allocatedSize != nil {
newSize = *allocatedSize
}
case v1.PersistentVolumeClaimNodeExpansionPending,
v1.PersistentVolumeClaimNodeExpansionInProgress:
if allocatedSize != nil {
newSize = *allocatedSize
}
// If PV is already greater or equal to whatever newSize is, then we should
// let previously issued node expansion finish before starting new one.
// This case is essentially an optimization on previous case, generally it is safe
// to let volume plugin receive the expansion call (expansion calls are idempotent)
// but having this check here *avoids* unnecessary RPC calls to plugin.
if pvSize.Cmp(newSize) >= 0 {
return pvc, pv, nil, resizeNotCalled
}
default:
newSize = pvcSpecSize
}
} else {
// PV has already been expanded and hence we can be here for following reasons:
// 1. If expansion is pending on the node and this was just a spurious update event
// we don't need to do anything and let kubelet handle it.
// 2. It could be that - although we successfully expanded the volume, we failed to
// record our work in API objects, in which case - we should resume resizing operation
// and let API objects be updated.
// 3. Controller successfully expanded the volume, but expansion is failing on the node
// and before kubelet can retry failed node expansion - controller must verify if it is
// safe to do so.
// 4. While expansion was still pending on the node, user reduced the pvc size.
switch resizeStatus {
case v1.PersistentVolumeClaimNodeExpansionInProgress,
v1.PersistentVolumeClaimNodeExpansionPending:
// we don't need to do any work. We could be here because of a spurious update event.
// This is case #1
return pvc, pv, nil, resizeNotCalled
case v1.PersistentVolumeClaimNodeExpansionFailed:
// This is case#3, we need to reset the pvc status in such a way that kubelet can safely retry volume
// expansion.
if ctrl.resizer.DriverSupportsControlPlaneExpansion() && allocatedSize != nil {
newSize = *allocatedSize
} else {
newSize = pvcSpecSize
}
case v1.PersistentVolumeClaimControllerExpansionInProgress,
v1.PersistentVolumeClaimControllerExpansionFailed,
v1.PersistentVolumeClaimNoExpansionInProgress:
// This is case#2 or it could also be case#4 when user manually shrunk the PVC
// after expanding it.
if allocatedSize != nil {
newSize = *allocatedSize
}
default:
// It is impossible for ResizeStatus to be nil and allocatedSize to be not nil but somehow
// if we do end up in this state, it is safest to resume expansion to last recorded size in
// allocatedSize variable.
if pvc.Status.ResizeStatus == nil && allocatedSize != nil {
newSize = *allocatedSize
} else {
newSize = pvcSpecSize
}
}
}
var err error
pvc, err = ctrl.markControllerResizeInProgress(pvc, newSize)
if err != nil {
return pvc, pv, fmt.Errorf("marking pvc %q as resizing failed: %v", util.PVCKey(pvc), err), resizeNotCalled
}

// if pvc previously failed to expand because it can't be expanded when in-use
// we must not try expansion here
if ctrl.usedPVCs.hasInUseErrors(pvc) && ctrl.usedPVCs.checkForUse(pvc) {
// Record an event to indicate that resizer is not expanding the pvc
msg := fmt.Sprintf("Unable to expand %s because CSI driver %s only supports offline expansion and volume is currently in-use", util.PVCKey(pvc), ctrl.resizer.Name())
ctrl.eventRecorder.Event(pvc, v1.EventTypeWarning, util.VolumeResizeFailed, msg)
return pvc, pv, fmt.Errorf(msg), resizeNotCalled
}

// Record an event to indicate that external resizer is resizing this volume.
ctrl.eventRecorder.Event(pvc, v1.EventTypeNormal, util.VolumeResizing,
fmt.Sprintf("External resizer is resizing volume %s", pv.Name))

// before trying expansion we will remove the PVC from map
// that tracks PVCs which can't be expanded when in-use. If
// pvc indeed can not be expanded when in-use then it will be added
// back when expansion fails with in-use error.
ctrl.usedPVCs.removePVCWithInUseError(pvc)
pvc, pv, err = ctrl.callResizeOnPlugin(pvc, pv, newSize, pvcStatusSize)

if err != nil {
// Record an event to indicate that resize operation is failed.
ctrl.eventRecorder.Eventf(pvc, v1.EventTypeWarning, util.VolumeResizeFailed, err.Error())
return pvc, pv, err, true
}

klog.V(4).Infof("Update capacity of PV %q to %s succeeded", pv.Name, newSize.String())
return pvc, pv, nil, true
}

func (ctrl *resizeController) callResizeOnPlugin(
pvc *v1.PersistentVolumeClaim,
pv *v1.PersistentVolume,
newSize, oldSize resource.Quantity) (*v1.PersistentVolumeClaim, *v1.PersistentVolume, error) {
updatedSize, fsResizeRequired, err := ctrl.resizer.Resize(pv, newSize)

if err != nil {
// if this error was a in-use error then it must be tracked so as we don't retry without
// first verifying if volume is in-use
if inUseError(err) {
ctrl.usedPVCs.addPVCWithInUseError(pvc)
}
if isFinalError(err) {
var markExpansionFailedError error
pvc, markExpansionFailedError = ctrl.markControllerExpansionFailed(pvc)
if markExpansionFailedError != nil {
return pvc, pv, fmt.Errorf("resizing failed in controller with %v but failed to update PVC %s with: %v", err, util.PVCKey(pvc), markExpansionFailedError)
}
}
return pvc, pv, fmt.Errorf("resize volume %q by resizer %q failed: %v", pv.Name, ctrl.name, err)
}

klog.V(4).Infof("Resize volume succeeded for volume %q, start to update PV's capacity", pv.Name)

pv, err = ctrl.updatePVCapacity(pv, oldSize, updatedSize, fsResizeRequired)
if err != nil {
return pvc, pv, fmt.Errorf("error updating pv %q by resizer: %v", pv.Name, err)
}

if fsResizeRequired {
pvc, err = ctrl.markForPendingNodeExpansion(pvc)
return pvc, pv, err
}
pvc, err = ctrl.markOverallExpansionAsFinished(pvc, updatedSize)
return pvc, pv, err
}

// checks if pv can be expanded
func (ctrl *resizeController) pvCanBeExpanded(pv *v1.PersistentVolume, pvc *v1.PersistentVolumeClaim) bool {
if !ctrl.resizer.CanSupport(pv, pvc) {
klog.V(4).Infof("Resizer %q doesn't support PV %q", ctrl.name, pv.Name)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe it can be a warning because you are calling resize, but resizer does not support it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it can get called on all sort of PVs. such as intree pv,pvc etc, which are not resizable by external-resizer. I think Info is suitable since most people will be running both intree expand_controller and this controller side-by-side for now.

return false
}

if (pv.Spec.ClaimRef == nil) || (pvc.Namespace != pv.Spec.ClaimRef.Namespace) || (pvc.UID != pv.Spec.ClaimRef.UID) {
klog.V(4).Infof("persistent volume is not bound to PVC being updated: %s", util.PVCKey(pvc))
return false
}
return true
}

func isFinalError(err error) bool {
// Sources:
// https://github.com/grpc/grpc/blob/master/doc/statuscodes.md
// https://github.com/container-storage-interface/spec/blob/master/spec.md
st, ok := status.FromError(err)
if !ok {
// This is not gRPC error. The operation must have failed before gRPC
// method was called, otherwise we would get gRPC error.
// We don't know if any previous volume operation is in progress, be on the safe side.
return false
}
switch st.Code() {
case codes.Canceled, // gRPC: Client Application cancelled the request
codes.DeadlineExceeded, // gRPC: Timeout
codes.Unavailable, // gRPC: Server shutting down, TCP connection broken - previous volume operation may be still in progress.
codes.ResourceExhausted, // gRPC: Server temporarily out of resources - previous volume operation may be still in progress.
codes.Aborted: // CSI: Operation pending for volume
return false
}
// All other errors mean that operation either did not
// even start or failed. It is for sure not in progress.
return true
}
Loading