-
Notifications
You must be signed in to change notification settings - Fork 137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle case of recovery from resize failures #187
Changes from all commits
ff87c34
dea6889
4127d62
9f08c69
8c552c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -343,12 +343,18 @@ func (ctrl *resizeController) syncPVC(key string) error { | |
klog.V(4).Infof("No need to resize PVC %q", util.PVCKey(pvc)) | ||
return nil | ||
} | ||
if !ctrl.pvNeedResize(pvc, pv) { | ||
klog.V(4).Infof("No need to resize PV %q", pv.Name) | ||
return nil | ||
} | ||
|
||
return ctrl.resizePVC(pvc, pv) | ||
if utilfeature.DefaultFeatureGate.Enabled(features.RecoverVolumeExpansionFailure) { | ||
_, _, err, _ := ctrl.expandAndRecover(pvc, pv) | ||
return err | ||
} else { | ||
if !ctrl.pvNeedResize(pvc, pv) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seem pvNeedResize can be called before checking feature is enabled? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is pvNeedResize more like pv cannot be resized due to condition is not met? Does it consider an error? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is basically old flow - which checks if PV is resizable (such as, whether it is bound and bound to PVC which is being resized and whether it was already controller expanded and node expansion is pending). It is not a hard error if PV can not expanded at this moment. But anyways - this is old code and has kinda always worked okay. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh I confused about pvCanBeExpanded and pvNeedResize, thinking they are the same function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah I am sorry about similar names. I am hoping to delete |
||
klog.V(4).Infof("No need to resize PV %q", pv.Name) | ||
return nil | ||
} | ||
|
||
return ctrl.resizePVC(pvc, pv) | ||
} | ||
} | ||
|
||
// pvcNeedResize returns true is a pvc requests a resize operation. | ||
|
@@ -476,7 +482,7 @@ func (ctrl *resizeController) resizeVolume( | |
} | ||
klog.V(4).Infof("Resize volume succeeded for volume %q, start to update PV's capacity", pv.Name) | ||
|
||
err = ctrl.updatePVCapacity(pv, pvc.Status.Capacity[v1.ResourceStorage], newSize, fsResizeRequired) | ||
_, err = ctrl.updatePVCapacity(pv, pvc.Status.Capacity[v1.ResourceStorage], newSize, fsResizeRequired) | ||
if err != nil { | ||
return newSize, fsResizeRequired, err | ||
} | ||
|
@@ -496,7 +502,7 @@ func (ctrl *resizeController) markPVCAsFSResizeRequired(pvc *v1.PersistentVolume | |
newPVC.Status.Conditions = util.MergeResizeConditionsOfPVC(newPVC.Status.Conditions, | ||
[]v1.PersistentVolumeClaimCondition{pvcCondition}) | ||
|
||
_, err := ctrl.patchClaim(pvc, newPVC) | ||
_, err := ctrl.patchClaim(pvc, newPVC, true /* addResourceVersionCheck */) | ||
|
||
if err != nil { | ||
return fmt.Errorf("Mark PVC %q as file system resize required failed: %v", util.PVCKey(pvc), err) | ||
|
@@ -509,6 +515,7 @@ func (ctrl *resizeController) markPVCAsFSResizeRequired(pvc *v1.PersistentVolume | |
return nil | ||
} | ||
|
||
// legacy markPVCResizeInProgress function, should be removed once RecoverFromVolumeExpansionFailure feature goes GA. | ||
func (ctrl *resizeController) markPVCResizeInProgress(pvc *v1.PersistentVolumeClaim) (*v1.PersistentVolumeClaim, error) { | ||
// Mark PVC as Resize Started | ||
progressCondition := v1.PersistentVolumeClaimCondition{ | ||
|
@@ -520,11 +527,8 @@ func (ctrl *resizeController) markPVCResizeInProgress(pvc *v1.PersistentVolumeCl | |
newPVC.Status.Conditions = util.MergeResizeConditionsOfPVC(newPVC.Status.Conditions, | ||
[]v1.PersistentVolumeClaimCondition{progressCondition}) | ||
|
||
updatedPVC, err := ctrl.patchClaim(pvc, newPVC) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return updatedPVC, nil | ||
updatedPVC, err := ctrl.patchClaim(pvc, newPVC, true /* addResourceVersionCheck */) | ||
return updatedPVC, err | ||
} | ||
|
||
func (ctrl *resizeController) markPVCResizeFinished( | ||
|
@@ -534,7 +538,7 @@ func (ctrl *resizeController) markPVCResizeFinished( | |
newPVC.Status.Capacity[v1.ResourceStorage] = newSize | ||
newPVC.Status.Conditions = util.MergeResizeConditionsOfPVC(pvc.Status.Conditions, []v1.PersistentVolumeClaimCondition{}) | ||
|
||
_, err := ctrl.patchClaim(pvc, newPVC) | ||
_, err := ctrl.patchClaim(pvc, newPVC, true /* addResourceVersionCheck */) | ||
if err != nil { | ||
return fmt.Errorf("Mark PVC %q as resize finished failed: %v", util.PVCKey(pvc), err) | ||
} | ||
|
@@ -545,19 +549,22 @@ func (ctrl *resizeController) markPVCResizeFinished( | |
return nil | ||
} | ||
|
||
func (ctrl *resizeController) patchClaim(oldPVC, newPVC *v1.PersistentVolumeClaim) (*v1.PersistentVolumeClaim, error) { | ||
patchBytes, err := util.GetPVCPatchData(oldPVC, newPVC) | ||
// Patches a given PVC with changes from newPVC. If addResourceVersionCheck is true | ||
// then a version check is added to the patch to ensure that we are not patching | ||
// old(and possibly outdated) PVC objects. | ||
func (ctrl *resizeController) patchClaim(oldPVC, newPVC *v1.PersistentVolumeClaim, addResourceVersionCheck bool) (*v1.PersistentVolumeClaim, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you add some comments about this option addResourceVersionCheck? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
patchBytes, err := util.GetPVCPatchData(oldPVC, newPVC, addResourceVersionCheck) | ||
if err != nil { | ||
return nil, fmt.Errorf("can't patch status of PVC %s as generate path data failed: %v", util.PVCKey(oldPVC), err) | ||
return oldPVC, fmt.Errorf("can't patch status of PVC %s as generate path data failed: %v", util.PVCKey(oldPVC), err) | ||
} | ||
updatedClaim, updateErr := ctrl.kubeClient.CoreV1().PersistentVolumeClaims(oldPVC.Namespace). | ||
Patch(context.TODO(), oldPVC.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") | ||
if updateErr != nil { | ||
return nil, fmt.Errorf("can't patch status of PVC %s with %v", util.PVCKey(oldPVC), updateErr) | ||
return oldPVC, fmt.Errorf("can't patch status of PVC %s with %v", util.PVCKey(oldPVC), updateErr) | ||
} | ||
err = ctrl.claims.Update(updatedClaim) | ||
if err != nil { | ||
return nil, fmt.Errorf("error updating PVC %s in local cache: %v", util.PVCKey(newPVC), err) | ||
return oldPVC, fmt.Errorf("error updating PVC %s in local cache: %v", util.PVCKey(newPVC), err) | ||
} | ||
|
||
return updatedClaim, nil | ||
|
@@ -575,7 +582,11 @@ func (ctrl *resizeController) deletePreResizeCapAnnotation(pv *v1.PersistentVolu | |
return err | ||
} | ||
|
||
func (ctrl *resizeController) updatePVCapacity(pv *v1.PersistentVolume, oldCapacity, newCapacity resource.Quantity, fsResizeRequired bool) error { | ||
func (ctrl *resizeController) updatePVCapacity( | ||
pv *v1.PersistentVolume, | ||
oldCapacity, newCapacity resource.Quantity, | ||
fsResizeRequired bool) (*v1.PersistentVolume, error) { | ||
|
||
klog.V(4).Infof("Resize volume succeeded for volume %q, start to update PV's capacity", pv.Name) | ||
newPV := pv.DeepCopy() | ||
newPV.Spec.Capacity[v1.ResourceStorage] = newCapacity | ||
|
@@ -590,11 +601,11 @@ func (ctrl *resizeController) updatePVCapacity(pv *v1.PersistentVolume, oldCapac | |
} | ||
} | ||
|
||
_, err := ctrl.patchPersistentVolume(pv, newPV) | ||
updatedPV, err := ctrl.patchPersistentVolume(pv, newPV) | ||
if err != nil { | ||
return fmt.Errorf("updating capacity of PV %q to %s failed: %v", pv.Name, newCapacity.String(), err) | ||
return pv, fmt.Errorf("updating capacity of PV %q to %s failed: %v", pv.Name, newCapacity.String(), err) | ||
} | ||
return nil | ||
return updatedPV, nil | ||
} | ||
|
||
func (ctrl *resizeController) patchPersistentVolume(oldPV, newPV *v1.PersistentVolume) (*v1.PersistentVolume, error) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
/* | ||
Copyright 2022 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package controller | ||
|
||
import ( | ||
"fmt" | ||
"github.com/kubernetes-csi/external-resizer/pkg/util" | ||
"google.golang.org/grpc/codes" | ||
"google.golang.org/grpc/status" | ||
v1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/api/resource" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
func (ctrl *resizeController) expandAndRecover(pvc *v1.PersistentVolumeClaim, pv *v1.PersistentVolume) (*v1.PersistentVolumeClaim, *v1.PersistentVolume, error, bool) { | ||
if !ctrl.pvCanBeExpanded(pv, pvc) { | ||
klog.V(4).Infof("No need to resize PV %q", pv.Name) | ||
return pvc, pv, nil, false | ||
} | ||
// only used as a sentinel value when function returns without | ||
// actually performing expansion on the volume. | ||
resizeNotCalled := false | ||
gnufied marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// if we are here that already means pvc.Spec.Size > pvc.Status.Size | ||
pvcSpecSize := pvc.Spec.Resources.Requests[v1.ResourceStorage] | ||
pvcStatusSize := pvc.Status.Capacity[v1.ResourceStorage] | ||
pvSize := pv.Spec.Capacity[v1.ResourceStorage] | ||
|
||
newSize := pvcSpecSize | ||
resizeStatus := v1.PersistentVolumeClaimNoExpansionInProgress | ||
if pvc.Status.ResizeStatus != nil { | ||
resizeStatus = *pvc.Status.ResizeStatus | ||
} | ||
|
||
var allocatedSize *resource.Quantity | ||
t, ok := pvc.Status.AllocatedResources[v1.ResourceStorage] | ||
if ok { | ||
allocatedSize = &t | ||
} | ||
|
||
if pvSize.Cmp(pvcSpecSize) < 0 { | ||
// PV is smaller than user requested size. In general some control-plane volume expansion | ||
// is necessary at this point but if we were expanding the PVC before we should let | ||
// previous operation finish before starting expansion to new user requested size. | ||
switch resizeStatus { | ||
case v1.PersistentVolumeClaimControllerExpansionInProgress, | ||
v1.PersistentVolumeClaimNodeExpansionFailed: | ||
if allocatedSize != nil { | ||
newSize = *allocatedSize | ||
} | ||
case v1.PersistentVolumeClaimNodeExpansionPending, | ||
v1.PersistentVolumeClaimNodeExpansionInProgress: | ||
if allocatedSize != nil { | ||
newSize = *allocatedSize | ||
} | ||
// If PV is already greater or equal to whatever newSize is, then we should | ||
// let previously issued node expansion finish before starting new one. | ||
// This case is essentially an optimization on previous case, generally it is safe | ||
// to let volume plugin receive the expansion call (expansion calls are idempotent) | ||
// but having this check here *avoids* unnecessary RPC calls to plugin. | ||
if pvSize.Cmp(newSize) >= 0 { | ||
return pvc, pv, nil, resizeNotCalled | ||
} | ||
default: | ||
newSize = pvcSpecSize | ||
} | ||
} else { | ||
// PV has already been expanded and hence we can be here for following reasons: | ||
// 1. If expansion is pending on the node and this was just a spurious update event | ||
// we don't need to do anything and let kubelet handle it. | ||
// 2. It could be that - although we successfully expanded the volume, we failed to | ||
// record our work in API objects, in which case - we should resume resizing operation | ||
// and let API objects be updated. | ||
// 3. Controller successfully expanded the volume, but expansion is failing on the node | ||
// and before kubelet can retry failed node expansion - controller must verify if it is | ||
// safe to do so. | ||
// 4. While expansion was still pending on the node, user reduced the pvc size. | ||
switch resizeStatus { | ||
case v1.PersistentVolumeClaimNodeExpansionInProgress, | ||
v1.PersistentVolumeClaimNodeExpansionPending: | ||
// we don't need to do any work. We could be here because of a spurious update event. | ||
// This is case #1 | ||
return pvc, pv, nil, resizeNotCalled | ||
case v1.PersistentVolumeClaimNodeExpansionFailed: | ||
// This is case#3, we need to reset the pvc status in such a way that kubelet can safely retry volume | ||
// expansion. | ||
if ctrl.resizer.DriverSupportsControlPlaneExpansion() && allocatedSize != nil { | ||
newSize = *allocatedSize | ||
} else { | ||
newSize = pvcSpecSize | ||
} | ||
case v1.PersistentVolumeClaimControllerExpansionInProgress, | ||
v1.PersistentVolumeClaimControllerExpansionFailed, | ||
v1.PersistentVolumeClaimNoExpansionInProgress: | ||
// This is case#2 or it could also be case#4 when user manually shrunk the PVC | ||
// after expanding it. | ||
if allocatedSize != nil { | ||
newSize = *allocatedSize | ||
} | ||
default: | ||
// It is impossible for ResizeStatus to be nil and allocatedSize to be not nil but somehow | ||
// if we do end up in this state, it is safest to resume expansion to last recorded size in | ||
// allocatedSize variable. | ||
if pvc.Status.ResizeStatus == nil && allocatedSize != nil { | ||
newSize = *allocatedSize | ||
} else { | ||
newSize = pvcSpecSize | ||
} | ||
} | ||
} | ||
var err error | ||
pvc, err = ctrl.markControllerResizeInProgress(pvc, newSize) | ||
if err != nil { | ||
return pvc, pv, fmt.Errorf("marking pvc %q as resizing failed: %v", util.PVCKey(pvc), err), resizeNotCalled | ||
} | ||
|
||
// if pvc previously failed to expand because it can't be expanded when in-use | ||
// we must not try expansion here | ||
if ctrl.usedPVCs.hasInUseErrors(pvc) && ctrl.usedPVCs.checkForUse(pvc) { | ||
// Record an event to indicate that resizer is not expanding the pvc | ||
msg := fmt.Sprintf("Unable to expand %s because CSI driver %s only supports offline expansion and volume is currently in-use", util.PVCKey(pvc), ctrl.resizer.Name()) | ||
ctrl.eventRecorder.Event(pvc, v1.EventTypeWarning, util.VolumeResizeFailed, msg) | ||
return pvc, pv, fmt.Errorf(msg), resizeNotCalled | ||
} | ||
|
||
// Record an event to indicate that external resizer is resizing this volume. | ||
ctrl.eventRecorder.Event(pvc, v1.EventTypeNormal, util.VolumeResizing, | ||
fmt.Sprintf("External resizer is resizing volume %s", pv.Name)) | ||
|
||
// before trying expansion we will remove the PVC from map | ||
// that tracks PVCs which can't be expanded when in-use. If | ||
// pvc indeed can not be expanded when in-use then it will be added | ||
// back when expansion fails with in-use error. | ||
ctrl.usedPVCs.removePVCWithInUseError(pvc) | ||
pvc, pv, err = ctrl.callResizeOnPlugin(pvc, pv, newSize, pvcStatusSize) | ||
|
||
if err != nil { | ||
// Record an event to indicate that resize operation is failed. | ||
ctrl.eventRecorder.Eventf(pvc, v1.EventTypeWarning, util.VolumeResizeFailed, err.Error()) | ||
return pvc, pv, err, true | ||
} | ||
|
||
klog.V(4).Infof("Update capacity of PV %q to %s succeeded", pv.Name, newSize.String()) | ||
return pvc, pv, nil, true | ||
} | ||
|
||
func (ctrl *resizeController) callResizeOnPlugin( | ||
pvc *v1.PersistentVolumeClaim, | ||
pv *v1.PersistentVolume, | ||
newSize, oldSize resource.Quantity) (*v1.PersistentVolumeClaim, *v1.PersistentVolume, error) { | ||
updatedSize, fsResizeRequired, err := ctrl.resizer.Resize(pv, newSize) | ||
|
||
if err != nil { | ||
// if this error was a in-use error then it must be tracked so as we don't retry without | ||
// first verifying if volume is in-use | ||
if inUseError(err) { | ||
ctrl.usedPVCs.addPVCWithInUseError(pvc) | ||
} | ||
if isFinalError(err) { | ||
var markExpansionFailedError error | ||
pvc, markExpansionFailedError = ctrl.markControllerExpansionFailed(pvc) | ||
if markExpansionFailedError != nil { | ||
return pvc, pv, fmt.Errorf("resizing failed in controller with %v but failed to update PVC %s with: %v", err, util.PVCKey(pvc), markExpansionFailedError) | ||
} | ||
} | ||
return pvc, pv, fmt.Errorf("resize volume %q by resizer %q failed: %v", pv.Name, ctrl.name, err) | ||
} | ||
|
||
klog.V(4).Infof("Resize volume succeeded for volume %q, start to update PV's capacity", pv.Name) | ||
|
||
pv, err = ctrl.updatePVCapacity(pv, oldSize, updatedSize, fsResizeRequired) | ||
if err != nil { | ||
return pvc, pv, fmt.Errorf("error updating pv %q by resizer: %v", pv.Name, err) | ||
} | ||
|
||
if fsResizeRequired { | ||
pvc, err = ctrl.markForPendingNodeExpansion(pvc) | ||
return pvc, pv, err | ||
} | ||
pvc, err = ctrl.markOverallExpansionAsFinished(pvc, updatedSize) | ||
return pvc, pv, err | ||
} | ||
|
||
// checks if pv can be expanded | ||
func (ctrl *resizeController) pvCanBeExpanded(pv *v1.PersistentVolume, pvc *v1.PersistentVolumeClaim) bool { | ||
jsafrane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if !ctrl.resizer.CanSupport(pv, pvc) { | ||
klog.V(4).Infof("Resizer %q doesn't support PV %q", ctrl.name, pv.Name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe it can be a warning because you are calling resize, but resizer does not support it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it can get called on all sort of PVs. such as intree pv,pvc etc, which are not resizable by external-resizer. I think Info is suitable since most people will be running both intree expand_controller and this controller side-by-side for now. |
||
return false | ||
} | ||
|
||
if (pv.Spec.ClaimRef == nil) || (pvc.Namespace != pv.Spec.ClaimRef.Namespace) || (pvc.UID != pv.Spec.ClaimRef.UID) { | ||
klog.V(4).Infof("persistent volume is not bound to PVC being updated: %s", util.PVCKey(pvc)) | ||
return false | ||
} | ||
return true | ||
} | ||
|
||
func isFinalError(err error) bool { | ||
// Sources: | ||
// https://github.com/grpc/grpc/blob/master/doc/statuscodes.md | ||
// https://github.com/container-storage-interface/spec/blob/master/spec.md | ||
st, ok := status.FromError(err) | ||
if !ok { | ||
// This is not gRPC error. The operation must have failed before gRPC | ||
// method was called, otherwise we would get gRPC error. | ||
// We don't know if any previous volume operation is in progress, be on the safe side. | ||
return false | ||
} | ||
switch st.Code() { | ||
case codes.Canceled, // gRPC: Client Application cancelled the request | ||
codes.DeadlineExceeded, // gRPC: Timeout | ||
codes.Unavailable, // gRPC: Server shutting down, TCP connection broken - previous volume operation may be still in progress. | ||
codes.ResourceExhausted, // gRPC: Server temporarily out of resources - previous volume operation may be still in progress. | ||
codes.Aborted: // CSI: Operation pending for volume | ||
return false | ||
} | ||
// All other errors mean that operation either did not | ||
// even start or failed. It is for sure not in progress. | ||
return true | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why return pv, pvc, but without using use objects?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for now, mainly for tests.