diff --git a/apis/apps/v1alpha1/cloneset_types.go b/apis/apps/v1alpha1/cloneset_types.go index 4d95dd224e..8f93bc0caa 100644 --- a/apis/apps/v1alpha1/cloneset_types.go +++ b/apis/apps/v1alpha1/cloneset_types.go @@ -93,6 +93,9 @@ type CloneSetScaleStrategy struct { // The scale will fail if the number of unavailable pods were greater than this MaxUnavailable at scaling up. // MaxUnavailable works only when scaling up. MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"` + + // Decide if cloneset reuses pvc when building pod. + DisablePVCReuse bool `json:"disablePVCReuse,omitempty"` } // CloneSetUpdateStrategy defines strategies for pods update. diff --git a/config/crd/bases/apps.kruise.io_clonesets.yaml b/config/crd/bases/apps.kruise.io_clonesets.yaml index da2fb66cdc..1497af75db 100644 --- a/config/crd/bases/apps.kruise.io_clonesets.yaml +++ b/config/crd/bases/apps.kruise.io_clonesets.yaml @@ -149,6 +149,9 @@ spec: description: ScaleStrategy indicates the ScaleStrategy that will be employed to create and delete Pods in the CloneSet. properties: + disablePVCReuse: + description: Decide if cloneset reuses pvc when building pod. + type: boolean maxUnavailable: anyOf: - type: integer diff --git a/config/crd/bases/apps.kruise.io_uniteddeployments.yaml b/config/crd/bases/apps.kruise.io_uniteddeployments.yaml index a274937b18..911fff2d3f 100644 --- a/config/crd/bases/apps.kruise.io_uniteddeployments.yaml +++ b/config/crd/bases/apps.kruise.io_uniteddeployments.yaml @@ -617,6 +617,10 @@ spec: that will be employed to create and delete Pods in the CloneSet. properties: + disablePVCReuse: + description: Decide if cloneset reuses pvc when building + pod. + type: boolean maxUnavailable: anyOf: - type: integer diff --git a/pkg/controller/cloneset/sync/cloneset_scale.go b/pkg/controller/cloneset/sync/cloneset_scale.go index 69ca12c685..14e55db753 100644 --- a/pkg/controller/cloneset/sync/cloneset_scale.go +++ b/pkg/controller/cloneset/sync/cloneset_scale.go @@ -29,11 +29,16 @@ import ( clonesetutils "github.com/openkruise/kruise/pkg/controller/cloneset/utils" "github.com/openkruise/kruise/pkg/util" "github.com/openkruise/kruise/pkg/util/expectations" + "github.com/openkruise/kruise/pkg/util/fieldindex" "github.com/openkruise/kruise/pkg/util/lifecycle" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/rand" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" + kubecontroller "k8s.io/kubernetes/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/client" ) const ( @@ -60,6 +65,21 @@ func (r *realControl) Scale( return false, nil } + // If cloneset doesn't want to reuse pvc, clean up + // the existing pvc first. Then it looks like the pod + // is deleted by controller, new pod can be created. + if updateCS.Spec.ScaleStrategy.DisablePVCReuse { + ins := getInstanceIDsFromPods(pods) + usingPVCs, uselessPVCs := classifyPVCs(ins, pvcs) + if len(uselessPVCs) > 0 { + klog.V(3).Infof("Begin to clean up cloneset %s useless PVCs", controllerKey) + if modified, err := r.cleanupPVCs(updateCS, uselessPVCs); err != nil || modified { + return modified, err + } + pvcs = usingPVCs + } + } + // 1. manage pods to delete and in preDelete podsSpecifiedToDelete, podsInPreDelete, numToDelete := getPlannedDeletedPods(updateCS, pods) if modified, err := r.managePreparingDelete(updateCS, pods, podsInPreDelete, numToDelete); err != nil || modified { @@ -403,3 +423,71 @@ func (r *realControl) choosePodsToDelete(cs *appsv1alpha1.CloneSet, totalDiff in return podsToDelete } + +func (r *realControl) cleanupPVCs(cs *appsv1alpha1.CloneSet, pvcs []*v1.PersistentVolumeClaim) (bool, error) { + var modified bool + for _, pvc := range pvcs { + // If the pvc has another ownerReference except cloneset, skip it + // and to avoid pvc deleted by mistake cased by instant pod status change, + // query the newest pod status again. + if len(pvc.OwnerReferences) > 1 || !isOwnerPodInactive(r.Client, cs, pvc) { + klog.Errorf("Skip deleting PVC %s", pvc.Name) + continue + } + clonesetutils.ScaleExpectations.ExpectScale(clonesetutils.GetControllerKey(cs), expectations.Delete, pvc.Name) + if err := r.Delete(context.TODO(), pvc); err != nil { + clonesetutils.ScaleExpectations.ObserveScale(clonesetutils.GetControllerKey(cs), expectations.Delete, pvc.Name) + r.recorder.Eventf(cs, v1.EventTypeWarning, "FailedCleanUp", "failed to clean up %s: %v", pvc.Name, err) + return modified, err + } + } + return modified, nil +} + +func getInstanceIDsFromPods(pods []*v1.Pod) sets.String { + ins := sets.NewString() + for _, pod := range pods { + ins.Insert(pod.Labels[appsv1alpha1.CloneSetInstanceID]) + } + return ins +} + +func classifyPVCs(ids sets.String, pvcs []*v1.PersistentVolumeClaim) (using, useless []*v1.PersistentVolumeClaim) { + usingMap := map[types.UID]*v1.PersistentVolumeClaim{} + uselessMap := map[types.UID]*v1.PersistentVolumeClaim{} + for _, pvc := range pvcs { + if ids.Has(pvc.Labels[appsv1alpha1.CloneSetInstanceID]) { + usingMap[pvc.UID] = pvc + } else { + uselessMap[pvc.UID] = pvc + } + } + + for _, p := range usingMap { + using = append(using, p) + } + for _, p := range uselessMap { + useless = append(useless, p) + } + return using, useless +} + +func isOwnerPodInactive(reader client.Reader, cs *appsv1alpha1.CloneSet, pvc *v1.PersistentVolumeClaim) bool { + opts := &client.ListOptions{ + Namespace: cs.Namespace, + FieldSelector: fields.SelectorFromSet(fields.Set{fieldindex.IndexNameForOwnerRefUID: string(cs.UID)}), + } + podList, err := clonesetutils.GetAllPods(reader, opts) + if err != nil { + klog.Errorf("Could not get cloneset %s owned pod", clonesetutils.GetControllerKey(cs)) + return false + } + + // If pod is inactive or not found which means already deleted, return true. + for _, pod := range podList { + if clonesetutils.IsPVCAndPodRelated(pvc, pod) && kubecontroller.IsPodActive(pod) { + return false + } + } + return true +} diff --git a/pkg/controller/cloneset/utils/cloneset_utils.go b/pkg/controller/cloneset/utils/cloneset_utils.go index 9b0cc0a332..8910755c9b 100644 --- a/pkg/controller/cloneset/utils/cloneset_utils.go +++ b/pkg/controller/cloneset/utils/cloneset_utils.go @@ -93,17 +93,17 @@ func GetControllerKey(cs *appsv1alpha1.CloneSet) string { // GetActivePods returns all active pods in this namespace. func GetActivePods(reader client.Reader, opts *client.ListOptions) ([]*v1.Pod, error) { - podList := &v1.PodList{} - if err := reader.List(context.TODO(), podList, opts, utilclient.DisableDeepCopy); err != nil { + podList, err := GetAllPods(reader, opts) + if err != nil { return nil, err } // Ignore inactive pods var activePods []*v1.Pod - for i, pod := range podList.Items { + for i, pod := range podList { // Consider all rebuild pod as active pod, should not recreate - if kubecontroller.IsPodActive(&pod) { - activePods = append(activePods, &podList.Items[i]) + if kubecontroller.IsPodActive(pod) { + activePods = append(activePods, podList[i]) } } return activePods, nil @@ -233,3 +233,27 @@ func DoItSlowly(count int, initialBatchSize int, fn func() error) (int, error) { } return successes, nil } + +// GetAllPods returns all pods in this namespace. +func GetAllPods(reader client.Reader, opts *client.ListOptions) ([]*v1.Pod, error) { + podList := &v1.PodList{} + if err := reader.List(context.TODO(), podList, opts, utilclient.DisableDeepCopy); err != nil { + return nil, err + } + + var pods []*v1.Pod + for i := range podList.Items { + pods = append(pods, &podList.Items[i]) + } + return pods, nil +} + +// Judge if the pvc belongs to the pod. +func IsPVCAndPodRelated(pvc *v1.PersistentVolumeClaim, pod *v1.Pod) bool { + pvcIns := pvc.Labels[appsv1alpha1.CloneSetInstanceID] + podIns := pod.Labels[appsv1alpha1.CloneSetInstanceID] + if pvcIns == "" || podIns == "" { + return false + } + return pvcIns == podIns +}