-
Notifications
You must be signed in to change notification settings - Fork 348
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Set READONLY flag in CSI PV based on PVC accessmode #469
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -417,7 +417,7 @@ func TestCreateDriverReturnsInvalidCapacityDuringProvision(t *testing.T) { | |
|
||
pluginCaps, controllerCaps := provisionCapabilities() | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", | ||
5, csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, nil, nil, false, defaultfsType, nil) | ||
5, csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, nil, nil, false, defaultfsType, nil, true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should investigate this separately, but I think the number of function arguments is getting to the point that we should probably have an options struct. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. completely agree here, the arg has grown a lot and even the test case organization is difficult in general with current state. I will revisit this part in a followup PR. thanks. |
||
|
||
// Requested PVC with requestedBytes storage | ||
deletePolicy := v1.PersistentVolumeReclaimDelete | ||
|
@@ -835,6 +835,7 @@ type provisioningTestcase struct { | |
expectSelectedNode string // a specific selected-node of the PVC in the apiserver after the test, same as before if empty | ||
expectNoProvision bool // if true, then ShouldProvision should return false | ||
supportsSingleNodeMultiWriter bool // if true, then provision with single node multi writer capabilities | ||
controllerPublishReadOnly bool | ||
} | ||
|
||
type provisioningFSTypeTestcase struct { | ||
|
@@ -1216,7 +1217,7 @@ func provisionTestcases() (int64, map[string]provisioningTestcase) { | |
}, | ||
expectState: controller.ProvisioningFinished, | ||
}, | ||
"provision with access mode multi node multi readonly": { | ||
"provision with access mode multi node multi readonly with sidecar arg false": { | ||
volOpts: controller.ProvisionOptions{ | ||
StorageClass: &storagev1.StorageClass{ | ||
ReclaimPolicy: &deletePolicy, | ||
|
@@ -1239,6 +1240,7 @@ func provisionTestcases() (int64, map[string]provisioningTestcase) { | |
}, | ||
}, | ||
}, | ||
controllerPublishReadOnly: false, | ||
expectedPVSpec: &pvSpec{ | ||
Name: "test-testi", | ||
ReclaimPolicy: v1.PersistentVolumeReclaimDelete, | ||
|
@@ -1250,6 +1252,61 @@ func provisionTestcases() (int64, map[string]provisioningTestcase) { | |
Driver: "test-driver", | ||
VolumeHandle: "test-volume-id", | ||
FSType: "ext4", | ||
ReadOnly: false, | ||
VolumeAttributes: map[string]string{ | ||
"storage.kubernetes.io/csiProvisionerIdentity": "test-provisioner", | ||
}, | ||
}, | ||
}, | ||
expectCreateVolDo: func(t *testing.T, ctx context.Context, req *csi.CreateVolumeRequest) { | ||
if len(req.GetVolumeCapabilities()) != 1 { | ||
t.Errorf("Incorrect length in volume capabilities") | ||
} | ||
if req.GetVolumeCapabilities()[0].GetAccessMode() == nil { | ||
t.Errorf("Expected access mode to be set") | ||
} | ||
if req.GetVolumeCapabilities()[0].GetAccessMode().GetMode() != csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY { | ||
t.Errorf("Expected multi_node_reader_only") | ||
} | ||
}, | ||
expectState: controller.ProvisioningFinished, | ||
}, | ||
"provision with access mode multi node multi readonly with sidecar arg true": { | ||
volOpts: controller.ProvisionOptions{ | ||
StorageClass: &storagev1.StorageClass{ | ||
ReclaimPolicy: &deletePolicy, | ||
Parameters: map[string]string{}, | ||
}, | ||
PVName: "test-name", | ||
PVC: &v1.PersistentVolumeClaim{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
UID: "testid", | ||
Annotations: driverNameAnnotation, | ||
}, | ||
Spec: v1.PersistentVolumeClaimSpec{ | ||
Selector: nil, | ||
Resources: v1.ResourceRequirements{ | ||
Requests: v1.ResourceList{ | ||
v1.ResourceName(v1.ResourceStorage): resource.MustParse(strconv.FormatInt(requestedBytes, 10)), | ||
}, | ||
}, | ||
AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadOnlyMany}, | ||
}, | ||
}, | ||
}, | ||
controllerPublishReadOnly: true, | ||
expectedPVSpec: &pvSpec{ | ||
Name: "test-testi", | ||
ReclaimPolicy: v1.PersistentVolumeReclaimDelete, | ||
AccessModes: []v1.PersistentVolumeAccessMode{v1.ReadOnlyMany}, | ||
Capacity: v1.ResourceList{ | ||
v1.ResourceName(v1.ResourceStorage): bytesToQuantity(requestedBytes), | ||
}, | ||
CSIPVS: &v1.CSIPersistentVolumeSource{ | ||
Driver: "test-driver", | ||
VolumeHandle: "test-volume-id", | ||
FSType: "ext4", | ||
ReadOnly: true, | ||
VolumeAttributes: map[string]string{ | ||
"storage.kubernetes.io/csiProvisionerIdentity": "test-provisioner", | ||
}, | ||
|
@@ -2244,7 +2301,7 @@ func runFSTypeProvisionTest(t *testing.T, k string, tc provisioningFSTypeTestcas | |
myDefaultfsType = "" | ||
} | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, csiConn.conn, | ||
nil, provisionDriverName, pluginCaps, controllerCaps, supportsMigrationFromInTreePluginName, false, true, csitrans.New(), nil, nil, nil, nil, nil, false, myDefaultfsType, nil) | ||
nil, provisionDriverName, pluginCaps, controllerCaps, supportsMigrationFromInTreePluginName, false, true, csitrans.New(), nil, nil, nil, nil, nil, false, myDefaultfsType, nil, false) | ||
out := &csi.CreateVolumeResponse{ | ||
Volume: &csi.Volume{ | ||
CapacityBytes: requestedBytes, | ||
|
@@ -2422,9 +2479,9 @@ func runProvisionTest(t *testing.T, tc provisioningTestcase, requestedBytes int6 | |
} else { | ||
pluginCaps, controllerCaps = provisionCapabilities() | ||
} | ||
|
||
mycontrollerPublishReadOnly := tc.controllerPublishReadOnly | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, csiConn.conn, | ||
nil, provisionDriverName, pluginCaps, controllerCaps, supportsMigrationFromInTreePluginName, false, true, csitrans.New(), scInformer.Lister(), csiNodeInformer.Lister(), nodeInformer.Lister(), nil, nil, tc.withExtraMetadata, defaultfsType, nodeDeployment) | ||
nil, provisionDriverName, pluginCaps, controllerCaps, supportsMigrationFromInTreePluginName, false, true, csitrans.New(), scInformer.Lister(), csiNodeInformer.Lister(), nodeInformer.Lister(), nil, nil, tc.withExtraMetadata, defaultfsType, nodeDeployment, mycontrollerPublishReadOnly) | ||
|
||
// Adding objects to the informer ensures that they are consistent with | ||
// the fake storage without having to start the informers. | ||
|
@@ -3167,7 +3224,7 @@ func TestProvisionFromSnapshot(t *testing.T) { | |
|
||
pluginCaps, controllerCaps := provisionFromSnapshotCapabilities() | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, csiConn.conn, | ||
client, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, nil, nil, false, defaultfsType, nil) | ||
client, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, nil, nil, false, defaultfsType, nil, true) | ||
|
||
out := &csi.CreateVolumeResponse{ | ||
Volume: &csi.Volume{ | ||
|
@@ -3347,7 +3404,7 @@ func TestProvisionWithTopologyEnabled(t *testing.T) { | |
defer close(stopChan) | ||
|
||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), scLister, csiNodeLister, nodeLister, claimLister, vaLister, false, defaultfsType, nil) | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), scLister, csiNodeLister, nodeLister, claimLister, vaLister, false, defaultfsType, nil, true) | ||
|
||
pv, _, err := csiProvisioner.Provision(context.Background(), controller.ProvisionOptions{ | ||
StorageClass: &storagev1.StorageClass{}, | ||
|
@@ -3441,7 +3498,7 @@ func TestProvisionErrorHandling(t *testing.T) { | |
defer close(stopChan) | ||
|
||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), scLister, csiNodeLister, nodeLister, claimLister, vaLister, false, defaultfsType, nil) | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), scLister, csiNodeLister, nodeLister, claimLister, vaLister, false, defaultfsType, nil, true) | ||
|
||
options := controller.ProvisionOptions{ | ||
StorageClass: &storagev1.StorageClass{}, | ||
|
@@ -3514,7 +3571,7 @@ func TestProvisionWithTopologyDisabled(t *testing.T) { | |
clientSet := fakeclientset.NewSimpleClientset() | ||
pluginCaps, controllerCaps := provisionWithTopologyCapabilities() | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, nil, nil, false, defaultfsType, nil) | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, nil, nil, false, defaultfsType, nil, true) | ||
|
||
out := &csi.CreateVolumeResponse{ | ||
Volume: &csi.Volume{ | ||
|
@@ -3962,7 +4019,7 @@ func runDeleteTest(t *testing.T, k string, tc deleteTestcase) { | |
pluginCaps, controllerCaps := provisionCapabilities() | ||
scLister, _, _, _, vaLister, _ := listers(clientSet) | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), scLister, nil, nil, nil, vaLister, false, defaultfsType, nodeDeployment) | ||
csiConn.conn, nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), scLister, nil, nil, nil, vaLister, false, defaultfsType, nodeDeployment, true) | ||
|
||
err = csiProvisioner.Delete(context.Background(), tc.persistentVolume) | ||
if tc.expectErr && err == nil { | ||
|
@@ -4383,7 +4440,7 @@ func TestProvisionFromPVC(t *testing.T) { | |
|
||
// Phase: execute the test | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", "test", 5, csiConn.conn, | ||
nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, claimLister, nil, false, defaultfsType, nil) | ||
nil, driverName, pluginCaps, controllerCaps, "", false, true, csitrans.New(), nil, nil, nil, claimLister, nil, false, defaultfsType, nil, true) | ||
|
||
pv, _, err = csiProvisioner.Provision(context.Background(), tc.volOpts) | ||
if tc.expectErr && err == nil { | ||
|
@@ -4501,7 +4558,7 @@ func TestProvisionWithMigration(t *testing.T) { | |
pluginCaps, controllerCaps := provisionCapabilities() | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", | ||
"test", 5, csiConn.conn, nil, driverName, pluginCaps, controllerCaps, | ||
inTreePluginName, false, true, mockTranslator, nil, nil, nil, nil, nil, false, defaultfsType, nil) | ||
inTreePluginName, false, true, mockTranslator, nil, nil, nil, nil, nil, false, defaultfsType, nil, true) | ||
|
||
// Set up return values (AnyTimes to avoid overfitting on implementation) | ||
|
||
|
@@ -4678,7 +4735,7 @@ func TestDeleteMigration(t *testing.T) { | |
defer close(stopCh) | ||
csiProvisioner := NewCSIProvisioner(clientSet, 5*time.Second, "test-provisioner", | ||
"test", 5, csiConn.conn, nil, driverName, pluginCaps, controllerCaps, inTreePluginName, | ||
false, true, mockTranslator, scLister, nil, nil, nil, vaLister, false, defaultfsType, nil) | ||
false, true, mockTranslator, scLister, nil, nil, nil, vaLister, false, defaultfsType, nil, true) | ||
|
||
// Set mock return values (AnyTimes to avoid overfitting on implementation details) | ||
mockTranslator.EXPECT().IsPVMigratable(gomock.Any()).Return(tc.expectTranslation).AnyTimes() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure if presence of ROX accessMode means volume should be controller published as readonly. To me, these can be orthogonal concerns. cc @bswartz
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
true.. but regardless of how controller publish, the PV source could still set it RO .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes but interpreting ROX as readonly in PV source should be done with caution. It could make the PV unmountable in case journal needs to be replayed or something else.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes I was going to argue the same. Volumes should not be controller-published as read-only unless the PV is marked read-only. There are too many good reasons for publishing them writable to the node and then having the node re-mount them as read-only to the pod.
Journal-replays are one use case. Pending expand operations are another.
I interpret PVs with the readOnly: true flag to be PERMANENTLY read-only, which you almost never want in practice. We might want to make it easier to cause that PV flag to get set to true in some cases, but I think the decision for ControllerPublishVolume should look at that flag only and not look at access modes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree the FS operations may need it RW while controller publish. But SP driver is free to do the operation or default to RW while controller publishing or node staging. Isnt it ?
I could think of a scenario where user provision a new volume as "RO" with dataSource set to
PVC
orSnapshot
. Isnt it a valid use case where user want a PV to be READONLY?@saad-ali @msau42 @jsafrane @xing-yang inviting your thoughts .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to question this part. I expect filesystems to be prepared for this case. Consider a normal device that degrades such that writes start to fail. The normal reaction of the kernel is to remount the filesystem as read-only. Being able to mount read-only after a system reboot to extract more data seems like a sensible feature to support - much better than refusing to mount completely.
Whether data and files are consistent is another question, but it's not different from protecting against a sudden power loss.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
First of all, I want to be clear that I'm not trying to block this issue. I think it's a good change, and I'd like to see it merge.
I was just trying to answer the question of what potential downsides this change could have, because it does introduce this new corner case with journaled filesystems. We shouldn't be surprised to see support cases from people with problems mounting readonly volumes that were cloned from in-use filesystems or snapshots taken of in-use filesystems, if journaling is in use.
There are some workarounds we can either recommend to users, or possibly even build into our CSI drivers to automatically overcome this class of problems. The most obvious workaround is to create a volume that's writable and use that instead. Or, create a writable volume, attach that to a pod that does nothing (to let the journal replay), then clone that volume after the pod has terminated cleanly (so the journal is empty).
The specific kernel error I see when I reproduce this problem with ext4 is:
[587843.214656] EXT4-fs (nbd2): INFO: recovery required on readonly filesystem
[587843.214657] EXT4-fs (nbd2): write access unavailable, cannot proceed (try mounting with noload)
Possibly CSI drivers could use the -o noload option with ext4 filesystems when mounting readonly to avoid the problem.
When using XFS, the kernel errors look like this:
[588571.823845] XFS (nbd1): Mounting V5 Filesystem
[588571.881098] XFS (nbd1): recovery required on read-only device.
[588571.881100] XFS (nbd1): write access unavailable, cannot proceed.
[588571.881101] XFS (nbd1): log mount/recovery failed: error -30
[588571.881123] XFS (nbd1): log mount failed
Some searching turns up the -o norecovery options for XFS which might also work.
In summary, I like this change but I'd urge that CSI driver authors consider special handling for mounting of volumes published read-only at the controller level. And if users do get stuck with unmountable volumes, be ready to suggest the workaround of using a writable volume.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@bswartz @pohly thanks for the inputs.. First of all good discussion about the concerned area.
Let me share some thoughts here: In general, a storage vendor does seizing of
in flight i/o
s and also put writebarriers
in place to make sure the data is persisted correctly while snapshot is taken. Thats one of the main requirement of consistent snapshots. Thus, in above scenario, it would be ideally achieved by the SP. From CSI layer, I assume it should not be of much concern. iow, a snapshot taken on anin used volume
or snapshot taken after a crash or node/pod loss comes pretty much to the same situation and SP or any layer responsible at storage low level should take care of it. While it comes to controller publish/node stage, again, decision can be made by SP to node stage or controller publish either by respecting the pv spec or not.Regardless, the mention about the possibilities of a 'unclean journal' situation and a workaround/solution is indeed a good point 👍 . I am not sure about the different mount options of ext4 or xfs mentioned, have to look into the details of these.
About the direction of this PR and its completion, I feel its good wait some more input from storage sig.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably the best thing to do before merging this fix would be to add documentation to the CSI spec around the PUBLISH_READONLY controller capability. Drivers that assert that capability will need to be prepared to mount unmodifiable volumes on the nodes, which means understanding how to deal with unclean journals of each of the fstypes that your driver supports.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While we wait for others to share their thoughts, let me rebase the PR and keep it updated with latest code.