Skip to content

Commit 2140f89

Browse files
committed
Support draining daemonset pods that use sriov devices
with this commit we also take care of removing daemonset owned pods using sriov devices. we only do it when drain is requested we don't do it for reboot requests Signed-off-by: Sebastian Sch <[email protected]>
1 parent b2c487c commit 2140f89

File tree

3 files changed

+248
-17
lines changed

3 files changed

+248
-17
lines changed

pkg/drain/drainer.go

+71-15
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"time"
88

99
corev1 "k8s.io/api/core/v1"
10+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1011
"k8s.io/apimachinery/pkg/util/wait"
1112
"k8s.io/client-go/kubernetes"
1213
"k8s.io/kubectl/pkg/drain"
@@ -85,12 +86,38 @@ func (d *Drainer) DrainNode(ctx context.Context, node *corev1.Node, fullNodeDrai
8586
return false, nil
8687
}
8788
err = drain.RunNodeDrain(drainHelper, node.Name)
88-
if err == nil {
89+
if err != nil {
90+
lastErr = err
91+
reqLogger.Info("drainNode(): Draining failed, retrying", "error", err)
92+
return false, nil
93+
}
94+
95+
// on full drain there is no need to try and remove pods that are owned by DaemonSets
96+
// as we are going to reboot the node in any case.
97+
if fullNodeDrain {
8998
return true, nil
9099
}
91-
lastErr = err
92-
reqLogger.Info("drainNode(): Draining failed, retrying", "error", err)
93-
return false, nil
100+
101+
// Go over all the remain pods search for DaemonSets that have SR-IOV devices and remove them
102+
// we can't use the drain from core kubernetes as it doesn't support removing pods that are part of a DaemonSets
103+
podList, err := d.kubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("spec.nodeName=%s", node.Name)})
104+
if err != nil {
105+
lastErr = err
106+
reqLogger.Info("drainNode(): Failed to list pods, retrying", "error", err)
107+
return false, nil
108+
}
109+
110+
// remove pods that are owned by a DaemonSet and use SR-IOV devices
111+
dsPodsList := getDsPodsToRemove(podList)
112+
for _, pod := range dsPodsList {
113+
err = d.kubeClient.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
114+
if err != nil {
115+
lastErr = err
116+
reqLogger.Info("drainNode(): Failed to delete pod, retrying", "error", err)
117+
return false, nil
118+
}
119+
}
120+
return true, nil
94121
}); err != nil {
95122
if wait.Interrupted(err) {
96123
reqLogger.Info("drainNode(): failed to drain node", "steps", backoff.Steps, "error", lastErr)
@@ -158,17 +185,11 @@ func createDrainHelper(kubeClient kubernetes.Interface, ctx context.Context, ful
158185
// when we just want to drain and not reboot we can only remove the pods using sriov devices
159186
if !fullDrain {
160187
deleteFunction := func(p corev1.Pod) drain.PodDeleteStatus {
161-
for _, c := range p.Spec.Containers {
162-
if c.Resources.Requests != nil {
163-
for r := range c.Resources.Requests {
164-
if strings.HasPrefix(r.String(), vars.ResourcePrefix) {
165-
return drain.PodDeleteStatus{
166-
Delete: true,
167-
Reason: "pod contain SR-IOV device",
168-
Message: "SR-IOV network operator draining the node",
169-
}
170-
}
171-
}
188+
if podHasSRIOVDevice(&p) {
189+
return drain.PodDeleteStatus{
190+
Delete: true,
191+
Reason: "pod contains SR-IOV device",
192+
Message: "SR-IOV network operator draining the node",
172193
}
173194
}
174195
return drain.PodDeleteStatus{Delete: false}
@@ -179,3 +200,38 @@ func createDrainHelper(kubeClient kubernetes.Interface, ctx context.Context, ful
179200

180201
return drainer
181202
}
203+
204+
func podHasSRIOVDevice(p *corev1.Pod) bool {
205+
for _, c := range p.Spec.Containers {
206+
if c.Resources.Requests != nil {
207+
for r := range c.Resources.Requests {
208+
if strings.HasPrefix(r.String(), vars.ResourcePrefix) {
209+
return true
210+
}
211+
}
212+
}
213+
}
214+
215+
return false
216+
}
217+
218+
func podsHasDSOwner(p *corev1.Pod) bool {
219+
for _, o := range p.OwnerReferences {
220+
if o.Kind == "DaemonSet" {
221+
return true
222+
}
223+
}
224+
225+
return false
226+
}
227+
228+
func getDsPodsToRemove(pl *corev1.PodList) []corev1.Pod {
229+
podsToRemove := []corev1.Pod{}
230+
for _, pod := range pl.Items {
231+
if podsHasDSOwner(&pod) && podHasSRIOVDevice(&pod) {
232+
podsToRemove = append(podsToRemove, pod)
233+
}
234+
}
235+
236+
return podsToRemove
237+
}

test/conformance/tests/test_policy_configuration.go

+1
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ var _ = Describe("[sriov] operator", Ordered, func() {
448448
createTestPod(nodeToTest, []string{sriovNetworkName})
449449
})
450450
})
451+
451452
Context("PF shutdown", func() {
452453
// 29398
453454
It("Should be able to create pods successfully if PF is down.Pods are able to communicate with each other on the same node", func() {

test/conformance/tests/test_sriov_operator.go

+176-2
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,7 @@ var _ = Describe("[sriov] operator", func() {
10451045
})
10461046

10471047
Describe("Custom SriovNetworkNodePolicy", func() {
1048-
BeforeEach(func() {
1048+
AfterEach(func() {
10491049
err := namespaces.Clean(operatorNamespace, namespaces.Test, clients, discovery.Enabled())
10501050
Expect(err).ToNot(HaveOccurred())
10511051
WaitForSRIOVStable()
@@ -1722,6 +1722,111 @@ var _ = Describe("[sriov] operator", func() {
17221722
})
17231723
})
17241724

1725+
Context("Draining Daemons using SR-IOV", func() {
1726+
var node string
1727+
resourceName := "drainresource"
1728+
sriovNetworkName := "test-drainnetwork"
1729+
var drainPolicy *sriovv1.SriovNetworkNodePolicy
1730+
1731+
BeforeEach(func() {
1732+
isSingleNode, err := cluster.IsSingleNode(clients)
1733+
Expect(err).ToNot(HaveOccurred())
1734+
if isSingleNode {
1735+
// TODO: change this when we add support for draining on single node
1736+
Skip("This test is not supported on single node as we don't drain on single node")
1737+
}
1738+
1739+
node = sriovInfos.Nodes[0]
1740+
sriovDeviceList, err := sriovInfos.FindSriovDevices(node)
1741+
Expect(err).ToNot(HaveOccurred())
1742+
intf := sriovDeviceList[0]
1743+
By("Using device " + intf.Name + " on node " + node)
1744+
1745+
drainPolicy = &sriovv1.SriovNetworkNodePolicy{
1746+
ObjectMeta: metav1.ObjectMeta{
1747+
GenerateName: "test-drainpolicy",
1748+
Namespace: operatorNamespace,
1749+
},
1750+
1751+
Spec: sriovv1.SriovNetworkNodePolicySpec{
1752+
NodeSelector: map[string]string{
1753+
"kubernetes.io/hostname": node,
1754+
},
1755+
Mtu: 1500,
1756+
NumVfs: 5,
1757+
ResourceName: resourceName,
1758+
Priority: 99,
1759+
NicSelector: sriovv1.SriovNetworkNicSelector{
1760+
PfNames: []string{intf.Name},
1761+
},
1762+
DeviceType: "netdevice",
1763+
},
1764+
}
1765+
1766+
err = clients.Create(context.Background(), drainPolicy)
1767+
Expect(err).ToNot(HaveOccurred())
1768+
1769+
WaitForSRIOVStable()
1770+
By("waiting for the resources to be available")
1771+
Eventually(func() int64 {
1772+
testedNode, err := clients.CoreV1Interface.Nodes().Get(context.Background(), node, metav1.GetOptions{})
1773+
Expect(err).ToNot(HaveOccurred())
1774+
resNum := testedNode.Status.Allocatable[corev1.ResourceName("openshift.io/"+resourceName)]
1775+
allocatable, _ := resNum.AsInt64()
1776+
return allocatable
1777+
}, 10*time.Minute, time.Second).Should(Equal(int64(5)))
1778+
1779+
sriovNetwork := &sriovv1.SriovNetwork{
1780+
ObjectMeta: metav1.ObjectMeta{
1781+
Name: sriovNetworkName,
1782+
Namespace: operatorNamespace,
1783+
},
1784+
Spec: sriovv1.SriovNetworkSpec{
1785+
ResourceName: resourceName,
1786+
IPAM: `{"type":"host-local","subnet":"10.10.10.0/24","rangeStart":"10.10.10.171","rangeEnd":"10.10.10.181"}`,
1787+
NetworkNamespace: namespaces.Test,
1788+
}}
1789+
1790+
// We need this to be able to run the connectivity checks on Mellanox cards
1791+
if intf.DeviceID == "1015" {
1792+
sriovNetwork.Spec.SpoofChk = off
1793+
}
1794+
1795+
err = clients.Create(context.Background(), sriovNetwork)
1796+
1797+
Expect(err).ToNot(HaveOccurred())
1798+
waitForNetAttachDef("test-drainnetwork", namespaces.Test)
1799+
1800+
createTestDaemonSet(node, []string{sriovNetworkName})
1801+
})
1802+
1803+
It("should reconcile managed VF if status is changed", func() {
1804+
By("getting running config-daemon and test pod on the node")
1805+
daemonTestPod, err := findPodOnNodeWithLabelsAndNamespace(node, namespaces.Test, map[string]string{"app": "test"})
1806+
Expect(err).ToNot(HaveOccurred())
1807+
daemonConfigPod, err := findPodOnNodeWithLabelsAndNamespace(node, operatorNamespace, map[string]string{"app": "sriov-network-config-daemon"})
1808+
Expect(err).ToNot(HaveOccurred())
1809+
1810+
By("deleting the sriov policy to start a drain")
1811+
err = clients.Delete(context.Background(), drainPolicy)
1812+
Expect(err).ToNot(HaveOccurred())
1813+
WaitForSRIOVStable()
1814+
1815+
tmpDaemon := &appsv1.DaemonSet{}
1816+
By("Checking the pod owned by a DaemonSet requesting sriov device was deleted ")
1817+
Eventually(func(g Gomega) bool {
1818+
err = clients.Client.Get(context.Background(), runtimeclient.ObjectKey{Name: daemonTestPod.Name, Namespace: daemonTestPod.Namespace}, tmpDaemon)
1819+
return err != nil && k8serrors.IsNotFound(err)
1820+
}, time.Minute, 5*time.Second).Should(BeTrue())
1821+
1822+
By("Checking the pod owned by a DaemonSet not requesting an sriov device was not deleted")
1823+
Consistently(func(g Gomega) bool {
1824+
err = clients.Client.Get(context.Background(), runtimeclient.ObjectKey{Name: daemonConfigPod.Name, Namespace: daemonConfigPod.Namespace}, tmpDaemon)
1825+
return err != nil && k8serrors.IsNotFound(err)
1826+
}, time.Minute, 10*time.Second).Should(BeTrue())
1827+
})
1828+
})
1829+
17251830
})
17261831
})
17271832

@@ -1842,6 +1947,11 @@ func findMainSriovDevice(executorPod *corev1.Pod, sriovDevices []*sriovv1.Interf
18421947

18431948
func findUnusedSriovDevices(testNode string, sriovDevices []*sriovv1.InterfaceExt) ([]*sriovv1.InterfaceExt, error) {
18441949
createdPod := createCustomTestPod(testNode, []string{}, true, nil)
1950+
defer func() {
1951+
err := clients.Delete(context.Background(), createdPod)
1952+
Expect(err).ToNot(HaveOccurred())
1953+
}()
1954+
18451955
filteredDevices := []*sriovv1.InterfaceExt{}
18461956
stdout, _, err := pod.ExecCommand(clients, createdPod, "ip", "route")
18471957
Expect(err).ToNot(HaveOccurred())
@@ -1989,6 +2099,24 @@ func isDaemonsetScheduledOnNodes(nodeSelector, daemonsetLabelSelector string) bo
19892099
return true
19902100
}
19912101

2102+
func findPodOnNodeWithLabelsAndNamespace(nodeName string, namespace string, labels map[string]string) (*corev1.Pod, error) {
2103+
podList := &corev1.PodList{}
2104+
err := clients.List(context.Background(), podList, runtimeclient.MatchingLabels(labels), &runtimeclient.ListOptions{Namespace: namespace}, runtimeclient.MatchingFields{"spec.nodeName": nodeName})
2105+
if err != nil {
2106+
return nil, err
2107+
}
2108+
2109+
if len(podList.Items) == 0 {
2110+
return nil, fmt.Errorf("no pod found")
2111+
}
2112+
2113+
if len(podList.Items) > 1 {
2114+
return nil, fmt.Errorf("multiple pods found")
2115+
}
2116+
2117+
return &podList.Items[0], nil
2118+
}
2119+
19922120
func createSriovPolicy(sriovDevice string, testNode string, numVfs int, resourceName string) {
19932121
_, err := network.CreateSriovPolicy(clients, "test-policy-", operatorNamespace, sriovDevice, testNode, numVfs, resourceName, "netdevice")
19942122
Expect(err).ToNot(HaveOccurred())
@@ -2017,7 +2145,6 @@ func createCustomTestPod(node string, networks []string, hostNetwork bool, podCa
20172145
node,
20182146
)
20192147
}
2020-
20212148
if len(podCapabilities) != 0 {
20222149
if podDefinition.Spec.Containers[0].SecurityContext == nil {
20232150
podDefinition.Spec.Containers[0].SecurityContext = &corev1.SecurityContext{}
@@ -2034,6 +2161,41 @@ func createCustomTestPod(node string, networks []string, hostNetwork bool, podCa
20342161
return waitForPodRunning(createdPod)
20352162
}
20362163

2164+
func createTestDaemonSet(node string, networks []string) *appsv1.DaemonSet {
2165+
podDefinition := pod.RedefineWithNodeSelector(
2166+
pod.GetDefinition(),
2167+
node,
2168+
)
2169+
2170+
// remove NET_ADMIN to not have issues in OCP
2171+
podDefinition.Spec.Containers[0].SecurityContext = nil
2172+
2173+
daemonDefinition := &appsv1.DaemonSet{
2174+
ObjectMeta: metav1.ObjectMeta{GenerateName: "test-", Namespace: namespaces.Test},
2175+
Spec: appsv1.DaemonSetSpec{
2176+
Selector: &metav1.LabelSelector{
2177+
MatchLabels: map[string]string{
2178+
"app": "test",
2179+
},
2180+
},
2181+
Template: corev1.PodTemplateSpec{
2182+
ObjectMeta: metav1.ObjectMeta{
2183+
Labels: map[string]string{
2184+
"app": "test",
2185+
},
2186+
Annotations: map[string]string{"k8s.v1.cni.cncf.io/networks": strings.Join(networks, ",")},
2187+
},
2188+
Spec: podDefinition.Spec,
2189+
},
2190+
},
2191+
}
2192+
2193+
err := clients.Create(context.Background(), daemonDefinition)
2194+
Expect(err).ToNot(HaveOccurred())
2195+
2196+
return waitForDaemonReady(daemonDefinition)
2197+
}
2198+
20372199
func pingPod(ip string, nodeSelector string, sriovNetworkAttachment string) {
20382200
ipProtocolVersion := "6"
20392201
if len(strings.Split(ip, ".")) == 4 {
@@ -2376,6 +2538,18 @@ func waitForPodRunning(p *corev1.Pod) *corev1.Pod {
23762538
return ret
23772539
}
23782540

2541+
func waitForDaemonReady(d *appsv1.DaemonSet) *appsv1.DaemonSet {
2542+
Eventually(func(g Gomega) bool {
2543+
err := clients.Get(context.Background(), runtimeclient.ObjectKey{Name: d.Name, Namespace: d.Namespace}, d)
2544+
g.Expect(err).ToNot(HaveOccurred())
2545+
g.Expect(d.Status.DesiredNumberScheduled).To(BeNumerically(">", 0))
2546+
g.Expect(d.Status.CurrentNumberScheduled).To(BeNumerically(">", 0))
2547+
return d.Status.DesiredNumberScheduled == d.Status.NumberReady
2548+
}, 3*time.Minute, 1*time.Second).Should(BeTrue(), "DaemonSet [%s/%s] should have running pods", d.Namespace, d.Name)
2549+
2550+
return d
2551+
}
2552+
23792553
// assertNodeStateHasVFMatching asserts that the given node state has at least one VF matching the given fields
23802554
func assertNodeStateHasVFMatching(nodeName string, fields Fields) {
23812555
EventuallyWithOffset(1, func(g Gomega) sriovv1.InterfaceExts {

0 commit comments

Comments
 (0)