From 9a23374edea66a260fa6f66b838b8437fcda2bdf Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Wed, 20 Nov 2024 14:52:55 +0000 Subject: [PATCH 1/2] removing the expireAfter of 30m which was removing ALL nodes hafter half an hour. also adding long running gpu nodepool for long running training jobs --- .../karpenter/karpenter-nodepool.yaml | 53 ++++++++++++++++--- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/kubernetes/manifests/karpenter/karpenter-nodepool.yaml b/kubernetes/manifests/karpenter/karpenter-nodepool.yaml index 1caca69..4319289 100644 --- a/kubernetes/manifests/karpenter/karpenter-nodepool.yaml +++ b/kubernetes/manifests/karpenter/karpenter-nodepool.yaml @@ -1,7 +1,7 @@ apiVersion: karpenter.sh/v1beta1 kind: NodePool metadata: - name: default-cpu + name: on-demand-cpu spec: template: spec: @@ -22,18 +22,19 @@ spec: values: ["2"] - key: karpenter.sh/capacity-type operator: In - values: ["spot", "on-demand"] + values: ["on-demand"] limits: cpu: 100 memory: 1000Gi disruption: - consolidationPolicy: WhenUnderutilized - expireAfter: 30m # ~5 minutes required for larger images, otherwise karpenter will deprovision before pods are running + consolidationPolicy: WhenUnderutilized | WhenEmpty + consolidateAfter: 15m # ~5 minutes required for larger images, otherwise karpenter will deprovision before pods are running + expireAfter: 24h --- apiVersion: karpenter.sh/v1beta1 kind: NodePool metadata: - name: default-gpu + name: on-demand-gpu spec: template: spec: @@ -48,7 +49,7 @@ spec: values: ["2"] - key: karpenter.sh/capacity-type operator: In - values: ["spot","on-demand"] + values: ["on-demand"] taints: - key: nvidia.com/gpu value: "true" @@ -58,5 +59,41 @@ spec: memory: 1000Gi nvidia.com/gpu: 5 disruption: - consolidationPolicy: WhenUnderutilized - expireAfter: 30m # ~5 minutes required for larger images, otherwise karpenter will deprovision before pods are running \ No newline at end of file + consolidationPolicy: WhenUnderutilized | WhenEmpty + consolidateAfter: 15m # dont go lower to prevent decomissioning bc of image pull phase + expireAfter: 12h # limit GPU nodes to 12h ours +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: long-run-gpu +spec: + template: + spec: + nodeClassRef: + name: bettmensch-ai-default + requirements: + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["p","g"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + taints: + - key: nvidia.com/gpu + value: "true" + effect: "NoSchedule" + - key: long-run-gpu + value: "true" + effect: NoSchedule + limits: + cpu: 100 + memory: 1000Gi + nvidia.com/gpu: 5 + disruption: + consolidationPolicy: WhenUnderutilized | WhenEmpty + consolidateAfter: 30m # larger training images take longer to pull + expireAfter: 240h # limit GPU nodes to 12h ours \ No newline at end of file From b6f3455fd85647c5052c6a8296d6aa7b4fb4bba9 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Wed, 20 Nov 2024 14:56:32 +0000 Subject: [PATCH 2/2] renamed default gpu to short run and lowered node lifetime at 3 hours hard --- kubernetes/manifests/karpenter/karpenter-nodepool.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kubernetes/manifests/karpenter/karpenter-nodepool.yaml b/kubernetes/manifests/karpenter/karpenter-nodepool.yaml index 4319289..9ebc593 100644 --- a/kubernetes/manifests/karpenter/karpenter-nodepool.yaml +++ b/kubernetes/manifests/karpenter/karpenter-nodepool.yaml @@ -34,7 +34,7 @@ spec: apiVersion: karpenter.sh/v1beta1 kind: NodePool metadata: - name: on-demand-gpu + name: short-run-gpu spec: template: spec: @@ -61,7 +61,7 @@ spec: disruption: consolidationPolicy: WhenUnderutilized | WhenEmpty consolidateAfter: 15m # dont go lower to prevent decomissioning bc of image pull phase - expireAfter: 12h # limit GPU nodes to 12h ours + expireAfter: 3h # limit GPU nodes to 3 ours - enough to train annotated transformer on one GPU (~2.8h) --- apiVersion: karpenter.sh/v1beta1 kind: NodePool @@ -90,8 +90,8 @@ spec: value: "true" effect: NoSchedule limits: - cpu: 100 - memory: 1000Gi + cpu: 200 + memory: 2000Gi nvidia.com/gpu: 5 disruption: consolidationPolicy: WhenUnderutilized | WhenEmpty