Skip to content

Commit

Permalink
Merge pull request #45 from SebastianScherer88/fix-karpenter-configur…
Browse files Browse the repository at this point in the history
…ation

Fix karpenter configuration
  • Loading branch information
SebastianScherer88 authored Nov 20, 2024
2 parents 85ad4b6 + b6f3455 commit 50cad38
Showing 1 changed file with 45 additions and 8 deletions.
53 changes: 45 additions & 8 deletions kubernetes/manifests/karpenter/karpenter-nodepool.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: default-cpu
name: on-demand-cpu
spec:
template:
spec:
Expand All @@ -22,18 +22,19 @@ spec:
values: ["2"]
- key: karpenter.sh/capacity-type
operator: In
values: ["spot", "on-demand"]
values: ["on-demand"]
limits:
cpu: 100
memory: 1000Gi
disruption:
consolidationPolicy: WhenUnderutilized
expireAfter: 30m # ~5 minutes required for larger images, otherwise karpenter will deprovision before pods are running
consolidationPolicy: WhenUnderutilized | WhenEmpty
consolidateAfter: 15m # ~5 minutes required for larger images, otherwise karpenter will deprovision before pods are running
expireAfter: 24h
---
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: default-gpu
name: short-run-gpu
spec:
template:
spec:
Expand All @@ -48,7 +49,7 @@ spec:
values: ["2"]
- key: karpenter.sh/capacity-type
operator: In
values: ["spot","on-demand"]
values: ["on-demand"]
taints:
- key: nvidia.com/gpu
value: "true"
Expand All @@ -58,5 +59,41 @@ spec:
memory: 1000Gi
nvidia.com/gpu: 5
disruption:
consolidationPolicy: WhenUnderutilized
expireAfter: 30m # ~5 minutes required for larger images, otherwise karpenter will deprovision before pods are running
consolidationPolicy: WhenUnderutilized | WhenEmpty
consolidateAfter: 15m # dont go lower to prevent decomissioning bc of image pull phase
expireAfter: 3h # limit GPU nodes to 3 ours - enough to train annotated transformer on one GPU (~2.8h)
---
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: long-run-gpu
spec:
template:
spec:
nodeClassRef:
name: bettmensch-ai-default
requirements:
- key: "karpenter.k8s.aws/instance-category"
operator: In
values: ["p","g"]
- key: "karpenter.k8s.aws/instance-generation"
operator: Gt
values: ["2"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
taints:
- key: nvidia.com/gpu
value: "true"
effect: "NoSchedule"
- key: long-run-gpu
value: "true"
effect: NoSchedule
limits:
cpu: 200
memory: 2000Gi
nvidia.com/gpu: 5
disruption:
consolidationPolicy: WhenUnderutilized | WhenEmpty
consolidateAfter: 30m # larger training images take longer to pull
expireAfter: 240h # limit GPU nodes to 12h ours

0 comments on commit 50cad38

Please sign in to comment.