From 0e1412b3eb9999a1cc2b27437ef86998678e9e10 Mon Sep 17 00:00:00 2001 From: Lis Date: Thu, 28 Jul 2022 09:31:21 +0800 Subject: [PATCH] fix(platform): pre allocate svc ip for qgpu scheduler (#2040) --- .../provider/baremetal/cluster/create.go | 10 ++++++++-- .../provider/baremetal/cluster/manifests.go | 17 +++++++++++++++++ .../provider/baremetal/constants/constants.go | 10 ++++++---- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/pkg/platform/provider/baremetal/cluster/create.go b/pkg/platform/provider/baremetal/cluster/create.go index 4c0c3493c..217e42309 100644 --- a/pkg/platform/provider/baremetal/cluster/create.go +++ b/pkg/platform/provider/baremetal/cluster/create.go @@ -380,7 +380,8 @@ func completeServiceIP(cluster *v1.Cluster) error { cluster.Annotations = make(map[string]string) } for index, name := range map[int]string{ - constants.GPUQuotaAdmissionIPIndex: constants.GPUQuotaAdmissionIPAnnotaion, + constants.GPUQuotaAdmissionIPIndex: constants.GPUQuotaAdmissionIPAnnotaion, + constants.QGPUQuotaAdmissionIPIndex: constants.QGPUQuotaAdmissionIPAnnotaion, } { ip, err := GetIndexedIP(cluster.Status.ServiceCIDR, index) if err != nil { @@ -756,11 +757,16 @@ func (p *Provider) EnsurePrepareForControlplane(ctx context.Context, c *v1.Clust oidcCa, _ := ioutil.ReadFile(constants.OIDCConfigFile) auditPolicyData, _ := ioutil.ReadFile(constants.AuditPolicyConfigFile) GPUQuotaAdmissionHost := c.Annotations[constants.GPUQuotaAdmissionIPAnnotaion] + QGPUQuotaAdmissionHost := c.Annotations[constants.QGPUQuotaAdmissionIPAnnotaion] if GPUQuotaAdmissionHost == "" { GPUQuotaAdmissionHost = "gpu-quota-admission" } + if QGPUQuotaAdmissionHost == "" { + GPUQuotaAdmissionHost = "qgpu-quota-admission" + } schedulerPolicyConfig, err := template.ParseString(schedulerPolicyConfig, map[string]interface{}{ - "GPUQuotaAdmissionHost": GPUQuotaAdmissionHost, + "GPUQuotaAdmissionHost": GPUQuotaAdmissionHost, + "QGPUQuotaAdmissionHost": QGPUQuotaAdmissionHost, }) if err != nil { return errors.Wrap(err, "parse schedulerPolicyConfig error") diff --git a/pkg/platform/provider/baremetal/cluster/manifests.go b/pkg/platform/provider/baremetal/cluster/manifests.go index debf68f2a..c1dde7565 100644 --- a/pkg/platform/provider/baremetal/cluster/manifests.go +++ b/pkg/platform/provider/baremetal/cluster/manifests.go @@ -52,6 +52,23 @@ const ( } ], "nodeCacheCapable" : false + }, + { + "urlPrefix": "http://{{.QGPUQuotaAdmissionHost}}:12345/scheduler", + "filterVerb" : "filter", + "prebindVerb": "prebind", + "unreserveVerb": "unreserve", + "prioritizeVerb": "priorities", + "nodeCacheCapable": true, + "weight": 10, + "managedResources" : [ + { + "name": "tke.cloud.tencent.com/qgpu-core" + }, + { + "name" : "tke.cloud.tencent.com/qgpu-memory" + } + ] } ], "kind" : "Policy" diff --git a/pkg/platform/provider/baremetal/constants/constants.go b/pkg/platform/provider/baremetal/constants/constants.go index 20972db3f..da4dfe7ae 100644 --- a/pkg/platform/provider/baremetal/constants/constants.go +++ b/pkg/platform/provider/baremetal/constants/constants.go @@ -120,10 +120,12 @@ const ( MetricsServerManifest = ManifestsDir + "metrics-server/metrics-server.yaml" CiliumManifest = SrcDir + "cilium/*.yaml" - KUBERNETES = 1 - DNSIPIndex = 10 - GPUQuotaAdmissionIPIndex = 9 - GPUQuotaAdmissionIPAnnotaion = platformv1.GroupName + "/gpu-quota-admission-ip" + KUBERNETES = 1 + DNSIPIndex = 10 + GPUQuotaAdmissionIPIndex = 9 + QGPUQuotaAdmissionIPIndex = 8 + GPUQuotaAdmissionIPAnnotaion = platformv1.GroupName + "/gpu-quota-admission-ip" + QGPUQuotaAdmissionIPAnnotaion = platformv1.GroupName + "/qgpu-quota-admission-ip" // RenewCertsTimeThreshold control how long time left to renew certs RenewCertsTimeThreshold = 30 * 24 * time.Hour