Skip to content

Commit

Permalink
Remove nvidia.com/gpu.imex-domain label
Browse files Browse the repository at this point in the history
This removes the logic for constructing an nvidia.com/gpu.imex-domain label.
This also means that the logic to include the imex nodes_config.cfg file
in the GFD container is not required.

Signed-off-by: Evan Lezar <[email protected]>
  • Loading branch information
elezar committed Feb 6, 2025
1 parent 9ae728c commit f563493
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 132 deletions.
3 changes: 1 addition & 2 deletions deployments/devel/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

# This Dockerfile is also used to define the golang version used in this project
# This allows dependabot to manage this version in addition to other images.
FROM golang:1.23.3
FROM golang:1.23.1

WORKDIR /work
COPY * .
Expand All @@ -24,4 +24,3 @@ RUN make install-tools
# We need to set the /work directory as a safe directory.
# This allows git commands to run in the container.
RUN git config --file=/.gitconfig --add safe.directory /work

6 changes: 4 additions & 2 deletions deployments/devel/go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
module github.com/NVIDIA/k8s-device-plugin/deployments/devel
module github.com/NVIDIA/k8s-dra-driver/deployments/devel

go 1.23

toolchain go1.23.2
toolchain go1.23.1

require github.com/matryer/moq v0.5.0

Expand All @@ -11,3 +11,5 @@ require (
golang.org/x/sync v0.8.0 // indirect
golang.org/x/tools v0.24.0 // indirect
)


26 changes: 0 additions & 26 deletions deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,32 +59,6 @@ spec:
shareProcessNamespace: true
{{- end }}
initContainers:
- image: {{ include "nvidia-device-plugin.fullimage" . }}
name: gpu-feature-discovery-imex-init
command: ["/bin/bash", "-c"]
args:
- |
IMEX_NODES_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg
if [[ -f /config/${IMEX_NODES_CONFIG_FILE} ]]; then
echo "Removing cached IMEX nodes config"
rm -f /config/${IMEX_NODES_CONFIG_FILE}
fi
if [[ ! -f /driver-root/${IMEX_NODES_CONFIG_FILE} ]]; then
echo "No IMEX nodes config path detected; Skipping"
exit 0
fi
echo "Copying IMEX nodes config"
mkdir -p $(dirname /config/${IMEX_NODES_CONFIG_FILE})
cp /driver-root/${IMEX_NODES_CONFIG_FILE} /config/${IMEX_NODES_CONFIG_FILE}
volumeMounts:
- name: config
mountPath: /config
- name: driver-root
mountPath: /driver-root/etc
subPath: etc
readOnly: true
{{- if $options.hasConfigMap }}
- image: {{ include "nvidia-device-plugin.fullimage" . }}
name: gpu-feature-discovery-init
Expand Down
1 change: 0 additions & 1 deletion docs/gpu-feature-discovery/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ For a similar list of labels generated or used by the device plugin, see [here](
| nvidia.com/gpu.replicas | String | Number of GPU replicas available. Will be equal to the number of physical GPUs unless some sharing strategy is employed in which case the GPU count will be multiplied by replicas. | 4 |
| nvidia.com/gpu.mode | String | Mode of the GPU. Can be either "compute" or "display". Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute |
| nvidia.com/gpu.clique | String | GPUFabric ClusterUUID + CliqueID | 7b968a6d-c8aa-45e1-9e07-e1e51be99c31.1 |
| nvidia.com/gpu.imex-domain | String | IMEX domain Ip list(Hashed) + CliqueID | 79b326e7-d566-3483-c2a3-9b38fa5cb1c8.1 |
Depending on the MIG strategy used, the following set of labels may also be
available (or override the default values for some of the labels listed above):
Expand Down
103 changes: 2 additions & 101 deletions internal/lm/imex.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,92 +17,26 @@
package lm

import (
"bufio"
"errors"
"fmt"
"io"
"net"
"os"
"path/filepath"
"sort"
"strings"

"github.com/google/uuid"
"k8s.io/klog/v2"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
)

const (
// ImexNodesConfigFilePath is the path to the IMEX nodes config file.
// This file contains a list of IP addresses of the nodes in the IMEX domain.
ImexNodesConfigFilePath = "/etc/nvidia-imex/nodes_config.cfg"
)

func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) {
var errs error
for _, root := range imexNodesConfigFilePathSearchRoots(config) {
configFilePath := filepath.Join(root, ImexNodesConfigFilePath)
imexLabeler, err := imexLabelerForConfigFile(configFilePath, devices)
if err != nil {
errs = errors.Join(errs, err)
continue
}
if imexLabeler != nil {
klog.Infof("Using labeler for IMEX config %v", configFilePath)
return imexLabeler, nil
}
}
if errs != nil {
return nil, errs
}

return empty{}, nil
}

// imexNodesConfigFilePathSearchRoots returns a list of roots to search for the IMEX nodes config file.
func imexNodesConfigFilePathSearchRoots(config *spec.Config) []string {
// By default, search / and /config for config files.
roots := []string{"/", "/config"}

if config == nil || config.Flags.Plugin == nil || config.Flags.Plugin.ContainerDriverRoot == nil {
return roots
}

// If a driver root is specified, it is also searched.
return append(roots, *config.Flags.Plugin.ContainerDriverRoot)
}

func imexLabelerForConfigFile(configFilePath string, devices []resource.Device) (Labeler, error) {
imexConfigFile, err := os.Open(configFilePath)
if os.IsNotExist(err) {
// No imex config file, return empty labels
return nil, nil
} else if err != nil {
return nil, fmt.Errorf("failed to open imex config file: %v", err)
}
defer imexConfigFile.Close()

clusterUUID, cliqueID, err := getFabricIDs(devices)
if err != nil {
return nil, err
}
if clusterUUID == "" || cliqueID == "" {
return nil, nil
}

imexDomainID, err := getImexDomainID(imexConfigFile)
if err != nil {
return nil, err
}
if imexDomainID == "" {
return nil, nil
return empty{}, nil
}

labels := Labels{
"nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."),
"nvidia.com/gpu.imex-domain": strings.Join([]string{imexDomainID, cliqueID}, "."),
"nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."),
}

return labels, nil
Expand Down Expand Up @@ -147,36 +81,3 @@ func getFabricIDs(devices []resource.Device) (string, string, error) {
}
return "", "", nil
}

// getImexDomainID reads the imex config file and returns a unique identifier
// based on the sorted list of IP addresses in the file.
func getImexDomainID(r io.Reader) (string, error) {
// Read the file line by line
var ips []string
scanner := bufio.NewScanner(r)
for scanner.Scan() {
ip := strings.TrimSpace(scanner.Text())
if net.ParseIP(ip) == nil {
return "", fmt.Errorf("invalid IP address in imex config file: %s", ip)
}
ips = append(ips, ip)
}

if err := scanner.Err(); err != nil {
return "", fmt.Errorf("failed to read imex config file: %v", err)
}

if len(ips) == 0 {
// No IPs in the file, return empty labels
return "", nil
}

sort.Strings(ips)

return generateContentUUID(strings.Join(ips, "\n")), nil

}

func generateContentUUID(seed string) string {
return uuid.NewSHA1(uuid.Nil, []byte(seed)).String()
}

0 comments on commit f563493

Please sign in to comment.