Skip to content

Commit

Permalink
Merge pull request #1999 from microsoft/kabaldau/nvidia_log_files
Browse files Browse the repository at this point in the history
Update nvidia hook log file paths to use container bundle path as base dir
  • Loading branch information
katiewasnothere authored Jan 31, 2024
2 parents d4494c7 + 4283479 commit a1319d5
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 8 deletions.
13 changes: 8 additions & 5 deletions internal/guest/runtime/hcsv2/nvidia_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"

oci "github.com/opencontainers/runtime-spec/specs-go"
Expand All @@ -19,20 +20,20 @@ import (
"github.com/Microsoft/hcsshim/pkg/annotations"
)

const nvidiaDebugFilePath = "/nvidia-container.log"

const nvidiaDebugFilePath = "nvidia-container.log"
const nvidiaToolBinary = "nvidia-container-cli"

// described here: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the prestart hook
func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec) error {
func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec, ociBundlePath string) error {
genericHookBinary := "generichook"
genericHookPath, err := exec.LookPath(genericHookBinary)
if err != nil {
return errors.Wrapf(err, "failed to find %s for container device support", genericHookBinary)
}

debugOption := fmt.Sprintf("--debug=%s", nvidiaDebugFilePath)
toolDebugPath := filepath.Join(ociBundlePath, nvidiaDebugFilePath)
debugOption := fmt.Sprintf("--debug=%s", toolDebugPath)
args := []string{
genericHookPath,
nvidiaToolBinary,
Expand Down Expand Up @@ -63,8 +64,10 @@ func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec) error {
// add template for pid argument to be injected later by the generic hook binary
args = append(args, "--no-cgroups", "--pid={{pid}}", spec.Root.Path)

hookLogDebugFileEnvOpt := fmt.Sprintf("%s=%s", generichook.LogDebugFileEnvKey, nvidiaDebugFilePath)
// setup environment variables for the hook to run in
hookLogDebugFileEnvOpt := fmt.Sprintf("%s=%s", generichook.LogDebugFileEnvKey, toolDebugPath)
hookEnv := append(updateEnvWithNvidiaVariables(), hookLogDebugFileEnvOpt)

nvidiaHook := hooks.NewOCIHook(genericHookPath, args, hookEnv)
return hooks.AddOCIHook(spec, hooks.CreateRuntime, nvidiaHook)
}
Expand Down
2 changes: 1 addition & 1 deletion internal/guest/runtime/hcsv2/uvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
if !ok || sid == "" {
return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid)
}
if err := setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification); err != nil {
if err := setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath); err != nil {
return nil, err
}

Expand Down
4 changes: 2 additions & 2 deletions internal/guest/runtime/hcsv2/workload_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func specHasGPUDevice(spec *oci.Spec) bool {
return false
}

func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.Spec) (err error) {
func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.Spec, ociBundlePath string) (err error) {
ctx, span := oc.StartSpan(ctx, "hcsv2::setupWorkloadContainerSpec")
defer span.End()
defer func() { oc.SetSpanStatus(span, err) }()
Expand Down Expand Up @@ -150,7 +150,7 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.
if spec.Windows != nil {
// we only support Nvidia gpus right now
if specHasGPUDevice(spec) {
if err := addNvidiaDeviceHook(ctx, spec); err != nil {
if err := addNvidiaDeviceHook(ctx, spec, ociBundlePath); err != nil {
return err
}
}
Expand Down

0 comments on commit a1319d5

Please sign in to comment.