diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 117ada881d..61ff9c7ac5 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -125,6 +125,11 @@ func (c *Container) Delete(ctx context.Context) error { if err := storage.UnmountAllInPath(ctx, getSandboxMountsDir(c.id), true); err != nil { log.G(ctx).WithError(err).Error("failed to unmount sandbox mounts") } + + // remove hugepages mounts in sandbox container + if err := storage.UnmountAllInPath(ctx, getSandboxHugePageMountsDir(c.id), true); err != nil { + log.G(ctx).WithError(err).Error("failed to unmount hugepages mounts") + } } return c.container.Delete() } diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index bba803f940..a5d80aeeb8 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -20,6 +20,10 @@ func getSandboxRootDir(id string) string { return filepath.Join("/run/gcs/c", id) } +func getSandboxHugePageMountsDir(id string) string { + return filepath.Join(getSandboxRootDir(id), "hugepages") +} + func getSandboxMountsDir(id string) string { return filepath.Join(getSandboxRootDir(id), "sandboxMounts") } diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 06b49155e7..e494118cc1 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -142,6 +142,15 @@ func setupSandboxMountsPath(id string) (err error) { return storage.MountRShared(mountPath) } +func setupSandboxHugePageMountsPath(id string) error { + mountPath := getSandboxHugePageMountsDir(id) + if err := os.MkdirAll(mountPath, 0755); err != nil { + return errors.Wrapf(err, "failed to create hugepage Mounts dir in sandbox %v", id) + } + + return storage.MountRShared(mountPath) +} + func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { h.containersMutex.Lock() defer h.containersMutex.Unlock() @@ -171,9 +180,14 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM _ = os.RemoveAll(getSandboxRootDir(id)) } }() + if err = setupSandboxMountsPath(id); err != nil { return nil, err } + + if err = setupSandboxHugePageMountsPath(id); err != nil { + return nil, err + } case "container": sid, ok := settings.OCISpecification.Annotations["io.kubernetes.cri.sandbox-id"] if !ok || sid == "" { diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index d8179807d3..1e83f6a7f3 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -14,6 +14,7 @@ import ( oci "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "go.opencensus.io/trace" + "golang.org/x/sys/unix" ) func getWorkloadRootDir(id string) string { @@ -28,6 +29,8 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { subPath := strings.TrimPrefix(m.Source, sandboxMountPrefix) sandboxSource := filepath.Join(mountsDir, subPath) + // filepath.Join cleans the resulting path before returning so it would resolve the relative path if one was given. + // Hence, we need to ensure that the resolved path is still under the correct directory if !strings.HasPrefix(sandboxSource, mountsDir) { return errors.Errorf("mount path %v for mount %v is not within sandbox's mounts dir", sandboxSource, m.Source) } @@ -45,6 +48,38 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { return nil } +func updateHugePageMounts(sbid string, spec *oci.Spec) error { + mountPrefix := "hugepages://" + for i, m := range spec.Mounts { + if strings.HasPrefix(m.Source, mountPrefix) { + mountsDir := getSandboxHugePageMountsDir(sbid) + subPath := strings.TrimPrefix(m.Source, mountPrefix) + pageSize := strings.Split(subPath, string(os.PathSeparator))[0] + hugePageMountSource := filepath.Join(mountsDir, subPath) + + // filepath.Join cleans the resulting path before returning so it would resolve the relative path if one was given. + // Hence, we need to ensure that the resolved path is still under the correct directory + if !strings.HasPrefix(hugePageMountSource, mountsDir) { + return errors.Errorf("mount path %v for mount %v is not within hugepages's mounts dir", hugePageMountSource, m.Source) + } + + spec.Mounts[i].Source = hugePageMountSource + + _, err := os.Stat(hugePageMountSource) + if os.IsNotExist(err) { + if err := os.MkdirAll(hugePageMountSource, 0755); err != nil { + return err + } + + if err := unix.Mount("none", hugePageMountSource, "hugetlbfs", 0, "pagesize="+pageSize); err != nil { + return errors.Errorf("mount operation failed for %v failed with error %v", hugePageMountSource, err) + } + } + } + } + return nil +} + func specHasGPUDevice(spec *oci.Spec) bool { for _, d := range spec.Windows.Devices { if d.IDType == "gpu" { @@ -72,6 +107,10 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci. return errors.Wrapf(err, "failed to update sandbox mounts for container %v in sandbox %v", id, sbid) } + if err = updateHugePageMounts(sbid, spec); err != nil { + return errors.Wrapf(err, "failed to update hugepages mounts for container %v in sandbox %v", id, sbid) + } + // Add /etc/hostname if the spec did not override it. if !isInMounts("/etc/hostname", spec.Mounts) { mt := oci.Mount{ diff --git a/internal/hcsoci/resources_lcow.go b/internal/hcsoci/resources_lcow.go index a6f6e78949..c01a1c18f6 100644 --- a/internal/hcsoci/resources_lcow.go +++ b/internal/hcsoci/resources_lcow.go @@ -120,6 +120,19 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * // Mounts that map to a path in UVM are specified with 'sandbox://' prefix. // example: sandbox:///a/dirInUvm destination:/b/dirInContainer uvmPathForFile = mount.Source + } else if strings.HasPrefix(mount.Source, "hugepages://") { + // currently we only support 2M hugepage size + hugePageSubDirs := strings.Split(strings.TrimPrefix(mount.Source, "hugepages://"), "/") + if len(hugePageSubDirs) < 2 { + return errors.Errorf(`%s mount path is invalid, expected format: hugepages:///`, mount.Source) + } + + // hugepages:// should be followed by pagesize + if hugePageSubDirs[0] != "2M" { + return errors.Errorf(`only 2M (megabytes) pagesize is supported, got %s`, hugePageSubDirs[0]) + } + // Hugepages inside a container are backed by a mount created inside a UVM. + uvmPathForFile = mount.Source } else { st, err := os.Stat(hostPath) if err != nil { diff --git a/test/cri-containerd/container_test.go b/test/cri-containerd/container_test.go index c92c920c25..84eb8368d8 100644 --- a/test/cri-containerd/container_test.go +++ b/test/cri-containerd/container_test.go @@ -826,3 +826,70 @@ func Test_CreateContainer_DevShmSize(t *testing.T) { t.Fatalf("expected the size of /dev/shm to be 64MB. Got output instead: %s", string(execResponse1.Stdout)) } } + +func Test_CreateContainer_HugePageMount_LCOW(t *testing.T) { + requireFeatures(t, featureLCOW) + + client := newTestRuntimeClient(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + pullRequiredLcowImages(t, []string{imageLcowK8sPause, imageLcowAlpine}) + + annotations := map[string]string{ + oci.AnnotationFullyPhysicallyBacked: "true", + oci.AnnotationMemorySizeInMB: "2048", + oci.AnnotationKernelBootOptions: "hugepagesz=2M hugepages=10", + } + sandboxRequest := getRunPodSandboxRequest(t, lcowRuntimeHandler, annotations) + + podID := runPodSandbox(t, client, ctx, sandboxRequest) + defer removePodSandbox(t, client, ctx, podID) + defer stopPodSandbox(t, client, ctx, podID) + + request := &runtime.CreateContainerRequest{ + Config: &runtime.ContainerConfig{ + Metadata: &runtime.ContainerMetadata{ + Name: t.Name() + "-Container", + }, + Image: &runtime.ImageSpec{ + Image: imageLcowAlpine, + }, + // Hold this command open until killed + Command: []string{ + "top", + }, + Mounts: []*runtime.Mount{ + { + HostPath: "hugepages://2M/hugepage2M", + ContainerPath: "/mnt/hugepage2M", + Readonly: false, + Propagation: runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL, + }, + }, + }, + } + + request.PodSandboxId = podID + request.SandboxConfig = sandboxRequest.Config + + containerId := createContainer(t, client, ctx, request) + defer removeContainer(t, client, ctx, containerId) + startContainer(t, client, ctx, containerId) + defer stopContainer(t, client, ctx, containerId) + + execCommand := []string{"grep", "-i", "/mnt/hugepage2M", "/proc/mounts"} + + output, errorMsg, exitCode := execContainer(t, client, ctx, containerId, execCommand) + if exitCode != 0 || len(errorMsg) > 0 { + t.Fatalf("Failed to exec in hugepage container errorMsg: %s, exitcode: %v\n", errorMsg, exitCode) + } + + if !strings.Contains(output, "hugetlbfs") { + t.Fatalf("Output is supposed to contain hugetlbfs, output: %s", output) + } + + if !strings.Contains(output, "pagesize=2M") { + t.Fatalf("Output is supposed to contain pagesize=2M, output: %s", output) + } +} diff --git a/test/vendor/github.com/Microsoft/hcsshim/internal/hcsoci/resources_lcow.go b/test/vendor/github.com/Microsoft/hcsshim/internal/hcsoci/resources_lcow.go index a6f6e78949..00253e8928 100644 --- a/test/vendor/github.com/Microsoft/hcsshim/internal/hcsoci/resources_lcow.go +++ b/test/vendor/github.com/Microsoft/hcsshim/internal/hcsoci/resources_lcow.go @@ -116,9 +116,10 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r * uvmPathForFile = scsiMount.UVMPath r.Add(scsiMount) coi.Spec.Mounts[i].Type = "none" - } else if strings.HasPrefix(mount.Source, "sandbox://") { + } else if strings.HasPrefix(mount.Source, "sandbox://") || strings.HasPrefix(mount.Source, "hugepages://") { // Mounts that map to a path in UVM are specified with 'sandbox://' prefix. // example: sandbox:///a/dirInUvm destination:/b/dirInContainer + // Hugepages inside a container are backed by a mount created inside a UVM. uvmPathForFile = mount.Source } else { st, err := os.Stat(hostPath)