Skip to content

Commit

Permalink
util, server: fix memory detection with non-root cgroups
Browse files Browse the repository at this point in the history
The cgroup information is constructed by looking at `/proc/self/cgroup`,
matching with the mount point in `/proc/self/mountinfo`
and then inpsecting the `memory.stat` file. The matching between the
cgroup and the mount point was working only in case that the mount and
the cgroup are exact match relative to cgroup NS. This is however
inadequate as while true for the docker case, it isn't true in the general
case. In this example determining the cgroup memory limit isn't possible:
```
cgcreate -t $USER:$USER -a $USER:$USER -g memory:crdb_test
echo 3999997952 > /sys/fs/cgroup/memory/crdb_test/memory.limit_in_bytes
cgexec -g memory:crdb_test ./cockroach start-single-nod
```
To address this, this patch instead constructs the expected
relative path of the cgroup's memory info by starting with the
mount path and then adding the cgroup.

Release note: None
  • Loading branch information
darinpp committed Nov 19, 2020
1 parent 897b99f commit 4a2177f
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 2 deletions.
25 changes: 23 additions & 2 deletions pkg/util/cgroups/cgroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,29 @@ func getCgroupDetails(mountinfoPath string, cRoot string, controller string) (st
}

ver, ok := detectCgroupVersion(fields, controller)
if ok && (ver == 1 && string(fields[3]) == cRoot) || ver == 2 {
return string(fields[4]), ver, nil
if ok {
mountPoint := string(fields[4])
if ver == 2 {
return mountPoint, ver, nil
}
// It is possible that the controller mount and the cgroup path are not the same (both are relative to the NS root).
// So start with the mount and construct the relative path of the cgroup.
// To test:
// cgcreate -t $USER:$USER -a $USER:$USER -g memory:crdb_test
// echo 3999997952 > /sys/fs/cgroup/memory/crdb_test/memory.limit_in_bytes
// cgexec -g memory:crdb_test ./cockroach start-single-node
// cockroach.log -> server/config.go:433 ⋮ system total memory: ‹3.7 GiB›
// without constructing the relative path
// cockroach.log -> server/config.go:433 ⋮ system total memory: ‹63 GiB›
nsRelativePath := string(fields[3])
if !strings.Contains(nsRelativePath, "..") {
// We don't expect to see err here ever but in case that it happens
// the best action is to ignore the line and hope that the rest of the lines
// will allow us to extract a valid path.
if relPath, err := filepath.Rel(nsRelativePath, cRoot); err == nil {
return filepath.Join(mountPoint, relPath), ver, nil
}
}
}
}

Expand Down
92 changes: 92 additions & 0 deletions pkg/util/cgroups/cgroups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ func TestCgroupsGetMemory(t *testing.T) {
},
limit: 2936016896,
},
{
name: "fetches the limit for cgroup v1 when the NS relative paths of mount and cgroup don't match",
paths: map[string]string{
"/proc/self/cgroup": v1CgroupWithMemoryControllerNS,
"/proc/self/mountinfo": v1MountsWithMemControllerNS,
"/sys/fs/cgroup/memory/cgroup_test/memory.stat": v1MemoryStat,
},
limit: 2936016896,
},
{
name: "fails when the stat file is missing for cgroup v2",
paths: map[string]string{
Expand Down Expand Up @@ -167,6 +176,59 @@ func TestCgroupsGetCPU(t *testing.T) {
system: uint64(123),
user: uint64(456),
},
{
name: "fetches the cpu quota and usage for cgroup v1 where the mount and cgroup path don't match",
paths: map[string]string{
"/proc/self/cgroup": v1CgroupWithCPUControllerNS,
"/proc/self/mountinfo": v1MountsWithCPUControllerNS,
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpu.cfs_quota_us": "12345",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpu.cfs_period_us": "67890",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpuacct.usage_sys": "123",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpuacct.usage_user": "456",
},
quota: int64(12345),
period: int64(67890),
system: uint64(123),
user: uint64(456),
},
{
name: "fetches the cpu quota and usage for cgroup v1 where the mount is relative",
paths: map[string]string{
"/proc/self/cgroup": v1CgroupWithCPUControllerNSMountRel,
"/proc/self/mountinfo": v1MountsWithCPUControllerNSMountRel,
},
errMsg: "failed to detect cgroup root mount and version",
},
{
name: "fetches the cpu quota and usage for cgroup v1 where the mount and cgroup match but there is extra mount",
paths: map[string]string{
"/proc/self/cgroup": v1CgroupWithCPUControllerNSMountRelRemount,
"/proc/self/mountinfo": v1MountsWithCPUControllerNSMountRelRemount,
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpu.cfs_quota_us": "12345",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpu.cfs_period_us": "67890",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpuacct.usage_sys": "123",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpuacct.usage_user": "456",
},
quota: int64(12345),
period: int64(67890),
system: uint64(123),
user: uint64(456),
},
{
name: "fetches the cpu quota and usage for cgroup v1 where the mount and cgroup match",
paths: map[string]string{
"/proc/self/cgroup": v1CgroupWithCPUControllerNS2,
"/proc/self/mountinfo": v1MountsWithCPUControllerNS2,
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpu.cfs_quota_us": "12345",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpu.cfs_period_us": "67890",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpuacct.usage_sys": "123",
"/sys/fs/cgroup/cpu,cpuacct/crdb_test/cpuacct.usage_user": "456",
},
quota: int64(12345),
period: int64(67890),
system: uint64(123),
user: uint64(456),
},
{
name: "fetches the cpu quota for cgroup v1 even if usage nonexistent",
paths: map[string]string{
Expand Down Expand Up @@ -534,4 +596,34 @@ total_inactive_file 1363746816
total_active_file 308867072
total_unevictable 0
`

// Both /proc/<pid>/mountinfo and /proc/<pid>/cgroup will show the mount and the cgroup relative to the cgroup NS root
// This tests the case where the memory controller mount and the cgroup are not exactly the same (as is with k8s pods).
v1CgroupWithMemoryControllerNS = "12:memory:/cgroup_test"
v1MountsWithMemControllerNS = "50 35 0:44 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:25 - cgroup cgroup rw,memory"

// Example where the paths in /proc/self/mountinfo and /proc/self/cgroup are not the same for the cpu controller
//
// sudo cgcreate -t $USER:$USER -a $USER:$USER -g cpu:crdb_test
// echo 100000 > /sys/fs/cgroup/cpu/crdb_test/cpu.cfs_period_us
// echo 33300 > /sys/fs/cgroup/cpu/crdb_test/cpu.cfs_quota_us
// cgexec -g cpu:crdb_test ./cockroach ...
v1CgroupWithCPUControllerNS = "5:cpu,cpuacct:/crdb_test"
v1MountsWithCPUControllerNS = "43 35 0:37 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,cpu,cpuacct"

// Same as above but with unshare -C
// Can't determine the location of the mount
v1CgroupWithCPUControllerNSMountRel = "5:cpu,cpuacct:/"
v1MountsWithCPUControllerNSMountRel = "43 35 0:37 /.. /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,cpu,cpuacct"

// Same as above but with mounting the cgroup fs one more time in the NS
// sudo mount -t cgroup -o cpu,cpuacct none /sys/fs/cgroup/cpu,cpuacct/crdb_test
v1CgroupWithCPUControllerNSMountRelRemount = "5:cpu,cpuacct:/"
v1MountsWithCPUControllerNSMountRelRemount = `
43 35 0:37 /.. /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,cpu,cpuacct
161 43 0:37 / /sys/fs/cgroup/cpu,cpuacct/crdb_test rw,relatime shared:95 - cgroup none rw,cpu,cpuacct
`
// Same as above but exiting the NS w/o unmounting
v1CgroupWithCPUControllerNS2 = "5:cpu,cpuacct:/crdb_test"
v1MountsWithCPUControllerNS2 = "161 43 0:37 /crdb_test /sys/fs/cgroup/cpu,cpuacct/crdb_test rw,relatime shared:95 - cgroup none rw,cpu,cpuacct"
)

0 comments on commit 4a2177f

Please sign in to comment.