Skip to content

Commit f82e6bf

Browse files
yosrym93akpm00
authored andcommitted
mm: memcg: use rstat for non-hierarchical stats
Currently, memcg uses rstat to maintain aggregated hierarchical stats. Counters are maintained for hierarchical stats at each memcg. Rstat tracks which cgroups have updates on which cpus to keep those counters fresh on the read-side. Non-hierarchical stats are currently not covered by rstat. Their per-cpu counters are summed up on every read, which is expensive. The original implementation did the same. At some point before rstat, non-hierarchical aggregated counters were introduced by commit a983b5e ("mm: memcontrol: fix excessive complexity in memory.stat reporting"). However, those counters were updated on the performance critical write-side, which caused regressions, so they were later removed by commit 815744d ("mm: memcontrol: don't batch updates of local VM stats and events"). See [1] for more detailed history. Kernel versions in between a983b5e & 815744d (a year and a half) enjoyed cheap reads of non-hierarchical stats, specifically on cgroup v1. When moving to more recent kernels, a performance regression for reading non-hierarchical stats is observed. Now that we have rstat, we know exactly which percpu counters have updates for each stat. We can maintain non-hierarchical counters again, making reads much more efficient, without affecting the performance critical write-side. Hence, add non-hierarchical (i.e local) counters for the stats, and extend rstat flushing to keep those up-to-date. A caveat is that we now need a stats flush before reading local/non-hierarchical stats through {memcg/lruvec}_page_state_local() or memcg_events_local(), where we previously only needed a flush to read hierarchical stats. Most contexts reading non-hierarchical stats are already doing a flush, add a flush to the only missing context in count_shadow_nodes(). With this patch, reading memory.stat from 1000 memcgs is 3x faster on a machine with 256 cpus on cgroup v1: # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done # time cat /sys/fs/cgroup/memory/cg*/memory.stat > /dev/null real 0m0.125s user 0m0.005s sys 0m0.120s After: real 0m0.032s user 0m0.005s sys 0m0.027s To make sure there are no regressions on cgroup v2, I ran an artificial reclaim/refault stress test [2] that creates (NR_CPUS * 2) cgroups, assigns them limits, runs a worker process in each cgroup that allocates tmpfs memory equal to quadruple the limit (to invoke reclaim continuously), and then reads back the entire file (to invoke refaults). All workers are run in parallel, and zram is used as a swapping backend. Both reclaim and refault have conditional stats flushing. I ran this on a machine with 112 cpus, once on mm-unstable, and once on mm-unstable with this patch reverted. (1) A few runs without this patch: # time ./stress_reclaim_refault.sh real 0m9.949s user 0m0.496s sys 14m44.974s # time ./stress_reclaim_refault.sh real 0m10.049s user 0m0.486s sys 14m55.791s # time ./stress_reclaim_refault.sh real 0m9.984s user 0m0.481s sys 14m53.841s (2) A few runs with this patch: # time ./stress_reclaim_refault.sh real 0m9.885s user 0m0.486s sys 14m48.753s # time ./stress_reclaim_refault.sh real 0m9.903s user 0m0.495s sys 14m48.339s # time ./stress_reclaim_refault.sh real 0m9.861s user 0m0.507s sys 14m49.317s No regressions are observed with this patch. There is actually a very slight improvement. If I have to guess, maybe it's because we avoid the percpu loop in count_shadow_nodes() when calling lruvec_page_state_local(), but I could not prove this using perf, it's probably in the noise. [1] https://lore.kernel.org/lkml/[email protected]/ [2] https://lore.kernel.org/lkml/CAJD7tkb17x=qwoO37uxyYXLEUVp15BQKR+Xfh7Sg9Hx-wTQ_=w@mail.gmail.com/ Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Yosry Ahmed <[email protected]> Acked-by: Johannes Weiner <[email protected]> Acked-by: Roman Gushchin <[email protected]> Acked-by: Michal Hocko <[email protected]> Cc: Muchun Song <[email protected]> Cc: Shakeel Butt <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 29a22b9 commit f82e6bf

File tree

3 files changed

+43
-32
lines changed

3 files changed

+43
-32
lines changed

include/linux/memcontrol.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ struct lruvec_stats {
111111
/* Aggregated (CPU and subtree) state */
112112
long state[NR_VM_NODE_STAT_ITEMS];
113113

114+
/* Non-hierarchical (CPU aggregated) state */
115+
long state_local[NR_VM_NODE_STAT_ITEMS];
116+
114117
/* Pending child counts during tree propagation */
115118
long state_pending[NR_VM_NODE_STAT_ITEMS];
116119
};
@@ -1018,14 +1021,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
10181021
{
10191022
struct mem_cgroup_per_node *pn;
10201023
long x = 0;
1021-
int cpu;
10221024

10231025
if (mem_cgroup_disabled())
10241026
return node_page_state(lruvec_pgdat(lruvec), idx);
10251027

10261028
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1027-
for_each_possible_cpu(cpu)
1028-
x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
1029+
x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
10291030
#ifdef CONFIG_SMP
10301031
if (x < 0)
10311032
x = 0;

mm/memcontrol.c

+38-29
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,10 @@ struct memcg_vmstats {
742742
long state[MEMCG_NR_STAT];
743743
unsigned long events[NR_MEMCG_EVENTS];
744744

745+
/* Non-hierarchical (CPU aggregated) page state & events */
746+
long state_local[MEMCG_NR_STAT];
747+
unsigned long events_local[NR_MEMCG_EVENTS];
748+
745749
/* Pending child counts during tree propagation */
746750
long state_pending[MEMCG_NR_STAT];
747751
unsigned long events_pending[NR_MEMCG_EVENTS];
@@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
775779
/* idx can be of type enum memcg_stat_item or node_stat_item. */
776780
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
777781
{
778-
long x = 0;
779-
int cpu;
782+
long x = READ_ONCE(memcg->vmstats->state_local[idx]);
780783

781-
for_each_possible_cpu(cpu)
782-
x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
783784
#ifdef CONFIG_SMP
784785
if (x < 0)
785786
x = 0;
@@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
926927

927928
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
928929
{
929-
long x = 0;
930-
int cpu;
931930
int index = memcg_events_index(event);
932931

933932
if (index < 0)
934933
return 0;
935934

936-
for_each_possible_cpu(cpu)
937-
x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
938-
return x;
935+
return READ_ONCE(memcg->vmstats->events_local[index]);
939936
}
940937

941938
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -5516,7 +5513,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
55165513
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
55175514
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
55185515
struct memcg_vmstats_percpu *statc;
5519-
long delta, v;
5516+
long delta, delta_cpu, v;
55205517
int i, nid;
55215518

55225519
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5532,38 +5529,46 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
55325529
memcg->vmstats->state_pending[i] = 0;
55335530

55345531
/* Add CPU changes on this level since the last flush */
5532+
delta_cpu = 0;
55355533
v = READ_ONCE(statc->state[i]);
55365534
if (v != statc->state_prev[i]) {
5537-
delta += v - statc->state_prev[i];
5535+
delta_cpu = v - statc->state_prev[i];
5536+
delta += delta_cpu;
55385537
statc->state_prev[i] = v;
55395538
}
55405539

5541-
if (!delta)
5542-
continue;
5543-
55445540
/* Aggregate counts on this level and propagate upwards */
5545-
memcg->vmstats->state[i] += delta;
5546-
if (parent)
5547-
parent->vmstats->state_pending[i] += delta;
5541+
if (delta_cpu)
5542+
memcg->vmstats->state_local[i] += delta_cpu;
5543+
5544+
if (delta) {
5545+
memcg->vmstats->state[i] += delta;
5546+
if (parent)
5547+
parent->vmstats->state_pending[i] += delta;
5548+
}
55485549
}
55495550

55505551
for (i = 0; i < NR_MEMCG_EVENTS; i++) {
55515552
delta = memcg->vmstats->events_pending[i];
55525553
if (delta)
55535554
memcg->vmstats->events_pending[i] = 0;
55545555

5556+
delta_cpu = 0;
55555557
v = READ_ONCE(statc->events[i]);
55565558
if (v != statc->events_prev[i]) {
5557-
delta += v - statc->events_prev[i];
5559+
delta_cpu = v - statc->events_prev[i];
5560+
delta += delta_cpu;
55585561
statc->events_prev[i] = v;
55595562
}
55605563

5561-
if (!delta)
5562-
continue;
5564+
if (delta_cpu)
5565+
memcg->vmstats->events_local[i] += delta_cpu;
55635566

5564-
memcg->vmstats->events[i] += delta;
5565-
if (parent)
5566-
parent->vmstats->events_pending[i] += delta;
5567+
if (delta) {
5568+
memcg->vmstats->events[i] += delta;
5569+
if (parent)
5570+
parent->vmstats->events_pending[i] += delta;
5571+
}
55675572
}
55685573

55695574
for_each_node_state(nid, N_MEMORY) {
@@ -5581,18 +5586,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
55815586
if (delta)
55825587
pn->lruvec_stats.state_pending[i] = 0;
55835588

5589+
delta_cpu = 0;
55845590
v = READ_ONCE(lstatc->state[i]);
55855591
if (v != lstatc->state_prev[i]) {
5586-
delta += v - lstatc->state_prev[i];
5592+
delta_cpu = v - lstatc->state_prev[i];
5593+
delta += delta_cpu;
55875594
lstatc->state_prev[i] = v;
55885595
}
55895596

5590-
if (!delta)
5591-
continue;
5597+
if (delta_cpu)
5598+
pn->lruvec_stats.state_local[i] += delta_cpu;
55925599

5593-
pn->lruvec_stats.state[i] += delta;
5594-
if (ppn)
5595-
ppn->lruvec_stats.state_pending[i] += delta;
5600+
if (delta) {
5601+
pn->lruvec_stats.state[i] += delta;
5602+
if (ppn)
5603+
ppn->lruvec_stats.state_pending[i] += delta;
5604+
}
55965605
}
55975606
}
55985607
}

mm/workingset.c

+1
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
664664
struct lruvec *lruvec;
665665
int i;
666666

667+
mem_cgroup_flush_stats();
667668
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
668669
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
669670
pages += lruvec_page_state_local(lruvec,

0 commit comments

Comments
 (0)