Skip to content

Commit

Permalink
Reduce memory arena contention
Browse files Browse the repository at this point in the history
Previously there was one memory arena for all threads, making it
the bottleneck for multi-threaded performance. As the number of
threads increased, the contention for the lock on the arena would
grow, causing other threads to wait to acquire it.

This commit makes it use 8 memory arenas, and round-robbins how
they are assigned to threads. Threads keep track of the index that
they should use into the arena array, assigned the first time the
arena is accessed on a given thread.

When an image is first created, it is allocated from an arena.
When the logic to have multiple arenas is enabled, it then keeps
track of the index on the image, so that when deleted it can be
returned to the correct arena.

Effectively this means that in single-threaded programs, this
should not really have an effect. We also do not do this logic if
the GIL is enabled, as it effectively acts as the lock on the
default arena for us.

As expected, this approach has no real noticable effect on regular
CPython. On free-threaded CPython, however, there is a massive
difference (measuring up to about 70%).
  • Loading branch information
Kevin Newton committed Jan 30, 2025
1 parent 51df142 commit b2a97bb
Show file tree
Hide file tree
Showing 4 changed files with 284 additions and 68 deletions.
64 changes: 57 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,21 @@
# ------------------------------
from __future__ import annotations

import distutils.ccompiler
import os
import re
import shutil
import struct
import subprocess
import sys
import tempfile
import warnings
from collections.abc import Iterator
from typing import Any

from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from setuptools.errors import CompileError


def get_version() -> str:
Expand Down Expand Up @@ -292,6 +295,47 @@ def _pkg_config(name: str) -> tuple[list[str], list[str]] | None:
return None


def _try_compile(compiler: distutils.ccompiler.CCompiler, code: str) -> bool:
try:
with tempfile.TemporaryDirectory() as d:
fn = os.path.join(d, "test.c")
with open(fn, "w") as f:
f.write(code)
compiler.compile([fn], output_dir=d, extra_preargs=["-Werror"])
return True
except CompileError:
return False


def _try_compile_attr(compiler: distutils.ccompiler.CCompiler, attr: str) -> bool:
code = f"""
#pragma GCC diagnostic error "-Wattributes"
#pragma clang diagnostic error "-Wattributes"
int {attr} foo;
int main() {{
return 0;
}}
"""

return _try_compile(compiler, code)


def _try_compile_tls_define_macros(
compiler: distutils.ccompiler.CCompiler,
) -> list[tuple[str, str | None]]:
if _try_compile_attr(compiler, "thread_local"): # C23
return [("HAVE_THREAD_LOCAL", None)]
elif _try_compile_attr(compiler, "_Thread_local"): # C11/C17
return [("HAVE__THREAD_LOCAL", None)]
elif _try_compile_attr(compiler, "__thread"): # GCC/clang
return [("HAVE___THREAD", None)]
elif _try_compile_attr(compiler, "__declspec(thread)"): # MSVC
return [("HAVE___DECLSPEC_THREAD_", None)]
else:
return []


class pil_build_ext(build_ext):
class ext_feature:
features = [
Expand Down Expand Up @@ -426,13 +470,14 @@ def finalize_options(self) -> None:
def _update_extension(
self,
name: str,
libraries: list[str] | list[str | bool | None],
libraries: list[str] | list[str | bool | None] | None = None,
define_macros: list[tuple[str, str | None]] | None = None,
sources: list[str] | None = None,
) -> None:
for extension in self.extensions:
if extension.name == name:
extension.libraries += libraries
if libraries is not None:
extension.libraries += libraries
if define_macros is not None:
extension.define_macros += define_macros
if sources is not None:
Expand Down Expand Up @@ -890,7 +935,10 @@ def build_extensions(self) -> None:

defs.append(("PILLOW_VERSION", f'"{PILLOW_VERSION}"'))

self._update_extension("PIL._imaging", libs, defs)
tls_define_macros = _try_compile_tls_define_macros(self.compiler)
self._update_extension("PIL._imaging", libs, defs + tls_define_macros)
self._update_extension("PIL._imagingmath", define_macros=tls_define_macros)
self._update_extension("PIL._imagingmorph", define_macros=tls_define_macros)

#
# additional libraries
Expand All @@ -913,7 +961,9 @@ def build_extensions(self) -> None:
libs.append(feature.get("fribidi"))
else: # building FriBiDi shim from src/thirdparty
srcs.append("src/thirdparty/fribidi-shim/fribidi.c")
self._update_extension("PIL._imagingft", libs, defs, srcs)
self._update_extension(
"PIL._imagingft", libs, defs + tls_define_macros, srcs
)

else:
self._remove_extension("PIL._imagingft")
Expand All @@ -922,19 +972,19 @@ def build_extensions(self) -> None:
libs = [feature.get("lcms")]
if sys.platform == "win32":
libs.extend(["user32", "gdi32"])
self._update_extension("PIL._imagingcms", libs)
self._update_extension("PIL._imagingcms", libs, tls_define_macros)
else:
self._remove_extension("PIL._imagingcms")

webp = feature.get("webp")
if isinstance(webp, str):
libs = [webp, webp + "mux", webp + "demux"]
self._update_extension("PIL._webp", libs)
self._update_extension("PIL._webp", libs, tls_define_macros)
else:
self._remove_extension("PIL._webp")

tk_libs = ["psapi"] if sys.platform in ("win32", "cygwin") else []
self._update_extension("PIL._imagingtk", tk_libs)
self._update_extension("PIL._imagingtk", tk_libs, tls_define_macros)

build_ext.build_extensions(self)

Expand Down
116 changes: 75 additions & 41 deletions src/_imaging.c
Original file line number Diff line number Diff line change
Expand Up @@ -3938,34 +3938,49 @@ _get_stats(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = &ImagingDefaultArena;

v = PyLong_FromLong(arena->stats_new_count);
long stats_new_count = 0;
long stats_allocated_blocks = 0;
long stats_reused_blocks = 0;
long stats_reallocated_blocks = 0;
long stats_freed_blocks = 0;
long blocks_cached = 0;

ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(arena) {
MUTEX_LOCK(&arena->mutex);
stats_new_count += arena->stats_new_count;
stats_allocated_blocks += arena->stats_allocated_blocks;
stats_reused_blocks += arena->stats_reused_blocks;
stats_reallocated_blocks += arena->stats_reallocated_blocks;
stats_freed_blocks += arena->stats_freed_blocks;
blocks_cached += arena->blocks_cached;
MUTEX_UNLOCK(&arena->mutex);
}

v = PyLong_FromLong(stats_new_count);
PyDict_SetItemString(d, "new_count", v ? v : Py_None);
Py_XDECREF(v);

v = PyLong_FromLong(arena->stats_allocated_blocks);
v = PyLong_FromLong(stats_allocated_blocks);
PyDict_SetItemString(d, "allocated_blocks", v ? v : Py_None);
Py_XDECREF(v);

v = PyLong_FromLong(arena->stats_reused_blocks);
v = PyLong_FromLong(stats_reused_blocks);
PyDict_SetItemString(d, "reused_blocks", v ? v : Py_None);
Py_XDECREF(v);

v = PyLong_FromLong(arena->stats_reallocated_blocks);
v = PyLong_FromLong(stats_reallocated_blocks);
PyDict_SetItemString(d, "reallocated_blocks", v ? v : Py_None);
Py_XDECREF(v);

v = PyLong_FromLong(arena->stats_freed_blocks);
v = PyLong_FromLong(stats_freed_blocks);
PyDict_SetItemString(d, "freed_blocks", v ? v : Py_None);
Py_XDECREF(v);

v = PyLong_FromLong(arena->blocks_cached);
v = PyLong_FromLong(blocks_cached);
PyDict_SetItemString(d, "blocks_cached", v ? v : Py_None);
Py_XDECREF(v);

MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
return d;
}

Expand All @@ -3975,14 +3990,16 @@ _reset_stats(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = &ImagingDefaultArena;
arena->stats_new_count = 0;
arena->stats_allocated_blocks = 0;
arena->stats_reused_blocks = 0;
arena->stats_reallocated_blocks = 0;
arena->stats_freed_blocks = 0;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(arena) {
MUTEX_LOCK(&arena->mutex);
arena->stats_new_count = 0;
arena->stats_allocated_blocks = 0;
arena->stats_reused_blocks = 0;
arena->stats_reallocated_blocks = 0;
arena->stats_freed_blocks = 0;
MUTEX_UNLOCK(&arena->mutex);
}

Py_INCREF(Py_None);
return Py_None;
Expand All @@ -3994,9 +4011,10 @@ _get_alignment(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
int alignment = ImagingDefaultArena.alignment;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
MUTEX_LOCK(&arena->mutex);
int alignment = arena->alignment;
MUTEX_UNLOCK(&arena->mutex);
return PyLong_FromLong(alignment);
}

Expand All @@ -4006,9 +4024,10 @@ _get_block_size(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
int block_size = ImagingDefaultArena.block_size;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
MUTEX_LOCK(&arena->mutex);
int block_size = arena->block_size;
MUTEX_UNLOCK(&arena->mutex);
return PyLong_FromLong(block_size);
}

Expand All @@ -4018,9 +4037,10 @@ _get_blocks_max(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
int blocks_max = ImagingDefaultArena.blocks_max;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena = ImagingGetArena();
MUTEX_LOCK(&arena->mutex);
int blocks_max = arena->blocks_max;
MUTEX_UNLOCK(&arena->mutex);
return PyLong_FromLong(blocks_max);
}

Expand All @@ -4041,9 +4061,12 @@ _set_alignment(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.alignment = alignment;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(arena) {
MUTEX_LOCK(&arena->mutex);
arena->alignment = alignment;
MUTEX_UNLOCK(&arena->mutex);
}

Py_INCREF(Py_None);
return Py_None;
Expand All @@ -4066,9 +4089,12 @@ _set_block_size(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingDefaultArena.block_size = block_size;
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(arena) {
MUTEX_LOCK(&arena->mutex);
arena->block_size = block_size;
MUTEX_UNLOCK(&arena->mutex);
}

Py_INCREF(Py_None);
return Py_None;
Expand All @@ -4087,15 +4113,20 @@ _set_blocks_max(PyObject *self, PyObject *args) {
}

if ((unsigned long)blocks_max >
SIZE_MAX / sizeof(ImagingDefaultArena.blocks_pool[0])) {
SIZE_MAX / sizeof(ImagingGetArena()->blocks_pool[0])) {
PyErr_SetString(PyExc_ValueError, "blocks_max is too large");
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
int status = ImagingMemorySetBlocksMax(&ImagingDefaultArena, blocks_max);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
if (!status) {
int error = 0;
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(arena) {
MUTEX_LOCK(&arena->mutex);
error |= ImagingMemorySetBlocksMax(arena, blocks_max);
MUTEX_UNLOCK(&arena->mutex);
}

if (error) {
return ImagingError_MemoryError();
}

Expand All @@ -4111,9 +4142,12 @@ _clear_cache(PyObject *self, PyObject *args) {
return NULL;
}

MUTEX_LOCK(&ImagingDefaultArena.mutex);
ImagingMemoryClearCache(&ImagingDefaultArena, i);
MUTEX_UNLOCK(&ImagingDefaultArena.mutex);
ImagingMemoryArena arena;
IMAGING_ARENAS_FOREACH(arena) {
MUTEX_LOCK(&arena->mutex);
ImagingMemoryClearCache(arena, i);
MUTEX_UNLOCK(&arena->mutex);
}

Py_INCREF(Py_None);
return Py_None;
Expand Down
Loading

0 comments on commit b2a97bb

Please sign in to comment.