From 410840a806ea462c94d7c3baaad1a963138ce6ed Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 24 Aug 2023 12:57:04 -0700 Subject: [PATCH 01/28] gh-108724: Add PyMutex and _PyParkingLot APIs `PyMutex` is a one byte lock with fast, inlineable lock and unlock functions for the common uncontended case. The design is based on WebKit's `WTF::Lock`. PyMutex is built using the `_PyParkingLot` APIs, which provides a cross-platform futex-like API (based on WebKit's `WTF::ParkingLot`). This internal API will be used for building other synchronization primitives used to implement PEP 703, such as one-time initialization and events. --- Include/Python.h | 1 + Include/cpython/pyatomic.h | 3 +- Include/internal/pycore_llist.h | 107 ++++ Include/internal/pycore_lock.h | 161 ++++++ Include/internal/pycore_parking_lot.h | 129 +++++ Lib/test/test_capi/test_lock.py | 15 + Makefile.pre.in | 5 + Modules/Setup.stdlib.in | 2 +- Modules/_testcapi/parts.h | 1 + Modules/_testinternalcapi.c | 3 + .../_testinternalcapi/clinic/test_lock.c.h | 74 +++ Modules/_testinternalcapi/parts.h | 1 + Modules/_testinternalcapi/test_lock.c | 353 +++++++++++++ PCbuild/_testinternalcapi.vcxproj | 1 + PCbuild/_testinternalcapi.vcxproj.filters | 3 + PCbuild/pythoncore.vcxproj | 5 + PCbuild/pythoncore.vcxproj.filters | 12 + Python/lock.c | 292 +++++++++++ Python/parking_lot.c | 466 ++++++++++++++++++ Python/pystate.c | 5 + Tools/lockbench/lockbench.py | 53 ++ 21 files changed, 1690 insertions(+), 2 deletions(-) create mode 100644 Include/internal/pycore_llist.h create mode 100644 Include/internal/pycore_lock.h create mode 100644 Include/internal/pycore_parking_lot.h create mode 100644 Lib/test/test_capi/test_lock.py create mode 100644 Modules/_testinternalcapi/clinic/test_lock.c.h create mode 100644 Modules/_testinternalcapi/test_lock.c create mode 100644 Python/lock.c create mode 100644 Python/parking_lot.c create mode 100644 Tools/lockbench/lockbench.py diff --git a/Include/Python.h b/Include/Python.h index 8b28200000ab56..d6429856b4b9f4 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -48,6 +48,7 @@ #include "pytypedefs.h" #include "pybuffer.h" #include "pystats.h" +#include "cpython/pyatomic.h" #include "object.h" #include "objimpl.h" #include "typeslots.h" diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h index 73712db847087d..066a969e0e486a 100644 --- a/Include/cpython/pyatomic.h +++ b/Include/cpython/pyatomic.h @@ -83,6 +83,7 @@ // # release // ... +#ifndef Py_LIMITED_API #ifndef Py_ATOMIC_H #define Py_ATOMIC_H @@ -503,4 +504,4 @@ static inline void _Py_atomic_fence_release(void); #endif #endif /* Py_ATOMIC_H */ - +#endif /* Py_LIMITED_API */ diff --git a/Include/internal/pycore_llist.h b/Include/internal/pycore_llist.h new file mode 100644 index 00000000000000..5fd261da05fa5d --- /dev/null +++ b/Include/internal/pycore_llist.h @@ -0,0 +1,107 @@ +// A doubly-linked list that can be embedded in a struct. +// +// Usage: +// struct llist_node head = LLIST_INIT(head); +// typedef struct { +// ... +// struct llist_node node; +// ... +// } MyObj; +// +// llist_insert_tail(&head, &obj->node); +// llist_remove(&obj->node); +// +// struct llist_node *node; +// llist_for_each(node, &head) { +// MyObj *obj = llist_data(node, MyObj, node); +// ... +// } +// + +#ifndef Py_INTERNAL_LLIST_H +#define Py_INTERNAL_LLIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "Py_BUILD_CORE must be defined to include this header" +#endif + +struct llist_node { + struct llist_node *next; + struct llist_node *prev; +}; + +// Get the struct containing a node. +#define llist_data(node, type, member) \ + (type*)((char*)node - offsetof(type, member)) + +// Iterate over a list. +#define llist_for_each(node, head) \ + for (node = (head)->next; node != (head); node = node->next) + +// Iterate over a list, but allow removal of the current node. +#define llist_for_each_safe(node, head) \ + for (struct llist_node *_next = (node = (head)->next, node->next); \ + node != (head); node = _next, _next = node->next) + +#define LLIST_INIT(head) { &head, &head } + +static inline void +llist_init(struct llist_node *head) +{ + head->next = head; + head->prev = head; +} + +// Returns 1 if the list is empty, 0 otherwise. +static inline int +llist_empty(struct llist_node *head) +{ + return head->next == head; +} + +// Appends to the tail of the list. +static inline void +llist_insert_tail(struct llist_node *head, struct llist_node *node) +{ + node->prev = head->prev; + node->next = head; + head->prev->next = node; + head->prev = node; +} + +// Remove a node from the list. +static inline void +llist_remove(struct llist_node *node) +{ + struct llist_node *prev = node->prev; + struct llist_node *next = node->next; + prev->next = next; + next->prev = prev; + node->prev = NULL; + node->next = NULL; +} + +// Append all nodes from head2 onto head1. head2 is left empty. +static inline void +llist_concat(struct llist_node *head1, struct llist_node *head2) +{ + if (!llist_empty(head2)) { + head1->prev->next = head2->next; + head2->next->prev = head1->prev; + + head1->prev = head2->prev; + head2->prev->next = head1; + llist_init(head2); + } +} + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_LLIST_H */ diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h new file mode 100644 index 00000000000000..36c76358f5bcae --- /dev/null +++ b/Include/internal/pycore_lock.h @@ -0,0 +1,161 @@ +// Lightweight locks and other synchronization mechanisms. +// +// These implementations are based on WebKit's WTF::Lock. See +// https://webkit.org/blog/6161/locking-in-webkit/ for a description of the +// design. +#ifndef Py_INTERNAL_LOCK_H +#define Py_INTERNAL_LOCK_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "pycore_time.h" // _PyTime_t + + +// A mutex that occupies one byte. The lock can be zero initialized. +// +// Only the two least significant bits are used. The remaining bits should be +// zero: +// 0b00: unlocked +// 0b01: locked +// 0b10: unlocked and has parked threads +// 0b11: locked and has parked threads +// +// Typical initialization: +// PyMutex m; +// memset(&m, 0, sizeof(m)); +// +// Typical usage: +// PyMutex_Lock(&m); +// ... +// PyMutex_Unlock(&m); +typedef struct _PyMutex { + uint8_t v; +} PyMutex; + +typedef enum { + _Py_UNLOCKED = 0, + _Py_LOCKED = 1, + _Py_HAS_PARKED = 2, +} _PyMutex_State; + +// (private) slow path for locking the mutex +PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m); + +// (private) slow path for unlocking the mutex +PyAPI_FUNC(void) _PyMutex_UnlockSlow(PyMutex *m); + +// Locks the mutex. +// +// If the mutex is currently locked, the calling thread will be parked until +// the mutex is unlocked. If the current thread holds the GIL, then the GIL +// will be released while the thread is parked. +static inline void +PyMutex_Lock(PyMutex *m) +{ + uint8_t expected = _Py_UNLOCKED; + if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_LOCKED)) { + _PyMutex_LockSlow(m); + } +} + +// Unlocks the mutex. +static inline void +PyMutex_Unlock(PyMutex *m) +{ + uint8_t expected = _Py_LOCKED; + if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_UNLOCKED)) { + _PyMutex_UnlockSlow(m); + } +} + +// Checks if the mutex is currently locked. +static inline int +PyMutex_IsLocked(PyMutex *m) +{ + return (_Py_atomic_load_uint8(&m->v) & _Py_LOCKED) != 0; +} + +typedef enum _PyLockFlags { + // Do not detach/release the GIL when waiting on the lock. + _Py_LOCK_DONT_DETACH = 0, + + // Detach/release the GIL when waiting on the lock. + _PY_LOCK_DETACH = 1, + + // Handle signals if interrupted while waiting on the lock. + _PY_LOCK_MAKE_PENDING_CALLS = 2, +} _PyLockFlags; + +// Lock a mutex with an optional timeout and additional options. See +// _PyLockFlags for details. +extern PyLockStatus +_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags); + +// Unlock a mutex, returns 0 if the mutex is not locked (used for improved +// error messages). +extern int _PyMutex_TryUnlock(PyMutex *m); + + +// PyEvent is a one-time event notification +typedef struct { + uint8_t v; +} PyEvent; + +// Set the event and notify any waiting threads. +// Export for '_testinternalcapi' shared extension +PyAPI_FUNC(void) _PyEvent_Notify(PyEvent *evt); + +// Wait for the event to be set. If the event is already set, then this returns +// immediately. +PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt); + +// Wait for the event to be set, or until the timeout expires. If the event is +// already set, then this returns immediately. Returns 1 if the event was set, +// and 0 if the timeout expired or thread was interrupted. +PyAPI_FUNC(int) PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns); + + +// _PyRawMutex implements a word-sized mutex that that does not depend on the +// parking lot API, and therefore can be used in the parking lot +// implementation. +// +// The mutex uses a packed representation: the least significant bit is used to +// indicate whether the mutex is locked or not. The remaining bits are either +// zero or a pointer to a `struct raw_mutex_entry` (see lock.c). +typedef struct { + uintptr_t v; +} _PyRawMutex; + +// Slow paths for lock/unlock +extern void _PyRawMutex_LockSlow(_PyRawMutex *m); +extern void _PyRawMutex_UnlockSlow(_PyRawMutex *m); + +static inline void +_PyRawMutex_Lock(_PyRawMutex *m) +{ + uintptr_t unlocked = _Py_UNLOCKED; + if (_Py_atomic_compare_exchange_uintptr(&m->v, &unlocked, _Py_LOCKED)) { + return; + } + _PyRawMutex_LockSlow(m); +} + +static inline void +_PyRawMutex_Unlock(_PyRawMutex *m) +{ + uintptr_t locked = _Py_LOCKED; + if (_Py_atomic_compare_exchange_uintptr(&m->v, &locked, _Py_UNLOCKED)) { + return; + } + _PyRawMutex_UnlockSlow(m); +} + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_LOCK_H */ diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h new file mode 100644 index 00000000000000..32a289912888f7 --- /dev/null +++ b/Include/internal/pycore_parking_lot.h @@ -0,0 +1,129 @@ +// ParkingLot is an internal API for building efficient synchronization +// primitives like mutexes and events. +// +// The API and name is inspired by WebKit's WTF::ParkingLot, which in turn +// is inspired Linux's futex API. +// See https://webkit.org/blog/6161/locking-in-webkit/. +// +// The core functionality is an atomic "compare-and-sleep" operation along with +// an atomic "wake-up" operation. + +#ifndef Py_INTERNAL_PARKING_LOT_H +#define Py_INTERNAL_PARKING_LOT_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "pycore_time.h" // _PyTime_t +#include "cpython/pyatomic.h" + + +enum { + // The thread was unparked by another thread. + Py_PARK_OK = 0, + + // The value of `address` did not match `expected`. + Py_PARK_AGAIN = -1, + + // The thread was unparked due to a timeout. + Py_PARK_TIMEOUT = -2, + + // The thread was interrupted by a signal. + Py_PARK_INTR = -3, +}; + +// Checks that `*address == *expected` and puts the thread to sleep until an +// unpark operation is called on the same `address`. Otherwise, the function +// returns `Py_PARK_AGAIN`. The comparison is performed atomically +// with respect to unpark operations. +// +// The `address_size` argument is the size of the data pointed to by the +// `address` and `expected` pointers (i.e., sizeof(*address)). +// +// `arg`, which can be NULL, is passed to the unpark operation. +// +// The `timeout_ns` argument specifies the maximum amount of time to wait, with +// -1 indicating an infinite wait. +PyAPI_FUNC(int) +_PyParkingLot_Park(const void *address, const void *expected, + size_t address_size, _PyTime_t timeout_ns, + void *arg, int detach); + +struct _PyUnpark { + // The `arg` value passed to _PyParkingLot_Park(). + void *arg; + + // Are there more threads waiting on the address? May be true in cases + // where threads are waiting on a different address that maps to the same + // internal bucket. + int more_waiters; +}; + +// Unpark a single thread waiting on `address`. +// +// The `unpark` is a pointer to a `struct _PyUnpark`. +// +// Usage: +// _PyParkingLot_Unpark(address, unpark, { +// if (unpark) { +// void *arg = unpark->arg; +// int more_waiters = unpark->more_waiters; +// ... +// } +// }); +#define _PyParkingLot_Unpark(address, unpark, ...) \ + do { \ + struct _PyUnpark *(unpark); \ + unpark = _PyParkingLot_BeginUnpark((address)); \ + __VA_ARGS__ \ + _PyParkingLot_FinishUnpark((address), unpark); \ + } while (0); + +// Implements half of an unpark operation. +// Prefer using the _PyParkingLot_Unpark() macro. +PyAPI_FUNC(struct _PyUnpark *) +_PyParkingLot_BeginUnpark(const void *address); + +// Finishes the unpark operation and wakes up the thread selected by +// _PyParkingLot_BeginUnpark. +// Prefer using the _PyParkingLot_Unpark() macro. +PyAPI_FUNC(void) +_PyParkingLot_FinishUnpark(const void *address, struct _PyUnpark *unpark); + +// Unparks all threads waiting on `address`. +PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address); + +// Initialize/deinitialize the thread-local state used by parking lot. +void _PyParkingLot_InitThread(void); +void _PyParkingLot_DeinitThread(void); + +// Resets the parking lot state after a fork. Forgets all parked threads. +PyAPI_FUNC(void) _PyParkingLot_AfterFork(void); + + +// The _PySemaphore API a simplified cross-platform semaphore used to implement +// parking lot. It is not intended to be used directly by other modules. +typedef struct _PySemaphore _PySemaphore; + +// Puts the current thread to sleep until _PySemaphore_Wakeup() is called. +PyAPI_FUNC(int) +_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach); + +// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup() +// can be called before _PySemaphore_Wait(). +PyAPI_FUNC(void) +_PySemaphore_Wakeup(_PySemaphore *sema); + +// Allocates/releases a semaphore from the thread-local pool. +PyAPI_FUNC(_PySemaphore *) _PySemaphore_Alloc(void); +PyAPI_FUNC(void) _PySemaphore_Free(_PySemaphore *sema); + + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_PARKING_LOT_H */ diff --git a/Lib/test/test_capi/test_lock.py b/Lib/test/test_capi/test_lock.py new file mode 100644 index 00000000000000..756fd8a5ce793b --- /dev/null +++ b/Lib/test/test_capi/test_lock.py @@ -0,0 +1,15 @@ +import unittest +from test.support import import_helper + +# Skip this test if the _testcapi module isn't available. +_testinternalcapi = import_helper.import_module('_testinternalcapi') + +class PyAtomicTests(unittest.TestCase): + pass + +for name in sorted(dir(_testinternalcapi)): + if name.startswith('test_lock_'): + setattr(PyAtomicTests, name, getattr(_testinternalcapi, name)) + +if __name__ == "__main__": + unittest.main() diff --git a/Makefile.pre.in b/Makefile.pre.in index 19a802997838a4..e634d3f30f857e 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -400,12 +400,14 @@ PYTHON_OBJS= \ Python/instrumentation.o \ Python/intrinsics.o \ Python/legacy_tracing.o \ + Python/lock.o \ Python/marshal.o \ Python/modsupport.o \ Python/mysnprintf.o \ Python/mystrtoul.o \ Python/optimizer.o \ Python/optimizer_analysis.o \ + Python/parking_lot.o \ Python/pathconfig.o \ Python/preconfig.o \ Python/pyarena.o \ @@ -1779,6 +1781,8 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_interp.h \ $(srcdir)/Include/internal/pycore_intrinsics.h \ $(srcdir)/Include/internal/pycore_list.h \ + $(srcdir)/Include/internal/pycore_llist.h \ + $(srcdir)/Include/internal/pycore_lock.h \ $(srcdir)/Include/internal/pycore_long.h \ $(srcdir)/Include/internal/pycore_modsupport.h \ $(srcdir)/Include/internal/pycore_moduleobject.h \ @@ -1790,6 +1794,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_opcode_metadata.h \ $(srcdir)/Include/internal/pycore_opcode_utils.h \ $(srcdir)/Include/internal/pycore_optimizer.h \ + $(srcdir)/Include/internal/pycore_parking_lot.h \ $(srcdir)/Include/internal/pycore_pathconfig.h \ $(srcdir)/Include/internal/pycore_pyarena.h \ $(srcdir)/Include/internal/pycore_pyerrors.h \ diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in index 56c1badf6b44a0..7b3216a50bb284 100644 --- a/Modules/Setup.stdlib.in +++ b/Modules/Setup.stdlib.in @@ -158,7 +158,7 @@ @MODULE_XXSUBTYPE_TRUE@xxsubtype xxsubtype.c @MODULE__XXTESTFUZZ_TRUE@_xxtestfuzz _xxtestfuzz/_xxtestfuzz.c _xxtestfuzz/fuzzer.c @MODULE__TESTBUFFER_TRUE@_testbuffer _testbuffer.c -@MODULE__TESTINTERNALCAPI_TRUE@_testinternalcapi _testinternalcapi.c _testinternalcapi/pytime.c +@MODULE__TESTINTERNALCAPI_TRUE@_testinternalcapi _testinternalcapi.c _testinternalcapi/test_lock.c _testinternalcapi/pytime.c @MODULE__TESTCAPI_TRUE@_testcapi _testcapimodule.c _testcapi/vectorcall.c _testcapi/vectorcall_limited.c _testcapi/heaptype.c _testcapi/abstract.c _testcapi/unicode.c _testcapi/dict.c _testcapi/getargs.c _testcapi/datetime.c _testcapi/docstring.c _testcapi/mem.c _testcapi/watchers.c _testcapi/long.c _testcapi/float.c _testcapi/structmember.c _testcapi/exceptions.c _testcapi/code.c _testcapi/buffer.c _testcapi/pyatomic.c _testcapi/pyos.c _testcapi/immortal.c _testcapi/heaptype_relative.c _testcapi/gc.c @MODULE__TESTCLINIC_TRUE@_testclinic _testclinic.c @MODULE__TESTCLINIC_LIMITED_TRUE@_testclinic_limited _testclinic_limited.c diff --git a/Modules/_testcapi/parts.h b/Modules/_testcapi/parts.h index c162dbc65db81a..a21bc381d709db 100644 --- a/Modules/_testcapi/parts.h +++ b/Modules/_testcapi/parts.h @@ -27,6 +27,7 @@ int _PyTestCapi_Init_PyOS(PyObject *module); int _PyTestCapi_Init_Immortal(PyObject *module); int _PyTestCapi_Init_GC(PyObject *mod); + int _PyTestCapi_Init_VectorcallLimited(PyObject *module); int _PyTestCapi_Init_HeaptypeRelative(PyObject *module); diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 922672d1a9f915..934e3637a9164d 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1543,6 +1543,9 @@ static PyMethodDef module_functions[] = { static int module_exec(PyObject *module) { + if (_PyTestInternalCapi_Init_Lock(module) < 0) { + return 1; + } if (_PyTestInternalCapi_Init_PyTime(module) < 0) { return 1; } diff --git a/Modules/_testinternalcapi/clinic/test_lock.c.h b/Modules/_testinternalcapi/clinic/test_lock.c.h new file mode 100644 index 00000000000000..3cbe5ef12c5fa6 --- /dev/null +++ b/Modules/_testinternalcapi/clinic/test_lock.c.h @@ -0,0 +1,74 @@ +/*[clinic input] +preserve +[clinic start generated code]*/ + +#include "pycore_abstract.h" // _PyNumber_Index() + +PyDoc_STRVAR(_testinternalcapi_benchmark_locks__doc__, +"benchmark_locks($module, num_threads, use_pymutex=True,\n" +" critical_section_length=1, time_ms=1000, /)\n" +"--\n" +"\n"); + +#define _TESTINTERNALCAPI_BENCHMARK_LOCKS_METHODDEF \ + {"benchmark_locks", _PyCFunction_CAST(_testinternalcapi_benchmark_locks), METH_FASTCALL, _testinternalcapi_benchmark_locks__doc__}, + +static PyObject * +_testinternalcapi_benchmark_locks_impl(PyObject *module, + Py_ssize_t num_threads, + int use_pymutex, + int critical_section_length, + int time_ms); + +static PyObject * +_testinternalcapi_benchmark_locks(PyObject *module, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + Py_ssize_t num_threads; + int use_pymutex = 1; + int critical_section_length = 1; + int time_ms = 1000; + + if (!_PyArg_CheckPositional("benchmark_locks", nargs, 1, 4)) { + goto exit; + } + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[0]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + num_threads = ival; + } + if (nargs < 2) { + goto skip_optional; + } + use_pymutex = PyObject_IsTrue(args[1]); + if (use_pymutex < 0) { + goto exit; + } + if (nargs < 3) { + goto skip_optional; + } + critical_section_length = PyLong_AsInt(args[2]); + if (critical_section_length == -1 && PyErr_Occurred()) { + goto exit; + } + if (nargs < 4) { + goto skip_optional; + } + time_ms = PyLong_AsInt(args[3]); + if (time_ms == -1 && PyErr_Occurred()) { + goto exit; + } +skip_optional: + return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads, use_pymutex, critical_section_length, time_ms); + +exit: + return return_value; +} +/*[clinic end generated code: output=97c85dff601fed4b input=a9049054013a1b77]*/ diff --git a/Modules/_testinternalcapi/parts.h b/Modules/_testinternalcapi/parts.h index 43e7714b235156..bbb8e62ddaf7a2 100644 --- a/Modules/_testinternalcapi/parts.h +++ b/Modules/_testinternalcapi/parts.h @@ -10,6 +10,7 @@ #include "Python.h" +int _PyTestInternalCapi_Init_Lock(PyObject *module); int _PyTestInternalCapi_Init_PyTime(PyObject *module); #endif // Py_TESTINTERNALCAPI_PARTS_H diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c new file mode 100644 index 00000000000000..adfb694908c836 --- /dev/null +++ b/Modules/_testinternalcapi/test_lock.c @@ -0,0 +1,353 @@ +// C Extension module to test pycore_lock.h API + +#include "parts.h" + +#include "pycore_lock.h" +#include "clinic/test_lock.c.h" + +#ifdef _WIN32 +#include +#else +#include // usleep() +#endif + +/*[clinic input] +module _testinternalcapi +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=7bb583d8c9eb9a78]*/ + + +static void +pysleep(int ms) +{ +#ifdef _WIN32 + Sleep(ms); +#else + usleep(ms * 1000); +#endif +} + +static PyObject * +test_lock_basic(PyObject *self, PyObject *obj) +{ + PyMutex m; + memset(&m, 0, sizeof(m)); + + // uncontended lock and unlock + PyMutex_Lock(&m); + assert(m.v == 1); + PyMutex_Unlock(&m); + assert(m.v == 0); + + Py_RETURN_NONE; +} + +struct test_lock2_data { + PyMutex m; + PyEvent done; + int started; +}; + +static void +lock_thread(void *arg) +{ + struct test_lock2_data *test_data = arg; + PyMutex *m = &test_data->m; + _Py_atomic_store_int(&test_data->started, 1); + + PyMutex_Lock(m); + assert(m->v == 1); + + PyMutex_Unlock(m); + assert(m->v == 0); + + _PyEvent_Notify(&test_data->done); +} + +static PyObject * +test_lock_two_threads(PyObject *self, PyObject *obj) +{ + // lock attempt by two threads + struct test_lock2_data test_data; + memset(&test_data, 0, sizeof(test_data)); + + PyMutex_Lock(&test_data.m); + assert(test_data.m.v == 1); + + PyThread_start_new_thread(lock_thread, &test_data); + while (!_Py_atomic_load_int(&test_data.started)) { + pysleep(10); + } + pysleep(10); // allow some time for the other thread to try to lock + assert(test_data.m.v == 3); + + PyMutex_Unlock(&test_data.m); + PyEvent_Wait(&test_data.done); + assert(test_data.m.v == 0); + + Py_RETURN_NONE; +} + +#define COUNTER_THREADS 5 +#define COUNTER_ITERS 10000 + +struct test_data_counter { + PyMutex m; + Py_ssize_t counter; +}; + +struct thread_data_counter { + struct test_data_counter *test_data; + PyEvent done_event; +}; + +static void +counter_thread(void *arg) +{ + struct thread_data_counter *thread_data = arg; + struct test_data_counter *test_data = thread_data->test_data; + + for (Py_ssize_t i = 0; i < COUNTER_ITERS; i++) { + PyMutex_Lock(&test_data->m); + test_data->counter++; + PyMutex_Unlock(&test_data->m); + } + _PyEvent_Notify(&thread_data->done_event); +} + +static PyObject * +test_lock_counter(PyObject *self, PyObject *obj) +{ + // Test with rapidly locking and unlocking mutex + struct test_data_counter test_data; + memset(&test_data, 0, sizeof(test_data)); + + struct thread_data_counter thread_data[COUNTER_THREADS]; + memset(&thread_data, 0, sizeof(thread_data)); + + for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) { + thread_data[i].test_data = &test_data; + PyThread_start_new_thread(counter_thread, &thread_data[i]); + } + + for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) { + PyEvent_Wait(&thread_data[i].done_event); + } + + assert(test_data.counter == COUNTER_THREADS * COUNTER_ITERS); + Py_RETURN_NONE; +} + +#define SLOW_COUNTER_ITERS 100 + +static void +slow_counter_thread(void *arg) +{ + struct thread_data_counter *thread_data = arg; + struct test_data_counter *test_data = thread_data->test_data; + + for (Py_ssize_t i = 0; i < SLOW_COUNTER_ITERS; i++) { + PyMutex_Lock(&test_data->m); + if (i % 7 == 0) { + pysleep(2); + } + test_data->counter++; + PyMutex_Unlock(&test_data->m); + } + _PyEvent_Notify(&thread_data->done_event); +} + +static PyObject * +test_lock_counter_slow(PyObject *self, PyObject *obj) +{ + // Test lock/unlock with occasional "long" critical section, which will + // trigger handoff of the lock. + struct test_data_counter test_data; + memset(&test_data, 0, sizeof(test_data)); + + struct thread_data_counter thread_data[COUNTER_THREADS]; + memset(&thread_data, 0, sizeof(thread_data)); + + for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) { + thread_data[i].test_data = &test_data; + PyThread_start_new_thread(slow_counter_thread, &thread_data[i]); + } + + for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) { + PyEvent_Wait(&thread_data[i].done_event); + } + + assert(test_data.counter == COUNTER_THREADS * SLOW_COUNTER_ITERS); + Py_RETURN_NONE; +} + +struct bench_data_locks { + int stop; + int use_pymutex; + int critical_section_length; + char padding[200]; + PyThread_type_lock lock; + PyMutex m; + double value; + Py_ssize_t total_iters; +}; + +struct bench_thread_data { + struct bench_data_locks *bench_data; + Py_ssize_t iters; + PyEvent done; +}; + +static void +thread_benchmark_locks(void *arg) +{ + struct bench_thread_data *thread_data = arg; + struct bench_data_locks *bench_data = thread_data->bench_data; + int use_pymutex = bench_data->use_pymutex; + int critical_section_length = bench_data->critical_section_length; + + double my_value = 1.0; + Py_ssize_t iters = 0; + while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) { + if (use_pymutex) { + PyMutex_Lock(&bench_data->m); + for (int i = 0; i < critical_section_length; i++) { + bench_data->value += my_value; + my_value = bench_data->value; + } + PyMutex_Unlock(&bench_data->m); + } + else { + PyThread_acquire_lock(bench_data->lock, 1); + for (int i = 0; i < critical_section_length; i++) { + bench_data->value += my_value; + my_value = bench_data->value; + } + PyThread_release_lock(bench_data->lock); + } + iters++; + } + + thread_data->iters = iters; + _Py_atomic_add_ssize(&bench_data->total_iters, iters); + _PyEvent_Notify(&thread_data->done); +} + +/*[clinic input] +_testinternalcapi.benchmark_locks + + num_threads: Py_ssize_t + use_pymutex: bool = True + critical_section_length: int = 1 + time_ms: int = 1000 + / + +[clinic start generated code]*/ + +static PyObject * +_testinternalcapi_benchmark_locks_impl(PyObject *module, + Py_ssize_t num_threads, + int use_pymutex, + int critical_section_length, + int time_ms) +/*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/ +{ + // Run from Tools/lockbench/lockbench.py + // Based on the WebKit lock benchmarks: + // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp + // See also https://webkit.org/blog/6161/locking-in-webkit/ + PyObject *thread_iters = NULL; + PyObject *res = NULL; + + struct bench_data_locks bench_data; + memset(&bench_data, 0, sizeof(bench_data)); + bench_data.use_pymutex = use_pymutex; + bench_data.critical_section_length = critical_section_length; + + bench_data.lock = PyThread_allocate_lock(); + if (bench_data.lock == NULL) { + return PyErr_NoMemory(); + } + + struct bench_thread_data *thread_data = NULL; + thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data)); + if (thread_data == NULL) { + PyErr_NoMemory(); + goto exit; + } + + thread_iters = PyList_New(num_threads); + if (thread_iters == NULL) { + goto exit; + } + + _PyTime_t start = _PyTime_GetMonotonicClock(); + + for (Py_ssize_t i = 0; i < num_threads; i++) { + thread_data[i].bench_data = &bench_data; + PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]); + } + + // Let the threads run for `time_ms` milliseconds + pysleep(time_ms); + _Py_atomic_store_int(&bench_data.stop, 1); + + // Wait for the threads to finish + for (Py_ssize_t i = 0; i < num_threads; i++) { + PyEvent_Wait(&thread_data[i].done); + } + + Py_ssize_t total_iters = bench_data.total_iters; + _PyTime_t end = _PyTime_GetMonotonicClock(); + + // Return the total number of acquisitions and the number of acquisitions + // for each thread. + for (Py_ssize_t i = 0; i < num_threads; i++) { + PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters); + if (iter == NULL) { + goto exit; + } + PyList_SET_ITEM(thread_iters, i, iter); + } + + double rate = total_iters * 1000000000.0 / (end - start); + res = Py_BuildValue("(dO)", rate, thread_iters); + +exit: + PyThread_free_lock(bench_data.lock); + PyMem_Free(thread_data); + Py_XDECREF(thread_iters); + return res; +} + +static PyObject * +test_lock_benchmark(PyObject *module, PyObject *obj) +{ + // Just make sure the benchmark runs without crashing + PyObject *res = _testinternalcapi_benchmark_locks_impl( + module, 1, 1, 1, 100); + if (res == NULL) { + return NULL; + } + Py_DECREF(res); + Py_RETURN_NONE; +} + +static PyMethodDef test_methods[] = { + {"test_lock_basic", test_lock_basic, METH_NOARGS}, + {"test_lock_two_threads", test_lock_two_threads, METH_NOARGS}, + {"test_lock_counter", test_lock_counter, METH_NOARGS}, + {"test_lock_counter_slow", test_lock_counter_slow, METH_NOARGS}, + _TESTINTERNALCAPI_BENCHMARK_LOCKS_METHODDEF + {"test_lock_benchmark", test_lock_benchmark, METH_NOARGS}, + {NULL, NULL} /* sentinel */ +}; + +int +_PyTestInternalCapi_Init_Lock(PyObject *mod) +{ + if (PyModule_AddFunctions(mod, test_methods) < 0) { + return -1; + } + return 0; +} diff --git a/PCbuild/_testinternalcapi.vcxproj b/PCbuild/_testinternalcapi.vcxproj index 59491c644b6655..fb474f06f38fe8 100644 --- a/PCbuild/_testinternalcapi.vcxproj +++ b/PCbuild/_testinternalcapi.vcxproj @@ -95,6 +95,7 @@ + diff --git a/PCbuild/_testinternalcapi.vcxproj.filters b/PCbuild/_testinternalcapi.vcxproj.filters index 21a66a2aa79f76..9c8a5d793ee0f4 100644 --- a/PCbuild/_testinternalcapi.vcxproj.filters +++ b/PCbuild/_testinternalcapi.vcxproj.filters @@ -15,6 +15,9 @@ Source Files + + Source Files + diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 04752a8029acc2..b35aeb58e11460 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -245,6 +245,8 @@ + + @@ -254,6 +256,7 @@ + @@ -552,12 +555,14 @@ + + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 4ad02778466925..c2430f00cb7e9a 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -645,6 +645,12 @@ Include\internal + + Include\internal + + + Include\internal + Include\internal @@ -1241,6 +1247,9 @@ Source Files + + Source Files + Python @@ -1259,6 +1268,9 @@ Python + + Python + Python diff --git a/Python/lock.c b/Python/lock.c new file mode 100644 index 00000000000000..fce4c4c4363f2d --- /dev/null +++ b/Python/lock.c @@ -0,0 +1,292 @@ +// Lock implementation + +#include "Python.h" + +#include "pycore_lock.h" +#include "pycore_parking_lot.h" + +#ifdef _WIN32 +#include // SwitchToThread() +#elif defined(HAVE_SCHED_H) +#include // sched_yield() +#endif + +// If a thread waits on a lock for longer than TIME_TO_BE_FAIR_NS (1 ms), then +// the unlocking thread directly hands off ownership of the lock. This avoids +// starvation. +static const _PyTime_t TIME_TO_BE_FAIR_NS = 1000*1000; + +// Spin for a bit before parking the thread. This is only enabled for +// `--disable-gil` builds because it is unlikely to be helpful if the GIL is +// enabled. +#if Py_NOGIL +static const int MAX_SPIN_COUNT = 40; +#else +static const int MAX_SPIN_COUNT = 0; +#endif + +struct mutex_entry { + // The time at which the thread should be handed off the lock. Written by + // the waiting thread. + _PyTime_t time_to_be_fair; + + // Set to 1 if the lock was handed off. Written by the unlocking thread. + int handoff; +}; + +static void +_Py_yield(void) +{ +#ifdef _WIN32 + SwitchToThread(); +#elif defined(HAVE_SCHED_H) + sched_yield(); +#endif +} + +void +_PyMutex_LockSlow(PyMutex *m) +{ + _PyMutex_TimedLock(m, -1, _PY_LOCK_DETACH); +} + +PyLockStatus +_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) +{ + uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v); + if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) { + return PY_LOCK_ACQUIRED; + } + } + else if (timeout == 0) { + return PY_LOCK_FAILURE; + } + + _PyTime_t now = _PyTime_GetMonotonicClock(); + _PyTime_t endtime = 0; + if (timeout > 0) { + endtime = _PyTime_Add(now, timeout); + } + + struct mutex_entry entry = { + .time_to_be_fair = now + TIME_TO_BE_FAIR_NS, + .handoff = 0, + }; + + Py_ssize_t spin_count = 0; + for (;;) { + if (!(v & _Py_LOCKED)) { + // The lock is unlocked. Try to grab it. + if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) { + return PY_LOCK_ACQUIRED; + } + continue; + } + + if (!(v & _Py_HAS_PARKED) && spin_count < MAX_SPIN_COUNT) { + // Spin for a bit. + _Py_yield(); + spin_count++; + continue; + } + + if (timeout == 0) { + return PY_LOCK_FAILURE; + } + + uint8_t newv = v; + if (!(v & _Py_HAS_PARKED)) { + // We are the first waiter. Set the _Py_HAS_PARKED flag. + newv = v | _Py_HAS_PARKED; + if (!_Py_atomic_compare_exchange_uint8(&m->v, &v, newv)) { + continue; + } + } + + int ret = _PyParkingLot_Park(&m->v, &newv, sizeof(newv), timeout, + &entry, (flags & _PY_LOCK_DETACH) != 0); + if (ret == Py_PARK_OK) { + if (entry.handoff) { + // We own the lock now. + assert(_Py_atomic_load_uint8_relaxed(&m->v) & _Py_LOCKED); + return PY_LOCK_ACQUIRED; + } + } + else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_MAKE_PENDING_CALLS)) { + if (Py_MakePendingCalls() < 0) { + return PY_LOCK_INTR; + } + } + else if (ret == Py_PARK_TIMEOUT) { + assert(timeout >= 0); + return PY_LOCK_FAILURE; + } + + if (timeout > 0) { + timeout = _PyDeadline_Get(endtime); + if (timeout <= 0) { + // Avoid negative values because those mean block forever. + timeout = 0; + } + } + + v = _Py_atomic_load_uint8_relaxed(&m->v); + } +} + +int +_PyMutex_TryUnlock(PyMutex *m) +{ + uint8_t v = _Py_atomic_load_uint8(&m->v); + for (;;) { + if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + // error: the mutex is not locked + return -1; + } + else if ((v & _Py_HAS_PARKED)) { + // wake up a single thread + _PyParkingLot_Unpark(&m->v, unpark, { + v = 0; + if (unpark) { + struct mutex_entry *entry = unpark->arg; + _PyTime_t now = _PyTime_GetMonotonicClock(); + int should_be_fair = now > entry->time_to_be_fair; + + entry->handoff = should_be_fair; + if (should_be_fair) { + v |= _Py_LOCKED; + } + if (unpark->more_waiters) { + v |= _Py_HAS_PARKED; + } + } + _Py_atomic_store_uint8(&m->v, v); + }); + return 0; + } + else if (_Py_atomic_compare_exchange_uint8(&m->v, &v, _Py_UNLOCKED)) { + // fast-path: no waiters + return 0; + } + } +} + +void +_PyMutex_UnlockSlow(PyMutex *m) +{ + if (_PyMutex_TryUnlock(m) < 0) { + Py_FatalError("unlocking mutex that is not locked"); + } +} + +// _PyRawMutex stores a linked list of `struct raw_mutex_entry`, one for each +// thread waiting on the mutex, directly in the mutex itself. +struct raw_mutex_entry { + struct raw_mutex_entry *next; + _PySemaphore *sema; +}; + +void +_PyRawMutex_LockSlow(_PyRawMutex *m) +{ + struct raw_mutex_entry waiter; + waiter.sema = _PySemaphore_Alloc(); + + uintptr_t v = _Py_atomic_load_uintptr(&m->v); + for (;;) { + if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + // Unlocked: try to grab it (even if it has a waiter). + if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, v|_Py_LOCKED)) { + break; + } + continue; + } + + // Locked: try to add ourselves as a waiter. + waiter.next = (struct raw_mutex_entry *)(v & ~1); + uintptr_t desired = ((uintptr_t)&waiter)|_Py_LOCKED; + if (!_Py_atomic_compare_exchange_uintptr(&m->v, &v, desired)) { + continue; + } + + // Wait for us to be woken up. Note that we still have to lock the + // mutex ourselves: it is NOT handed off to us. + _PySemaphore_Wait(waiter.sema, -1, /*detach=*/0); + } + + _PySemaphore_Free(waiter.sema); +} + +void +_PyRawMutex_UnlockSlow(_PyRawMutex *m) +{ + uintptr_t v = _Py_atomic_load_uintptr(&m->v); + for (;;) { + if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + Py_FatalError("unlocking mutex that is not locked"); + } + + struct raw_mutex_entry *waiter = (struct raw_mutex_entry *)(v & ~1); + if (waiter) { + uintptr_t next_waiter = (uintptr_t)waiter->next; + if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, next_waiter)) { + _PySemaphore_Wakeup(waiter->sema); + return; + } + } + else { + if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, _Py_UNLOCKED)) { + return; + } + } + } +} + +void +_PyEvent_Notify(PyEvent *evt) +{ + uintptr_t v = _Py_atomic_exchange_uint8(&evt->v, _Py_LOCKED); + if (v == _Py_UNLOCKED) { + // no waiters + return; + } + else if (v == _Py_LOCKED) { + // event already set + return; + } + else { + assert(v == _Py_HAS_PARKED); + _PyParkingLot_UnparkAll(&evt->v); + } +} + +void +PyEvent_Wait(PyEvent *evt) +{ + while (!PyEvent_TimedWait(evt, -1)) + ; +} + +int +PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns) +{ + for (;;) { + uint8_t v = _Py_atomic_load_uint8(&evt->v); + if (v == _Py_LOCKED) { + // event already set + return 1; + } + if (v == _Py_UNLOCKED) { + if (!_Py_atomic_compare_exchange_uint8(&evt->v, &v, _Py_HAS_PARKED)) { + continue; + } + } + + uint8_t expected = _Py_HAS_PARKED; + (void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v), + timeout_ns, NULL, 1); + + return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED; + } +} diff --git a/Python/parking_lot.c b/Python/parking_lot.c new file mode 100644 index 00000000000000..8e6995653ce6f3 --- /dev/null +++ b/Python/parking_lot.c @@ -0,0 +1,466 @@ +#include "Python.h" + +#include "pycore_llist.h" +#include "pycore_lock.h" // _PyRawMutex +#include "pycore_parking_lot.h" +#include "pycore_pyerrors.h" // _Py_FatalErrorFormat +#include "pycore_pystate.h" // _PyThreadState_GET + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \ + defined(HAVE_SEM_TIMEDWAIT)) +#define USE_SEMAPHORES +#include +#else +#include +#endif + +// A simple, cross-platform binary semaphore that can be used to implement +// wakeup/sleep. +struct _PySemaphore { +#if defined(_WIN32) + HANDLE platform_sem; +#elif defined(USE_SEMAPHORES) + sem_t platform_sem; +#else + PyMUTEX_T mutex; + PyCOND_T cond; + int counter; +#endif +}; + +typedef struct { + // The mutex protects the waiter queue and the num_waiters counter. + _PyRawMutex mutex; + + // Linked list of `struct wait_entry` waiters in this bucket. + struct llist_node root; + size_t num_waiters; +} Bucket; + +struct wait_entry { + struct _PyUnpark unpark; + uintptr_t addr; + _PySemaphore *sema; + struct llist_node node; +}; + +#define MAX_SEMA_DEPTH 3 + +typedef struct { + Py_ssize_t refcount; + + int depth; + _PySemaphore semas[MAX_SEMA_DEPTH]; +} ThreadData; + +#define NUM_BUCKETS 251 + +// Table of waiters (hashed by address) +static Bucket buckets[NUM_BUCKETS]; + +#ifdef HAVE_THREAD_LOCAL +_Py_thread_local ThreadData *thread_data = NULL; +#else +#error "no supported thread-local variable storage classifier" +#endif + +static void +_PySemaphore_Init(_PySemaphore *sema) +{ +#if defined(_WIN32) + sema->platform_sem = CreateSemaphore( + NULL, // attributes + 0, // initial count + 10, // maximum count + NULL // unnamed + ); + if (!sema->platform_sem) { + Py_FatalError("parking_lot: CreateSemaphore failed"); + } +#elif defined(USE_SEMAPHORES) + if (sem_init(&sema->platform_sem, /*pshared=*/0, /*value=*/0) < 0) { + Py_FatalError("parking_lot: sem_init failed"); + } +#else + if (pthread_mutex_init(&sema->mutex, NULL) != 0) { + Py_FatalError("parking_lot: pthread_mutex_init failed"); + } + if (pthread_cond_init(&sema->cond, NULL)) { + Py_FatalError("parking_lot: pthread_cond_init failed"); + } +#endif +} + +static void +_PySemaphore_Destroy(_PySemaphore *sema) +{ +#if defined(_WIN32) + CloseHandle(sema->platform_sem); +#elif defined(USE_SEMAPHORES) + sem_destroy(&sema->platform_sem); +#else + pthread_mutex_destroy(&sema->mutex); + pthread_cond_destroy(&sema->cond); +#endif +} + +static int +_PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) +{ + int res = Py_PARK_INTR; +#if defined(_WIN32) + DWORD wait; + DWORD millis = 0; + if (timeout < 0) { + millis = INFINITE; + } + else { + millis = (DWORD) (timeout / 1000000); + } + wait = WaitForSingleObjectEx(sema->platform_sem, millis, FALSE); + if (wait == WAIT_OBJECT_0) { + res = Py_PARK_OK; + } + else if (wait == WAIT_TIMEOUT) { + res = Py_PARK_TIMEOUT; + } +#elif defined(USE_SEMAPHORES) + int err; + if (timeout >= 0) { + struct timespec ts; + + _PyTime_t deadline = _PyTime_Add(_PyTime_GetSystemClock(), timeout); + _PyTime_AsTimespec(deadline, &ts); + + err = sem_timedwait(&sema->platform_sem, &ts); + } + else { + err = sem_wait(&sema->platform_sem); + } + if (err == -1) { + err = errno; + if (err == EINTR) { + res = Py_PARK_INTR; + } + else if (err == ETIMEDOUT) { + res = Py_PARK_TIMEOUT; + } + else { + _Py_FatalErrorFormat(__func__, + "unexpected error from semaphore: %d", + err); + } + } + else { + res = Py_PARK_OK; + } +#else + pthread_mutex_lock(&sema->mutex); + if (sema->counter == 0) { + int err; + if (timeout >= 0) { + struct timespec ts; + + _PyTime_t deadline = _PyTime_Add(_PyTime_GetSystemClock(), timeout); + _PyTime_AsTimespec(deadline, &ts); + + err = pthread_cond_timedwait(&sema->cond, &sema->mutex, &ts); + } + else { + err = pthread_cond_wait(&sema->cond, &sema->mutex); + } + if (err) { + res = Py_PARK_TIMEOUT; + } + } + if (sema->counter > 0) { + sema->counter--; + res = Py_PARK_OK; + } + pthread_mutex_unlock(&sema->mutex); +#endif + return res; +} + +int +_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout, int detach) +{ + PyThreadState *tstate = _PyThreadState_GET(); + int was_attached = 0; + if (tstate) { + was_attached = (tstate->_status.active); + if (was_attached && detach) { + PyEval_ReleaseThread(tstate); + } + } + + int res = _PySemaphore_PlatformWait(sema, timeout); + + if (tstate) { + if (was_attached && detach) { + PyEval_AcquireThread(tstate); + } + } + return res; +} + +void +_PySemaphore_Wakeup(_PySemaphore *sema) +{ +#if defined(_WIN32) + if (!ReleaseSemaphore(sema->platform_sem, 1, NULL)) { + Py_FatalError("parking_lot: ReleaseSemaphore failed"); + } +#elif defined(USE_SEMAPHORES) + int err = sem_post(&sema->platform_sem); + if (err != 0) { + Py_FatalError("parking_lot: sem_post failed"); + } +#else + pthread_mutex_lock(&sema->mutex); + sema->counter++; + pthread_cond_signal(&sema->cond); + pthread_mutex_unlock(&sema->mutex); +#endif +} + +_PySemaphore * +_PySemaphore_Alloc(void) +{ + // Make sure we have a valid thread_data. We need to acquire + // some locks before we have a fully initialized PyThreadState. + _PyParkingLot_InitThread(); + + ThreadData *this_thread = thread_data; + if (this_thread->depth >= MAX_SEMA_DEPTH) { + Py_FatalError("_PySemaphore_Alloc(): too many calls"); + } + return &this_thread->semas[this_thread->depth++]; +} + +void +_PySemaphore_Free(_PySemaphore *sema) +{ + ThreadData *this_thread = thread_data; + this_thread->depth--; + if (&this_thread->semas[this_thread->depth] != sema) { + Py_FatalError("_PySemaphore_Free(): mismatch wakeup"); + } + _PyParkingLot_DeinitThread(); +} + +void +_PyParkingLot_InitThread(void) +{ + if (thread_data != NULL) { + thread_data->refcount++; + return; + } + ThreadData *this_thread = PyMem_RawMalloc(sizeof(ThreadData)); + if (this_thread == NULL) { + Py_FatalError("_PyParkingLot_InitThread: unable to allocate thread data"); + } + memset(this_thread, 0, sizeof(*this_thread)); + this_thread->refcount = 1; + this_thread->depth = 0; + for (int i = 0; i < MAX_SEMA_DEPTH; i++) { + _PySemaphore_Init(&this_thread->semas[i]); + } + thread_data = this_thread; +} + +void +_PyParkingLot_DeinitThread(void) +{ + ThreadData *td = thread_data; + if (td == NULL) { + return; + } + + if (--td->refcount != 0) { + assert(td->refcount > 0); + return; + } + + thread_data = NULL; + for (int i = 0; i < MAX_SEMA_DEPTH; i++) { + _PySemaphore_Destroy(&td->semas[i]); + } + + PyMem_RawFree(td); +} + +static void +enqueue(Bucket *bucket, const void *address, struct wait_entry *wait) +{ + if (!bucket->root.next) { + // initialize bucket + llist_init(&bucket->root); + } + llist_insert_tail(&bucket->root, &wait->node); + ++bucket->num_waiters; +} + +static struct wait_entry * +dequeue(Bucket *bucket, const void *address) +{ + struct llist_node *root = &bucket->root; + if (!root->next) { + // bucket was not yet initialized + return NULL; + } + + // find the first waiter that is waiting on `address` + struct llist_node *node; + llist_for_each(node, root) { + struct wait_entry *wait = llist_data(node, struct wait_entry, node); + if (wait->addr == (uintptr_t)address) { + llist_remove(node); + --bucket->num_waiters; + return wait; + } + } + return NULL; +} + +static void +dequeue_all(Bucket *bucket, const void *address, struct llist_node *dst) +{ + struct llist_node *root = &bucket->root; + if (!root->next) { + // bucket was not yet initialized + return; + } + + // remove and append all matching waiters to dst + struct llist_node *node; + llist_for_each_safe(node, root) { + struct wait_entry *wait = llist_data(node, struct wait_entry, node); + if (wait->addr == (uintptr_t)address) { + llist_remove(node); + llist_insert_tail(dst, node); + --bucket->num_waiters; + } + } +} + +// Checks that `*addr == *expected` +static int +validate_addr(const void *addr, const void *expected, size_t addr_size) +{ + switch (addr_size) { + case 1: return _Py_atomic_load_uint8(addr) == *(const uint8_t *)expected; + case 2: return _Py_atomic_load_uint16(addr) == *(const uint16_t *)expected; + case 4: return _Py_atomic_load_uint32(addr) == *(const uint32_t *)expected; + case 8: return _Py_atomic_load_uint64(addr) == *(const uint64_t *)expected; + default: Py_UNREACHABLE(); + } +} + +int +_PyParkingLot_Park(const void *addr, const void *expected, size_t size, + _PyTime_t timeout_ns, void *arg, int detach) +{ + struct wait_entry wait; + wait.unpark.arg = arg; + wait.addr = (uintptr_t)addr; + + Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; + + _PyRawMutex_Lock(&bucket->mutex); + if (!validate_addr(addr, expected, size)) { + _PyRawMutex_Unlock(&bucket->mutex); + return Py_PARK_AGAIN; + } + wait.sema = _PySemaphore_Alloc(); + enqueue(bucket, addr, &wait); + _PyRawMutex_Unlock(&bucket->mutex); + + int res = _PySemaphore_Wait(wait.sema, timeout_ns, detach); + if (res == Py_PARK_OK) { + goto done; + } + + // timeout or interrupt + _PyRawMutex_Lock(&bucket->mutex); + if (wait.node.next == NULL) { + _PyRawMutex_Unlock(&bucket->mutex); + // We've been removed the waiter queue. Wait until we process the + // wakeup signal. + do { + res = _PySemaphore_Wait(wait.sema, -1, detach); + } while (res != Py_PARK_OK); + goto done; + } + else { + llist_remove(&wait.node); + --bucket->num_waiters; + } + _PyRawMutex_Unlock(&bucket->mutex); + +done: + _PySemaphore_Free(wait.sema); + return res; + +} + +struct _PyUnpark * +_PyParkingLot_BeginUnpark(const void *addr) +{ + Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; + _PyRawMutex_Lock(&bucket->mutex); + + struct wait_entry *waiter = dequeue(bucket, addr); + if (!waiter) { + return NULL; + } + + waiter->unpark.more_waiters = (bucket->num_waiters > 0); + return &waiter->unpark; +} + +#define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - offsetof(type, member))) + +void +_PyParkingLot_FinishUnpark(const void *addr, struct _PyUnpark *unpark) +{ + Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; + _PyRawMutex_Unlock(&bucket->mutex); + + if (unpark) { + struct wait_entry *waiter; + waiter = container_of(unpark, struct wait_entry, unpark); + + _PySemaphore_Wakeup(waiter->sema); + } +} + +void +_PyParkingLot_UnparkAll(const void *addr) +{ + struct llist_node head = LLIST_INIT(head); + Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; + + _PyRawMutex_Lock(&bucket->mutex); + dequeue_all(bucket, addr, &head); + _PyRawMutex_Unlock(&bucket->mutex); + + struct llist_node *node; + llist_for_each_safe(node, &head) { + struct wait_entry *waiter = llist_data(node, struct wait_entry, node); + llist_remove(node); + _PySemaphore_Wakeup(waiter->sema); + } +} + +void +_PyParkingLot_AfterFork(void) +{ + // After a fork only one thread remains. That thread cannot be blocked + // so all entries in the parking lot are for dead threads. + memset(buckets, 0, sizeof(buckets)); +} diff --git a/Python/pystate.c b/Python/pystate.c index b5c4fd7fb50616..b7e8429b1ca403 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -8,6 +8,7 @@ #include "pycore_frame.h" #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_object.h" // _PyType_InitCache() +#include "pycore_parking_lot.h" // _PyParkingLot_AfterFork() #include "pycore_pyerrors.h" // _PyErr_Clear() #include "pycore_pylifecycle.h" // _PyAST_Fini() #include "pycore_pymem.h" // _PyMem_SetDefaultAllocator() @@ -549,6 +550,10 @@ _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime) PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc); + // Clears the parking lot. Any waiting threads are dead. This must be + // called before releasing any locks that use the parking lot. + _PyParkingLot_AfterFork(); + /* bpo-42540: id_mutex is freed by _PyInterpreterState_Delete, which does * not force the default allocator. */ reinit_err += _PyThread_at_fork_reinit(&runtime->interpreters.main->id_mutex); diff --git a/Tools/lockbench/lockbench.py b/Tools/lockbench/lockbench.py new file mode 100644 index 00000000000000..9833d703e00cbb --- /dev/null +++ b/Tools/lockbench/lockbench.py @@ -0,0 +1,53 @@ +# Measure the performance of PyMutex and PyThread_type_lock locks +# with short critical sections. +# +# Usage: python Tools/lockbench/lockbench.py [CRITICAL_SECTION_LENGTH] +# +# How to interpret the results: +# +# Acquisitions (kHz): Reports the total number of lock acquisitions in +# thousands of acquisitions per second. This is the most important metric, +# particularly for the 1 thread case because even in multithreaded programs, +# most locks acquisitions are not contended. Values for 2+ threads are +# only meaningful for `--disable-gil` builds, because the GIL prevents most +# situations where there is lock contention with short critical sections. +# +# Fairness: A measure of how evenly the lock acquisitions are distributed. +# A fairness of 1.0 means that all threads acquired the lock the same number +# of times. A fairness of 1/N means that only one thread ever acquired the +# lock. +# See https://en.wikipedia.org/wiki/Fairness_measure#Jain's_fairness_index + +from _testinternalcapi import benchmark_locks +import sys + +# Max number of threads to test +MAX_THREADS = 10 + +# How much "work" to do while holding the lock +CRITICAL_SECTION_LENGTH = 1 + + +def jains_fairness(values): + # Jain's fairness index + # See https://en.wikipedia.org/wiki/Fairness_measure + return (sum(values) ** 2) / (len(values) * sum(x ** 2 for x in values)) + +def main(): + print("Lock Type Threads Acquisitions (kHz) Fairness") + for lock_type in ["PyMutex", "PyThread_type_lock"]: + use_pymutex = (lock_type == "PyMutex") + for num_threads in range(1, MAX_THREADS + 1): + acquisitions, thread_iters = benchmark_locks( + num_threads, use_pymutex, CRITICAL_SECTION_LENGTH) + + acquisitions /= 1000 # report in kHz for readability + fairness = jains_fairness(thread_iters) + + print(f"{lock_type: <20}{num_threads: <18}{acquisitions: >5.0f}{fairness: >20.2f}") + + +if __name__ == "__main__": + if len(sys.argv) > 1: + CRITICAL_SECTION_LENGTH = int(sys.argv[1]) + main() From 6db39dc2132f686e37396d47cffd7806a3adc3ec Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 12 Sep 2023 13:12:32 -0700 Subject: [PATCH 02/28] Add news entry --- .../next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst diff --git a/Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst b/Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst new file mode 100644 index 00000000000000..5cddf9bc239700 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst @@ -0,0 +1 @@ +Add :c:type:`PyMutex` internal-only lightweight locking API. From e6247337e32eb87662844502cef14f3440bc26c6 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 12 Sep 2023 13:12:46 -0700 Subject: [PATCH 03/28] Revert change to Modules/_testcapi/parts.h --- Modules/_testcapi/parts.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Modules/_testcapi/parts.h b/Modules/_testcapi/parts.h index a21bc381d709db..c162dbc65db81a 100644 --- a/Modules/_testcapi/parts.h +++ b/Modules/_testcapi/parts.h @@ -27,7 +27,6 @@ int _PyTestCapi_Init_PyOS(PyObject *module); int _PyTestCapi_Init_Immortal(PyObject *module); int _PyTestCapi_Init_GC(PyObject *mod); - int _PyTestCapi_Init_VectorcallLimited(PyObject *module); int _PyTestCapi_Init_HeaptypeRelative(PyObject *module); From 83e60fbf29f412c876114e7323ff73a01154ae46 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 12 Sep 2023 13:34:14 -0700 Subject: [PATCH 04/28] Update Tools/c-analyzer/cpython/ignored.tsv --- Python/parking_lot.c | 2 +- Tools/c-analyzer/cpython/ignored.tsv | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Python/parking_lot.c b/Python/parking_lot.c index 8e6995653ce6f3..a60d954cf95a6c 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -62,7 +62,7 @@ typedef struct { static Bucket buckets[NUM_BUCKETS]; #ifdef HAVE_THREAD_LOCAL -_Py_thread_local ThreadData *thread_data = NULL; +static _Py_thread_local ThreadData *thread_data = NULL; #else #error "no supported thread-local variable storage classifier" #endif diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index d1ac0410619c96..0e0c9980958efb 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -50,6 +50,9 @@ Python/getversion.c - version - Python/bootstrap_hash.c - _Py_HashSecret_Initialized - Python/pyhash.c - _Py_HashSecret - +## thread-safe hashtable (internal locks) +Python/parking_lot.c - buckets - + ################################## ## state tied to Py_Main() @@ -173,6 +176,7 @@ Python/pyfpe.c - PyFPE_counter - Python/import.c - pkgcontext - Python/pystate.c - _Py_tss_tstate - +Python/parking_lot.c - thread_data - ##----------------------- ## should be const From 3161e17da49312af3bdfc7eed3302cdfdebcdff7 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 12 Sep 2023 13:42:33 -0700 Subject: [PATCH 05/28] Fix _Py_atomic_store_ptr_release on winarm64 --- Include/cpython/pyatomic_msc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h index c88bb03cc8f94a..287ed43b5714cd 100644 --- a/Include/cpython/pyatomic_msc.h +++ b/Include/cpython/pyatomic_msc.h @@ -906,7 +906,7 @@ _Py_atomic_store_ptr_release(void *obj, void *value) #if defined(_M_X64) || defined(_M_IX86) *(void * volatile *)obj = value; #elif defined(_M_ARM64) - __stlr64(obj, (uintptr_t)value); + __stlr64((unsigned __int64 volatile *)obj, (uintptr_t)value); #else # error "no implementation of _Py_atomic_store_ptr_release" #endif From 3ab7110915e49bc1ed7243bf75603007f19c0254 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 12 Sep 2023 14:17:13 -0700 Subject: [PATCH 06/28] Support pthread_stubs.h and require threads for lock tests --- Lib/test/test_capi/test_lock.py | 10 +++++++--- Python/parking_lot.c | 13 ++++++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_capi/test_lock.py b/Lib/test/test_capi/test_lock.py index 756fd8a5ce793b..cfef5a26c4eb61 100644 --- a/Lib/test/test_capi/test_lock.py +++ b/Lib/test/test_capi/test_lock.py @@ -1,15 +1,19 @@ import unittest -from test.support import import_helper +from test.support import import_helper, threading_helper # Skip this test if the _testcapi module isn't available. _testinternalcapi = import_helper.import_module('_testinternalcapi') -class PyAtomicTests(unittest.TestCase): +# Lock tests require threads +threading_helper.requires_working_threading(module=True) + + +class PyLockTests(unittest.TestCase): pass for name in sorted(dir(_testinternalcapi)): if name.startswith('test_lock_'): - setattr(PyAtomicTests, name, getattr(_testinternalcapi, name)) + setattr(PyLockTests, name, getattr(_testinternalcapi, name)) if __name__ == "__main__": unittest.main() diff --git a/Python/parking_lot.c b/Python/parking_lot.c index a60d954cf95a6c..66f99484ab879d 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -7,14 +7,17 @@ #include "pycore_pystate.h" // _PyThreadState_GET #ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#include +# include #elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \ defined(HAVE_SEM_TIMEDWAIT)) -#define USE_SEMAPHORES -#include +# define USE_SEMAPHORES +# include +#elif defined(HAVE_PTHREAD_H) +# include +#elif defined(HAVE_PTHREAD_STUBS) +# include "cpython/pthread_stubs.h" #else -#include +# error "Require native threads. See https://bugs.python.org/issue31370" #endif // A simple, cross-platform binary semaphore that can be used to implement From 153a0b35aa940d87cb76fa1cf1f76f4cdfe226e1 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 12 Sep 2023 14:21:42 -0700 Subject: [PATCH 07/28] Clean-up Windows includes and ifdefs --- Modules/_testinternalcapi/test_lock.c | 5 +++-- Python/lock.c | 5 +++-- Python/parking_lot.c | 13 +++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c index adfb694908c836..fe25417a2f315d 100644 --- a/Modules/_testinternalcapi/test_lock.c +++ b/Modules/_testinternalcapi/test_lock.c @@ -5,7 +5,8 @@ #include "pycore_lock.h" #include "clinic/test_lock.c.h" -#ifdef _WIN32 +#ifdef MS_WINDOWS +#define WIN32_LEAN_AND_MEAN #include #else #include // usleep() @@ -20,7 +21,7 @@ module _testinternalcapi static void pysleep(int ms) { -#ifdef _WIN32 +#ifdef MS_WINDOWS Sleep(ms); #else usleep(ms * 1000); diff --git a/Python/lock.c b/Python/lock.c index fce4c4c4363f2d..b8a1f64a838118 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -5,7 +5,8 @@ #include "pycore_lock.h" #include "pycore_parking_lot.h" -#ifdef _WIN32 +#ifdef MS_WINDOWS +#define WIN32_LEAN_AND_MEAN #include // SwitchToThread() #elif defined(HAVE_SCHED_H) #include // sched_yield() @@ -37,7 +38,7 @@ struct mutex_entry { static void _Py_yield(void) { -#ifdef _WIN32 +#ifdef MS_WINDOWS SwitchToThread(); #elif defined(HAVE_SCHED_H) sched_yield(); diff --git a/Python/parking_lot.c b/Python/parking_lot.c index 66f99484ab879d..4ec6657c033ed5 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -6,7 +6,8 @@ #include "pycore_pyerrors.h" // _Py_FatalErrorFormat #include "pycore_pystate.h" // _PyThreadState_GET -#ifdef _WIN32 +#ifdef MS_WINDOWS +# define WIN32_LEAN_AND_MEAN # include #elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \ defined(HAVE_SEM_TIMEDWAIT)) @@ -23,7 +24,7 @@ // A simple, cross-platform binary semaphore that can be used to implement // wakeup/sleep. struct _PySemaphore { -#if defined(_WIN32) +#if defined(MS_WINDOWS) HANDLE platform_sem; #elif defined(USE_SEMAPHORES) sem_t platform_sem; @@ -73,7 +74,7 @@ static _Py_thread_local ThreadData *thread_data = NULL; static void _PySemaphore_Init(_PySemaphore *sema) { -#if defined(_WIN32) +#if defined(MS_WINDOWS) sema->platform_sem = CreateSemaphore( NULL, // attributes 0, // initial count @@ -100,7 +101,7 @@ _PySemaphore_Init(_PySemaphore *sema) static void _PySemaphore_Destroy(_PySemaphore *sema) { -#if defined(_WIN32) +#if defined(MS_WINDOWS) CloseHandle(sema->platform_sem); #elif defined(USE_SEMAPHORES) sem_destroy(&sema->platform_sem); @@ -114,7 +115,7 @@ static int _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) { int res = Py_PARK_INTR; -#if defined(_WIN32) +#if defined(MS_WINDOWS) DWORD wait; DWORD millis = 0; if (timeout < 0) { @@ -213,7 +214,7 @@ _PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout, int detach) void _PySemaphore_Wakeup(_PySemaphore *sema) { -#if defined(_WIN32) +#if defined(MS_WINDOWS) if (!ReleaseSemaphore(sema->platform_sem, 1, NULL)) { Py_FatalError("parking_lot: ReleaseSemaphore failed"); } From f963fd873331b6a93e31a923ba860f75294916d5 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 12 Sep 2023 15:22:13 -0700 Subject: [PATCH 08/28] Fix Raspbian build --- configure | 6 +++--- configure.ac | 9 +++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/configure b/configure index 8326a1db06c2da..1c9a3fe7d0e4fc 100755 --- a/configure +++ b/configure @@ -27760,7 +27760,7 @@ fi # # Avoid #include or #include . The header # requires header which is only written below by AC_OUTPUT below. -# If the check is done after AC_OUTPUT, modifying LIBATOMIC has no effect +# If the check is done after AC_OUTPUT, modifying LIBS has no effect # anymore. cannot be included alone, it's designed to be included # by : it expects other includes and macros to be defined. save_CPPFLAGS=$CPPFLAGS @@ -27825,7 +27825,7 @@ printf "%s\n" "$ac_cv_libatomic_needed" >&6; } if test "x$ac_cv_libatomic_needed" = xyes then : - LIBATOMIC=${LIBATOMIC-"-latomic"} + LIBS="${LIBS} -latomic" fi CPPFLAGS=$save_CPPFLAGS @@ -29979,7 +29979,7 @@ fi then : - as_fn_append MODULE_BLOCK "MODULE__TESTCAPI_LDFLAGS=$LIBATOMIC$as_nl" + fi if test "$py_cv_module__testcapi" = yes; then diff --git a/configure.ac b/configure.ac index 843f2b267a5253..fde8cfb97fcfa7 100644 --- a/configure.ac +++ b/configure.ac @@ -6970,7 +6970,7 @@ fi # # Avoid #include or #include . The header # requires header which is only written below by AC_OUTPUT below. -# If the check is done after AC_OUTPUT, modifying LIBATOMIC has no effect +# If the check is done after AC_OUTPUT, modifying LIBS has no effect # anymore. cannot be included alone, it's designed to be included # by : it expects other includes and macros to be defined. _SAVE_VAR([CPPFLAGS]) @@ -7014,7 +7014,7 @@ int main() ]) AS_VAR_IF([ac_cv_libatomic_needed], [yes], - [LIBATOMIC=${LIBATOMIC-"-latomic"}]) + [LIBS="${LIBS} -latomic"]) _RESTORE_VAR([CPPFLAGS]) @@ -7286,10 +7286,7 @@ PY_STDLIB_MOD([_hashlib], [], [test "$ac_cv_working_openssl_hashlib" = yes], [$OPENSSL_INCLUDES], [$OPENSSL_LDFLAGS $OPENSSL_LDFLAGS_RPATH $LIBCRYPTO_LIBS]) dnl test modules -PY_STDLIB_MOD([_testcapi], - [test "$TEST_MODULES" = yes], [] - dnl Modules/_testcapi/pyatomic.c uses header - [], [], [$LIBATOMIC]) +PY_STDLIB_MOD([_testcapi], [test "$TEST_MODULES" = yes]) PY_STDLIB_MOD([_testclinic], [test "$TEST_MODULES" = yes]) PY_STDLIB_MOD([_testclinic_limited], [test "$TEST_MODULES" = yes]) PY_STDLIB_MOD([_testinternalcapi], [test "$TEST_MODULES" = yes]) From 6f49ae75fd2b7c85c4decd067cb3fca95207cf6b Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Wed, 13 Sep 2023 10:52:49 -0700 Subject: [PATCH 09/28] Fix wasm tests. The lock tests were being picked up twice: once in test_lock.py and once in Test_testinternalcapi from test_misc.py. The Test_testinternalcapi was not skipping tests when the platform doesn't have threads. This moves the tests to test_misc.py, doesn't include them in Test_testinternalcapi, and skips them if the platform doesn't support threads. --- Lib/test/test_capi/test_lock.py | 19 ------------------- Lib/test/test_capi/test_misc.py | 10 +++++++++- 2 files changed, 9 insertions(+), 20 deletions(-) delete mode 100644 Lib/test/test_capi/test_lock.py diff --git a/Lib/test/test_capi/test_lock.py b/Lib/test/test_capi/test_lock.py deleted file mode 100644 index cfef5a26c4eb61..00000000000000 --- a/Lib/test/test_capi/test_lock.py +++ /dev/null @@ -1,19 +0,0 @@ -import unittest -from test.support import import_helper, threading_helper - -# Skip this test if the _testcapi module isn't available. -_testinternalcapi = import_helper.import_module('_testinternalcapi') - -# Lock tests require threads -threading_helper.requires_working_threading(module=True) - - -class PyLockTests(unittest.TestCase): - pass - -for name in sorted(dir(_testinternalcapi)): - if name.startswith('test_lock_'): - setattr(PyLockTests, name, getattr(_testinternalcapi, name)) - -if __name__ == "__main__": - unittest.main() diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 964886ad1ca0d8..6b0237d4971e9f 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2067,7 +2067,15 @@ def test_version_api_data(self): class Test_testinternalcapi(unittest.TestCase): locals().update((name, getattr(_testinternalcapi, name)) for name in dir(_testinternalcapi) - if name.startswith('test_')) + if name.startswith('test_') + and not name.startswith('test_lock_')) + + +@threading_helper.requires_working_threading() +class Test_PyLock(unittest.TestCase): + locals().update((name, getattr(_testinternalcapi, name)) + for name in dir(_testinternalcapi) + if name.startswith('test_lock_')) @unittest.skipIf(_testmultiphase is None, "test requires _testmultiphase module") From 779a4019132fda37771de0d60195687f5f854ac3 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Wed, 13 Sep 2023 12:04:40 -0700 Subject: [PATCH 10/28] Fix indentation --- Lib/test/test_capi/test_misc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index 6b0237d4971e9f..84cbde75f8bb78 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2073,9 +2073,9 @@ class Test_testinternalcapi(unittest.TestCase): @threading_helper.requires_working_threading() class Test_PyLock(unittest.TestCase): - locals().update((name, getattr(_testinternalcapi, name)) - for name in dir(_testinternalcapi) - if name.startswith('test_lock_')) + locals().update((name, getattr(_testinternalcapi, name)) + for name in dir(_testinternalcapi) + if name.startswith('test_lock_')) @unittest.skipIf(_testmultiphase is None, "test requires _testmultiphase module") From 3200ef5ac37f81e44d3deaa566e746b959954b5e Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 08:15:14 -0700 Subject: [PATCH 11/28] Rename PyEvent_TimedWait and _PyMutex_LockTimed --- Include/internal/pycore_lock.h | 4 ++-- Python/lock.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h index 36c76358f5bcae..cd660758f8c360 100644 --- a/Include/internal/pycore_lock.h +++ b/Include/internal/pycore_lock.h @@ -94,7 +94,7 @@ typedef enum _PyLockFlags { // Lock a mutex with an optional timeout and additional options. See // _PyLockFlags for details. extern PyLockStatus -_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags); +_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags); // Unlock a mutex, returns 0 if the mutex is not locked (used for improved // error messages). @@ -117,7 +117,7 @@ PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt); // Wait for the event to be set, or until the timeout expires. If the event is // already set, then this returns immediately. Returns 1 if the event was set, // and 0 if the timeout expired or thread was interrupted. -PyAPI_FUNC(int) PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns); +PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns); // _PyRawMutex implements a word-sized mutex that that does not depend on the diff --git a/Python/lock.c b/Python/lock.c index b8a1f64a838118..d5ad7a602052f6 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -48,11 +48,11 @@ _Py_yield(void) void _PyMutex_LockSlow(PyMutex *m) { - _PyMutex_TimedLock(m, -1, _PY_LOCK_DETACH); + _PyMutex_LockTimed(m, -1, _PY_LOCK_DETACH); } PyLockStatus -_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) +_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) { uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v); if ((v & _Py_LOCKED) == _Py_UNLOCKED) { @@ -265,12 +265,12 @@ _PyEvent_Notify(PyEvent *evt) void PyEvent_Wait(PyEvent *evt) { - while (!PyEvent_TimedWait(evt, -1)) + while (!PyEvent_WaitTimed(evt, -1)) ; } int -PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns) +PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns) { for (;;) { uint8_t v = _Py_atomic_load_uint8(&evt->v); From 880a2634c6f1d05db84e85c9ed79ffdd690bd6c3 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 08:22:35 -0700 Subject: [PATCH 12/28] more_waiters -> has_more_waiters --- Include/internal/pycore_parking_lot.h | 4 ++-- Python/lock.c | 2 +- Python/parking_lot.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h index 32a289912888f7..e30d4be452be3a 100644 --- a/Include/internal/pycore_parking_lot.h +++ b/Include/internal/pycore_parking_lot.h @@ -60,7 +60,7 @@ struct _PyUnpark { // Are there more threads waiting on the address? May be true in cases // where threads are waiting on a different address that maps to the same // internal bucket. - int more_waiters; + int has_more_waiters; }; // Unpark a single thread waiting on `address`. @@ -71,7 +71,7 @@ struct _PyUnpark { // _PyParkingLot_Unpark(address, unpark, { // if (unpark) { // void *arg = unpark->arg; -// int more_waiters = unpark->more_waiters; +// int has_more_waiters = unpark->has_more_waiters; // ... // } // }); diff --git a/Python/lock.c b/Python/lock.c index d5ad7a602052f6..b01665db7d54a9 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -158,7 +158,7 @@ _PyMutex_TryUnlock(PyMutex *m) if (should_be_fair) { v |= _Py_LOCKED; } - if (unpark->more_waiters) { + if (unpark->has_more_waiters) { v |= _Py_HAS_PARKED; } } diff --git a/Python/parking_lot.c b/Python/parking_lot.c index 4ec6657c033ed5..a0542e1973ff0d 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -422,7 +422,7 @@ _PyParkingLot_BeginUnpark(const void *addr) return NULL; } - waiter->unpark.more_waiters = (bucket->num_waiters > 0); + waiter->unpark.has_more_waiters = (bucket->num_waiters > 0); return &waiter->unpark; } From 717b3c9c5a0ca0f9ceb320803b605c2aaa7a8142 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 08:25:14 -0700 Subject: [PATCH 13/28] Remove _PyMutex_State typedef --- Include/internal/pycore_lock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h index cd660758f8c360..61767b87c5494e 100644 --- a/Include/internal/pycore_lock.h +++ b/Include/internal/pycore_lock.h @@ -37,11 +37,11 @@ typedef struct _PyMutex { uint8_t v; } PyMutex; -typedef enum { +enum { _Py_UNLOCKED = 0, _Py_LOCKED = 1, _Py_HAS_PARKED = 2, -} _PyMutex_State; +}; // (private) slow path for locking the mutex PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m); From 23b91b0217913159206c0018bbae976c929cf61a Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 08:35:39 -0700 Subject: [PATCH 14/28] Update docs --- Include/internal/pycore_parking_lot.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h index e30d4be452be3a..d88549aa0dd6ab 100644 --- a/Include/internal/pycore_parking_lot.h +++ b/Include/internal/pycore_parking_lot.h @@ -42,12 +42,24 @@ enum { // with respect to unpark operations. // // The `address_size` argument is the size of the data pointed to by the -// `address` and `expected` pointers (i.e., sizeof(*address)). -// -// `arg`, which can be NULL, is passed to the unpark operation. +// `address` and `expected` pointers (i.e., sizeof(*address)). It must be +// 1, 2, 4, or 8. // // The `timeout_ns` argument specifies the maximum amount of time to wait, with // -1 indicating an infinite wait. +// +// `arg`, which can be NULL, is passed to the unpark operation. +// +// If `detach` is true, then the thread will detach/release the GIL while +// waiting. +// +// Example usage: +// +// if (_Py_atomic_compare_exchange_uint8(address, &expected, new_value)) { +// int res = _PyParkingLot_Park(address, &new_value, sizeof(*address), +// timeout_ns, NULL, 1); +// ... +// } PyAPI_FUNC(int) _PyParkingLot_Park(const void *address, const void *expected, size_t address_size, _PyTime_t timeout_ns, @@ -110,6 +122,8 @@ PyAPI_FUNC(void) _PyParkingLot_AfterFork(void); typedef struct _PySemaphore _PySemaphore; // Puts the current thread to sleep until _PySemaphore_Wakeup() is called. +// If `detach` is true, then the thread will detach/release the GIL while +// sleeping. PyAPI_FUNC(int) _PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach); From 279c56afee60c1d98476cab179003ff2bbd02c0d Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 08:50:12 -0700 Subject: [PATCH 15/28] Include cpython/pyatomic.h via pyatomic.h --- Include/Python.h | 2 +- Include/cpython/pyatomic.h | 10 +++------- Include/internal/pycore_parking_lot.h | 1 - Include/pyatomic.h | 16 ++++++++++++++++ Modules/_testcapi/pyatomic.c | 1 - PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 +++ configure | 2 +- configure.ac | 2 +- 9 files changed, 26 insertions(+), 12 deletions(-) create mode 100644 Include/pyatomic.h diff --git a/Include/Python.h b/Include/Python.h index d6429856b4b9f4..7312cc87d5cc33 100644 --- a/Include/Python.h +++ b/Include/Python.h @@ -48,7 +48,7 @@ #include "pytypedefs.h" #include "pybuffer.h" #include "pystats.h" -#include "cpython/pyatomic.h" +#include "pyatomic.h" #include "object.h" #include "objimpl.h" #include "typeslots.h" diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h index 066a969e0e486a..ab182381b39f00 100644 --- a/Include/cpython/pyatomic.h +++ b/Include/cpython/pyatomic.h @@ -83,10 +83,9 @@ // # release // ... -#ifndef Py_LIMITED_API -#ifndef Py_ATOMIC_H -#define Py_ATOMIC_H - +#ifndef Py_CPYTHON_ATOMIC_H +# error "this header file must not be included directly" +#endif // --- _Py_atomic_add -------------------------------------------------------- // Atomically adds `value` to `obj` and returns the previous value @@ -502,6 +501,3 @@ static inline void _Py_atomic_fence_release(void); #else # error "no available pyatomic implementation for this platform/compiler" #endif - -#endif /* Py_ATOMIC_H */ -#endif /* Py_LIMITED_API */ diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h index d88549aa0dd6ab..d6410cb1e2d4a5 100644 --- a/Include/internal/pycore_parking_lot.h +++ b/Include/internal/pycore_parking_lot.h @@ -19,7 +19,6 @@ extern "C" { #endif #include "pycore_time.h" // _PyTime_t -#include "cpython/pyatomic.h" enum { diff --git a/Include/pyatomic.h b/Include/pyatomic.h new file mode 100644 index 00000000000000..2ce2c81cf5251a --- /dev/null +++ b/Include/pyatomic.h @@ -0,0 +1,16 @@ +#ifndef Py_ATOMIC_H +#define Py_ATOMIC_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_LIMITED_API +# define Py_CPYTHON_ATOMIC_H +# include "cpython/pyatomic.h" +# undef Py_CPYTHON_ATOMIC_H +#endif + +#ifdef __cplusplus +} +#endif +#endif /* !Py_ATOMIC_H */ diff --git a/Modules/_testcapi/pyatomic.c b/Modules/_testcapi/pyatomic.c index 15602ce3f4ab3b..f0be2cfccccc98 100644 --- a/Modules/_testcapi/pyatomic.c +++ b/Modules/_testcapi/pyatomic.c @@ -8,7 +8,6 @@ #undef NDEBUG #include "Python.h" -#include "cpython/pyatomic.h" #include "parts.h" // We define atomic bitwise operations on these types diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index b35aeb58e11460..60126ac085e1f2 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -310,6 +310,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index c2430f00cb7e9a..20d3df3de1f31a 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -144,6 +144,9 @@ Include + + Include + Include diff --git a/configure b/configure index 1c9a3fe7d0e4fc..91408efb384de6 100755 --- a/configure +++ b/configure @@ -27793,7 +27793,7 @@ typedef intptr_t Py_ssize_t; # error "unable to define Py_ssize_t" #endif -#include "cpython/pyatomic.h" +#include "pyatomic.h" int main() { diff --git a/configure.ac b/configure.ac index fde8cfb97fcfa7..d1b7c7a1817d67 100644 --- a/configure.ac +++ b/configure.ac @@ -6993,7 +6993,7 @@ typedef intptr_t Py_ssize_t; # error "unable to define Py_ssize_t" #endif -#include "cpython/pyatomic.h" +#include "pyatomic.h" int main() { From 8d8035c65fb93634a788f94c23ec107fe137fff8 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 08:57:29 -0700 Subject: [PATCH 16/28] Use compound initializer in test_lock.c --- Modules/_testinternalcapi/test_lock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c index fe25417a2f315d..33b49dacaa946e 100644 --- a/Modules/_testinternalcapi/test_lock.c +++ b/Modules/_testinternalcapi/test_lock.c @@ -31,8 +31,7 @@ pysleep(int ms) static PyObject * test_lock_basic(PyObject *self, PyObject *obj) { - PyMutex m; - memset(&m, 0, sizeof(m)); + PyMutex m = (PyMutex){0}; // uncontended lock and unlock PyMutex_Lock(&m); From 417e75499481ee25b5f35e34ba39d8f071299a73 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 11:58:46 -0400 Subject: [PATCH 17/28] Apply suggestions from code review Co-authored-by: Eric Snow --- Include/internal/pycore_lock.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h index 61767b87c5494e..61a0874731e081 100644 --- a/Include/internal/pycore_lock.h +++ b/Include/internal/pycore_lock.h @@ -26,8 +26,7 @@ extern "C" { // 0b11: locked and has parked threads // // Typical initialization: -// PyMutex m; -// memset(&m, 0, sizeof(m)); +// PyMutex m = (PyMutex){0}; // // Typical usage: // PyMutex_Lock(&m); @@ -84,7 +83,7 @@ typedef enum _PyLockFlags { // Do not detach/release the GIL when waiting on the lock. _Py_LOCK_DONT_DETACH = 0, - // Detach/release the GIL when waiting on the lock. + // Detach/release the GIL while waiting on the lock. _PY_LOCK_DETACH = 1, // Handle signals if interrupted while waiting on the lock. From 3ee97b1e75cedd65c01e3e2a1b1d6db4104e22e7 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 14 Sep 2023 10:54:13 -0700 Subject: [PATCH 18/28] Changes from review: * Rename _PY_LOCK_MAKE_PENDING_CALLS to _PY_LOCK_HANDLE_SIGNALS * Use define instead of enum for PyMutex states (e.g., _Py_UNLOCKED) --- Include/internal/pycore_lock.h | 10 ++++------ Python/lock.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h index 61a0874731e081..c4bb76a40e7b12 100644 --- a/Include/internal/pycore_lock.h +++ b/Include/internal/pycore_lock.h @@ -36,11 +36,9 @@ typedef struct _PyMutex { uint8_t v; } PyMutex; -enum { - _Py_UNLOCKED = 0, - _Py_LOCKED = 1, - _Py_HAS_PARKED = 2, -}; +#define _Py_UNLOCKED 0 +#define _Py_LOCKED 1 +#define _Py_HAS_PARKED 2 // (private) slow path for locking the mutex PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m); @@ -87,7 +85,7 @@ typedef enum _PyLockFlags { _PY_LOCK_DETACH = 1, // Handle signals if interrupted while waiting on the lock. - _PY_LOCK_MAKE_PENDING_CALLS = 2, + _PY_LOCK_HANDLE_SIGNALS = 2, } _PyLockFlags; // Lock a mutex with an optional timeout and additional options. See diff --git a/Python/lock.c b/Python/lock.c index b01665db7d54a9..4ec76d11c33602 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -114,7 +114,7 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) return PY_LOCK_ACQUIRED; } } - else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_MAKE_PENDING_CALLS)) { + else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_HANDLE_SIGNALS)) { if (Py_MakePendingCalls() < 0) { return PY_LOCK_INTR; } From 95b0d871dd0d2825945187df79ccc2c15ac40734 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 05:33:40 -0700 Subject: [PATCH 19/28] Remove thread-local data in parking_lot.c Move _PySemaphore definition to its own header to avoid leaking platform details in parking_lot.h --- Include/internal/pycore_parking_lot.h | 27 +---- Include/internal/pycore_semaphore.h | 63 ++++++++++++ Makefile.pre.in | 1 + PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 6 ++ Python/lock.c | 11 ++- Python/parking_lot.c | 137 +++----------------------- Tools/c-analyzer/cpython/ignored.tsv | 1 - 8 files changed, 93 insertions(+), 154 deletions(-) create mode 100644 Include/internal/pycore_semaphore.h diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h index d6410cb1e2d4a5..29285d2b980ec8 100644 --- a/Include/internal/pycore_parking_lot.h +++ b/Include/internal/pycore_parking_lot.h @@ -18,7 +18,7 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif -#include "pycore_time.h" // _PyTime_t +#include "pycore_time.h" // _PyTime_t enum { @@ -108,34 +108,9 @@ _PyParkingLot_FinishUnpark(const void *address, struct _PyUnpark *unpark); // Unparks all threads waiting on `address`. PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address); -// Initialize/deinitialize the thread-local state used by parking lot. -void _PyParkingLot_InitThread(void); -void _PyParkingLot_DeinitThread(void); - // Resets the parking lot state after a fork. Forgets all parked threads. PyAPI_FUNC(void) _PyParkingLot_AfterFork(void); - -// The _PySemaphore API a simplified cross-platform semaphore used to implement -// parking lot. It is not intended to be used directly by other modules. -typedef struct _PySemaphore _PySemaphore; - -// Puts the current thread to sleep until _PySemaphore_Wakeup() is called. -// If `detach` is true, then the thread will detach/release the GIL while -// sleeping. -PyAPI_FUNC(int) -_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach); - -// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup() -// can be called before _PySemaphore_Wait(). -PyAPI_FUNC(void) -_PySemaphore_Wakeup(_PySemaphore *sema); - -// Allocates/releases a semaphore from the thread-local pool. -PyAPI_FUNC(_PySemaphore *) _PySemaphore_Alloc(void); -PyAPI_FUNC(void) _PySemaphore_Free(_PySemaphore *sema); - - #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_semaphore.h b/Include/internal/pycore_semaphore.h new file mode 100644 index 00000000000000..c1df8333629066 --- /dev/null +++ b/Include/internal/pycore_semaphore.h @@ -0,0 +1,63 @@ +// The _PySemaphore API a simplified cross-platform semaphore used to implement +// wakeup/sleep. +#ifndef Py_INTERNAL_SEMAPHORE_H +#define Py_INTERNAL_SEMAPHORE_H + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "pycore_time.h" // _PyTime_t + +#ifdef MS_WINDOWS +# define WIN32_LEAN_AND_MEAN +# include +#elif defined(HAVE_PTHREAD_H) +# include +#elif defined(HAVE_PTHREAD_STUBS) +# include "cpython/pthread_stubs.h" +#else +# error "Require native threads. See https://bugs.python.org/issue31370" +#endif + +#if defined(_POSIX_SEMAPHORES) && (_POSIX_SEMAPHORES+0) != -1 +# define _Py_USE_SEMAPHORES +# include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _PySemaphore { +#if defined(MS_WINDOWS) + HANDLE platform_sem; +#elif defined(_Py_USE_SEMAPHORES) + sem_t platform_sem; +#else + pthread_mutex_t mutex; + pthread_cond_t cond; + int counter; +#endif +} _PySemaphore; + +// Puts the current thread to sleep until _PySemaphore_Wakeup() is called. +// If `detach` is true, then the thread will detach/release the GIL while +// sleeping. +PyAPI_FUNC(int) +_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach); + +// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup() +// can be called before _PySemaphore_Wait(). +PyAPI_FUNC(void) +_PySemaphore_Wakeup(_PySemaphore *sema); + +// Initializes/destroys a semaphore +PyAPI_FUNC(void) _PySemaphore_Init(_PySemaphore *sema); +PyAPI_FUNC(void) _PySemaphore_Destroy(_PySemaphore *sema); + + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_SEMAPHORE_H */ diff --git a/Makefile.pre.in b/Makefile.pre.in index e634d3f30f857e..ce72f9cee86eef 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1810,6 +1810,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_runtime.h \ $(srcdir)/Include/internal/pycore_runtime_init_generated.h \ $(srcdir)/Include/internal/pycore_runtime_init.h \ + $(srcdir)/Include/internal/pycore_semaphore.h \ $(srcdir)/Include/internal/pycore_setobject.h \ $(srcdir)/Include/internal/pycore_signal.h \ $(srcdir)/Include/internal/pycore_sliceobject.h \ diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 60126ac085e1f2..190eaa16daa8af 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -272,6 +272,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 20d3df3de1f31a..f4fddfdd11f4c1 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -681,6 +681,9 @@ Include\internal + + Include\internal + Include\internal @@ -726,6 +729,9 @@ Include\internal + + Include\internal + Include\internal diff --git a/Python/lock.c b/Python/lock.c index 4ec76d11c33602..d220990e8719df 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -4,6 +4,7 @@ #include "pycore_lock.h" #include "pycore_parking_lot.h" +#include "pycore_semaphore.h" #ifdef MS_WINDOWS #define WIN32_LEAN_AND_MEAN @@ -185,14 +186,14 @@ _PyMutex_UnlockSlow(PyMutex *m) // thread waiting on the mutex, directly in the mutex itself. struct raw_mutex_entry { struct raw_mutex_entry *next; - _PySemaphore *sema; + _PySemaphore sema; }; void _PyRawMutex_LockSlow(_PyRawMutex *m) { struct raw_mutex_entry waiter; - waiter.sema = _PySemaphore_Alloc(); + _PySemaphore_Init(&waiter.sema); uintptr_t v = _Py_atomic_load_uintptr(&m->v); for (;;) { @@ -213,10 +214,10 @@ _PyRawMutex_LockSlow(_PyRawMutex *m) // Wait for us to be woken up. Note that we still have to lock the // mutex ourselves: it is NOT handed off to us. - _PySemaphore_Wait(waiter.sema, -1, /*detach=*/0); + _PySemaphore_Wait(&waiter.sema, -1, /*detach=*/0); } - _PySemaphore_Free(waiter.sema); + _PySemaphore_Destroy(&waiter.sema); } void @@ -232,7 +233,7 @@ _PyRawMutex_UnlockSlow(_PyRawMutex *m) if (waiter) { uintptr_t next_waiter = (uintptr_t)waiter->next; if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, next_waiter)) { - _PySemaphore_Wakeup(waiter->sema); + _PySemaphore_Wakeup(&waiter->sema); return; } } diff --git a/Python/parking_lot.c b/Python/parking_lot.c index a0542e1973ff0d..d90f7affed94ef 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -5,35 +5,8 @@ #include "pycore_parking_lot.h" #include "pycore_pyerrors.h" // _Py_FatalErrorFormat #include "pycore_pystate.h" // _PyThreadState_GET +#include "pycore_semaphore.h" // _PySemaphore -#ifdef MS_WINDOWS -# define WIN32_LEAN_AND_MEAN -# include -#elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \ - defined(HAVE_SEM_TIMEDWAIT)) -# define USE_SEMAPHORES -# include -#elif defined(HAVE_PTHREAD_H) -# include -#elif defined(HAVE_PTHREAD_STUBS) -# include "cpython/pthread_stubs.h" -#else -# error "Require native threads. See https://bugs.python.org/issue31370" -#endif - -// A simple, cross-platform binary semaphore that can be used to implement -// wakeup/sleep. -struct _PySemaphore { -#if defined(MS_WINDOWS) - HANDLE platform_sem; -#elif defined(USE_SEMAPHORES) - sem_t platform_sem; -#else - PyMUTEX_T mutex; - PyCOND_T cond; - int counter; -#endif -}; typedef struct { // The mutex protects the waiter queue and the num_waiters counter. @@ -47,31 +20,16 @@ typedef struct { struct wait_entry { struct _PyUnpark unpark; uintptr_t addr; - _PySemaphore *sema; + _PySemaphore sema; struct llist_node node; }; -#define MAX_SEMA_DEPTH 3 - -typedef struct { - Py_ssize_t refcount; - - int depth; - _PySemaphore semas[MAX_SEMA_DEPTH]; -} ThreadData; - #define NUM_BUCKETS 251 // Table of waiters (hashed by address) static Bucket buckets[NUM_BUCKETS]; -#ifdef HAVE_THREAD_LOCAL -static _Py_thread_local ThreadData *thread_data = NULL; -#else -#error "no supported thread-local variable storage classifier" -#endif - -static void +void _PySemaphore_Init(_PySemaphore *sema) { #if defined(MS_WINDOWS) @@ -84,7 +42,7 @@ _PySemaphore_Init(_PySemaphore *sema) if (!sema->platform_sem) { Py_FatalError("parking_lot: CreateSemaphore failed"); } -#elif defined(USE_SEMAPHORES) +#elif defined(_Py_USE_SEMAPHORES) if (sem_init(&sema->platform_sem, /*pshared=*/0, /*value=*/0) < 0) { Py_FatalError("parking_lot: sem_init failed"); } @@ -95,15 +53,16 @@ _PySemaphore_Init(_PySemaphore *sema) if (pthread_cond_init(&sema->cond, NULL)) { Py_FatalError("parking_lot: pthread_cond_init failed"); } + sema->counter = 0; #endif } -static void +void _PySemaphore_Destroy(_PySemaphore *sema) { #if defined(MS_WINDOWS) CloseHandle(sema->platform_sem); -#elif defined(USE_SEMAPHORES) +#elif defined(_Py_USE_SEMAPHORES) sem_destroy(&sema->platform_sem); #else pthread_mutex_destroy(&sema->mutex); @@ -131,7 +90,7 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) else if (wait == WAIT_TIMEOUT) { res = Py_PARK_TIMEOUT; } -#elif defined(USE_SEMAPHORES) +#elif defined(_Py_USE_SEMAPHORES) int err; if (timeout >= 0) { struct timespec ts; @@ -218,7 +177,7 @@ _PySemaphore_Wakeup(_PySemaphore *sema) if (!ReleaseSemaphore(sema->platform_sem, 1, NULL)) { Py_FatalError("parking_lot: ReleaseSemaphore failed"); } -#elif defined(USE_SEMAPHORES) +#elif defined(_Py_USE_SEMAPHORES) int err = sem_post(&sema->platform_sem); if (err != 0) { Py_FatalError("parking_lot: sem_post failed"); @@ -231,72 +190,6 @@ _PySemaphore_Wakeup(_PySemaphore *sema) #endif } -_PySemaphore * -_PySemaphore_Alloc(void) -{ - // Make sure we have a valid thread_data. We need to acquire - // some locks before we have a fully initialized PyThreadState. - _PyParkingLot_InitThread(); - - ThreadData *this_thread = thread_data; - if (this_thread->depth >= MAX_SEMA_DEPTH) { - Py_FatalError("_PySemaphore_Alloc(): too many calls"); - } - return &this_thread->semas[this_thread->depth++]; -} - -void -_PySemaphore_Free(_PySemaphore *sema) -{ - ThreadData *this_thread = thread_data; - this_thread->depth--; - if (&this_thread->semas[this_thread->depth] != sema) { - Py_FatalError("_PySemaphore_Free(): mismatch wakeup"); - } - _PyParkingLot_DeinitThread(); -} - -void -_PyParkingLot_InitThread(void) -{ - if (thread_data != NULL) { - thread_data->refcount++; - return; - } - ThreadData *this_thread = PyMem_RawMalloc(sizeof(ThreadData)); - if (this_thread == NULL) { - Py_FatalError("_PyParkingLot_InitThread: unable to allocate thread data"); - } - memset(this_thread, 0, sizeof(*this_thread)); - this_thread->refcount = 1; - this_thread->depth = 0; - for (int i = 0; i < MAX_SEMA_DEPTH; i++) { - _PySemaphore_Init(&this_thread->semas[i]); - } - thread_data = this_thread; -} - -void -_PyParkingLot_DeinitThread(void) -{ - ThreadData *td = thread_data; - if (td == NULL) { - return; - } - - if (--td->refcount != 0) { - assert(td->refcount > 0); - return; - } - - thread_data = NULL; - for (int i = 0; i < MAX_SEMA_DEPTH; i++) { - _PySemaphore_Destroy(&td->semas[i]); - } - - PyMem_RawFree(td); -} - static void enqueue(Bucket *bucket, const void *address, struct wait_entry *wait) { @@ -379,11 +272,11 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size, _PyRawMutex_Unlock(&bucket->mutex); return Py_PARK_AGAIN; } - wait.sema = _PySemaphore_Alloc(); + _PySemaphore_Init(&wait.sema); enqueue(bucket, addr, &wait); _PyRawMutex_Unlock(&bucket->mutex); - int res = _PySemaphore_Wait(wait.sema, timeout_ns, detach); + int res = _PySemaphore_Wait(&wait.sema, timeout_ns, detach); if (res == Py_PARK_OK) { goto done; } @@ -395,7 +288,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size, // We've been removed the waiter queue. Wait until we process the // wakeup signal. do { - res = _PySemaphore_Wait(wait.sema, -1, detach); + res = _PySemaphore_Wait(&wait.sema, -1, detach); } while (res != Py_PARK_OK); goto done; } @@ -406,7 +299,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size, _PyRawMutex_Unlock(&bucket->mutex); done: - _PySemaphore_Free(wait.sema); + _PySemaphore_Destroy(&wait.sema); return res; } @@ -439,7 +332,7 @@ _PyParkingLot_FinishUnpark(const void *addr, struct _PyUnpark *unpark) struct wait_entry *waiter; waiter = container_of(unpark, struct wait_entry, unpark); - _PySemaphore_Wakeup(waiter->sema); + _PySemaphore_Wakeup(&waiter->sema); } } @@ -457,7 +350,7 @@ _PyParkingLot_UnparkAll(const void *addr) llist_for_each_safe(node, &head) { struct wait_entry *waiter = llist_data(node, struct wait_entry, node); llist_remove(node); - _PySemaphore_Wakeup(waiter->sema); + _PySemaphore_Wakeup(&waiter->sema); } } diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index 0e0c9980958efb..841cfc72fe631f 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -176,7 +176,6 @@ Python/pyfpe.c - PyFPE_counter - Python/import.c - pkgcontext - Python/pystate.c - _Py_tss_tstate - -Python/parking_lot.c - thread_data - ##----------------------- ## should be const From 3d1d2e166be8caf7b52e0dc06aac1276113f5d16 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 06:34:24 -0700 Subject: [PATCH 20/28] Add doc for NUM_BUCKETS --- Python/parking_lot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Python/parking_lot.c b/Python/parking_lot.c index d90f7affed94ef..fdf4a10fa25ff9 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -24,6 +24,10 @@ struct wait_entry { struct llist_node node; }; +// Prime number to avoid correlations with memory addresses. +// We want this to be roughly proportional to the number of CPU cores +// to minimize contention on the bucket locks, but not too big to avoid +// wasting memory. The exact choice does not matter much. #define NUM_BUCKETS 251 // Table of waiters (hashed by address) From e043a8e6e1a41cc0d9c43ed096169b77030fd95b Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 08:50:26 -0700 Subject: [PATCH 21/28] Make use of Py_PARK_INTR more explicit in _PySemaphore_PlatformWait --- Python/parking_lot.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Python/parking_lot.c b/Python/parking_lot.c index fdf4a10fa25ff9..1c3bab5331e145 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -77,7 +77,7 @@ _PySemaphore_Destroy(_PySemaphore *sema) static int _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) { - int res = Py_PARK_INTR; + int res; #if defined(MS_WINDOWS) DWORD wait; DWORD millis = 0; @@ -94,6 +94,9 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) else if (wait == WAIT_TIMEOUT) { res = Py_PARK_TIMEOUT; } + else { + res = Py_PARK_INTR; + } #elif defined(_Py_USE_SEMAPHORES) int err; if (timeout >= 0) { @@ -126,8 +129,8 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) } #else pthread_mutex_lock(&sema->mutex); + int err = 0; if (sema->counter == 0) { - int err; if (timeout >= 0) { struct timespec ts; @@ -139,14 +142,17 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) else { err = pthread_cond_wait(&sema->cond, &sema->mutex); } - if (err) { - res = Py_PARK_TIMEOUT; - } } if (sema->counter > 0) { sema->counter--; res = Py_PARK_OK; } + else if (err) { + res = Py_PARK_TIMEOUT; + } + else { + res = Py_PARK_INTR; + } pthread_mutex_unlock(&sema->mutex); #endif return res; From 6b1d8f968e870ac4ae35622901272f1535d5b3ce Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 08:59:11 -0700 Subject: [PATCH 22/28] Statically initialize buckets in parking_lot.c --- Python/parking_lot.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/Python/parking_lot.c b/Python/parking_lot.c index 1c3bab5331e145..a9dbca3b57fbeb 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -28,10 +28,23 @@ struct wait_entry { // We want this to be roughly proportional to the number of CPU cores // to minimize contention on the bucket locks, but not too big to avoid // wasting memory. The exact choice does not matter much. -#define NUM_BUCKETS 251 +#define NUM_BUCKETS 257 + +#define BUCKET_INIT(b, i) [i] = { .root = LLIST_INIT(b[i].root) } +#define BUCKET_INIT_2(b, i) BUCKET_INIT(b, i), BUCKET_INIT(b, i+1) +#define BUCKET_INIT_4(b, i) BUCKET_INIT_2(b, i), BUCKET_INIT_2(b, i+2) +#define BUCKET_INIT_8(b, i) BUCKET_INIT_4(b, i), BUCKET_INIT_4(b, i+4) +#define BUCKET_INIT_16(b, i) BUCKET_INIT_8(b, i), BUCKET_INIT_8(b, i+8) +#define BUCKET_INIT_32(b, i) BUCKET_INIT_16(b, i), BUCKET_INIT_16(b, i+16) +#define BUCKET_INIT_64(b, i) BUCKET_INIT_32(b, i), BUCKET_INIT_32(b, i+32) +#define BUCKET_INIT_128(b, i) BUCKET_INIT_64(b, i), BUCKET_INIT_64(b, i+64) +#define BUCKET_INIT_256(b, i) BUCKET_INIT_128(b, i), BUCKET_INIT_128(b, i+128) // Table of waiters (hashed by address) -static Bucket buckets[NUM_BUCKETS]; +static Bucket buckets[NUM_BUCKETS] = { + BUCKET_INIT_256(buckets, 0), + BUCKET_INIT(buckets, 256), +}; void _PySemaphore_Init(_PySemaphore *sema) @@ -203,10 +216,6 @@ _PySemaphore_Wakeup(_PySemaphore *sema) static void enqueue(Bucket *bucket, const void *address, struct wait_entry *wait) { - if (!bucket->root.next) { - // initialize bucket - llist_init(&bucket->root); - } llist_insert_tail(&bucket->root, &wait->node); ++bucket->num_waiters; } @@ -214,13 +223,8 @@ enqueue(Bucket *bucket, const void *address, struct wait_entry *wait) static struct wait_entry * dequeue(Bucket *bucket, const void *address) { - struct llist_node *root = &bucket->root; - if (!root->next) { - // bucket was not yet initialized - return NULL; - } - // find the first waiter that is waiting on `address` + struct llist_node *root = &bucket->root; struct llist_node *node; llist_for_each(node, root) { struct wait_entry *wait = llist_data(node, struct wait_entry, node); @@ -236,13 +240,8 @@ dequeue(Bucket *bucket, const void *address) static void dequeue_all(Bucket *bucket, const void *address, struct llist_node *dst) { - struct llist_node *root = &bucket->root; - if (!root->next) { - // bucket was not yet initialized - return; - } - // remove and append all matching waiters to dst + struct llist_node *root = &bucket->root; struct llist_node *node; llist_for_each_safe(node, root) { struct wait_entry *wait = llist_data(node, struct wait_entry, node); @@ -370,4 +369,7 @@ _PyParkingLot_AfterFork(void) // After a fork only one thread remains. That thread cannot be blocked // so all entries in the parking lot are for dead threads. memset(buckets, 0, sizeof(buckets)); + for (Py_ssize_t i = 0; i < NUM_BUCKETS; i++) { + llist_init(&buckets[i].root); + } } From 1af0bffdd340c851b4bddc2a41f81fe195462166 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 09:07:00 -0700 Subject: [PATCH 23/28] Rename validate_addr to atomic_memcmp --- Python/parking_lot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/parking_lot.c b/Python/parking_lot.c index a9dbca3b57fbeb..e45319a0ebe17b 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -253,9 +253,9 @@ dequeue_all(Bucket *bucket, const void *address, struct llist_node *dst) } } -// Checks that `*addr == *expected` +// Checks that `*addr == *expected` (only works for 1, 2, 4, or 8 bytes) static int -validate_addr(const void *addr, const void *expected, size_t addr_size) +atomic_memcmp(const void *addr, const void *expected, size_t addr_size) { switch (addr_size) { case 1: return _Py_atomic_load_uint8(addr) == *(const uint8_t *)expected; @@ -277,7 +277,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size, Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; _PyRawMutex_Lock(&bucket->mutex); - if (!validate_addr(addr, expected, size)) { + if (!atomic_memcmp(addr, expected, size)) { _PyRawMutex_Unlock(&bucket->mutex); return Py_PARK_AGAIN; } From f2073d00ddbde9de6812cb9a623870280778d34e Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 09:18:57 -0700 Subject: [PATCH 24/28] Add is_unparking to struct wait_entry in parking_lot.c --- Python/parking_lot.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Python/parking_lot.c b/Python/parking_lot.c index e45319a0ebe17b..33247fb5b38afe 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -7,6 +7,8 @@ #include "pycore_pystate.h" // _PyThreadState_GET #include "pycore_semaphore.h" // _PySemaphore +#include + typedef struct { // The mutex protects the waiter queue and the num_waiters counter. @@ -22,6 +24,7 @@ struct wait_entry { uintptr_t addr; _PySemaphore sema; struct llist_node node; + bool is_unparking; }; // Prime number to avoid correlations with memory addresses. @@ -270,9 +273,11 @@ int _PyParkingLot_Park(const void *addr, const void *expected, size_t size, _PyTime_t timeout_ns, void *arg, int detach) { - struct wait_entry wait; - wait.unpark.arg = arg; - wait.addr = (uintptr_t)addr; + struct wait_entry wait = { + .unpark.arg = arg, + .addr = (uintptr_t)addr, + .is_unparking = false, + }; Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; @@ -292,9 +297,9 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size, // timeout or interrupt _PyRawMutex_Lock(&bucket->mutex); - if (wait.node.next == NULL) { + if (wait.is_unparking) { _PyRawMutex_Unlock(&bucket->mutex); - // We've been removed the waiter queue. Wait until we process the + // Another thread has started to unpark us. Wait until we process the // wakeup signal. do { res = _PySemaphore_Wait(&wait.sema, -1, detach); @@ -324,6 +329,7 @@ _PyParkingLot_BeginUnpark(const void *addr) return NULL; } + waiter->is_unparking = true; waiter->unpark.has_more_waiters = (bucket->num_waiters > 0); return &waiter->unpark; } From 8f1645b6818a0028951a97ce87216539c44cf6c1 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 10:00:14 -0700 Subject: [PATCH 25/28] Use callback in _PyParkingLot_Unpark --- Include/internal/pycore_parking_lot.h | 64 ++++++++++----------------- Python/lock.c | 37 +++++++++------- Python/parking_lot.c | 42 +++++++----------- 3 files changed, 60 insertions(+), 83 deletions(-) diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h index 29285d2b980ec8..f444da730055e8 100644 --- a/Include/internal/pycore_parking_lot.h +++ b/Include/internal/pycore_parking_lot.h @@ -37,8 +37,8 @@ enum { // Checks that `*address == *expected` and puts the thread to sleep until an // unpark operation is called on the same `address`. Otherwise, the function -// returns `Py_PARK_AGAIN`. The comparison is performed atomically -// with respect to unpark operations. +// returns `Py_PARK_AGAIN`. The comparison behaves like memcmp, but is +// performed atomically with respect to unpark operations. // // The `address_size` argument is the size of the data pointed to by the // `address` and `expected` pointers (i.e., sizeof(*address)). It must be @@ -47,7 +47,7 @@ enum { // The `timeout_ns` argument specifies the maximum amount of time to wait, with // -1 indicating an infinite wait. // -// `arg`, which can be NULL, is passed to the unpark operation. +// `park_arg`, which can be NULL, is passed to the unpark operation. // // If `detach` is true, then the thread will detach/release the GIL while // waiting. @@ -62,48 +62,30 @@ enum { PyAPI_FUNC(int) _PyParkingLot_Park(const void *address, const void *expected, size_t address_size, _PyTime_t timeout_ns, - void *arg, int detach); + void *park_arg, int detach); -struct _PyUnpark { - // The `arg` value passed to _PyParkingLot_Park(). - void *arg; - - // Are there more threads waiting on the address? May be true in cases - // where threads are waiting on a different address that maps to the same - // internal bucket. - int has_more_waiters; -}; - -// Unpark a single thread waiting on `address`. +// Callback for _PyParkingLot_Unpark: // -// The `unpark` is a pointer to a `struct _PyUnpark`. +// `arg` is the data of the same name provided to the _PyParkingLot_Unpark() +// call. +// `park_arg` is the data provided to _PyParkingLot_Park() call or NULL if +// no waiting thread was found. +// `has_more_waiters` is true if there are more threads waiting on the same +// address. May be true in cases where threads are waiting on a different +// address that map to the same internal bucket. +typedef void _Py_unpark_fn_t(void *arg, void *park_arg, int has_more_waiters); + +// Unparks a single thread waiting on `address`. // -// Usage: -// _PyParkingLot_Unpark(address, unpark, { -// if (unpark) { -// void *arg = unpark->arg; -// int has_more_waiters = unpark->has_more_waiters; -// ... -// } -// }); -#define _PyParkingLot_Unpark(address, unpark, ...) \ - do { \ - struct _PyUnpark *(unpark); \ - unpark = _PyParkingLot_BeginUnpark((address)); \ - __VA_ARGS__ \ - _PyParkingLot_FinishUnpark((address), unpark); \ - } while (0); - -// Implements half of an unpark operation. -// Prefer using the _PyParkingLot_Unpark() macro. -PyAPI_FUNC(struct _PyUnpark *) -_PyParkingLot_BeginUnpark(const void *address); - -// Finishes the unpark operation and wakes up the thread selected by -// _PyParkingLot_BeginUnpark. -// Prefer using the _PyParkingLot_Unpark() macro. +// Note that fn() is called regardless of whether a thread was unparked. If +// no threads are waiting on `address` then the `park_arg` argument to fn() +// will be NULL. +// +// Example usage: +// void callback(void *arg, void *park_arg, int has_more_waiters); +// _PyParkingLot_Unpark(address, &callback, arg); PyAPI_FUNC(void) -_PyParkingLot_FinishUnpark(const void *address, struct _PyUnpark *unpark); +_PyParkingLot_Unpark(const void *address, _Py_unpark_fn_t *fn, void *arg); // Unparks all threads waiting on `address`. PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address); diff --git a/Python/lock.c b/Python/lock.c index d220990e8719df..da364f37a6f057 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -137,6 +137,25 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) } } +static void +mutex_unpark(PyMutex *m, struct mutex_entry *entry, int has_more_waiters) +{ + uint8_t v = 0; + if (entry) { + _PyTime_t now = _PyTime_GetMonotonicClock(); + int should_be_fair = now > entry->time_to_be_fair; + + entry->handoff = should_be_fair; + if (should_be_fair) { + v |= _Py_LOCKED; + } + if (has_more_waiters) { + v |= _Py_HAS_PARKED; + } + } + _Py_atomic_store_uint8(&m->v, v); +} + int _PyMutex_TryUnlock(PyMutex *m) { @@ -148,23 +167,7 @@ _PyMutex_TryUnlock(PyMutex *m) } else if ((v & _Py_HAS_PARKED)) { // wake up a single thread - _PyParkingLot_Unpark(&m->v, unpark, { - v = 0; - if (unpark) { - struct mutex_entry *entry = unpark->arg; - _PyTime_t now = _PyTime_GetMonotonicClock(); - int should_be_fair = now > entry->time_to_be_fair; - - entry->handoff = should_be_fair; - if (should_be_fair) { - v |= _Py_LOCKED; - } - if (unpark->has_more_waiters) { - v |= _Py_HAS_PARKED; - } - } - _Py_atomic_store_uint8(&m->v, v); - }); + _PyParkingLot_Unpark(&m->v, (_Py_unpark_fn_t *)mutex_unpark, m); return 0; } else if (_Py_atomic_compare_exchange_uint8(&m->v, &v, _Py_UNLOCKED)) { diff --git a/Python/parking_lot.c b/Python/parking_lot.c index 33247fb5b38afe..170108b0edb6ae 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -20,7 +20,7 @@ typedef struct { } Bucket; struct wait_entry { - struct _PyUnpark unpark; + void *park_arg; uintptr_t addr; _PySemaphore sema; struct llist_node node; @@ -271,10 +271,10 @@ atomic_memcmp(const void *addr, const void *expected, size_t addr_size) int _PyParkingLot_Park(const void *addr, const void *expected, size_t size, - _PyTime_t timeout_ns, void *arg, int detach) + _PyTime_t timeout_ns, void *park_arg, int detach) { struct wait_entry wait = { - .unpark.arg = arg, + .park_arg = park_arg, .addr = (uintptr_t)addr, .is_unparking = false, }; @@ -318,35 +318,27 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size, } -struct _PyUnpark * -_PyParkingLot_BeginUnpark(const void *addr) +void +_PyParkingLot_Unpark(const void *addr, _Py_unpark_fn_t *fn, void *arg) { Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; - _PyRawMutex_Lock(&bucket->mutex); + // Find the first waiter that is waiting on `addr` + _PyRawMutex_Lock(&bucket->mutex); struct wait_entry *waiter = dequeue(bucket, addr); - if (!waiter) { - return NULL; - } - - waiter->is_unparking = true; - waiter->unpark.has_more_waiters = (bucket->num_waiters > 0); - return &waiter->unpark; -} - -#define container_of(ptr, type, member) \ - ((type *)((char *)(ptr) - offsetof(type, member))) + if (waiter) { + waiter->is_unparking = true; -void -_PyParkingLot_FinishUnpark(const void *addr, struct _PyUnpark *unpark) -{ - Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS]; + int has_more_waiters = (bucket->num_waiters > 0); + fn(arg, waiter->park_arg, has_more_waiters); + } + else { + fn(arg, NULL, 0); + } _PyRawMutex_Unlock(&bucket->mutex); - if (unpark) { - struct wait_entry *waiter; - waiter = container_of(unpark, struct wait_entry, unpark); - + if (waiter) { + // Wakeup the waiter outside of the bucket lock _PySemaphore_Wakeup(&waiter->sema); } } From a0261c16511566c8a80381ee5b50fd825d21460a Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 10:12:28 -0700 Subject: [PATCH 26/28] Simplify _PySemaphore_Wait. Release the GIL if `detach` is `1` and there is a current thread state. It's the caller's responsibility to make sure that if there is a valid thread state (and `detach` is 1), then the current thread holds the GIL. --- Python/parking_lot.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/Python/parking_lot.c b/Python/parking_lot.c index 170108b0edb6ae..664e622cc17474 100644 --- a/Python/parking_lot.c +++ b/Python/parking_lot.c @@ -177,21 +177,18 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout) int _PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout, int detach) { - PyThreadState *tstate = _PyThreadState_GET(); - int was_attached = 0; - if (tstate) { - was_attached = (tstate->_status.active); - if (was_attached && detach) { + PyThreadState *tstate = NULL; + if (detach) { + tstate = _PyThreadState_GET(); + if (tstate) { PyEval_ReleaseThread(tstate); } } int res = _PySemaphore_PlatformWait(sema, timeout); - if (tstate) { - if (was_attached && detach) { - PyEval_AcquireThread(tstate); - } + if (detach && tstate) { + PyEval_AcquireThread(tstate); } return res; } From 6d4565de6c389d1b62a62ca9360904ea650743f6 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 15 Sep 2023 14:51:32 -0700 Subject: [PATCH 27/28] check-c-globals: increase limit for parking_lot.c --- Tools/c-analyzer/cpython/_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index 90334d0e79da80..4523b2ed5b9fdf 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -314,6 +314,7 @@ def clean_lines(text): _abs('Objects/stringlib/unicode_format.h'): (10_000, 400), _abs('Objects/typeobject.c'): (35_000, 200), _abs('Python/compile.c'): (20_000, 500), + _abs('Python/parking_lot.c'): (40_000, 1000), _abs('Python/pylifecycle.c'): (500_000, 5000), _abs('Python/pystate.c'): (500_000, 5000), From a8ce6be7026f14f9f8e788bc2380eb58cd8ad7ef Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Mon, 18 Sep 2023 06:43:45 -0700 Subject: [PATCH 28/28] Changes from review --- Python/lock.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Python/lock.c b/Python/lock.c index da364f37a6f057..3dad2aa93b5cc9 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -28,12 +28,12 @@ static const int MAX_SPIN_COUNT = 0; #endif struct mutex_entry { - // The time at which the thread should be handed off the lock. Written by - // the waiting thread. + // The time after which the unlocking thread should hand off lock ownership + // directly to the waiting thread. Written by the waiting thread. _PyTime_t time_to_be_fair; // Set to 1 if the lock was handed off. Written by the unlocking thread. - int handoff; + int handed_off; }; static void @@ -56,7 +56,7 @@ PyLockStatus _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) { uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v); - if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + if ((v & _Py_LOCKED) == 0) { if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) { return PY_LOCK_ACQUIRED; } @@ -73,12 +73,12 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) struct mutex_entry entry = { .time_to_be_fair = now + TIME_TO_BE_FAIR_NS, - .handoff = 0, + .handed_off = 0, }; Py_ssize_t spin_count = 0; for (;;) { - if (!(v & _Py_LOCKED)) { + if ((v & _Py_LOCKED) == 0) { // The lock is unlocked. Try to grab it. if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) { return PY_LOCK_ACQUIRED; @@ -109,7 +109,7 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags) int ret = _PyParkingLot_Park(&m->v, &newv, sizeof(newv), timeout, &entry, (flags & _PY_LOCK_DETACH) != 0); if (ret == Py_PARK_OK) { - if (entry.handoff) { + if (entry.handed_off) { // We own the lock now. assert(_Py_atomic_load_uint8_relaxed(&m->v) & _Py_LOCKED); return PY_LOCK_ACQUIRED; @@ -145,7 +145,7 @@ mutex_unpark(PyMutex *m, struct mutex_entry *entry, int has_more_waiters) _PyTime_t now = _PyTime_GetMonotonicClock(); int should_be_fair = now > entry->time_to_be_fair; - entry->handoff = should_be_fair; + entry->handed_off = should_be_fair; if (should_be_fair) { v |= _Py_LOCKED; } @@ -161,7 +161,7 @@ _PyMutex_TryUnlock(PyMutex *m) { uint8_t v = _Py_atomic_load_uint8(&m->v); for (;;) { - if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + if ((v & _Py_LOCKED) == 0) { // error: the mutex is not locked return -1; } @@ -200,7 +200,7 @@ _PyRawMutex_LockSlow(_PyRawMutex *m) uintptr_t v = _Py_atomic_load_uintptr(&m->v); for (;;) { - if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + if ((v & _Py_LOCKED) == 0) { // Unlocked: try to grab it (even if it has a waiter). if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, v|_Py_LOCKED)) { break; @@ -228,7 +228,7 @@ _PyRawMutex_UnlockSlow(_PyRawMutex *m) { uintptr_t v = _Py_atomic_load_uintptr(&m->v); for (;;) { - if ((v & _Py_LOCKED) == _Py_UNLOCKED) { + if ((v & _Py_LOCKED) == 0) { Py_FatalError("unlocking mutex that is not locked"); }