From 410840a806ea462c94d7c3baaad1a963138ce6ed Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 24 Aug 2023 12:57:04 -0700
Subject: [PATCH 01/28] gh-108724: Add PyMutex and _PyParkingLot APIs

`PyMutex` is a one byte lock with fast, inlineable lock and unlock
functions for the common uncontended case. The design is based on
WebKit's `WTF::Lock`.

PyMutex is built using the `_PyParkingLot` APIs, which provides a
cross-platform futex-like API (based on WebKit's `WTF::ParkingLot`).
This internal API will be used for building other synchronization
primitives used to implement PEP 703, such as one-time initialization
and events.
---
 Include/Python.h                              |   1 +
 Include/cpython/pyatomic.h                    |   3 +-
 Include/internal/pycore_llist.h               | 107 ++++
 Include/internal/pycore_lock.h                | 161 ++++++
 Include/internal/pycore_parking_lot.h         | 129 +++++
 Lib/test/test_capi/test_lock.py               |  15 +
 Makefile.pre.in                               |   5 +
 Modules/Setup.stdlib.in                       |   2 +-
 Modules/_testcapi/parts.h                     |   1 +
 Modules/_testinternalcapi.c                   |   3 +
 .../_testinternalcapi/clinic/test_lock.c.h    |  74 +++
 Modules/_testinternalcapi/parts.h             |   1 +
 Modules/_testinternalcapi/test_lock.c         | 353 +++++++++++++
 PCbuild/_testinternalcapi.vcxproj             |   1 +
 PCbuild/_testinternalcapi.vcxproj.filters     |   3 +
 PCbuild/pythoncore.vcxproj                    |   5 +
 PCbuild/pythoncore.vcxproj.filters            |  12 +
 Python/lock.c                                 | 292 +++++++++++
 Python/parking_lot.c                          | 466 ++++++++++++++++++
 Python/pystate.c                              |   5 +
 Tools/lockbench/lockbench.py                  |  53 ++
 21 files changed, 1690 insertions(+), 2 deletions(-)
 create mode 100644 Include/internal/pycore_llist.h
 create mode 100644 Include/internal/pycore_lock.h
 create mode 100644 Include/internal/pycore_parking_lot.h
 create mode 100644 Lib/test/test_capi/test_lock.py
 create mode 100644 Modules/_testinternalcapi/clinic/test_lock.c.h
 create mode 100644 Modules/_testinternalcapi/test_lock.c
 create mode 100644 Python/lock.c
 create mode 100644 Python/parking_lot.c
 create mode 100644 Tools/lockbench/lockbench.py

diff --git a/Include/Python.h b/Include/Python.h
index 8b28200000ab56..d6429856b4b9f4 100644
--- a/Include/Python.h
+++ b/Include/Python.h
@@ -48,6 +48,7 @@
 #include "pytypedefs.h"
 #include "pybuffer.h"
 #include "pystats.h"
+#include "cpython/pyatomic.h"
 #include "object.h"
 #include "objimpl.h"
 #include "typeslots.h"
diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index 73712db847087d..066a969e0e486a 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -83,6 +83,7 @@
 //       # release
 //       ...
 
+#ifndef Py_LIMITED_API
 #ifndef Py_ATOMIC_H
 #define Py_ATOMIC_H
 
@@ -503,4 +504,4 @@ static inline void _Py_atomic_fence_release(void);
 #endif
 
 #endif  /* Py_ATOMIC_H */
-
+#endif  /* Py_LIMITED_API */
diff --git a/Include/internal/pycore_llist.h b/Include/internal/pycore_llist.h
new file mode 100644
index 00000000000000..5fd261da05fa5d
--- /dev/null
+++ b/Include/internal/pycore_llist.h
@@ -0,0 +1,107 @@
+// A doubly-linked list that can be embedded in a struct.
+//
+// Usage:
+//  struct llist_node head = LLIST_INIT(head);
+//  typedef struct {
+//      ...
+//      struct llist_node node;
+//      ...
+//  } MyObj;
+//
+//  llist_insert_tail(&head, &obj->node);
+//  llist_remove(&obj->node);
+//
+//  struct llist_node *node;
+//  llist_for_each(node, &head) {
+//      MyObj *obj = llist_data(node, MyObj, node);
+//      ...
+//  }
+//
+
+#ifndef Py_INTERNAL_LLIST_H
+#define Py_INTERNAL_LLIST_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "Py_BUILD_CORE must be defined to include this header"
+#endif
+
+struct llist_node {
+    struct llist_node *next;
+    struct llist_node *prev;
+};
+
+// Get the struct containing a node.
+#define llist_data(node, type, member) \
+    (type*)((char*)node - offsetof(type, member))
+
+// Iterate over a list.
+#define llist_for_each(node, head) \
+    for (node = (head)->next; node != (head); node = node->next)
+
+// Iterate over a list, but allow removal of the current node.
+#define llist_for_each_safe(node, head) \
+    for (struct llist_node *_next = (node = (head)->next, node->next); \
+         node != (head); node = _next, _next = node->next)
+
+#define LLIST_INIT(head) { &head, &head }
+
+static inline void
+llist_init(struct llist_node *head)
+{
+    head->next = head;
+    head->prev = head;
+}
+
+// Returns 1 if the list is empty, 0 otherwise.
+static inline int
+llist_empty(struct llist_node *head)
+{
+    return head->next == head;
+}
+
+// Appends to the tail of the list.
+static inline void
+llist_insert_tail(struct llist_node *head, struct llist_node *node)
+{
+    node->prev = head->prev;
+    node->next = head;
+    head->prev->next = node;
+    head->prev = node;
+}
+
+// Remove a node from the list.
+static inline void
+llist_remove(struct llist_node *node)
+{
+    struct llist_node *prev = node->prev;
+    struct llist_node *next = node->next;
+    prev->next = next;
+    next->prev = prev;
+    node->prev = NULL;
+    node->next = NULL;
+}
+
+// Append all nodes from head2 onto head1. head2 is left empty.
+static inline void
+llist_concat(struct llist_node *head1, struct llist_node *head2)
+{
+    if (!llist_empty(head2)) {
+        head1->prev->next = head2->next;
+        head2->next->prev = head1->prev;
+
+        head1->prev = head2->prev;
+        head2->prev->next = head1;
+        llist_init(head2);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_LLIST_H */
diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
new file mode 100644
index 00000000000000..36c76358f5bcae
--- /dev/null
+++ b/Include/internal/pycore_lock.h
@@ -0,0 +1,161 @@
+// Lightweight locks and other synchronization mechanisms.
+//
+// These implementations are based on WebKit's WTF::Lock. See
+// https://webkit.org/blog/6161/locking-in-webkit/ for a description of the
+// design.
+#ifndef Py_INTERNAL_LOCK_H
+#define Py_INTERNAL_LOCK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_time.h"          // _PyTime_t
+
+
+// A mutex that occupies one byte. The lock can be zero initialized.
+//
+// Only the two least significant bits are used. The remaining bits should be
+// zero:
+// 0b00: unlocked
+// 0b01: locked
+// 0b10: unlocked and has parked threads
+// 0b11: locked and has parked threads
+//
+// Typical initialization:
+//   PyMutex m;
+//   memset(&m, 0, sizeof(m));
+//
+// Typical usage:
+//   PyMutex_Lock(&m);
+//   ...
+//   PyMutex_Unlock(&m);
+typedef struct _PyMutex {
+    uint8_t v;
+} PyMutex;
+
+typedef enum {
+    _Py_UNLOCKED = 0,
+    _Py_LOCKED = 1,
+    _Py_HAS_PARKED = 2,
+} _PyMutex_State;
+
+// (private) slow path for locking the mutex
+PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m);
+
+// (private) slow path for unlocking the mutex
+PyAPI_FUNC(void) _PyMutex_UnlockSlow(PyMutex *m);
+
+// Locks the mutex.
+//
+// If the mutex is currently locked, the calling thread will be parked until
+// the mutex is unlocked. If the current thread holds the GIL, then the GIL
+// will be released while the thread is parked.
+static inline void
+PyMutex_Lock(PyMutex *m)
+{
+    uint8_t expected = _Py_UNLOCKED;
+    if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_LOCKED)) {
+        _PyMutex_LockSlow(m);
+    }
+}
+
+// Unlocks the mutex.
+static inline void
+PyMutex_Unlock(PyMutex *m)
+{
+    uint8_t expected = _Py_LOCKED;
+    if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_UNLOCKED)) {
+        _PyMutex_UnlockSlow(m);
+    }
+}
+
+// Checks if the mutex is currently locked.
+static inline int
+PyMutex_IsLocked(PyMutex *m)
+{
+    return (_Py_atomic_load_uint8(&m->v) & _Py_LOCKED) != 0;
+}
+
+typedef enum _PyLockFlags {
+    // Do not detach/release the GIL when waiting on the lock.
+    _Py_LOCK_DONT_DETACH = 0,
+
+    // Detach/release the GIL when waiting on the lock.
+    _PY_LOCK_DETACH = 1,
+
+    // Handle signals if interrupted while waiting on the lock.
+    _PY_LOCK_MAKE_PENDING_CALLS = 2,
+} _PyLockFlags;
+
+// Lock a mutex with an optional timeout and additional options. See
+// _PyLockFlags for details.
+extern PyLockStatus
+_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags);
+
+// Unlock a mutex, returns 0 if the mutex is not locked (used for improved
+// error messages).
+extern int _PyMutex_TryUnlock(PyMutex *m);
+
+
+// PyEvent is a one-time event notification
+typedef struct {
+    uint8_t v;
+} PyEvent;
+
+// Set the event and notify any waiting threads.
+// Export for '_testinternalcapi' shared extension
+PyAPI_FUNC(void) _PyEvent_Notify(PyEvent *evt);
+
+// Wait for the event to be set. If the event is already set, then this returns
+// immediately.
+PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt);
+
+// Wait for the event to be set, or until the timeout expires. If the event is
+// already set, then this returns immediately. Returns 1 if the event was set,
+// and 0 if the timeout expired or thread was interrupted.
+PyAPI_FUNC(int) PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns);
+
+
+// _PyRawMutex implements a word-sized mutex that that does not depend on the
+// parking lot API, and therefore can be used in the parking lot
+// implementation.
+//
+// The mutex uses a packed representation: the least significant bit is used to
+// indicate whether the mutex is locked or not. The remaining bits are either
+// zero or a pointer to a `struct raw_mutex_entry` (see lock.c).
+typedef struct {
+    uintptr_t v;
+} _PyRawMutex;
+
+// Slow paths for lock/unlock
+extern void _PyRawMutex_LockSlow(_PyRawMutex *m);
+extern void _PyRawMutex_UnlockSlow(_PyRawMutex *m);
+
+static inline void
+_PyRawMutex_Lock(_PyRawMutex *m)
+{
+    uintptr_t unlocked = _Py_UNLOCKED;
+    if (_Py_atomic_compare_exchange_uintptr(&m->v, &unlocked, _Py_LOCKED)) {
+        return;
+    }
+    _PyRawMutex_LockSlow(m);
+}
+
+static inline void
+_PyRawMutex_Unlock(_PyRawMutex *m)
+{
+    uintptr_t locked = _Py_LOCKED;
+    if (_Py_atomic_compare_exchange_uintptr(&m->v, &locked, _Py_UNLOCKED)) {
+        return;
+    }
+    _PyRawMutex_UnlockSlow(m);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_LOCK_H */
diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
new file mode 100644
index 00000000000000..32a289912888f7
--- /dev/null
+++ b/Include/internal/pycore_parking_lot.h
@@ -0,0 +1,129 @@
+// ParkingLot is an internal API for building efficient synchronization
+// primitives like mutexes and events.
+//
+// The API and name is inspired by WebKit's WTF::ParkingLot, which in turn
+// is inspired Linux's futex API.
+// See https://webkit.org/blog/6161/locking-in-webkit/.
+//
+// The core functionality is an atomic "compare-and-sleep" operation along with
+// an atomic "wake-up" operation.
+
+#ifndef Py_INTERNAL_PARKING_LOT_H
+#define Py_INTERNAL_PARKING_LOT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_time.h"          // _PyTime_t
+#include "cpython/pyatomic.h"
+
+
+enum {
+    // The thread was unparked by another thread.
+    Py_PARK_OK = 0,
+
+    // The value of `address` did not match `expected`.
+    Py_PARK_AGAIN = -1,
+
+    // The thread was unparked due to a timeout.
+    Py_PARK_TIMEOUT = -2,
+
+    // The thread was interrupted by a signal.
+    Py_PARK_INTR = -3,
+};
+
+// Checks that `*address == *expected` and puts the thread to sleep until an
+// unpark operation is called on the same `address`. Otherwise, the function
+// returns `Py_PARK_AGAIN`. The comparison is performed atomically
+// with respect to unpark operations.
+//
+// The `address_size` argument is the size of the data pointed to by the
+// `address` and `expected` pointers (i.e., sizeof(*address)).
+//
+// `arg`, which can be NULL, is passed to the unpark operation.
+//
+// The `timeout_ns` argument specifies the maximum amount of time to wait, with
+// -1 indicating an infinite wait.
+PyAPI_FUNC(int)
+_PyParkingLot_Park(const void *address, const void *expected,
+                   size_t address_size, _PyTime_t timeout_ns,
+                   void *arg, int detach);
+
+struct _PyUnpark {
+    // The `arg` value passed to _PyParkingLot_Park().
+    void *arg;
+
+    // Are there more threads waiting on the address? May be true in cases
+    // where threads are waiting on a different address that maps to the same
+    // internal bucket.
+    int more_waiters;
+};
+
+// Unpark a single thread waiting on `address`.
+//
+// The `unpark` is a pointer to a `struct _PyUnpark`.
+//
+// Usage:
+//  _PyParkingLot_Unpark(address, unpark, {
+//      if (unpark) {
+//          void *arg = unpark->arg;
+//          int more_waiters = unpark->more_waiters;
+//          ...
+//      }
+//  });
+#define _PyParkingLot_Unpark(address, unpark, ...)                          \
+    do {                                                                    \
+        struct _PyUnpark *(unpark);                                         \
+        unpark = _PyParkingLot_BeginUnpark((address));                      \
+        __VA_ARGS__                                                         \
+        _PyParkingLot_FinishUnpark((address), unpark);                      \
+    } while (0);
+
+// Implements half of an unpark operation.
+// Prefer using the _PyParkingLot_Unpark() macro.
+PyAPI_FUNC(struct _PyUnpark *)
+_PyParkingLot_BeginUnpark(const void *address);
+
+// Finishes the unpark operation and wakes up the thread selected by
+// _PyParkingLot_BeginUnpark.
+// Prefer using the _PyParkingLot_Unpark() macro.
+PyAPI_FUNC(void)
+_PyParkingLot_FinishUnpark(const void *address, struct _PyUnpark *unpark);
+
+// Unparks all threads waiting on `address`.
+PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address);
+
+// Initialize/deinitialize the thread-local state used by parking lot.
+void _PyParkingLot_InitThread(void);
+void _PyParkingLot_DeinitThread(void);
+
+// Resets the parking lot state after a fork. Forgets all parked threads.
+PyAPI_FUNC(void) _PyParkingLot_AfterFork(void);
+
+
+// The _PySemaphore API a simplified cross-platform semaphore used to implement
+// parking lot. It is not intended to be used directly by other modules.
+typedef struct _PySemaphore _PySemaphore;
+
+// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
+PyAPI_FUNC(int)
+_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach);
+
+// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
+// can be called before _PySemaphore_Wait().
+PyAPI_FUNC(void)
+_PySemaphore_Wakeup(_PySemaphore *sema);
+
+// Allocates/releases a semaphore from the thread-local pool.
+PyAPI_FUNC(_PySemaphore *) _PySemaphore_Alloc(void);
+PyAPI_FUNC(void) _PySemaphore_Free(_PySemaphore *sema);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PARKING_LOT_H */
diff --git a/Lib/test/test_capi/test_lock.py b/Lib/test/test_capi/test_lock.py
new file mode 100644
index 00000000000000..756fd8a5ce793b
--- /dev/null
+++ b/Lib/test/test_capi/test_lock.py
@@ -0,0 +1,15 @@
+import unittest
+from test.support import import_helper
+
+# Skip this test if the _testcapi module isn't available.
+_testinternalcapi = import_helper.import_module('_testinternalcapi')
+
+class PyAtomicTests(unittest.TestCase):
+    pass
+
+for name in sorted(dir(_testinternalcapi)):
+    if name.startswith('test_lock_'):
+        setattr(PyAtomicTests, name, getattr(_testinternalcapi, name))
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/Makefile.pre.in b/Makefile.pre.in
index 19a802997838a4..e634d3f30f857e 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -400,12 +400,14 @@ PYTHON_OBJS=	\
 		Python/instrumentation.o \
 		Python/intrinsics.o \
 		Python/legacy_tracing.o \
+		Python/lock.o \
 		Python/marshal.o \
 		Python/modsupport.o \
 		Python/mysnprintf.o \
 		Python/mystrtoul.o \
 		Python/optimizer.o \
 		Python/optimizer_analysis.o \
+		Python/parking_lot.o \
 		Python/pathconfig.o \
 		Python/preconfig.o \
 		Python/pyarena.o \
@@ -1779,6 +1781,8 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/internal/pycore_interp.h \
 		$(srcdir)/Include/internal/pycore_intrinsics.h \
 		$(srcdir)/Include/internal/pycore_list.h \
+		$(srcdir)/Include/internal/pycore_llist.h \
+		$(srcdir)/Include/internal/pycore_lock.h \
 		$(srcdir)/Include/internal/pycore_long.h \
 		$(srcdir)/Include/internal/pycore_modsupport.h \
 		$(srcdir)/Include/internal/pycore_moduleobject.h \
@@ -1790,6 +1794,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/internal/pycore_opcode_metadata.h \
 		$(srcdir)/Include/internal/pycore_opcode_utils.h \
 		$(srcdir)/Include/internal/pycore_optimizer.h \
+		$(srcdir)/Include/internal/pycore_parking_lot.h \
 		$(srcdir)/Include/internal/pycore_pathconfig.h \
 		$(srcdir)/Include/internal/pycore_pyarena.h \
 		$(srcdir)/Include/internal/pycore_pyerrors.h \
diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in
index 56c1badf6b44a0..7b3216a50bb284 100644
--- a/Modules/Setup.stdlib.in
+++ b/Modules/Setup.stdlib.in
@@ -158,7 +158,7 @@
 @MODULE_XXSUBTYPE_TRUE@xxsubtype xxsubtype.c
 @MODULE__XXTESTFUZZ_TRUE@_xxtestfuzz _xxtestfuzz/_xxtestfuzz.c _xxtestfuzz/fuzzer.c
 @MODULE__TESTBUFFER_TRUE@_testbuffer _testbuffer.c
-@MODULE__TESTINTERNALCAPI_TRUE@_testinternalcapi _testinternalcapi.c _testinternalcapi/pytime.c
+@MODULE__TESTINTERNALCAPI_TRUE@_testinternalcapi _testinternalcapi.c _testinternalcapi/test_lock.c _testinternalcapi/pytime.c
 @MODULE__TESTCAPI_TRUE@_testcapi _testcapimodule.c _testcapi/vectorcall.c _testcapi/vectorcall_limited.c _testcapi/heaptype.c _testcapi/abstract.c _testcapi/unicode.c _testcapi/dict.c _testcapi/getargs.c _testcapi/datetime.c _testcapi/docstring.c _testcapi/mem.c _testcapi/watchers.c _testcapi/long.c _testcapi/float.c _testcapi/structmember.c _testcapi/exceptions.c _testcapi/code.c _testcapi/buffer.c _testcapi/pyatomic.c _testcapi/pyos.c _testcapi/immortal.c _testcapi/heaptype_relative.c _testcapi/gc.c
 @MODULE__TESTCLINIC_TRUE@_testclinic _testclinic.c
 @MODULE__TESTCLINIC_LIMITED_TRUE@_testclinic_limited _testclinic_limited.c
diff --git a/Modules/_testcapi/parts.h b/Modules/_testcapi/parts.h
index c162dbc65db81a..a21bc381d709db 100644
--- a/Modules/_testcapi/parts.h
+++ b/Modules/_testcapi/parts.h
@@ -27,6 +27,7 @@ int _PyTestCapi_Init_PyOS(PyObject *module);
 int _PyTestCapi_Init_Immortal(PyObject *module);
 int _PyTestCapi_Init_GC(PyObject *mod);
 
+
 int _PyTestCapi_Init_VectorcallLimited(PyObject *module);
 int _PyTestCapi_Init_HeaptypeRelative(PyObject *module);
 
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 922672d1a9f915..934e3637a9164d 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -1543,6 +1543,9 @@ static PyMethodDef module_functions[] = {
 static int
 module_exec(PyObject *module)
 {
+    if (_PyTestInternalCapi_Init_Lock(module) < 0) {
+        return 1;
+    }
     if (_PyTestInternalCapi_Init_PyTime(module) < 0) {
         return 1;
     }
diff --git a/Modules/_testinternalcapi/clinic/test_lock.c.h b/Modules/_testinternalcapi/clinic/test_lock.c.h
new file mode 100644
index 00000000000000..3cbe5ef12c5fa6
--- /dev/null
+++ b/Modules/_testinternalcapi/clinic/test_lock.c.h
@@ -0,0 +1,74 @@
+/*[clinic input]
+preserve
+[clinic start generated code]*/
+
+#include "pycore_abstract.h"      // _PyNumber_Index()
+
+PyDoc_STRVAR(_testinternalcapi_benchmark_locks__doc__,
+"benchmark_locks($module, num_threads, use_pymutex=True,\n"
+"                critical_section_length=1, time_ms=1000, /)\n"
+"--\n"
+"\n");
+
+#define _TESTINTERNALCAPI_BENCHMARK_LOCKS_METHODDEF    \
+    {"benchmark_locks", _PyCFunction_CAST(_testinternalcapi_benchmark_locks), METH_FASTCALL, _testinternalcapi_benchmark_locks__doc__},
+
+static PyObject *
+_testinternalcapi_benchmark_locks_impl(PyObject *module,
+                                       Py_ssize_t num_threads,
+                                       int use_pymutex,
+                                       int critical_section_length,
+                                       int time_ms);
+
+static PyObject *
+_testinternalcapi_benchmark_locks(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
+{
+    PyObject *return_value = NULL;
+    Py_ssize_t num_threads;
+    int use_pymutex = 1;
+    int critical_section_length = 1;
+    int time_ms = 1000;
+
+    if (!_PyArg_CheckPositional("benchmark_locks", nargs, 1, 4)) {
+        goto exit;
+    }
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[0]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        num_threads = ival;
+    }
+    if (nargs < 2) {
+        goto skip_optional;
+    }
+    use_pymutex = PyObject_IsTrue(args[1]);
+    if (use_pymutex < 0) {
+        goto exit;
+    }
+    if (nargs < 3) {
+        goto skip_optional;
+    }
+    critical_section_length = PyLong_AsInt(args[2]);
+    if (critical_section_length == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+    if (nargs < 4) {
+        goto skip_optional;
+    }
+    time_ms = PyLong_AsInt(args[3]);
+    if (time_ms == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
+skip_optional:
+    return_value = _testinternalcapi_benchmark_locks_impl(module, num_threads, use_pymutex, critical_section_length, time_ms);
+
+exit:
+    return return_value;
+}
+/*[clinic end generated code: output=97c85dff601fed4b input=a9049054013a1b77]*/
diff --git a/Modules/_testinternalcapi/parts.h b/Modules/_testinternalcapi/parts.h
index 43e7714b235156..bbb8e62ddaf7a2 100644
--- a/Modules/_testinternalcapi/parts.h
+++ b/Modules/_testinternalcapi/parts.h
@@ -10,6 +10,7 @@
 
 #include "Python.h"
 
+int _PyTestInternalCapi_Init_Lock(PyObject *module);
 int _PyTestInternalCapi_Init_PyTime(PyObject *module);
 
 #endif // Py_TESTINTERNALCAPI_PARTS_H
diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c
new file mode 100644
index 00000000000000..adfb694908c836
--- /dev/null
+++ b/Modules/_testinternalcapi/test_lock.c
@@ -0,0 +1,353 @@
+// C Extension module to test pycore_lock.h API
+
+#include "parts.h"
+
+#include "pycore_lock.h"
+#include "clinic/test_lock.c.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>         // usleep()
+#endif
+
+/*[clinic input]
+module _testinternalcapi
+[clinic start generated code]*/
+/*[clinic end generated code: output=da39a3ee5e6b4b0d input=7bb583d8c9eb9a78]*/
+
+
+static void
+pysleep(int ms)
+{
+#ifdef _WIN32
+    Sleep(ms);
+#else
+    usleep(ms * 1000);
+#endif
+}
+
+static PyObject *
+test_lock_basic(PyObject *self, PyObject *obj)
+{
+    PyMutex m;
+    memset(&m, 0, sizeof(m));
+
+    // uncontended lock and unlock
+    PyMutex_Lock(&m);
+    assert(m.v == 1);
+    PyMutex_Unlock(&m);
+    assert(m.v == 0);
+
+    Py_RETURN_NONE;
+}
+
+struct test_lock2_data {
+    PyMutex m;
+    PyEvent done;
+    int started;
+};
+
+static void
+lock_thread(void *arg)
+{
+    struct test_lock2_data *test_data = arg;
+    PyMutex *m = &test_data->m;
+    _Py_atomic_store_int(&test_data->started, 1);
+
+    PyMutex_Lock(m);
+    assert(m->v == 1);
+
+    PyMutex_Unlock(m);
+    assert(m->v == 0);
+
+    _PyEvent_Notify(&test_data->done);
+}
+
+static PyObject *
+test_lock_two_threads(PyObject *self, PyObject *obj)
+{
+    // lock attempt by two threads
+    struct test_lock2_data test_data;
+    memset(&test_data, 0, sizeof(test_data));
+
+    PyMutex_Lock(&test_data.m);
+    assert(test_data.m.v == 1);
+
+    PyThread_start_new_thread(lock_thread, &test_data);
+    while (!_Py_atomic_load_int(&test_data.started)) {
+        pysleep(10);
+    }
+    pysleep(10);  // allow some time for the other thread to try to lock
+    assert(test_data.m.v == 3);
+
+    PyMutex_Unlock(&test_data.m);
+    PyEvent_Wait(&test_data.done);
+    assert(test_data.m.v == 0);
+
+    Py_RETURN_NONE;
+}
+
+#define COUNTER_THREADS 5
+#define COUNTER_ITERS 10000
+
+struct test_data_counter {
+    PyMutex m;
+    Py_ssize_t counter;
+};
+
+struct thread_data_counter {
+    struct test_data_counter *test_data;
+    PyEvent done_event;
+};
+
+static void
+counter_thread(void *arg)
+{
+    struct thread_data_counter *thread_data = arg;
+    struct test_data_counter *test_data = thread_data->test_data;
+
+    for (Py_ssize_t i = 0; i < COUNTER_ITERS; i++) {
+        PyMutex_Lock(&test_data->m);
+        test_data->counter++;
+        PyMutex_Unlock(&test_data->m);
+    }
+    _PyEvent_Notify(&thread_data->done_event);
+}
+
+static PyObject *
+test_lock_counter(PyObject *self, PyObject *obj)
+{
+    // Test with rapidly locking and unlocking mutex
+    struct test_data_counter test_data;
+    memset(&test_data, 0, sizeof(test_data));
+
+    struct thread_data_counter thread_data[COUNTER_THREADS];
+    memset(&thread_data, 0, sizeof(thread_data));
+
+    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+        thread_data[i].test_data = &test_data;
+        PyThread_start_new_thread(counter_thread, &thread_data[i]);
+    }
+
+    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+        PyEvent_Wait(&thread_data[i].done_event);
+    }
+
+    assert(test_data.counter == COUNTER_THREADS * COUNTER_ITERS);
+    Py_RETURN_NONE;
+}
+
+#define SLOW_COUNTER_ITERS 100
+
+static void
+slow_counter_thread(void *arg)
+{
+    struct thread_data_counter *thread_data = arg;
+    struct test_data_counter *test_data = thread_data->test_data;
+
+    for (Py_ssize_t i = 0; i < SLOW_COUNTER_ITERS; i++) {
+        PyMutex_Lock(&test_data->m);
+        if (i % 7 == 0) {
+            pysleep(2);
+        }
+        test_data->counter++;
+        PyMutex_Unlock(&test_data->m);
+    }
+    _PyEvent_Notify(&thread_data->done_event);
+}
+
+static PyObject *
+test_lock_counter_slow(PyObject *self, PyObject *obj)
+{
+    // Test lock/unlock with occasional "long" critical section, which will
+    // trigger handoff of the lock.
+    struct test_data_counter test_data;
+    memset(&test_data, 0, sizeof(test_data));
+
+    struct thread_data_counter thread_data[COUNTER_THREADS];
+    memset(&thread_data, 0, sizeof(thread_data));
+
+    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+        thread_data[i].test_data = &test_data;
+        PyThread_start_new_thread(slow_counter_thread, &thread_data[i]);
+    }
+
+    for (Py_ssize_t i = 0; i < COUNTER_THREADS; i++) {
+        PyEvent_Wait(&thread_data[i].done_event);
+    }
+
+    assert(test_data.counter == COUNTER_THREADS * SLOW_COUNTER_ITERS);
+    Py_RETURN_NONE;
+}
+
+struct bench_data_locks {
+    int stop;
+    int use_pymutex;
+    int critical_section_length;
+    char padding[200];
+    PyThread_type_lock lock;
+    PyMutex m;
+    double value;
+    Py_ssize_t total_iters;
+};
+
+struct bench_thread_data {
+    struct bench_data_locks *bench_data;
+    Py_ssize_t iters;
+    PyEvent done;
+};
+
+static void
+thread_benchmark_locks(void *arg)
+{
+    struct bench_thread_data *thread_data = arg;
+    struct bench_data_locks *bench_data = thread_data->bench_data;
+    int use_pymutex = bench_data->use_pymutex;
+    int critical_section_length = bench_data->critical_section_length;
+
+    double my_value = 1.0;
+    Py_ssize_t iters = 0;
+    while (!_Py_atomic_load_int_relaxed(&bench_data->stop)) {
+        if (use_pymutex) {
+            PyMutex_Lock(&bench_data->m);
+            for (int i = 0; i < critical_section_length; i++) {
+                bench_data->value += my_value;
+                my_value = bench_data->value;
+            }
+            PyMutex_Unlock(&bench_data->m);
+        }
+        else {
+            PyThread_acquire_lock(bench_data->lock, 1);
+            for (int i = 0; i < critical_section_length; i++) {
+                bench_data->value += my_value;
+                my_value = bench_data->value;
+            }
+            PyThread_release_lock(bench_data->lock);
+        }
+        iters++;
+    }
+
+    thread_data->iters = iters;
+    _Py_atomic_add_ssize(&bench_data->total_iters, iters);
+    _PyEvent_Notify(&thread_data->done);
+}
+
+/*[clinic input]
+_testinternalcapi.benchmark_locks
+
+    num_threads: Py_ssize_t
+    use_pymutex: bool = True
+    critical_section_length: int = 1
+    time_ms: int = 1000
+    /
+
+[clinic start generated code]*/
+
+static PyObject *
+_testinternalcapi_benchmark_locks_impl(PyObject *module,
+                                       Py_ssize_t num_threads,
+                                       int use_pymutex,
+                                       int critical_section_length,
+                                       int time_ms)
+/*[clinic end generated code: output=381df8d7e9a74f18 input=f3aeaf688738c121]*/
+{
+    // Run from Tools/lockbench/lockbench.py
+    // Based on the WebKit lock benchmarks:
+    // https://github.com/WebKit/WebKit/blob/main/Source/WTF/benchmarks/LockSpeedTest.cpp
+    // See also https://webkit.org/blog/6161/locking-in-webkit/
+    PyObject *thread_iters = NULL;
+    PyObject *res = NULL;
+
+    struct bench_data_locks bench_data;
+    memset(&bench_data, 0, sizeof(bench_data));
+    bench_data.use_pymutex = use_pymutex;
+    bench_data.critical_section_length = critical_section_length;
+
+    bench_data.lock = PyThread_allocate_lock();
+    if (bench_data.lock == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    struct bench_thread_data *thread_data = NULL;
+    thread_data = PyMem_Calloc(num_threads, sizeof(*thread_data));
+    if (thread_data == NULL) {
+        PyErr_NoMemory();
+        goto exit;
+    }
+
+    thread_iters = PyList_New(num_threads);
+    if (thread_iters == NULL) {
+        goto exit;
+    }
+
+    _PyTime_t start = _PyTime_GetMonotonicClock();
+
+    for (Py_ssize_t i = 0; i < num_threads; i++) {
+        thread_data[i].bench_data = &bench_data;
+        PyThread_start_new_thread(thread_benchmark_locks, &thread_data[i]);
+    }
+
+    // Let the threads run for `time_ms` milliseconds
+    pysleep(time_ms);
+    _Py_atomic_store_int(&bench_data.stop, 1);
+
+    // Wait for the threads to finish
+    for (Py_ssize_t i = 0; i < num_threads; i++) {
+        PyEvent_Wait(&thread_data[i].done);
+    }
+
+    Py_ssize_t total_iters = bench_data.total_iters;
+    _PyTime_t end = _PyTime_GetMonotonicClock();
+
+    // Return the total number of acquisitions and the number of acquisitions
+    // for each thread.
+    for (Py_ssize_t i = 0; i < num_threads; i++) {
+        PyObject *iter = PyLong_FromSsize_t(thread_data[i].iters);
+        if (iter == NULL) {
+            goto exit;
+        }
+        PyList_SET_ITEM(thread_iters, i, iter);
+    }
+
+    double rate = total_iters * 1000000000.0 / (end - start);
+    res = Py_BuildValue("(dO)", rate, thread_iters);
+
+exit:
+    PyThread_free_lock(bench_data.lock);
+    PyMem_Free(thread_data);
+    Py_XDECREF(thread_iters);
+    return res;
+}
+
+static PyObject *
+test_lock_benchmark(PyObject *module, PyObject *obj)
+{
+    // Just make sure the benchmark runs without crashing
+    PyObject *res = _testinternalcapi_benchmark_locks_impl(
+        module, 1, 1, 1, 100);
+    if (res == NULL) {
+        return NULL;
+    }
+    Py_DECREF(res);
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef test_methods[] = {
+    {"test_lock_basic", test_lock_basic, METH_NOARGS},
+    {"test_lock_two_threads", test_lock_two_threads, METH_NOARGS},
+    {"test_lock_counter", test_lock_counter, METH_NOARGS},
+    {"test_lock_counter_slow", test_lock_counter_slow, METH_NOARGS},
+    _TESTINTERNALCAPI_BENCHMARK_LOCKS_METHODDEF
+    {"test_lock_benchmark", test_lock_benchmark, METH_NOARGS},
+    {NULL, NULL} /* sentinel */
+};
+
+int
+_PyTestInternalCapi_Init_Lock(PyObject *mod)
+{
+    if (PyModule_AddFunctions(mod, test_methods) < 0) {
+        return -1;
+    }
+    return 0;
+}
diff --git a/PCbuild/_testinternalcapi.vcxproj b/PCbuild/_testinternalcapi.vcxproj
index 59491c644b6655..fb474f06f38fe8 100644
--- a/PCbuild/_testinternalcapi.vcxproj
+++ b/PCbuild/_testinternalcapi.vcxproj
@@ -95,6 +95,7 @@
   <ItemGroup>
     <ClCompile Include="..\Modules\_testinternalcapi.c" />
     <ClCompile Include="..\Modules\_testinternalcapi\pytime.c" />
+    <ClCompile Include="..\Modules\_testinternalcapi\test_lock.c" />
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc" />
diff --git a/PCbuild/_testinternalcapi.vcxproj.filters b/PCbuild/_testinternalcapi.vcxproj.filters
index 21a66a2aa79f76..9c8a5d793ee0f4 100644
--- a/PCbuild/_testinternalcapi.vcxproj.filters
+++ b/PCbuild/_testinternalcapi.vcxproj.filters
@@ -15,6 +15,9 @@
     <ClCompile Include="..\Modules\_testinternalcapi\pytime.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Modules\_testinternalcapi\test_lock.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\PC\python_nt.rc">
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 04752a8029acc2..b35aeb58e11460 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -245,6 +245,8 @@
     <ClInclude Include="..\Include\internal\pycore_interp.h" />
     <ClInclude Include="..\Include\internal\pycore_intrinsics.h" />
     <ClInclude Include="..\Include\internal\pycore_list.h" />
+    <ClInclude Include="..\Include\internal\pycore_llist.h" />
+    <ClInclude Include="..\Include\internal\pycore_lock.h" />
     <ClInclude Include="..\Include\internal\pycore_long.h" />
     <ClInclude Include="..\Include\internal\pycore_modsupport.h" />
     <ClInclude Include="..\Include\internal\pycore_moduleobject.h" />
@@ -254,6 +256,7 @@
     <ClInclude Include="..\Include\internal\pycore_obmalloc.h" />
     <ClInclude Include="..\Include\internal\pycore_obmalloc_init.h" />
     <ClInclude Include="..\Include\internal\pycore_optimizer.h" />
+    <ClInclude Include="..\Include\internal\pycore_parking_lot.h" />
     <ClInclude Include="..\Include\internal\pycore_pathconfig.h" />
     <ClInclude Include="..\Include\internal\pycore_pyarena.h" />
     <ClInclude Include="..\Include\internal\pycore_pyerrors.h" />
@@ -552,12 +555,14 @@
     <ClCompile Include="..\Python\intrinsics.c" />
     <ClCompile Include="..\Python\instrumentation.c" />
     <ClCompile Include="..\Python\legacy_tracing.c" />
+    <ClCompile Include="..\Python\lock.c" />
     <ClCompile Include="..\Python\marshal.c" />
     <ClCompile Include="..\Python\modsupport.c" />
     <ClCompile Include="..\Python\mysnprintf.c" />
     <ClCompile Include="..\Python\mystrtoul.c" />
     <ClCompile Include="..\Python\optimizer.c" />
     <ClCompile Include="..\Python\optimizer_analysis.c" />
+    <ClCompile Include="..\Python\parking_lot.c" />
     <ClCompile Include="..\Python\pathconfig.c" />
     <ClCompile Include="..\Python\perf_trampoline.c" />
     <ClCompile Include="..\Python\preconfig.c" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 4ad02778466925..c2430f00cb7e9a 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -645,6 +645,12 @@
     <ClInclude Include="..\Include\internal\pycore_list.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_llist.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_lock.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_long.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
@@ -1241,6 +1247,9 @@
     <ClCompile Include="..\Python\legacy_tracing.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\lock.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Python\marshal.c">
       <Filter>Python</Filter>
     </ClCompile>
@@ -1259,6 +1268,9 @@
     <ClCompile Include="..\Python\optimizer_analysis.c">
       <Filter>Python</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\parking_lot.c">
+      <Filter>Python</Filter>
+    </ClCompile>
     <ClCompile Include="..\Python\pathconfig.c">
       <Filter>Python</Filter>
     </ClCompile>
diff --git a/Python/lock.c b/Python/lock.c
new file mode 100644
index 00000000000000..fce4c4c4363f2d
--- /dev/null
+++ b/Python/lock.c
@@ -0,0 +1,292 @@
+// Lock implementation
+
+#include "Python.h"
+
+#include "pycore_lock.h"
+#include "pycore_parking_lot.h"
+
+#ifdef _WIN32
+#include <windows.h>        // SwitchToThread()
+#elif defined(HAVE_SCHED_H)
+#include <sched.h>          // sched_yield()
+#endif
+
+// If a thread waits on a lock for longer than TIME_TO_BE_FAIR_NS (1 ms), then
+// the unlocking thread directly hands off ownership of the lock. This avoids
+// starvation.
+static const _PyTime_t TIME_TO_BE_FAIR_NS = 1000*1000;
+
+// Spin for a bit before parking the thread. This is only enabled for
+// `--disable-gil` builds because it is unlikely to be helpful if the GIL is
+// enabled.
+#if Py_NOGIL
+static const int MAX_SPIN_COUNT = 40;
+#else
+static const int MAX_SPIN_COUNT = 0;
+#endif
+
+struct mutex_entry {
+    // The time at which the thread should be handed off the lock. Written by
+    // the waiting thread.
+    _PyTime_t time_to_be_fair;
+
+    // Set to 1 if the lock was handed off. Written by the unlocking thread.
+    int handoff;
+};
+
+static void
+_Py_yield(void)
+{
+#ifdef _WIN32
+    SwitchToThread();
+#elif defined(HAVE_SCHED_H)
+    sched_yield();
+#endif
+}
+
+void
+_PyMutex_LockSlow(PyMutex *m)
+{
+    _PyMutex_TimedLock(m, -1, _PY_LOCK_DETACH);
+}
+
+PyLockStatus
+_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
+{
+    uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v);
+    if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+        if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) {
+            return PY_LOCK_ACQUIRED;
+        }
+    }
+    else if (timeout == 0) {
+        return PY_LOCK_FAILURE;
+    }
+
+    _PyTime_t now = _PyTime_GetMonotonicClock();
+    _PyTime_t endtime = 0;
+    if (timeout > 0) {
+        endtime = _PyTime_Add(now, timeout);
+    }
+
+    struct mutex_entry entry = {
+        .time_to_be_fair = now + TIME_TO_BE_FAIR_NS,
+        .handoff = 0,
+    };
+
+    Py_ssize_t spin_count = 0;
+    for (;;) {
+        if (!(v & _Py_LOCKED)) {
+            // The lock is unlocked. Try to grab it.
+            if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) {
+                return PY_LOCK_ACQUIRED;
+            }
+            continue;
+        }
+
+        if (!(v & _Py_HAS_PARKED) && spin_count < MAX_SPIN_COUNT) {
+            // Spin for a bit.
+            _Py_yield();
+            spin_count++;
+            continue;
+        }
+
+        if (timeout == 0) {
+            return PY_LOCK_FAILURE;
+        }
+
+        uint8_t newv = v;
+        if (!(v & _Py_HAS_PARKED)) {
+            // We are the first waiter. Set the _Py_HAS_PARKED flag.
+            newv = v | _Py_HAS_PARKED;
+            if (!_Py_atomic_compare_exchange_uint8(&m->v, &v, newv)) {
+                continue;
+            }
+        }
+
+        int ret = _PyParkingLot_Park(&m->v, &newv, sizeof(newv), timeout,
+                                     &entry, (flags & _PY_LOCK_DETACH) != 0);
+        if (ret == Py_PARK_OK) {
+            if (entry.handoff) {
+                // We own the lock now.
+                assert(_Py_atomic_load_uint8_relaxed(&m->v) & _Py_LOCKED);
+                return PY_LOCK_ACQUIRED;
+            }
+        }
+        else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_MAKE_PENDING_CALLS)) {
+            if (Py_MakePendingCalls() < 0) {
+                return PY_LOCK_INTR;
+            }
+        }
+        else if (ret == Py_PARK_TIMEOUT) {
+            assert(timeout >= 0);
+            return PY_LOCK_FAILURE;
+        }
+
+        if (timeout > 0) {
+            timeout = _PyDeadline_Get(endtime);
+            if (timeout <= 0) {
+                // Avoid negative values because those mean block forever.
+                timeout = 0;
+            }
+        }
+
+        v = _Py_atomic_load_uint8_relaxed(&m->v);
+    }
+}
+
+int
+_PyMutex_TryUnlock(PyMutex *m)
+{
+    uint8_t v = _Py_atomic_load_uint8(&m->v);
+    for (;;) {
+        if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+            // error: the mutex is not locked
+            return -1;
+        }
+        else if ((v & _Py_HAS_PARKED)) {
+            // wake up a single thread
+            _PyParkingLot_Unpark(&m->v, unpark, {
+                v = 0;
+                if (unpark) {
+                    struct mutex_entry *entry = unpark->arg;
+                    _PyTime_t now = _PyTime_GetMonotonicClock();
+                    int should_be_fair = now > entry->time_to_be_fair;
+
+                    entry->handoff = should_be_fair;
+                    if (should_be_fair) {
+                        v |= _Py_LOCKED;
+                    }
+                    if (unpark->more_waiters) {
+                        v |= _Py_HAS_PARKED;
+                    }
+                }
+                _Py_atomic_store_uint8(&m->v, v);
+            });
+            return 0;
+        }
+        else if (_Py_atomic_compare_exchange_uint8(&m->v, &v, _Py_UNLOCKED)) {
+            // fast-path: no waiters
+            return 0;
+        }
+    }
+}
+
+void
+_PyMutex_UnlockSlow(PyMutex *m)
+{
+    if (_PyMutex_TryUnlock(m) < 0) {
+        Py_FatalError("unlocking mutex that is not locked");
+    }
+}
+
+// _PyRawMutex stores a linked list of `struct raw_mutex_entry`, one for each
+// thread waiting on the mutex, directly in the mutex itself.
+struct raw_mutex_entry {
+    struct raw_mutex_entry *next;
+    _PySemaphore *sema;
+};
+
+void
+_PyRawMutex_LockSlow(_PyRawMutex *m)
+{
+    struct raw_mutex_entry waiter;
+    waiter.sema = _PySemaphore_Alloc();
+
+    uintptr_t v = _Py_atomic_load_uintptr(&m->v);
+    for (;;) {
+        if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+            // Unlocked: try to grab it (even if it has a waiter).
+            if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, v|_Py_LOCKED)) {
+                break;
+            }
+            continue;
+        }
+
+        // Locked: try to add ourselves as a waiter.
+        waiter.next = (struct raw_mutex_entry *)(v & ~1);
+        uintptr_t desired = ((uintptr_t)&waiter)|_Py_LOCKED;
+        if (!_Py_atomic_compare_exchange_uintptr(&m->v, &v, desired)) {
+            continue;
+        }
+
+        // Wait for us to be woken up. Note that we still have to lock the
+        // mutex ourselves: it is NOT handed off to us.
+        _PySemaphore_Wait(waiter.sema, -1, /*detach=*/0);
+    }
+
+    _PySemaphore_Free(waiter.sema);
+}
+
+void
+_PyRawMutex_UnlockSlow(_PyRawMutex *m)
+{
+    uintptr_t v = _Py_atomic_load_uintptr(&m->v);
+    for (;;) {
+        if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+            Py_FatalError("unlocking mutex that is not locked");
+        }
+
+        struct raw_mutex_entry *waiter = (struct raw_mutex_entry *)(v & ~1);
+        if (waiter) {
+            uintptr_t next_waiter = (uintptr_t)waiter->next;
+            if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, next_waiter)) {
+                _PySemaphore_Wakeup(waiter->sema);
+                return;
+            }
+        }
+        else {
+            if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, _Py_UNLOCKED)) {
+                return;
+            }
+        }
+    }
+}
+
+void
+_PyEvent_Notify(PyEvent *evt)
+{
+    uintptr_t v = _Py_atomic_exchange_uint8(&evt->v, _Py_LOCKED);
+    if (v == _Py_UNLOCKED) {
+        // no waiters
+        return;
+    }
+    else if (v == _Py_LOCKED) {
+        // event already set
+        return;
+    }
+    else {
+        assert(v == _Py_HAS_PARKED);
+        _PyParkingLot_UnparkAll(&evt->v);
+    }
+}
+
+void
+PyEvent_Wait(PyEvent *evt)
+{
+    while (!PyEvent_TimedWait(evt, -1))
+        ;
+}
+
+int
+PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns)
+{
+    for (;;) {
+        uint8_t v = _Py_atomic_load_uint8(&evt->v);
+        if (v == _Py_LOCKED) {
+            // event already set
+            return 1;
+        }
+        if (v == _Py_UNLOCKED) {
+            if (!_Py_atomic_compare_exchange_uint8(&evt->v, &v, _Py_HAS_PARKED)) {
+                continue;
+            }
+        }
+
+        uint8_t expected = _Py_HAS_PARKED;
+        (void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v),
+                                  timeout_ns, NULL, 1);
+
+        return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED;
+    }
+}
diff --git a/Python/parking_lot.c b/Python/parking_lot.c
new file mode 100644
index 00000000000000..8e6995653ce6f3
--- /dev/null
+++ b/Python/parking_lot.c
@@ -0,0 +1,466 @@
+#include "Python.h"
+
+#include "pycore_llist.h"
+#include "pycore_lock.h"        // _PyRawMutex
+#include "pycore_parking_lot.h"
+#include "pycore_pyerrors.h"    // _Py_FatalErrorFormat
+#include "pycore_pystate.h"     // _PyThreadState_GET
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \
+     defined(HAVE_SEM_TIMEDWAIT))
+#define USE_SEMAPHORES
+#include <semaphore.h>
+#else
+#include <pthread.h>
+#endif
+
+// A simple, cross-platform binary semaphore that can be used to implement
+// wakeup/sleep.
+struct _PySemaphore {
+#if defined(_WIN32)
+    HANDLE platform_sem;
+#elif defined(USE_SEMAPHORES)
+    sem_t platform_sem;
+#else
+    PyMUTEX_T mutex;
+    PyCOND_T cond;
+    int counter;
+#endif
+};
+
+typedef struct {
+    // The mutex protects the waiter queue and the num_waiters counter.
+    _PyRawMutex mutex;
+
+    // Linked list of `struct wait_entry` waiters in this bucket.
+    struct llist_node root;
+    size_t num_waiters;
+} Bucket;
+
+struct wait_entry {
+    struct _PyUnpark unpark;
+    uintptr_t addr;
+    _PySemaphore *sema;
+    struct llist_node node;
+};
+
+#define MAX_SEMA_DEPTH 3
+
+typedef struct {
+    Py_ssize_t refcount;
+
+    int depth;
+    _PySemaphore semas[MAX_SEMA_DEPTH];
+} ThreadData;
+
+#define NUM_BUCKETS 251
+
+// Table of waiters (hashed by address)
+static Bucket buckets[NUM_BUCKETS];
+
+#ifdef HAVE_THREAD_LOCAL
+_Py_thread_local ThreadData *thread_data = NULL;
+#else
+#error "no supported thread-local variable storage classifier"
+#endif
+
+static void
+_PySemaphore_Init(_PySemaphore *sema)
+{
+#if defined(_WIN32)
+    sema->platform_sem = CreateSemaphore(
+        NULL,   //  attributes
+        0,      //  initial count
+        10,     //  maximum count
+        NULL    //  unnamed
+    );
+    if (!sema->platform_sem) {
+        Py_FatalError("parking_lot: CreateSemaphore failed");
+    }
+#elif defined(USE_SEMAPHORES)
+    if (sem_init(&sema->platform_sem, /*pshared=*/0, /*value=*/0) < 0) {
+        Py_FatalError("parking_lot: sem_init failed");
+    }
+#else
+    if (pthread_mutex_init(&sema->mutex, NULL) != 0) {
+        Py_FatalError("parking_lot: pthread_mutex_init failed");
+    }
+    if (pthread_cond_init(&sema->cond, NULL)) {
+        Py_FatalError("parking_lot: pthread_cond_init failed");
+    }
+#endif
+}
+
+static void
+_PySemaphore_Destroy(_PySemaphore *sema)
+{
+#if defined(_WIN32)
+    CloseHandle(sema->platform_sem);
+#elif defined(USE_SEMAPHORES)
+    sem_destroy(&sema->platform_sem);
+#else
+    pthread_mutex_destroy(&sema->mutex);
+    pthread_cond_destroy(&sema->cond);
+#endif
+}
+
+static int
+_PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
+{
+    int res = Py_PARK_INTR;
+#if defined(_WIN32)
+    DWORD wait;
+    DWORD millis = 0;
+    if (timeout < 0) {
+        millis = INFINITE;
+    }
+    else {
+        millis = (DWORD) (timeout / 1000000);
+    }
+    wait = WaitForSingleObjectEx(sema->platform_sem, millis, FALSE);
+    if (wait == WAIT_OBJECT_0) {
+        res = Py_PARK_OK;
+    }
+    else if (wait == WAIT_TIMEOUT) {
+        res = Py_PARK_TIMEOUT;
+    }
+#elif defined(USE_SEMAPHORES)
+    int err;
+    if (timeout >= 0) {
+        struct timespec ts;
+
+        _PyTime_t deadline = _PyTime_Add(_PyTime_GetSystemClock(), timeout);
+        _PyTime_AsTimespec(deadline, &ts);
+
+        err = sem_timedwait(&sema->platform_sem, &ts);
+    }
+    else {
+        err = sem_wait(&sema->platform_sem);
+    }
+    if (err == -1) {
+        err = errno;
+        if (err == EINTR) {
+            res = Py_PARK_INTR;
+        }
+        else if (err == ETIMEDOUT) {
+            res = Py_PARK_TIMEOUT;
+        }
+        else {
+            _Py_FatalErrorFormat(__func__,
+                "unexpected error from semaphore: %d",
+                err);
+        }
+    }
+    else {
+        res = Py_PARK_OK;
+    }
+#else
+    pthread_mutex_lock(&sema->mutex);
+    if (sema->counter == 0) {
+        int err;
+        if (timeout >= 0) {
+            struct timespec ts;
+
+            _PyTime_t deadline = _PyTime_Add(_PyTime_GetSystemClock(), timeout);
+            _PyTime_AsTimespec(deadline, &ts);
+
+            err = pthread_cond_timedwait(&sema->cond, &sema->mutex, &ts);
+        }
+        else {
+            err = pthread_cond_wait(&sema->cond, &sema->mutex);
+        }
+        if (err) {
+            res = Py_PARK_TIMEOUT;
+        }
+    }
+    if (sema->counter > 0) {
+        sema->counter--;
+        res = Py_PARK_OK;
+    }
+    pthread_mutex_unlock(&sema->mutex);
+#endif
+    return res;
+}
+
+int
+_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout, int detach)
+{
+    PyThreadState *tstate = _PyThreadState_GET();
+    int was_attached = 0;
+    if (tstate) {
+        was_attached = (tstate->_status.active);
+        if (was_attached && detach) {
+            PyEval_ReleaseThread(tstate);
+        }
+    }
+
+    int res = _PySemaphore_PlatformWait(sema, timeout);
+
+    if (tstate) {
+        if (was_attached && detach) {
+            PyEval_AcquireThread(tstate);
+        }
+    }
+    return res;
+}
+
+void
+_PySemaphore_Wakeup(_PySemaphore *sema)
+{
+#if defined(_WIN32)
+    if (!ReleaseSemaphore(sema->platform_sem, 1, NULL)) {
+        Py_FatalError("parking_lot: ReleaseSemaphore failed");
+    }
+#elif defined(USE_SEMAPHORES)
+    int err = sem_post(&sema->platform_sem);
+    if (err != 0) {
+        Py_FatalError("parking_lot: sem_post failed");
+    }
+#else
+    pthread_mutex_lock(&sema->mutex);
+    sema->counter++;
+    pthread_cond_signal(&sema->cond);
+    pthread_mutex_unlock(&sema->mutex);
+#endif
+}
+
+_PySemaphore *
+_PySemaphore_Alloc(void)
+{
+    // Make sure we have a valid thread_data. We need to acquire
+    // some locks before we have a fully initialized PyThreadState.
+    _PyParkingLot_InitThread();
+
+    ThreadData *this_thread = thread_data;
+    if (this_thread->depth >= MAX_SEMA_DEPTH) {
+        Py_FatalError("_PySemaphore_Alloc(): too many calls");
+    }
+    return &this_thread->semas[this_thread->depth++];
+}
+
+void
+_PySemaphore_Free(_PySemaphore *sema)
+{
+    ThreadData *this_thread = thread_data;
+    this_thread->depth--;
+    if (&this_thread->semas[this_thread->depth] != sema) {
+        Py_FatalError("_PySemaphore_Free(): mismatch wakeup");
+    }
+    _PyParkingLot_DeinitThread();
+}
+
+void
+_PyParkingLot_InitThread(void)
+{
+    if (thread_data != NULL) {
+        thread_data->refcount++;
+        return;
+    }
+    ThreadData *this_thread = PyMem_RawMalloc(sizeof(ThreadData));
+    if (this_thread == NULL) {
+        Py_FatalError("_PyParkingLot_InitThread: unable to allocate thread data");
+    }
+    memset(this_thread, 0, sizeof(*this_thread));
+    this_thread->refcount = 1;
+    this_thread->depth = 0;
+    for (int i = 0; i < MAX_SEMA_DEPTH; i++) {
+        _PySemaphore_Init(&this_thread->semas[i]);
+    }
+    thread_data = this_thread;
+}
+
+void
+_PyParkingLot_DeinitThread(void)
+{
+    ThreadData *td = thread_data;
+    if (td == NULL) {
+        return;
+    }
+
+    if (--td->refcount != 0) {
+        assert(td->refcount > 0);
+        return;
+    }
+
+    thread_data = NULL;
+    for (int i = 0; i < MAX_SEMA_DEPTH; i++) {
+        _PySemaphore_Destroy(&td->semas[i]);
+    }
+
+    PyMem_RawFree(td);
+}
+
+static void
+enqueue(Bucket *bucket, const void *address, struct wait_entry *wait)
+{
+    if (!bucket->root.next) {
+        // initialize bucket
+        llist_init(&bucket->root);
+    }
+    llist_insert_tail(&bucket->root, &wait->node);
+    ++bucket->num_waiters;
+}
+
+static struct wait_entry *
+dequeue(Bucket *bucket, const void *address)
+{
+    struct llist_node *root = &bucket->root;
+    if (!root->next) {
+        // bucket was not yet initialized
+        return NULL;
+    }
+
+    // find the first waiter that is waiting on `address`
+    struct llist_node *node;
+    llist_for_each(node, root) {
+        struct wait_entry *wait = llist_data(node, struct wait_entry, node);
+        if (wait->addr == (uintptr_t)address) {
+            llist_remove(node);
+            --bucket->num_waiters;
+            return wait;
+        }
+    }
+    return NULL;
+}
+
+static void
+dequeue_all(Bucket *bucket, const void *address, struct llist_node *dst)
+{
+    struct llist_node *root = &bucket->root;
+    if (!root->next) {
+        // bucket was not yet initialized
+        return;
+    }
+
+    // remove and append all matching waiters to dst
+    struct llist_node *node;
+    llist_for_each_safe(node, root) {
+        struct wait_entry *wait = llist_data(node, struct wait_entry, node);
+        if (wait->addr == (uintptr_t)address) {
+            llist_remove(node);
+            llist_insert_tail(dst, node);
+            --bucket->num_waiters;
+        }
+    }
+}
+
+// Checks that `*addr == *expected`
+static int
+validate_addr(const void *addr, const void *expected, size_t addr_size)
+{
+    switch (addr_size) {
+    case 1: return _Py_atomic_load_uint8(addr) == *(const uint8_t *)expected;
+    case 2: return _Py_atomic_load_uint16(addr) == *(const uint16_t *)expected;
+    case 4: return _Py_atomic_load_uint32(addr) == *(const uint32_t *)expected;
+    case 8: return _Py_atomic_load_uint64(addr) == *(const uint64_t *)expected;
+    default: Py_UNREACHABLE();
+    }
+}
+
+int
+_PyParkingLot_Park(const void *addr, const void *expected, size_t size,
+                   _PyTime_t timeout_ns, void *arg, int detach)
+{
+    struct wait_entry wait;
+    wait.unpark.arg = arg;
+    wait.addr = (uintptr_t)addr;
+
+    Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
+
+    _PyRawMutex_Lock(&bucket->mutex);
+    if (!validate_addr(addr, expected, size)) {
+        _PyRawMutex_Unlock(&bucket->mutex);
+        return Py_PARK_AGAIN;
+    }
+    wait.sema = _PySemaphore_Alloc();
+    enqueue(bucket, addr, &wait);
+    _PyRawMutex_Unlock(&bucket->mutex);
+
+    int res = _PySemaphore_Wait(wait.sema, timeout_ns, detach);
+    if (res == Py_PARK_OK) {
+        goto done;
+    }
+
+    // timeout or interrupt
+    _PyRawMutex_Lock(&bucket->mutex);
+    if (wait.node.next == NULL) {
+        _PyRawMutex_Unlock(&bucket->mutex);
+        // We've been removed the waiter queue. Wait until we process the
+        // wakeup signal.
+        do {
+            res = _PySemaphore_Wait(wait.sema, -1, detach);
+        } while (res != Py_PARK_OK);
+        goto done;
+    }
+    else {
+        llist_remove(&wait.node);
+        --bucket->num_waiters;
+    }
+    _PyRawMutex_Unlock(&bucket->mutex);
+
+done:
+    _PySemaphore_Free(wait.sema);
+    return res;
+
+}
+
+struct _PyUnpark *
+_PyParkingLot_BeginUnpark(const void *addr)
+{
+    Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
+    _PyRawMutex_Lock(&bucket->mutex);
+
+    struct wait_entry *waiter = dequeue(bucket, addr);
+    if (!waiter) {
+        return NULL;
+    }
+
+    waiter->unpark.more_waiters = (bucket->num_waiters > 0);
+    return &waiter->unpark;
+}
+
+#define container_of(ptr, type, member) \
+    ((type *)((char *)(ptr) - offsetof(type, member)))
+
+void
+_PyParkingLot_FinishUnpark(const void *addr, struct _PyUnpark *unpark)
+{
+    Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
+    _PyRawMutex_Unlock(&bucket->mutex);
+
+    if (unpark) {
+        struct wait_entry *waiter;
+        waiter = container_of(unpark, struct wait_entry, unpark);
+
+        _PySemaphore_Wakeup(waiter->sema);
+    }
+}
+
+void
+_PyParkingLot_UnparkAll(const void *addr)
+{
+    struct llist_node head = LLIST_INIT(head);
+    Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
+
+    _PyRawMutex_Lock(&bucket->mutex);
+    dequeue_all(bucket, addr, &head);
+    _PyRawMutex_Unlock(&bucket->mutex);
+
+    struct llist_node *node;
+    llist_for_each_safe(node, &head) {
+        struct wait_entry *waiter = llist_data(node, struct wait_entry, node);
+        llist_remove(node);
+        _PySemaphore_Wakeup(waiter->sema);
+    }
+}
+
+void
+_PyParkingLot_AfterFork(void)
+{
+    // After a fork only one thread remains. That thread cannot be blocked
+    // so all entries in the parking lot are for dead threads.
+    memset(buckets, 0, sizeof(buckets));
+}
diff --git a/Python/pystate.c b/Python/pystate.c
index b5c4fd7fb50616..b7e8429b1ca403 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -8,6 +8,7 @@
 #include "pycore_frame.h"
 #include "pycore_initconfig.h"    // _PyStatus_OK()
 #include "pycore_object.h"        // _PyType_InitCache()
+#include "pycore_parking_lot.h"   // _PyParkingLot_AfterFork()
 #include "pycore_pyerrors.h"      // _PyErr_Clear()
 #include "pycore_pylifecycle.h"   // _PyAST_Fini()
 #include "pycore_pymem.h"         // _PyMem_SetDefaultAllocator()
@@ -549,6 +550,10 @@ _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime)
 
     PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
 
+    // Clears the parking lot. Any waiting threads are dead. This must be
+    // called before releasing any locks that use the parking lot.
+    _PyParkingLot_AfterFork();
+
     /* bpo-42540: id_mutex is freed by _PyInterpreterState_Delete, which does
      * not force the default allocator. */
     reinit_err += _PyThread_at_fork_reinit(&runtime->interpreters.main->id_mutex);
diff --git a/Tools/lockbench/lockbench.py b/Tools/lockbench/lockbench.py
new file mode 100644
index 00000000000000..9833d703e00cbb
--- /dev/null
+++ b/Tools/lockbench/lockbench.py
@@ -0,0 +1,53 @@
+# Measure the performance of PyMutex and PyThread_type_lock locks
+# with short critical sections.
+#
+# Usage: python Tools/lockbench/lockbench.py [CRITICAL_SECTION_LENGTH]
+#
+# How to interpret the results:
+#
+# Acquisitions (kHz): Reports the total number of lock acquisitions in
+# thousands of acquisitions per second. This is the most important metric,
+# particularly for the 1 thread case because even in multithreaded programs,
+# most locks acquisitions are not contended. Values for 2+ threads are
+# only meaningful for `--disable-gil` builds, because the GIL prevents most
+# situations where there is lock contention with short critical sections.
+#
+# Fairness: A measure of how evenly the lock acquisitions are distributed.
+# A fairness of 1.0 means that all threads acquired the lock the same number
+# of times. A fairness of 1/N means that only one thread ever acquired the
+# lock.
+# See https://en.wikipedia.org/wiki/Fairness_measure#Jain's_fairness_index
+
+from _testinternalcapi import benchmark_locks
+import sys
+
+# Max number of threads to test
+MAX_THREADS = 10
+
+# How much "work" to do while holding the lock
+CRITICAL_SECTION_LENGTH = 1
+
+
+def jains_fairness(values):
+    # Jain's fairness index
+    # See https://en.wikipedia.org/wiki/Fairness_measure
+    return (sum(values) ** 2) / (len(values) * sum(x ** 2 for x in values))
+
+def main():
+    print("Lock Type           Threads           Acquisitions (kHz)   Fairness")
+    for lock_type in ["PyMutex", "PyThread_type_lock"]:
+        use_pymutex = (lock_type == "PyMutex")
+        for num_threads in range(1, MAX_THREADS + 1):
+            acquisitions, thread_iters = benchmark_locks(
+                num_threads, use_pymutex, CRITICAL_SECTION_LENGTH)
+
+            acquisitions /= 1000  # report in kHz for readability
+            fairness = jains_fairness(thread_iters)
+
+            print(f"{lock_type: <20}{num_threads: <18}{acquisitions: >5.0f}{fairness: >20.2f}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        CRITICAL_SECTION_LENGTH = int(sys.argv[1])
+    main()

From 6db39dc2132f686e37396d47cffd7806a3adc3ec Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 12 Sep 2023 13:12:32 -0700
Subject: [PATCH 02/28] Add news entry

---
 .../next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst    | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst

diff --git a/Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst b/Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst
new file mode 100644
index 00000000000000..5cddf9bc239700
--- /dev/null
+++ b/Misc/NEWS.d/next/C API/2023-09-12-13-09-36.gh-issue-108724.-yMsC8.rst	
@@ -0,0 +1 @@
+Add :c:type:`PyMutex` internal-only lightweight locking API.

From e6247337e32eb87662844502cef14f3440bc26c6 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 12 Sep 2023 13:12:46 -0700
Subject: [PATCH 03/28] Revert change to Modules/_testcapi/parts.h

---
 Modules/_testcapi/parts.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Modules/_testcapi/parts.h b/Modules/_testcapi/parts.h
index a21bc381d709db..c162dbc65db81a 100644
--- a/Modules/_testcapi/parts.h
+++ b/Modules/_testcapi/parts.h
@@ -27,7 +27,6 @@ int _PyTestCapi_Init_PyOS(PyObject *module);
 int _PyTestCapi_Init_Immortal(PyObject *module);
 int _PyTestCapi_Init_GC(PyObject *mod);
 
-
 int _PyTestCapi_Init_VectorcallLimited(PyObject *module);
 int _PyTestCapi_Init_HeaptypeRelative(PyObject *module);
 

From 83e60fbf29f412c876114e7323ff73a01154ae46 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 12 Sep 2023 13:34:14 -0700
Subject: [PATCH 04/28] Update Tools/c-analyzer/cpython/ignored.tsv

---
 Python/parking_lot.c                 | 2 +-
 Tools/c-analyzer/cpython/ignored.tsv | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index 8e6995653ce6f3..a60d954cf95a6c 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -62,7 +62,7 @@ typedef struct {
 static Bucket buckets[NUM_BUCKETS];
 
 #ifdef HAVE_THREAD_LOCAL
-_Py_thread_local ThreadData *thread_data = NULL;
+static _Py_thread_local ThreadData *thread_data = NULL;
 #else
 #error "no supported thread-local variable storage classifier"
 #endif
diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv
index d1ac0410619c96..0e0c9980958efb 100644
--- a/Tools/c-analyzer/cpython/ignored.tsv
+++ b/Tools/c-analyzer/cpython/ignored.tsv
@@ -50,6 +50,9 @@ Python/getversion.c	-	version	-
 Python/bootstrap_hash.c	-	_Py_HashSecret_Initialized	-
 Python/pyhash.c	-	_Py_HashSecret	-
 
+## thread-safe hashtable (internal locks)
+Python/parking_lot.c	-	buckets	-
+
 
 ##################################
 ## state tied to Py_Main()
@@ -173,6 +176,7 @@ Python/pyfpe.c	-	PyFPE_counter	-
 
 Python/import.c	-	pkgcontext	-
 Python/pystate.c	-	_Py_tss_tstate	-
+Python/parking_lot.c	-	thread_data	-
 
 ##-----------------------
 ## should be const

From 3161e17da49312af3bdfc7eed3302cdfdebcdff7 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 12 Sep 2023 13:42:33 -0700
Subject: [PATCH 05/28] Fix _Py_atomic_store_ptr_release on winarm64

---
 Include/cpython/pyatomic_msc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index c88bb03cc8f94a..287ed43b5714cd 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -906,7 +906,7 @@ _Py_atomic_store_ptr_release(void *obj, void *value)
 #if defined(_M_X64) || defined(_M_IX86)
     *(void * volatile *)obj = value;
 #elif defined(_M_ARM64)
-    __stlr64(obj, (uintptr_t)value);
+    __stlr64((unsigned __int64 volatile *)obj, (uintptr_t)value);
 #else
 #  error "no implementation of _Py_atomic_store_ptr_release"
 #endif

From 3ab7110915e49bc1ed7243bf75603007f19c0254 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 12 Sep 2023 14:17:13 -0700
Subject: [PATCH 06/28] Support pthread_stubs.h and require threads for lock
 tests

---
 Lib/test/test_capi/test_lock.py | 10 +++++++---
 Python/parking_lot.c            | 13 ++++++++-----
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/Lib/test/test_capi/test_lock.py b/Lib/test/test_capi/test_lock.py
index 756fd8a5ce793b..cfef5a26c4eb61 100644
--- a/Lib/test/test_capi/test_lock.py
+++ b/Lib/test/test_capi/test_lock.py
@@ -1,15 +1,19 @@
 import unittest
-from test.support import import_helper
+from test.support import import_helper, threading_helper
 
 # Skip this test if the _testcapi module isn't available.
 _testinternalcapi = import_helper.import_module('_testinternalcapi')
 
-class PyAtomicTests(unittest.TestCase):
+# Lock tests require threads
+threading_helper.requires_working_threading(module=True)
+
+
+class PyLockTests(unittest.TestCase):
     pass
 
 for name in sorted(dir(_testinternalcapi)):
     if name.startswith('test_lock_'):
-        setattr(PyAtomicTests, name, getattr(_testinternalcapi, name))
+        setattr(PyLockTests, name, getattr(_testinternalcapi, name))
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index a60d954cf95a6c..66f99484ab879d 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -7,14 +7,17 @@
 #include "pycore_pystate.h"     // _PyThreadState_GET
 
 #ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
+#   include <windows.h>
 #elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \
      defined(HAVE_SEM_TIMEDWAIT))
-#define USE_SEMAPHORES
-#include <semaphore.h>
+#   define USE_SEMAPHORES
+#   include <semaphore.h>
+#elif defined(HAVE_PTHREAD_H)
+#   include <pthread.h>
+#elif defined(HAVE_PTHREAD_STUBS)
+#   include "cpython/pthread_stubs.h"
 #else
-#include <pthread.h>
+#   error "Require native threads. See https://bugs.python.org/issue31370"
 #endif
 
 // A simple, cross-platform binary semaphore that can be used to implement

From 153a0b35aa940d87cb76fa1cf1f76f4cdfe226e1 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 12 Sep 2023 14:21:42 -0700
Subject: [PATCH 07/28] Clean-up Windows includes and ifdefs

---
 Modules/_testinternalcapi/test_lock.c |  5 +++--
 Python/lock.c                         |  5 +++--
 Python/parking_lot.c                  | 13 +++++++------
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c
index adfb694908c836..fe25417a2f315d 100644
--- a/Modules/_testinternalcapi/test_lock.c
+++ b/Modules/_testinternalcapi/test_lock.c
@@ -5,7 +5,8 @@
 #include "pycore_lock.h"
 #include "clinic/test_lock.c.h"
 
-#ifdef _WIN32
+#ifdef MS_WINDOWS
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #else
 #include <unistd.h>         // usleep()
@@ -20,7 +21,7 @@ module _testinternalcapi
 static void
 pysleep(int ms)
 {
-#ifdef _WIN32
+#ifdef MS_WINDOWS
     Sleep(ms);
 #else
     usleep(ms * 1000);
diff --git a/Python/lock.c b/Python/lock.c
index fce4c4c4363f2d..b8a1f64a838118 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -5,7 +5,8 @@
 #include "pycore_lock.h"
 #include "pycore_parking_lot.h"
 
-#ifdef _WIN32
+#ifdef MS_WINDOWS
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>        // SwitchToThread()
 #elif defined(HAVE_SCHED_H)
 #include <sched.h>          // sched_yield()
@@ -37,7 +38,7 @@ struct mutex_entry {
 static void
 _Py_yield(void)
 {
-#ifdef _WIN32
+#ifdef MS_WINDOWS
     SwitchToThread();
 #elif defined(HAVE_SCHED_H)
     sched_yield();
diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index 66f99484ab879d..4ec6657c033ed5 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -6,7 +6,8 @@
 #include "pycore_pyerrors.h"    // _Py_FatalErrorFormat
 #include "pycore_pystate.h"     // _PyThreadState_GET
 
-#ifdef _WIN32
+#ifdef MS_WINDOWS
+#   define WIN32_LEAN_AND_MEAN
 #   include <windows.h>
 #elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \
      defined(HAVE_SEM_TIMEDWAIT))
@@ -23,7 +24,7 @@
 // A simple, cross-platform binary semaphore that can be used to implement
 // wakeup/sleep.
 struct _PySemaphore {
-#if defined(_WIN32)
+#if defined(MS_WINDOWS)
     HANDLE platform_sem;
 #elif defined(USE_SEMAPHORES)
     sem_t platform_sem;
@@ -73,7 +74,7 @@ static _Py_thread_local ThreadData *thread_data = NULL;
 static void
 _PySemaphore_Init(_PySemaphore *sema)
 {
-#if defined(_WIN32)
+#if defined(MS_WINDOWS)
     sema->platform_sem = CreateSemaphore(
         NULL,   //  attributes
         0,      //  initial count
@@ -100,7 +101,7 @@ _PySemaphore_Init(_PySemaphore *sema)
 static void
 _PySemaphore_Destroy(_PySemaphore *sema)
 {
-#if defined(_WIN32)
+#if defined(MS_WINDOWS)
     CloseHandle(sema->platform_sem);
 #elif defined(USE_SEMAPHORES)
     sem_destroy(&sema->platform_sem);
@@ -114,7 +115,7 @@ static int
 _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
 {
     int res = Py_PARK_INTR;
-#if defined(_WIN32)
+#if defined(MS_WINDOWS)
     DWORD wait;
     DWORD millis = 0;
     if (timeout < 0) {
@@ -213,7 +214,7 @@ _PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout, int detach)
 void
 _PySemaphore_Wakeup(_PySemaphore *sema)
 {
-#if defined(_WIN32)
+#if defined(MS_WINDOWS)
     if (!ReleaseSemaphore(sema->platform_sem, 1, NULL)) {
         Py_FatalError("parking_lot: ReleaseSemaphore failed");
     }

From f963fd873331b6a93e31a923ba860f75294916d5 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Tue, 12 Sep 2023 15:22:13 -0700
Subject: [PATCH 08/28] Fix Raspbian build

---
 configure    | 6 +++---
 configure.ac | 9 +++------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/configure b/configure
index 8326a1db06c2da..1c9a3fe7d0e4fc 100755
--- a/configure
+++ b/configure
@@ -27760,7 +27760,7 @@ fi
 #
 # Avoid #include <Python.h> or #include <pyport.h>. The <Python.h> header
 # requires <pyconfig.h> header which is only written below by AC_OUTPUT below.
-# If the check is done after AC_OUTPUT, modifying LIBATOMIC has no effect
+# If the check is done after AC_OUTPUT, modifying LIBS has no effect
 # anymore.  <pyport.h> cannot be included alone, it's designed to be included
 # by <Python.h>: it expects other includes and macros to be defined.
 save_CPPFLAGS=$CPPFLAGS
@@ -27825,7 +27825,7 @@ printf "%s\n" "$ac_cv_libatomic_needed" >&6; }
 
 if test "x$ac_cv_libatomic_needed" = xyes
 then :
-  LIBATOMIC=${LIBATOMIC-"-latomic"}
+  LIBS="${LIBS} -latomic"
 fi
 CPPFLAGS=$save_CPPFLAGS
 
@@ -29979,7 +29979,7 @@ fi
 then :
 
 
-    as_fn_append MODULE_BLOCK "MODULE__TESTCAPI_LDFLAGS=$LIBATOMIC$as_nl"
+
 
 fi
    if test "$py_cv_module__testcapi" = yes; then
diff --git a/configure.ac b/configure.ac
index 843f2b267a5253..fde8cfb97fcfa7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -6970,7 +6970,7 @@ fi
 #
 # Avoid #include <Python.h> or #include <pyport.h>. The <Python.h> header
 # requires <pyconfig.h> header which is only written below by AC_OUTPUT below.
-# If the check is done after AC_OUTPUT, modifying LIBATOMIC has no effect
+# If the check is done after AC_OUTPUT, modifying LIBS has no effect
 # anymore.  <pyport.h> cannot be included alone, it's designed to be included
 # by <Python.h>: it expects other includes and macros to be defined.
 _SAVE_VAR([CPPFLAGS])
@@ -7014,7 +7014,7 @@ int main()
 ])
 
 AS_VAR_IF([ac_cv_libatomic_needed], [yes],
-          [LIBATOMIC=${LIBATOMIC-"-latomic"}])
+          [LIBS="${LIBS} -latomic"])
 _RESTORE_VAR([CPPFLAGS])
 
 
@@ -7286,10 +7286,7 @@ PY_STDLIB_MOD([_hashlib], [], [test "$ac_cv_working_openssl_hashlib" = yes],
   [$OPENSSL_INCLUDES], [$OPENSSL_LDFLAGS $OPENSSL_LDFLAGS_RPATH $LIBCRYPTO_LIBS])
 
 dnl test modules
-PY_STDLIB_MOD([_testcapi],
-    [test "$TEST_MODULES" = yes],  []
-    dnl Modules/_testcapi/pyatomic.c uses <cpython/pyatomic.h> header
-    [], [], [$LIBATOMIC])
+PY_STDLIB_MOD([_testcapi], [test "$TEST_MODULES" = yes])
 PY_STDLIB_MOD([_testclinic], [test "$TEST_MODULES" = yes])
 PY_STDLIB_MOD([_testclinic_limited], [test "$TEST_MODULES" = yes])
 PY_STDLIB_MOD([_testinternalcapi], [test "$TEST_MODULES" = yes])

From 6f49ae75fd2b7c85c4decd067cb3fca95207cf6b Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Wed, 13 Sep 2023 10:52:49 -0700
Subject: [PATCH 09/28] Fix wasm tests.

The lock tests were being picked up twice: once in test_lock.py and once
in Test_testinternalcapi from test_misc.py. The Test_testinternalcapi
was not skipping tests when the platform doesn't have threads. This
moves the tests to test_misc.py, doesn't include them in
Test_testinternalcapi, and skips them if the platform doesn't support
threads.
---
 Lib/test/test_capi/test_lock.py | 19 -------------------
 Lib/test/test_capi/test_misc.py | 10 +++++++++-
 2 files changed, 9 insertions(+), 20 deletions(-)
 delete mode 100644 Lib/test/test_capi/test_lock.py

diff --git a/Lib/test/test_capi/test_lock.py b/Lib/test/test_capi/test_lock.py
deleted file mode 100644
index cfef5a26c4eb61..00000000000000
--- a/Lib/test/test_capi/test_lock.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import unittest
-from test.support import import_helper, threading_helper
-
-# Skip this test if the _testcapi module isn't available.
-_testinternalcapi = import_helper.import_module('_testinternalcapi')
-
-# Lock tests require threads
-threading_helper.requires_working_threading(module=True)
-
-
-class PyLockTests(unittest.TestCase):
-    pass
-
-for name in sorted(dir(_testinternalcapi)):
-    if name.startswith('test_lock_'):
-        setattr(PyLockTests, name, getattr(_testinternalcapi, name))
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py
index 964886ad1ca0d8..6b0237d4971e9f 100644
--- a/Lib/test/test_capi/test_misc.py
+++ b/Lib/test/test_capi/test_misc.py
@@ -2067,7 +2067,15 @@ def test_version_api_data(self):
 class Test_testinternalcapi(unittest.TestCase):
     locals().update((name, getattr(_testinternalcapi, name))
                     for name in dir(_testinternalcapi)
-                    if name.startswith('test_'))
+                    if name.startswith('test_')
+                    and not name.startswith('test_lock_'))
+
+
+@threading_helper.requires_working_threading()
+class Test_PyLock(unittest.TestCase):
+        locals().update((name, getattr(_testinternalcapi, name))
+                        for name in dir(_testinternalcapi)
+                        if name.startswith('test_lock_'))
 
 
 @unittest.skipIf(_testmultiphase is None, "test requires _testmultiphase module")

From 779a4019132fda37771de0d60195687f5f854ac3 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Wed, 13 Sep 2023 12:04:40 -0700
Subject: [PATCH 10/28] Fix indentation

---
 Lib/test/test_capi/test_misc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py
index 6b0237d4971e9f..84cbde75f8bb78 100644
--- a/Lib/test/test_capi/test_misc.py
+++ b/Lib/test/test_capi/test_misc.py
@@ -2073,9 +2073,9 @@ class Test_testinternalcapi(unittest.TestCase):
 
 @threading_helper.requires_working_threading()
 class Test_PyLock(unittest.TestCase):
-        locals().update((name, getattr(_testinternalcapi, name))
-                        for name in dir(_testinternalcapi)
-                        if name.startswith('test_lock_'))
+    locals().update((name, getattr(_testinternalcapi, name))
+                    for name in dir(_testinternalcapi)
+                    if name.startswith('test_lock_'))
 
 
 @unittest.skipIf(_testmultiphase is None, "test requires _testmultiphase module")

From 3200ef5ac37f81e44d3deaa566e746b959954b5e Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 08:15:14 -0700
Subject: [PATCH 11/28] Rename PyEvent_TimedWait and _PyMutex_LockTimed

---
 Include/internal/pycore_lock.h | 4 ++--
 Python/lock.c                  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
index 36c76358f5bcae..cd660758f8c360 100644
--- a/Include/internal/pycore_lock.h
+++ b/Include/internal/pycore_lock.h
@@ -94,7 +94,7 @@ typedef enum _PyLockFlags {
 // Lock a mutex with an optional timeout and additional options. See
 // _PyLockFlags for details.
 extern PyLockStatus
-_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags);
+_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags);
 
 // Unlock a mutex, returns 0 if the mutex is not locked (used for improved
 // error messages).
@@ -117,7 +117,7 @@ PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt);
 // Wait for the event to be set, or until the timeout expires. If the event is
 // already set, then this returns immediately. Returns 1 if the event was set,
 // and 0 if the timeout expired or thread was interrupted.
-PyAPI_FUNC(int) PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns);
+PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns);
 
 
 // _PyRawMutex implements a word-sized mutex that that does not depend on the
diff --git a/Python/lock.c b/Python/lock.c
index b8a1f64a838118..d5ad7a602052f6 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -48,11 +48,11 @@ _Py_yield(void)
 void
 _PyMutex_LockSlow(PyMutex *m)
 {
-    _PyMutex_TimedLock(m, -1, _PY_LOCK_DETACH);
+    _PyMutex_LockTimed(m, -1, _PY_LOCK_DETACH);
 }
 
 PyLockStatus
-_PyMutex_TimedLock(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
+_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
 {
     uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v);
     if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
@@ -265,12 +265,12 @@ _PyEvent_Notify(PyEvent *evt)
 void
 PyEvent_Wait(PyEvent *evt)
 {
-    while (!PyEvent_TimedWait(evt, -1))
+    while (!PyEvent_WaitTimed(evt, -1))
         ;
 }
 
 int
-PyEvent_TimedWait(PyEvent *evt, _PyTime_t timeout_ns)
+PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns)
 {
     for (;;) {
         uint8_t v = _Py_atomic_load_uint8(&evt->v);

From 880a2634c6f1d05db84e85c9ed79ffdd690bd6c3 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 08:22:35 -0700
Subject: [PATCH 12/28] more_waiters -> has_more_waiters

---
 Include/internal/pycore_parking_lot.h | 4 ++--
 Python/lock.c                         | 2 +-
 Python/parking_lot.c                  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
index 32a289912888f7..e30d4be452be3a 100644
--- a/Include/internal/pycore_parking_lot.h
+++ b/Include/internal/pycore_parking_lot.h
@@ -60,7 +60,7 @@ struct _PyUnpark {
     // Are there more threads waiting on the address? May be true in cases
     // where threads are waiting on a different address that maps to the same
     // internal bucket.
-    int more_waiters;
+    int has_more_waiters;
 };
 
 // Unpark a single thread waiting on `address`.
@@ -71,7 +71,7 @@ struct _PyUnpark {
 //  _PyParkingLot_Unpark(address, unpark, {
 //      if (unpark) {
 //          void *arg = unpark->arg;
-//          int more_waiters = unpark->more_waiters;
+//          int has_more_waiters = unpark->has_more_waiters;
 //          ...
 //      }
 //  });
diff --git a/Python/lock.c b/Python/lock.c
index d5ad7a602052f6..b01665db7d54a9 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -158,7 +158,7 @@ _PyMutex_TryUnlock(PyMutex *m)
                     if (should_be_fair) {
                         v |= _Py_LOCKED;
                     }
-                    if (unpark->more_waiters) {
+                    if (unpark->has_more_waiters) {
                         v |= _Py_HAS_PARKED;
                     }
                 }
diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index 4ec6657c033ed5..a0542e1973ff0d 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -422,7 +422,7 @@ _PyParkingLot_BeginUnpark(const void *addr)
         return NULL;
     }
 
-    waiter->unpark.more_waiters = (bucket->num_waiters > 0);
+    waiter->unpark.has_more_waiters = (bucket->num_waiters > 0);
     return &waiter->unpark;
 }
 

From 717b3c9c5a0ca0f9ceb320803b605c2aaa7a8142 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 08:25:14 -0700
Subject: [PATCH 13/28] Remove _PyMutex_State typedef

---
 Include/internal/pycore_lock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
index cd660758f8c360..61767b87c5494e 100644
--- a/Include/internal/pycore_lock.h
+++ b/Include/internal/pycore_lock.h
@@ -37,11 +37,11 @@ typedef struct _PyMutex {
     uint8_t v;
 } PyMutex;
 
-typedef enum {
+enum {
     _Py_UNLOCKED = 0,
     _Py_LOCKED = 1,
     _Py_HAS_PARKED = 2,
-} _PyMutex_State;
+};
 
 // (private) slow path for locking the mutex
 PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m);

From 23b91b0217913159206c0018bbae976c929cf61a Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 08:35:39 -0700
Subject: [PATCH 14/28] Update docs

---
 Include/internal/pycore_parking_lot.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
index e30d4be452be3a..d88549aa0dd6ab 100644
--- a/Include/internal/pycore_parking_lot.h
+++ b/Include/internal/pycore_parking_lot.h
@@ -42,12 +42,24 @@ enum {
 // with respect to unpark operations.
 //
 // The `address_size` argument is the size of the data pointed to by the
-// `address` and `expected` pointers (i.e., sizeof(*address)).
-//
-// `arg`, which can be NULL, is passed to the unpark operation.
+// `address` and `expected` pointers (i.e., sizeof(*address)). It must be
+// 1, 2, 4, or 8.
 //
 // The `timeout_ns` argument specifies the maximum amount of time to wait, with
 // -1 indicating an infinite wait.
+//
+// `arg`, which can be NULL, is passed to the unpark operation.
+//
+// If `detach` is true, then the thread will detach/release the GIL while
+// waiting.
+//
+// Example usage:
+//
+//  if (_Py_atomic_compare_exchange_uint8(address, &expected, new_value)) {
+//    int res = _PyParkingLot_Park(address, &new_value, sizeof(*address),
+//                                 timeout_ns, NULL, 1);
+//    ...
+//  }
 PyAPI_FUNC(int)
 _PyParkingLot_Park(const void *address, const void *expected,
                    size_t address_size, _PyTime_t timeout_ns,
@@ -110,6 +122,8 @@ PyAPI_FUNC(void) _PyParkingLot_AfterFork(void);
 typedef struct _PySemaphore _PySemaphore;
 
 // Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
+// If `detach` is true, then the thread will detach/release the GIL while
+// sleeping.
 PyAPI_FUNC(int)
 _PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach);
 

From 279c56afee60c1d98476cab179003ff2bbd02c0d Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 08:50:12 -0700
Subject: [PATCH 15/28] Include cpython/pyatomic.h via pyatomic.h

---
 Include/Python.h                      |  2 +-
 Include/cpython/pyatomic.h            | 10 +++-------
 Include/internal/pycore_parking_lot.h |  1 -
 Include/pyatomic.h                    | 16 ++++++++++++++++
 Modules/_testcapi/pyatomic.c          |  1 -
 PCbuild/pythoncore.vcxproj            |  1 +
 PCbuild/pythoncore.vcxproj.filters    |  3 +++
 configure                             |  2 +-
 configure.ac                          |  2 +-
 9 files changed, 26 insertions(+), 12 deletions(-)
 create mode 100644 Include/pyatomic.h

diff --git a/Include/Python.h b/Include/Python.h
index d6429856b4b9f4..7312cc87d5cc33 100644
--- a/Include/Python.h
+++ b/Include/Python.h
@@ -48,7 +48,7 @@
 #include "pytypedefs.h"
 #include "pybuffer.h"
 #include "pystats.h"
-#include "cpython/pyatomic.h"
+#include "pyatomic.h"
 #include "object.h"
 #include "objimpl.h"
 #include "typeslots.h"
diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index 066a969e0e486a..ab182381b39f00 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -83,10 +83,9 @@
 //       # release
 //       ...
 
-#ifndef Py_LIMITED_API
-#ifndef Py_ATOMIC_H
-#define Py_ATOMIC_H
-
+#ifndef Py_CPYTHON_ATOMIC_H
+#  error "this header file must not be included directly"
+#endif
 
 // --- _Py_atomic_add --------------------------------------------------------
 // Atomically adds `value` to `obj` and returns the previous value
@@ -502,6 +501,3 @@ static inline void _Py_atomic_fence_release(void);
 #else
 #  error "no available pyatomic implementation for this platform/compiler"
 #endif
-
-#endif  /* Py_ATOMIC_H */
-#endif  /* Py_LIMITED_API */
diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
index d88549aa0dd6ab..d6410cb1e2d4a5 100644
--- a/Include/internal/pycore_parking_lot.h
+++ b/Include/internal/pycore_parking_lot.h
@@ -19,7 +19,6 @@ extern "C" {
 #endif
 
 #include "pycore_time.h"          // _PyTime_t
-#include "cpython/pyatomic.h"
 
 
 enum {
diff --git a/Include/pyatomic.h b/Include/pyatomic.h
new file mode 100644
index 00000000000000..2ce2c81cf5251a
--- /dev/null
+++ b/Include/pyatomic.h
@@ -0,0 +1,16 @@
+#ifndef Py_ATOMIC_H
+#define Py_ATOMIC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_ATOMIC_H
+#  include "cpython/pyatomic.h"
+#  undef Py_CPYTHON_ATOMIC_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* !Py_ATOMIC_H */
diff --git a/Modules/_testcapi/pyatomic.c b/Modules/_testcapi/pyatomic.c
index 15602ce3f4ab3b..f0be2cfccccc98 100644
--- a/Modules/_testcapi/pyatomic.c
+++ b/Modules/_testcapi/pyatomic.c
@@ -8,7 +8,6 @@
 #undef NDEBUG
 
 #include "Python.h"
-#include "cpython/pyatomic.h"
 #include "parts.h"
 
 // We define atomic bitwise operations on these types
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index b35aeb58e11460..60126ac085e1f2 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -310,6 +310,7 @@
     <ClInclude Include="..\Include\osmodule.h" />
     <ClInclude Include="..\Include\patchlevel.h" />
     <ClInclude Include="..\Include\py_curses.h" />
+    <ClInclude Include="..\Include\pyatomic.h" />
     <ClInclude Include="..\Include\pybuffer.h" />
     <ClInclude Include="..\Include\pycapsule.h" />
     <ClInclude Include="..\Include\pyerrors.h" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index c2430f00cb7e9a..20d3df3de1f31a 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -144,6 +144,9 @@
     <ClInclude Include="..\Include\py_curses.h">
       <Filter>Include</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\pyatomic.h">
+      <Filter>Include</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\pybuffer.h">
       <Filter>Include</Filter>
     </ClInclude>
diff --git a/configure b/configure
index 1c9a3fe7d0e4fc..91408efb384de6 100755
--- a/configure
+++ b/configure
@@ -27793,7 +27793,7 @@ typedef intptr_t Py_ssize_t;
 #  error "unable to define Py_ssize_t"
 #endif
 
-#include "cpython/pyatomic.h"
+#include "pyatomic.h"
 
 int main()
 {
diff --git a/configure.ac b/configure.ac
index fde8cfb97fcfa7..d1b7c7a1817d67 100644
--- a/configure.ac
+++ b/configure.ac
@@ -6993,7 +6993,7 @@ typedef intptr_t Py_ssize_t;
 #  error "unable to define Py_ssize_t"
 #endif
 
-#include "cpython/pyatomic.h"
+#include "pyatomic.h"
 
 int main()
 {

From 8d8035c65fb93634a788f94c23ec107fe137fff8 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 08:57:29 -0700
Subject: [PATCH 16/28] Use compound initializer in test_lock.c

---
 Modules/_testinternalcapi/test_lock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Modules/_testinternalcapi/test_lock.c b/Modules/_testinternalcapi/test_lock.c
index fe25417a2f315d..33b49dacaa946e 100644
--- a/Modules/_testinternalcapi/test_lock.c
+++ b/Modules/_testinternalcapi/test_lock.c
@@ -31,8 +31,7 @@ pysleep(int ms)
 static PyObject *
 test_lock_basic(PyObject *self, PyObject *obj)
 {
-    PyMutex m;
-    memset(&m, 0, sizeof(m));
+    PyMutex m = (PyMutex){0};
 
     // uncontended lock and unlock
     PyMutex_Lock(&m);

From 417e75499481ee25b5f35e34ba39d8f071299a73 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 11:58:46 -0400
Subject: [PATCH 17/28] Apply suggestions from code review

Co-authored-by: Eric Snow <ericsnowcurrently@gmail.com>
---
 Include/internal/pycore_lock.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
index 61767b87c5494e..61a0874731e081 100644
--- a/Include/internal/pycore_lock.h
+++ b/Include/internal/pycore_lock.h
@@ -26,8 +26,7 @@ extern "C" {
 // 0b11: locked and has parked threads
 //
 // Typical initialization:
-//   PyMutex m;
-//   memset(&m, 0, sizeof(m));
+//   PyMutex m = (PyMutex){0};
 //
 // Typical usage:
 //   PyMutex_Lock(&m);
@@ -84,7 +83,7 @@ typedef enum _PyLockFlags {
     // Do not detach/release the GIL when waiting on the lock.
     _Py_LOCK_DONT_DETACH = 0,
 
-    // Detach/release the GIL when waiting on the lock.
+    // Detach/release the GIL while waiting on the lock.
     _PY_LOCK_DETACH = 1,
 
     // Handle signals if interrupted while waiting on the lock.

From 3ee97b1e75cedd65c01e3e2a1b1d6db4104e22e7 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Thu, 14 Sep 2023 10:54:13 -0700
Subject: [PATCH 18/28] Changes from review:

* Rename _PY_LOCK_MAKE_PENDING_CALLS to _PY_LOCK_HANDLE_SIGNALS
* Use define instead of enum for PyMutex states (e.g., _Py_UNLOCKED)
---
 Include/internal/pycore_lock.h | 10 ++++------
 Python/lock.c                  |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
index 61a0874731e081..c4bb76a40e7b12 100644
--- a/Include/internal/pycore_lock.h
+++ b/Include/internal/pycore_lock.h
@@ -36,11 +36,9 @@ typedef struct _PyMutex {
     uint8_t v;
 } PyMutex;
 
-enum {
-    _Py_UNLOCKED = 0,
-    _Py_LOCKED = 1,
-    _Py_HAS_PARKED = 2,
-};
+#define _Py_UNLOCKED    0
+#define _Py_LOCKED      1
+#define _Py_HAS_PARKED  2
 
 // (private) slow path for locking the mutex
 PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m);
@@ -87,7 +85,7 @@ typedef enum _PyLockFlags {
     _PY_LOCK_DETACH = 1,
 
     // Handle signals if interrupted while waiting on the lock.
-    _PY_LOCK_MAKE_PENDING_CALLS = 2,
+    _PY_LOCK_HANDLE_SIGNALS = 2,
 } _PyLockFlags;
 
 // Lock a mutex with an optional timeout and additional options. See
diff --git a/Python/lock.c b/Python/lock.c
index b01665db7d54a9..4ec76d11c33602 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -114,7 +114,7 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
                 return PY_LOCK_ACQUIRED;
             }
         }
-        else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_MAKE_PENDING_CALLS)) {
+        else if (ret == Py_PARK_INTR && (flags & _PY_LOCK_HANDLE_SIGNALS)) {
             if (Py_MakePendingCalls() < 0) {
                 return PY_LOCK_INTR;
             }

From 95b0d871dd0d2825945187df79ccc2c15ac40734 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 05:33:40 -0700
Subject: [PATCH 19/28] Remove thread-local data in parking_lot.c

Move _PySemaphore definition to its own header to avoid
leaking platform details in parking_lot.h
---
 Include/internal/pycore_parking_lot.h |  27 +----
 Include/internal/pycore_semaphore.h   |  63 ++++++++++++
 Makefile.pre.in                       |   1 +
 PCbuild/pythoncore.vcxproj            |   1 +
 PCbuild/pythoncore.vcxproj.filters    |   6 ++
 Python/lock.c                         |  11 ++-
 Python/parking_lot.c                  | 137 +++-----------------------
 Tools/c-analyzer/cpython/ignored.tsv  |   1 -
 8 files changed, 93 insertions(+), 154 deletions(-)
 create mode 100644 Include/internal/pycore_semaphore.h

diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
index d6410cb1e2d4a5..29285d2b980ec8 100644
--- a/Include/internal/pycore_parking_lot.h
+++ b/Include/internal/pycore_parking_lot.h
@@ -18,7 +18,7 @@ extern "C" {
 #  error "this header requires Py_BUILD_CORE define"
 #endif
 
-#include "pycore_time.h"          // _PyTime_t
+#include "pycore_time.h"        // _PyTime_t
 
 
 enum {
@@ -108,34 +108,9 @@ _PyParkingLot_FinishUnpark(const void *address, struct _PyUnpark *unpark);
 // Unparks all threads waiting on `address`.
 PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address);
 
-// Initialize/deinitialize the thread-local state used by parking lot.
-void _PyParkingLot_InitThread(void);
-void _PyParkingLot_DeinitThread(void);
-
 // Resets the parking lot state after a fork. Forgets all parked threads.
 PyAPI_FUNC(void) _PyParkingLot_AfterFork(void);
 
-
-// The _PySemaphore API a simplified cross-platform semaphore used to implement
-// parking lot. It is not intended to be used directly by other modules.
-typedef struct _PySemaphore _PySemaphore;
-
-// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
-// If `detach` is true, then the thread will detach/release the GIL while
-// sleeping.
-PyAPI_FUNC(int)
-_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach);
-
-// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
-// can be called before _PySemaphore_Wait().
-PyAPI_FUNC(void)
-_PySemaphore_Wakeup(_PySemaphore *sema);
-
-// Allocates/releases a semaphore from the thread-local pool.
-PyAPI_FUNC(_PySemaphore *) _PySemaphore_Alloc(void);
-PyAPI_FUNC(void) _PySemaphore_Free(_PySemaphore *sema);
-
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_semaphore.h b/Include/internal/pycore_semaphore.h
new file mode 100644
index 00000000000000..c1df8333629066
--- /dev/null
+++ b/Include/internal/pycore_semaphore.h
@@ -0,0 +1,63 @@
+// The _PySemaphore API a simplified cross-platform semaphore used to implement
+// wakeup/sleep.
+#ifndef Py_INTERNAL_SEMAPHORE_H
+#define Py_INTERNAL_SEMAPHORE_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_time.h"            // _PyTime_t
+
+#ifdef MS_WINDOWS
+#   define WIN32_LEAN_AND_MEAN
+#   include <windows.h>
+#elif defined(HAVE_PTHREAD_H)
+#   include <pthread.h>
+#elif defined(HAVE_PTHREAD_STUBS)
+#   include "cpython/pthread_stubs.h"
+#else
+#   error "Require native threads. See https://bugs.python.org/issue31370"
+#endif
+
+#if defined(_POSIX_SEMAPHORES) && (_POSIX_SEMAPHORES+0) != -1
+#   define _Py_USE_SEMAPHORES
+#   include <semaphore.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _PySemaphore {
+#if defined(MS_WINDOWS)
+    HANDLE platform_sem;
+#elif defined(_Py_USE_SEMAPHORES)
+    sem_t platform_sem;
+#else
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    int counter;
+#endif
+} _PySemaphore;
+
+// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
+// If `detach` is true, then the thread will detach/release the GIL while
+// sleeping.
+PyAPI_FUNC(int)
+_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach);
+
+// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
+// can be called before _PySemaphore_Wait().
+PyAPI_FUNC(void)
+_PySemaphore_Wakeup(_PySemaphore *sema);
+
+// Initializes/destroys a semaphore
+PyAPI_FUNC(void) _PySemaphore_Init(_PySemaphore *sema);
+PyAPI_FUNC(void) _PySemaphore_Destroy(_PySemaphore *sema);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SEMAPHORE_H */
diff --git a/Makefile.pre.in b/Makefile.pre.in
index e634d3f30f857e..ce72f9cee86eef 100644
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1810,6 +1810,7 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/internal/pycore_runtime.h \
 		$(srcdir)/Include/internal/pycore_runtime_init_generated.h \
 		$(srcdir)/Include/internal/pycore_runtime_init.h \
+		$(srcdir)/Include/internal/pycore_semaphore.h \
 		$(srcdir)/Include/internal/pycore_setobject.h \
 		$(srcdir)/Include/internal/pycore_signal.h \
 		$(srcdir)/Include/internal/pycore_sliceobject.h \
diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
index 60126ac085e1f2..190eaa16daa8af 100644
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -272,6 +272,7 @@
     <ClInclude Include="..\Include\internal\pycore_runtime.h" />
     <ClInclude Include="..\Include\internal\pycore_runtime_init.h" />
     <ClInclude Include="..\Include\internal\pycore_runtime_init_generated.h" />
+    <ClInclude Include="..\Include\internal\pycore_semaphore.h" />
     <ClInclude Include="..\Include\internal\pycore_setobject.h" />
     <ClInclude Include="..\Include\internal\pycore_signal.h" />
     <ClInclude Include="..\Include\internal\pycore_sliceobject.h" />
diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
index 20d3df3de1f31a..f4fddfdd11f4c1 100644
--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -681,6 +681,9 @@
     <ClInclude Include="..\Include\internal\pycore_optimizer.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_parking_lot.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_pathconfig.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
@@ -726,6 +729,9 @@
     <ClInclude Include="..\Include\internal\pycore_runtime_init_generated.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_semaphore.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_setobject.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
diff --git a/Python/lock.c b/Python/lock.c
index 4ec76d11c33602..d220990e8719df 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -4,6 +4,7 @@
 
 #include "pycore_lock.h"
 #include "pycore_parking_lot.h"
+#include "pycore_semaphore.h"
 
 #ifdef MS_WINDOWS
 #define WIN32_LEAN_AND_MEAN
@@ -185,14 +186,14 @@ _PyMutex_UnlockSlow(PyMutex *m)
 // thread waiting on the mutex, directly in the mutex itself.
 struct raw_mutex_entry {
     struct raw_mutex_entry *next;
-    _PySemaphore *sema;
+    _PySemaphore sema;
 };
 
 void
 _PyRawMutex_LockSlow(_PyRawMutex *m)
 {
     struct raw_mutex_entry waiter;
-    waiter.sema = _PySemaphore_Alloc();
+    _PySemaphore_Init(&waiter.sema);
 
     uintptr_t v = _Py_atomic_load_uintptr(&m->v);
     for (;;) {
@@ -213,10 +214,10 @@ _PyRawMutex_LockSlow(_PyRawMutex *m)
 
         // Wait for us to be woken up. Note that we still have to lock the
         // mutex ourselves: it is NOT handed off to us.
-        _PySemaphore_Wait(waiter.sema, -1, /*detach=*/0);
+        _PySemaphore_Wait(&waiter.sema, -1, /*detach=*/0);
     }
 
-    _PySemaphore_Free(waiter.sema);
+    _PySemaphore_Destroy(&waiter.sema);
 }
 
 void
@@ -232,7 +233,7 @@ _PyRawMutex_UnlockSlow(_PyRawMutex *m)
         if (waiter) {
             uintptr_t next_waiter = (uintptr_t)waiter->next;
             if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, next_waiter)) {
-                _PySemaphore_Wakeup(waiter->sema);
+                _PySemaphore_Wakeup(&waiter->sema);
                 return;
             }
         }
diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index a0542e1973ff0d..d90f7affed94ef 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -5,35 +5,8 @@
 #include "pycore_parking_lot.h"
 #include "pycore_pyerrors.h"    // _Py_FatalErrorFormat
 #include "pycore_pystate.h"     // _PyThreadState_GET
+#include "pycore_semaphore.h"   // _PySemaphore
 
-#ifdef MS_WINDOWS
-#   define WIN32_LEAN_AND_MEAN
-#   include <windows.h>
-#elif (defined(_POSIX_SEMAPHORES) && !defined(HAVE_BROKEN_POSIX_SEMAPHORES) && \
-     defined(HAVE_SEM_TIMEDWAIT))
-#   define USE_SEMAPHORES
-#   include <semaphore.h>
-#elif defined(HAVE_PTHREAD_H)
-#   include <pthread.h>
-#elif defined(HAVE_PTHREAD_STUBS)
-#   include "cpython/pthread_stubs.h"
-#else
-#   error "Require native threads. See https://bugs.python.org/issue31370"
-#endif
-
-// A simple, cross-platform binary semaphore that can be used to implement
-// wakeup/sleep.
-struct _PySemaphore {
-#if defined(MS_WINDOWS)
-    HANDLE platform_sem;
-#elif defined(USE_SEMAPHORES)
-    sem_t platform_sem;
-#else
-    PyMUTEX_T mutex;
-    PyCOND_T cond;
-    int counter;
-#endif
-};
 
 typedef struct {
     // The mutex protects the waiter queue and the num_waiters counter.
@@ -47,31 +20,16 @@ typedef struct {
 struct wait_entry {
     struct _PyUnpark unpark;
     uintptr_t addr;
-    _PySemaphore *sema;
+    _PySemaphore sema;
     struct llist_node node;
 };
 
-#define MAX_SEMA_DEPTH 3
-
-typedef struct {
-    Py_ssize_t refcount;
-
-    int depth;
-    _PySemaphore semas[MAX_SEMA_DEPTH];
-} ThreadData;
-
 #define NUM_BUCKETS 251
 
 // Table of waiters (hashed by address)
 static Bucket buckets[NUM_BUCKETS];
 
-#ifdef HAVE_THREAD_LOCAL
-static _Py_thread_local ThreadData *thread_data = NULL;
-#else
-#error "no supported thread-local variable storage classifier"
-#endif
-
-static void
+void
 _PySemaphore_Init(_PySemaphore *sema)
 {
 #if defined(MS_WINDOWS)
@@ -84,7 +42,7 @@ _PySemaphore_Init(_PySemaphore *sema)
     if (!sema->platform_sem) {
         Py_FatalError("parking_lot: CreateSemaphore failed");
     }
-#elif defined(USE_SEMAPHORES)
+#elif defined(_Py_USE_SEMAPHORES)
     if (sem_init(&sema->platform_sem, /*pshared=*/0, /*value=*/0) < 0) {
         Py_FatalError("parking_lot: sem_init failed");
     }
@@ -95,15 +53,16 @@ _PySemaphore_Init(_PySemaphore *sema)
     if (pthread_cond_init(&sema->cond, NULL)) {
         Py_FatalError("parking_lot: pthread_cond_init failed");
     }
+    sema->counter = 0;
 #endif
 }
 
-static void
+void
 _PySemaphore_Destroy(_PySemaphore *sema)
 {
 #if defined(MS_WINDOWS)
     CloseHandle(sema->platform_sem);
-#elif defined(USE_SEMAPHORES)
+#elif defined(_Py_USE_SEMAPHORES)
     sem_destroy(&sema->platform_sem);
 #else
     pthread_mutex_destroy(&sema->mutex);
@@ -131,7 +90,7 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
     else if (wait == WAIT_TIMEOUT) {
         res = Py_PARK_TIMEOUT;
     }
-#elif defined(USE_SEMAPHORES)
+#elif defined(_Py_USE_SEMAPHORES)
     int err;
     if (timeout >= 0) {
         struct timespec ts;
@@ -218,7 +177,7 @@ _PySemaphore_Wakeup(_PySemaphore *sema)
     if (!ReleaseSemaphore(sema->platform_sem, 1, NULL)) {
         Py_FatalError("parking_lot: ReleaseSemaphore failed");
     }
-#elif defined(USE_SEMAPHORES)
+#elif defined(_Py_USE_SEMAPHORES)
     int err = sem_post(&sema->platform_sem);
     if (err != 0) {
         Py_FatalError("parking_lot: sem_post failed");
@@ -231,72 +190,6 @@ _PySemaphore_Wakeup(_PySemaphore *sema)
 #endif
 }
 
-_PySemaphore *
-_PySemaphore_Alloc(void)
-{
-    // Make sure we have a valid thread_data. We need to acquire
-    // some locks before we have a fully initialized PyThreadState.
-    _PyParkingLot_InitThread();
-
-    ThreadData *this_thread = thread_data;
-    if (this_thread->depth >= MAX_SEMA_DEPTH) {
-        Py_FatalError("_PySemaphore_Alloc(): too many calls");
-    }
-    return &this_thread->semas[this_thread->depth++];
-}
-
-void
-_PySemaphore_Free(_PySemaphore *sema)
-{
-    ThreadData *this_thread = thread_data;
-    this_thread->depth--;
-    if (&this_thread->semas[this_thread->depth] != sema) {
-        Py_FatalError("_PySemaphore_Free(): mismatch wakeup");
-    }
-    _PyParkingLot_DeinitThread();
-}
-
-void
-_PyParkingLot_InitThread(void)
-{
-    if (thread_data != NULL) {
-        thread_data->refcount++;
-        return;
-    }
-    ThreadData *this_thread = PyMem_RawMalloc(sizeof(ThreadData));
-    if (this_thread == NULL) {
-        Py_FatalError("_PyParkingLot_InitThread: unable to allocate thread data");
-    }
-    memset(this_thread, 0, sizeof(*this_thread));
-    this_thread->refcount = 1;
-    this_thread->depth = 0;
-    for (int i = 0; i < MAX_SEMA_DEPTH; i++) {
-        _PySemaphore_Init(&this_thread->semas[i]);
-    }
-    thread_data = this_thread;
-}
-
-void
-_PyParkingLot_DeinitThread(void)
-{
-    ThreadData *td = thread_data;
-    if (td == NULL) {
-        return;
-    }
-
-    if (--td->refcount != 0) {
-        assert(td->refcount > 0);
-        return;
-    }
-
-    thread_data = NULL;
-    for (int i = 0; i < MAX_SEMA_DEPTH; i++) {
-        _PySemaphore_Destroy(&td->semas[i]);
-    }
-
-    PyMem_RawFree(td);
-}
-
 static void
 enqueue(Bucket *bucket, const void *address, struct wait_entry *wait)
 {
@@ -379,11 +272,11 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
         _PyRawMutex_Unlock(&bucket->mutex);
         return Py_PARK_AGAIN;
     }
-    wait.sema = _PySemaphore_Alloc();
+    _PySemaphore_Init(&wait.sema);
     enqueue(bucket, addr, &wait);
     _PyRawMutex_Unlock(&bucket->mutex);
 
-    int res = _PySemaphore_Wait(wait.sema, timeout_ns, detach);
+    int res = _PySemaphore_Wait(&wait.sema, timeout_ns, detach);
     if (res == Py_PARK_OK) {
         goto done;
     }
@@ -395,7 +288,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
         // We've been removed the waiter queue. Wait until we process the
         // wakeup signal.
         do {
-            res = _PySemaphore_Wait(wait.sema, -1, detach);
+            res = _PySemaphore_Wait(&wait.sema, -1, detach);
         } while (res != Py_PARK_OK);
         goto done;
     }
@@ -406,7 +299,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
     _PyRawMutex_Unlock(&bucket->mutex);
 
 done:
-    _PySemaphore_Free(wait.sema);
+    _PySemaphore_Destroy(&wait.sema);
     return res;
 
 }
@@ -439,7 +332,7 @@ _PyParkingLot_FinishUnpark(const void *addr, struct _PyUnpark *unpark)
         struct wait_entry *waiter;
         waiter = container_of(unpark, struct wait_entry, unpark);
 
-        _PySemaphore_Wakeup(waiter->sema);
+        _PySemaphore_Wakeup(&waiter->sema);
     }
 }
 
@@ -457,7 +350,7 @@ _PyParkingLot_UnparkAll(const void *addr)
     llist_for_each_safe(node, &head) {
         struct wait_entry *waiter = llist_data(node, struct wait_entry, node);
         llist_remove(node);
-        _PySemaphore_Wakeup(waiter->sema);
+        _PySemaphore_Wakeup(&waiter->sema);
     }
 }
 
diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv
index 0e0c9980958efb..841cfc72fe631f 100644
--- a/Tools/c-analyzer/cpython/ignored.tsv
+++ b/Tools/c-analyzer/cpython/ignored.tsv
@@ -176,7 +176,6 @@ Python/pyfpe.c	-	PyFPE_counter	-
 
 Python/import.c	-	pkgcontext	-
 Python/pystate.c	-	_Py_tss_tstate	-
-Python/parking_lot.c	-	thread_data	-
 
 ##-----------------------
 ## should be const

From 3d1d2e166be8caf7b52e0dc06aac1276113f5d16 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 06:34:24 -0700
Subject: [PATCH 20/28] Add doc for NUM_BUCKETS

---
 Python/parking_lot.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index d90f7affed94ef..fdf4a10fa25ff9 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -24,6 +24,10 @@ struct wait_entry {
     struct llist_node node;
 };
 
+// Prime number to avoid correlations with memory addresses.
+// We want this to be roughly proportional to the number of CPU cores
+// to minimize contention on the bucket locks, but not too big to avoid
+// wasting memory. The exact choice does not matter much.
 #define NUM_BUCKETS 251
 
 // Table of waiters (hashed by address)

From e043a8e6e1a41cc0d9c43ed096169b77030fd95b Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 08:50:26 -0700
Subject: [PATCH 21/28] Make use of Py_PARK_INTR more explicit in
 _PySemaphore_PlatformWait

---
 Python/parking_lot.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index fdf4a10fa25ff9..1c3bab5331e145 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -77,7 +77,7 @@ _PySemaphore_Destroy(_PySemaphore *sema)
 static int
 _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
 {
-    int res = Py_PARK_INTR;
+    int res;
 #if defined(MS_WINDOWS)
     DWORD wait;
     DWORD millis = 0;
@@ -94,6 +94,9 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
     else if (wait == WAIT_TIMEOUT) {
         res = Py_PARK_TIMEOUT;
     }
+    else {
+        res = Py_PARK_INTR;
+    }
 #elif defined(_Py_USE_SEMAPHORES)
     int err;
     if (timeout >= 0) {
@@ -126,8 +129,8 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
     }
 #else
     pthread_mutex_lock(&sema->mutex);
+    int err = 0;
     if (sema->counter == 0) {
-        int err;
         if (timeout >= 0) {
             struct timespec ts;
 
@@ -139,14 +142,17 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
         else {
             err = pthread_cond_wait(&sema->cond, &sema->mutex);
         }
-        if (err) {
-            res = Py_PARK_TIMEOUT;
-        }
     }
     if (sema->counter > 0) {
         sema->counter--;
         res = Py_PARK_OK;
     }
+    else if (err) {
+        res = Py_PARK_TIMEOUT;
+    }
+    else {
+        res = Py_PARK_INTR;
+    }
     pthread_mutex_unlock(&sema->mutex);
 #endif
     return res;

From 6b1d8f968e870ac4ae35622901272f1535d5b3ce Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 08:59:11 -0700
Subject: [PATCH 22/28] Statically initialize buckets in parking_lot.c

---
 Python/parking_lot.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index 1c3bab5331e145..a9dbca3b57fbeb 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -28,10 +28,23 @@ struct wait_entry {
 // We want this to be roughly proportional to the number of CPU cores
 // to minimize contention on the bucket locks, but not too big to avoid
 // wasting memory. The exact choice does not matter much.
-#define NUM_BUCKETS 251
+#define NUM_BUCKETS 257
+
+#define BUCKET_INIT(b, i) [i] = { .root = LLIST_INIT(b[i].root) }
+#define BUCKET_INIT_2(b, i)   BUCKET_INIT(b, i),     BUCKET_INIT(b, i+1)
+#define BUCKET_INIT_4(b, i)   BUCKET_INIT_2(b, i),   BUCKET_INIT_2(b, i+2)
+#define BUCKET_INIT_8(b, i)   BUCKET_INIT_4(b, i),   BUCKET_INIT_4(b, i+4)
+#define BUCKET_INIT_16(b, i)  BUCKET_INIT_8(b, i),   BUCKET_INIT_8(b, i+8)
+#define BUCKET_INIT_32(b, i)  BUCKET_INIT_16(b, i),  BUCKET_INIT_16(b, i+16)
+#define BUCKET_INIT_64(b, i)  BUCKET_INIT_32(b, i),  BUCKET_INIT_32(b, i+32)
+#define BUCKET_INIT_128(b, i) BUCKET_INIT_64(b, i),  BUCKET_INIT_64(b, i+64)
+#define BUCKET_INIT_256(b, i) BUCKET_INIT_128(b, i), BUCKET_INIT_128(b, i+128)
 
 // Table of waiters (hashed by address)
-static Bucket buckets[NUM_BUCKETS];
+static Bucket buckets[NUM_BUCKETS] = {
+    BUCKET_INIT_256(buckets, 0),
+    BUCKET_INIT(buckets, 256),
+};
 
 void
 _PySemaphore_Init(_PySemaphore *sema)
@@ -203,10 +216,6 @@ _PySemaphore_Wakeup(_PySemaphore *sema)
 static void
 enqueue(Bucket *bucket, const void *address, struct wait_entry *wait)
 {
-    if (!bucket->root.next) {
-        // initialize bucket
-        llist_init(&bucket->root);
-    }
     llist_insert_tail(&bucket->root, &wait->node);
     ++bucket->num_waiters;
 }
@@ -214,13 +223,8 @@ enqueue(Bucket *bucket, const void *address, struct wait_entry *wait)
 static struct wait_entry *
 dequeue(Bucket *bucket, const void *address)
 {
-    struct llist_node *root = &bucket->root;
-    if (!root->next) {
-        // bucket was not yet initialized
-        return NULL;
-    }
-
     // find the first waiter that is waiting on `address`
+    struct llist_node *root = &bucket->root;
     struct llist_node *node;
     llist_for_each(node, root) {
         struct wait_entry *wait = llist_data(node, struct wait_entry, node);
@@ -236,13 +240,8 @@ dequeue(Bucket *bucket, const void *address)
 static void
 dequeue_all(Bucket *bucket, const void *address, struct llist_node *dst)
 {
-    struct llist_node *root = &bucket->root;
-    if (!root->next) {
-        // bucket was not yet initialized
-        return;
-    }
-
     // remove and append all matching waiters to dst
+    struct llist_node *root = &bucket->root;
     struct llist_node *node;
     llist_for_each_safe(node, root) {
         struct wait_entry *wait = llist_data(node, struct wait_entry, node);
@@ -370,4 +369,7 @@ _PyParkingLot_AfterFork(void)
     // After a fork only one thread remains. That thread cannot be blocked
     // so all entries in the parking lot are for dead threads.
     memset(buckets, 0, sizeof(buckets));
+    for (Py_ssize_t i = 0; i < NUM_BUCKETS; i++) {
+        llist_init(&buckets[i].root);
+    }
 }

From 1af0bffdd340c851b4bddc2a41f81fe195462166 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 09:07:00 -0700
Subject: [PATCH 23/28] Rename validate_addr to atomic_memcmp

---
 Python/parking_lot.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index a9dbca3b57fbeb..e45319a0ebe17b 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -253,9 +253,9 @@ dequeue_all(Bucket *bucket, const void *address, struct llist_node *dst)
     }
 }
 
-// Checks that `*addr == *expected`
+// Checks that `*addr == *expected` (only works for 1, 2, 4, or 8 bytes)
 static int
-validate_addr(const void *addr, const void *expected, size_t addr_size)
+atomic_memcmp(const void *addr, const void *expected, size_t addr_size)
 {
     switch (addr_size) {
     case 1: return _Py_atomic_load_uint8(addr) == *(const uint8_t *)expected;
@@ -277,7 +277,7 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
     Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
 
     _PyRawMutex_Lock(&bucket->mutex);
-    if (!validate_addr(addr, expected, size)) {
+    if (!atomic_memcmp(addr, expected, size)) {
         _PyRawMutex_Unlock(&bucket->mutex);
         return Py_PARK_AGAIN;
     }

From f2073d00ddbde9de6812cb9a623870280778d34e Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 09:18:57 -0700
Subject: [PATCH 24/28] Add is_unparking to struct wait_entry in parking_lot.c

---
 Python/parking_lot.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index e45319a0ebe17b..33247fb5b38afe 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -7,6 +7,8 @@
 #include "pycore_pystate.h"     // _PyThreadState_GET
 #include "pycore_semaphore.h"   // _PySemaphore
 
+#include <stdbool.h>
+
 
 typedef struct {
     // The mutex protects the waiter queue and the num_waiters counter.
@@ -22,6 +24,7 @@ struct wait_entry {
     uintptr_t addr;
     _PySemaphore sema;
     struct llist_node node;
+    bool is_unparking;
 };
 
 // Prime number to avoid correlations with memory addresses.
@@ -270,9 +273,11 @@ int
 _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
                    _PyTime_t timeout_ns, void *arg, int detach)
 {
-    struct wait_entry wait;
-    wait.unpark.arg = arg;
-    wait.addr = (uintptr_t)addr;
+    struct wait_entry wait = {
+        .unpark.arg = arg,
+        .addr = (uintptr_t)addr,
+        .is_unparking = false,
+    };
 
     Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
 
@@ -292,9 +297,9 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
 
     // timeout or interrupt
     _PyRawMutex_Lock(&bucket->mutex);
-    if (wait.node.next == NULL) {
+    if (wait.is_unparking) {
         _PyRawMutex_Unlock(&bucket->mutex);
-        // We've been removed the waiter queue. Wait until we process the
+        // Another thread has started to unpark us. Wait until we process the
         // wakeup signal.
         do {
             res = _PySemaphore_Wait(&wait.sema, -1, detach);
@@ -324,6 +329,7 @@ _PyParkingLot_BeginUnpark(const void *addr)
         return NULL;
     }
 
+    waiter->is_unparking = true;
     waiter->unpark.has_more_waiters = (bucket->num_waiters > 0);
     return &waiter->unpark;
 }

From 8f1645b6818a0028951a97ce87216539c44cf6c1 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 10:00:14 -0700
Subject: [PATCH 25/28] Use callback in _PyParkingLot_Unpark

---
 Include/internal/pycore_parking_lot.h | 64 ++++++++++-----------------
 Python/lock.c                         | 37 +++++++++-------
 Python/parking_lot.c                  | 42 +++++++-----------
 3 files changed, 60 insertions(+), 83 deletions(-)

diff --git a/Include/internal/pycore_parking_lot.h b/Include/internal/pycore_parking_lot.h
index 29285d2b980ec8..f444da730055e8 100644
--- a/Include/internal/pycore_parking_lot.h
+++ b/Include/internal/pycore_parking_lot.h
@@ -37,8 +37,8 @@ enum {
 
 // Checks that `*address == *expected` and puts the thread to sleep until an
 // unpark operation is called on the same `address`. Otherwise, the function
-// returns `Py_PARK_AGAIN`. The comparison is performed atomically
-// with respect to unpark operations.
+// returns `Py_PARK_AGAIN`. The comparison behaves like memcmp, but is
+// performed atomically with respect to unpark operations.
 //
 // The `address_size` argument is the size of the data pointed to by the
 // `address` and `expected` pointers (i.e., sizeof(*address)). It must be
@@ -47,7 +47,7 @@ enum {
 // The `timeout_ns` argument specifies the maximum amount of time to wait, with
 // -1 indicating an infinite wait.
 //
-// `arg`, which can be NULL, is passed to the unpark operation.
+// `park_arg`, which can be NULL, is passed to the unpark operation.
 //
 // If `detach` is true, then the thread will detach/release the GIL while
 // waiting.
@@ -62,48 +62,30 @@ enum {
 PyAPI_FUNC(int)
 _PyParkingLot_Park(const void *address, const void *expected,
                    size_t address_size, _PyTime_t timeout_ns,
-                   void *arg, int detach);
+                   void *park_arg, int detach);
 
-struct _PyUnpark {
-    // The `arg` value passed to _PyParkingLot_Park().
-    void *arg;
-
-    // Are there more threads waiting on the address? May be true in cases
-    // where threads are waiting on a different address that maps to the same
-    // internal bucket.
-    int has_more_waiters;
-};
-
-// Unpark a single thread waiting on `address`.
+// Callback for _PyParkingLot_Unpark:
 //
-// The `unpark` is a pointer to a `struct _PyUnpark`.
+// `arg` is the data of the same name provided to the _PyParkingLot_Unpark()
+//      call.
+// `park_arg` is the data provided to _PyParkingLot_Park() call or NULL if
+//      no waiting thread was found.
+// `has_more_waiters` is true if there are more threads waiting on the same
+//      address. May be true in cases where threads are waiting on a different
+//      address that map to the same internal bucket.
+typedef void _Py_unpark_fn_t(void *arg, void *park_arg, int has_more_waiters);
+
+// Unparks a single thread waiting on `address`.
 //
-// Usage:
-//  _PyParkingLot_Unpark(address, unpark, {
-//      if (unpark) {
-//          void *arg = unpark->arg;
-//          int has_more_waiters = unpark->has_more_waiters;
-//          ...
-//      }
-//  });
-#define _PyParkingLot_Unpark(address, unpark, ...)                          \
-    do {                                                                    \
-        struct _PyUnpark *(unpark);                                         \
-        unpark = _PyParkingLot_BeginUnpark((address));                      \
-        __VA_ARGS__                                                         \
-        _PyParkingLot_FinishUnpark((address), unpark);                      \
-    } while (0);
-
-// Implements half of an unpark operation.
-// Prefer using the _PyParkingLot_Unpark() macro.
-PyAPI_FUNC(struct _PyUnpark *)
-_PyParkingLot_BeginUnpark(const void *address);
-
-// Finishes the unpark operation and wakes up the thread selected by
-// _PyParkingLot_BeginUnpark.
-// Prefer using the _PyParkingLot_Unpark() macro.
+// Note that fn() is called regardless of whether a thread was unparked. If
+// no threads are waiting on `address` then the `park_arg` argument to fn()
+// will be NULL.
+//
+// Example usage:
+//  void callback(void *arg, void *park_arg, int has_more_waiters);
+//  _PyParkingLot_Unpark(address, &callback, arg);
 PyAPI_FUNC(void)
-_PyParkingLot_FinishUnpark(const void *address, struct _PyUnpark *unpark);
+_PyParkingLot_Unpark(const void *address, _Py_unpark_fn_t *fn, void *arg);
 
 // Unparks all threads waiting on `address`.
 PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address);
diff --git a/Python/lock.c b/Python/lock.c
index d220990e8719df..da364f37a6f057 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -137,6 +137,25 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
     }
 }
 
+static void
+mutex_unpark(PyMutex *m, struct mutex_entry *entry, int has_more_waiters)
+{
+    uint8_t v = 0;
+    if (entry) {
+        _PyTime_t now = _PyTime_GetMonotonicClock();
+        int should_be_fair = now > entry->time_to_be_fair;
+
+        entry->handoff = should_be_fair;
+        if (should_be_fair) {
+            v |= _Py_LOCKED;
+        }
+        if (has_more_waiters) {
+            v |= _Py_HAS_PARKED;
+        }
+    }
+    _Py_atomic_store_uint8(&m->v, v);
+}
+
 int
 _PyMutex_TryUnlock(PyMutex *m)
 {
@@ -148,23 +167,7 @@ _PyMutex_TryUnlock(PyMutex *m)
         }
         else if ((v & _Py_HAS_PARKED)) {
             // wake up a single thread
-            _PyParkingLot_Unpark(&m->v, unpark, {
-                v = 0;
-                if (unpark) {
-                    struct mutex_entry *entry = unpark->arg;
-                    _PyTime_t now = _PyTime_GetMonotonicClock();
-                    int should_be_fair = now > entry->time_to_be_fair;
-
-                    entry->handoff = should_be_fair;
-                    if (should_be_fair) {
-                        v |= _Py_LOCKED;
-                    }
-                    if (unpark->has_more_waiters) {
-                        v |= _Py_HAS_PARKED;
-                    }
-                }
-                _Py_atomic_store_uint8(&m->v, v);
-            });
+            _PyParkingLot_Unpark(&m->v, (_Py_unpark_fn_t *)mutex_unpark, m);
             return 0;
         }
         else if (_Py_atomic_compare_exchange_uint8(&m->v, &v, _Py_UNLOCKED)) {
diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index 33247fb5b38afe..170108b0edb6ae 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -20,7 +20,7 @@ typedef struct {
 } Bucket;
 
 struct wait_entry {
-    struct _PyUnpark unpark;
+    void *park_arg;
     uintptr_t addr;
     _PySemaphore sema;
     struct llist_node node;
@@ -271,10 +271,10 @@ atomic_memcmp(const void *addr, const void *expected, size_t addr_size)
 
 int
 _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
-                   _PyTime_t timeout_ns, void *arg, int detach)
+                   _PyTime_t timeout_ns, void *park_arg, int detach)
 {
     struct wait_entry wait = {
-        .unpark.arg = arg,
+        .park_arg = park_arg,
         .addr = (uintptr_t)addr,
         .is_unparking = false,
     };
@@ -318,35 +318,27 @@ _PyParkingLot_Park(const void *addr, const void *expected, size_t size,
 
 }
 
-struct _PyUnpark *
-_PyParkingLot_BeginUnpark(const void *addr)
+void
+_PyParkingLot_Unpark(const void *addr, _Py_unpark_fn_t *fn, void *arg)
 {
     Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
-    _PyRawMutex_Lock(&bucket->mutex);
 
+    // Find the first waiter that is waiting on `addr`
+    _PyRawMutex_Lock(&bucket->mutex);
     struct wait_entry *waiter = dequeue(bucket, addr);
-    if (!waiter) {
-        return NULL;
-    }
-
-    waiter->is_unparking = true;
-    waiter->unpark.has_more_waiters = (bucket->num_waiters > 0);
-    return &waiter->unpark;
-}
-
-#define container_of(ptr, type, member) \
-    ((type *)((char *)(ptr) - offsetof(type, member)))
+    if (waiter) {
+        waiter->is_unparking = true;
 
-void
-_PyParkingLot_FinishUnpark(const void *addr, struct _PyUnpark *unpark)
-{
-    Bucket *bucket = &buckets[((uintptr_t)addr) % NUM_BUCKETS];
+        int has_more_waiters = (bucket->num_waiters > 0);
+        fn(arg, waiter->park_arg, has_more_waiters);
+    }
+    else {
+        fn(arg, NULL, 0);
+    }
     _PyRawMutex_Unlock(&bucket->mutex);
 
-    if (unpark) {
-        struct wait_entry *waiter;
-        waiter = container_of(unpark, struct wait_entry, unpark);
-
+    if (waiter) {
+        // Wakeup the waiter outside of the bucket lock
         _PySemaphore_Wakeup(&waiter->sema);
     }
 }

From a0261c16511566c8a80381ee5b50fd825d21460a Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 10:12:28 -0700
Subject: [PATCH 26/28] Simplify _PySemaphore_Wait.

Release the GIL if `detach` is `1` and there is a current thread state.
It's the caller's responsibility to make sure that if there is a valid
thread state (and `detach` is 1), then the current thread holds the GIL.
---
 Python/parking_lot.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/Python/parking_lot.c b/Python/parking_lot.c
index 170108b0edb6ae..664e622cc17474 100644
--- a/Python/parking_lot.c
+++ b/Python/parking_lot.c
@@ -177,21 +177,18 @@ _PySemaphore_PlatformWait(_PySemaphore *sema, _PyTime_t timeout)
 int
 _PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout, int detach)
 {
-    PyThreadState *tstate = _PyThreadState_GET();
-    int was_attached = 0;
-    if (tstate) {
-        was_attached = (tstate->_status.active);
-        if (was_attached && detach) {
+    PyThreadState *tstate = NULL;
+    if (detach) {
+        tstate = _PyThreadState_GET();
+        if (tstate) {
             PyEval_ReleaseThread(tstate);
         }
     }
 
     int res = _PySemaphore_PlatformWait(sema, timeout);
 
-    if (tstate) {
-        if (was_attached && detach) {
-            PyEval_AcquireThread(tstate);
-        }
+    if (detach && tstate) {
+        PyEval_AcquireThread(tstate);
     }
     return res;
 }

From 6d4565de6c389d1b62a62ca9360904ea650743f6 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Fri, 15 Sep 2023 14:51:32 -0700
Subject: [PATCH 27/28] check-c-globals: increase limit for parking_lot.c

---
 Tools/c-analyzer/cpython/_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py
index 90334d0e79da80..4523b2ed5b9fdf 100644
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -314,6 +314,7 @@ def clean_lines(text):
     _abs('Objects/stringlib/unicode_format.h'): (10_000, 400),
     _abs('Objects/typeobject.c'): (35_000, 200),
     _abs('Python/compile.c'): (20_000, 500),
+    _abs('Python/parking_lot.c'): (40_000, 1000),
     _abs('Python/pylifecycle.c'): (500_000, 5000),
     _abs('Python/pystate.c'): (500_000, 5000),
 

From a8ce6be7026f14f9f8e788bc2380eb58cd8ad7ef Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Mon, 18 Sep 2023 06:43:45 -0700
Subject: [PATCH 28/28] Changes from review

---
 Python/lock.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/Python/lock.c b/Python/lock.c
index da364f37a6f057..3dad2aa93b5cc9 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -28,12 +28,12 @@ static const int MAX_SPIN_COUNT = 0;
 #endif
 
 struct mutex_entry {
-    // The time at which the thread should be handed off the lock. Written by
-    // the waiting thread.
+    // The time after which the unlocking thread should hand off lock ownership
+    // directly to the waiting thread. Written by the waiting thread.
     _PyTime_t time_to_be_fair;
 
     // Set to 1 if the lock was handed off. Written by the unlocking thread.
-    int handoff;
+    int handed_off;
 };
 
 static void
@@ -56,7 +56,7 @@ PyLockStatus
 _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
 {
     uint8_t v = _Py_atomic_load_uint8_relaxed(&m->v);
-    if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+    if ((v & _Py_LOCKED) == 0) {
         if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) {
             return PY_LOCK_ACQUIRED;
         }
@@ -73,12 +73,12 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
 
     struct mutex_entry entry = {
         .time_to_be_fair = now + TIME_TO_BE_FAIR_NS,
-        .handoff = 0,
+        .handed_off = 0,
     };
 
     Py_ssize_t spin_count = 0;
     for (;;) {
-        if (!(v & _Py_LOCKED)) {
+        if ((v & _Py_LOCKED) == 0) {
             // The lock is unlocked. Try to grab it.
             if (_Py_atomic_compare_exchange_uint8(&m->v, &v, v|_Py_LOCKED)) {
                 return PY_LOCK_ACQUIRED;
@@ -109,7 +109,7 @@ _PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout, _PyLockFlags flags)
         int ret = _PyParkingLot_Park(&m->v, &newv, sizeof(newv), timeout,
                                      &entry, (flags & _PY_LOCK_DETACH) != 0);
         if (ret == Py_PARK_OK) {
-            if (entry.handoff) {
+            if (entry.handed_off) {
                 // We own the lock now.
                 assert(_Py_atomic_load_uint8_relaxed(&m->v) & _Py_LOCKED);
                 return PY_LOCK_ACQUIRED;
@@ -145,7 +145,7 @@ mutex_unpark(PyMutex *m, struct mutex_entry *entry, int has_more_waiters)
         _PyTime_t now = _PyTime_GetMonotonicClock();
         int should_be_fair = now > entry->time_to_be_fair;
 
-        entry->handoff = should_be_fair;
+        entry->handed_off = should_be_fair;
         if (should_be_fair) {
             v |= _Py_LOCKED;
         }
@@ -161,7 +161,7 @@ _PyMutex_TryUnlock(PyMutex *m)
 {
     uint8_t v = _Py_atomic_load_uint8(&m->v);
     for (;;) {
-        if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+        if ((v & _Py_LOCKED) == 0) {
             // error: the mutex is not locked
             return -1;
         }
@@ -200,7 +200,7 @@ _PyRawMutex_LockSlow(_PyRawMutex *m)
 
     uintptr_t v = _Py_atomic_load_uintptr(&m->v);
     for (;;) {
-        if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+        if ((v & _Py_LOCKED) == 0) {
             // Unlocked: try to grab it (even if it has a waiter).
             if (_Py_atomic_compare_exchange_uintptr(&m->v, &v, v|_Py_LOCKED)) {
                 break;
@@ -228,7 +228,7 @@ _PyRawMutex_UnlockSlow(_PyRawMutex *m)
 {
     uintptr_t v = _Py_atomic_load_uintptr(&m->v);
     for (;;) {
-        if ((v & _Py_LOCKED) == _Py_UNLOCKED) {
+        if ((v & _Py_LOCKED) == 0) {
             Py_FatalError("unlocking mutex that is not locked");
         }