From 5f85268e2b39b1e75cb8ad8bd85dcf3751dff0c7 Mon Sep 17 00:00:00 2001
From: kprotty <kbutcher6200@gmail.com>
Date: Thu, 22 Jul 2021 14:39:48 -0500
Subject: [PATCH] std.Thread.RwLock: more impl

---
 lib/std/Thread/RwLock.zig | 694 ++++++++++++++++++++++++++++++--------
 1 file changed, 550 insertions(+), 144 deletions(-)

diff --git a/lib/std/Thread/RwLock.zig b/lib/std/Thread/RwLock.zig
index 7a6b2b19759c..8e848091516e 100644
--- a/lib/std/Thread/RwLock.zig
+++ b/lib/std/Thread/RwLock.zig
@@ -126,49 +126,21 @@ const SerialRwLockImpl = extern struct {
 /// Reader-Writer Lock implementation from Rust parking_lot:
 /// https://github.com/Amanieu/parking_lot/blob/master/src/raw_rwlock.rs
 const RwLockImpl = extern struct {
-    mutex: std.Thread.Mutex = .{},
-    waiters: ?*Waiter = null,
     state: Atomic(usize) = Atomic(usize).init(UNLOCKED),
+    parker: Parker = .{},
     
     const UNLOCKED: usize = 0;
-    const HAS_PARKED: usize = 1 << 0;
+    const PARKED: usize = 1 << 0;
     const WRITER_PARKED: usize = 1 << 1;
-    const HAS_WRITER: usize = 1 << 2;
+    const WRITER: usize = 1 << 2;
     const READER: usize = 1 << 3;
     const READER_MASK: usize = ~(READER - 1);
-    
-    const Token = enum{ reader, writer };
-    const Waiter = struct {
-        address: usize,
-        token: Token,
-        prev: ?*Waiter = null,
-        next: ?*Waiter = null,
-        tail: ?*Waiter = null,
-        left: ?*Waiter = null,
-        right: ?*Waiter = null,
-        event: Atomic(u32) = Atomic(u32).init(0),
-
-        fn wait(self: *Waiter, deadline_ns: anytype) error{TimedOut}!void {
-            while (self.event.load(.Acquire) == 0) {
-                try std.Thread.Futex.wait(&self.event, 0, blk: {
-                    if (@TypeOf(deadline_ns) != u64) break :blk null;
-                    const now_ns = std.time.now();
-                    if (now_ns >= deadline_ns) return error.TimedOut;
-                    break :blk (deadline_ns - now_ns);
-                });
-            }
-        }
-
-        fn notify(self: *Waiter) void {
-            self.event.store(1, .Release);
-            std.Thread.Futex.wake(&self.event, 1);
-        }
-    };
 
     fn tryAcquire(self: *Impl) bool {
+        // Acquire barrier on success to ensure that we see any writes done by previous writer threads.
         return self.state.compareAndSwap(
             UNLOCKED, 
-            HAS_WRITER, 
+            WRITER, 
             .Acquire, 
             .Monotonic,
         ) == null;
@@ -183,9 +155,10 @@ const RwLockImpl = extern struct {
     }
 
     fn acquireExclusiveAccess(self: *Impl, timeout_ns: anytype) error{TimedOut}!void {
+        // Acquire barrier on success to ensure that we see any writes done by previous writer threads.
         _ = self.state.tryCompareAndSwap(
             UNLOCKED, 
-            HAS_WRITER, 
+            WRITER, 
             .Acquire, 
             .Monotonic,
         ) orelse return;
@@ -201,18 +174,19 @@ const RwLockImpl = extern struct {
             break :blk std.time.now() + timeout_ns;
         };
 
-        // Acquire the HAS_WRITER bit
-        try self.waitWith(.writer, deadline_ns, struct {
-            fn tryAcquireWith(noalias this: *Impl, noalias state_ptr: *usize) bool {
+        // Acquire the WRITER bit
+        try self.acquireByParkingWith(deadline_ns, WRITER, struct {
+            pub fn tryAcquireWith(noalias impl: *Impl, noalias state_ptr: *usize) bool {
                 while (true) {
                     const state = state_ptr.*;
-                    if (state & HAS_WRITER != 0) {
+                    if (state & WRITER != 0) {
                         return false;
                     }
 
-                    state_ptr.* = this.state.tryCompareAndSwap(
+                    // Acquire barrier on success to ensure that we see any writes done by previous writer threads.
+                    state_ptr.* = impl.state.tryCompareAndSwap(
                         state,
-                        state | HAS_WRITER,
+                        state | WRITER,
                         .Acquire,
                         .Monotonic,
                     ) orelse return true;
@@ -221,16 +195,160 @@ const RwLockImpl = extern struct {
         });
         
         // Wait for the readers to exit
-        var spin: u8 = 10;
+        try self.waitForReaders(deadline_ns);
+    }
+
+    fn release(self: *Impl) void {
+        // Release barrier to ensure that future readers/writers Acquire and see any writes this thread performed.
+        _ = self.state.compareAndSwap(
+            WRITER, 
+            UNLOCKED, 
+            .Release, 
+            .Monotonic,
+        ) orelse return;
+        return self.releaseSlow();
+    }
+
+    fn releaseSlow(self: *Impl) void {
+        @setCold(true);
+
+        // When unparking other threads, we need to also unlock/release the RwLock to allow other readers/writers.
+        // If we observed that there's no more threads parked on the address, unset the PARKED bit in the process.
+        // The state can only change to PARKED concurrently so if we change it to UNLOCKED, the parking thread can always retry from onValidate(). 
+        // Release barrier to ensure future readers/writers see the writes done by this thread.
+        self.notifyParkedWith(UNLOCKED, struct {
+            pub fn onNotify(impl: *Impl, has_more_threads: bool) void {
+                const new_state = if (has_more_threads) PARKED else UNLOCKED;
+                impl.state.store(new_state, .Release);
+            }
+        });
+    }
+
+    fn tryAcquireShared(self: *Impl) bool {
+        return self.tryAcquireSharedFast() or self.tryAcquireSharedSlow();
+    }
+
+    fn tryAcquireSharedFast(self: *Impl) bool {
+        const state = self.state.load(.Monotonic);
+        const result = self.tryAcquireSharedUsing(state) catch state;
+        return result == null;
+    }
+
+    fn tryAcquireSharedSlow(self: *Impl) bool {
+        @setCold(true);
+
+        var state = self.state.load(.Monotonic);
+        while (true) {
+            const result = self.tryAcquireSharedUsing(state) catch return false;
+            state = result orelse return true;
+        }
+    }
+
+    inline fn tryAcquireSharedUsing(self: *Impl, state: usize) error{Overflowed}!?usize {
+        // We can't acquire the lock while there's an active writer
+        if (state & WRITER != 0) {
+            return state;
+        }
+
+        // Use hardware lock elision to try and avoid cache conflicts when a reader tries to acquire the lock.
+        // We do this only when the lock is completely uncontended since it LockElision seems to handle conflicts poorly.
+        if (LockElision != void and state == UNLOCKED) {
+            state_ptr.* = LockElision.compareAndSwapAcquire(
+                &impl.state,
+                UNLOCKED,
+                READER,
+            ) orelse return null;
+            continue;
+        }
+
+        var new_state: usize = undefined;
+        if (@addWithOverflow(usize, state, READER, &new_state)) {
+            return error.Overflowed;
+        }
+
+        // Acquire barrier in order to see writes done by any previous lock writers.
+        return impl.state.tryCompareAndSwap(
+            state,
+            new_state,
+            .Acquire,
+            .Monotonic,
+        );
+    }
+
+    fn acquireShared(self: *Impl) void {
+        return self.acquireSharedAccess({}) catch unreachable;
+    }
+
+    fn timedAcquireShared(self: *Impl, timeout_ns: u64) error{TimedOut}!void {
+       return self.acquireSharedAccess(timeout_ns) catch unreachable;
+    }
+
+    fn acquireSharedAccess(self: *Impl, timeout_ns: anytype) error{TimedOut}!void {
+        if (self.tryAcquireSharedFast()) return;
+        return self.acquireSharedAccessSlow(timeout_ns);
+    }
+
+    fn acquireSharedAccessSlow(self: *Impl, timeout_ns: anytype) error{TimedOut}!void {
+        @setCold(true);
+
+        // create a deadline in which the timeout actually expires
+        const deadline_ns = blk: {
+            if (@TypeOf(timeout_ns) != u64) break :blk {};
+            break :blk std.time.now() + timeout_ns;
+        };
+
+        // acquire a reader count
+        try self.acquireByParkingWith(deadline_ns, READER, struct {
+            pub fn tryAcquireWith(noalias impl: *Impl, noalias state_ptr: *usize) bool {
+                var spin: u8 = spin_count;
+                while (true) {
+                    // Try to acquire the lock as a reader
+                    const result = impl.tryAcquireSharedUsing(state_ptr.*) catch unreachable; // reader count overflowed
+                    if (result == null) {
+                        return true;
+                    }
+
+                    // When there's high contention on the reader count,
+                    // it benefits to try and leave some time for other threads to make progress before retrying.
+                    defer state_ptr.* = impl.state.load(.Monotonic);
+                    if (spin > 0) {
+                        spin -= 1;
+                        std.atomic.spinLoopHint();
+                    } else {
+                        std.os.sched_yield() catch {};
+                    }
+                }
+            }
+        });
+    }
+
+    fn releaseShared(self: *Impl) void {
+        // Release barrier to ensure all reads to the lock-protected state
+        // don't get reordered after the read-unlock to prevent writes from tampering with them.
+        const state = switch (LockElision) {
+            void => self.state.fetchSub(READER, .Release),
+            else => LockElision.fetchSubRelease(&self.state, READER),
+        };
+
+        // Unpark a pending writer waiting for all readers to exit if we're the last reader.
+        if (state & (READER_MASK | WRITER_PARKED) == (READER | WRITER_PARKED)) {
+            self.wakeReaderWaiter();
+        }
+    }
+
+    fn waitForReaders(self: *Impl, deadline_ns: anytype) error{TimedOut}!void {
+        var spin: u8 = spin_count;
         while (true) : (std.atomic.spinLoopHint()) {
+            // Acquire barrier so that the writer's writes done after exiting waitForReaders()
+            // are ensured to happen after any reads done by the previous readers.
             const state = self.state.load(.Acquire);
-            if (state & HAS_WRITER == 0) {
-                unreachable; // HAS_WRITER not set when waiting for readers
+            if (state & WRITER == 0) {
+                unreachable; // WRITER not set when waiting for readers
             }
             
             // Return when there's no readers, 
             if (state & READER_MASK == 0) {
-                break;
+                return;
             }
 
             // Spin on the state for a bit in case the readers finish early
@@ -243,7 +361,6 @@ const RwLockImpl = extern struct {
             // There's still readers and this thread has spun for too long.
             // Set the WRITER_PARKED bit to indicate that the writer thread is sleeping.
             if (state & WRITER_PARKED == 0) blk: {
-                self.pending_writer.store(0, .Monotonic);
                 _ = self.state.tryCompareAndSwap(
                     state,
                     state | WRITER_PARKED,
@@ -253,124 +370,354 @@ const RwLockImpl = extern struct {
                 continue;
             }
 
-            // If this writer thread times out while trying to wait for readers to complete,
-            // then it needs to unset the HAS_WRITER and WRITER_PARKED bit to properly indicate so.
-            // It also needs to unpark another thread that was waiting this writer thread.
-            errdefer {
-                state = self.state.fetchSub(HAS_WRITER | WRITER_PARKED, .Monotonic);
-                if (state & HAS_PARKED != 0) {
-                    self.notifyWith(READER);
+            // Park until notified by the last reader finishing up
+            const ParkImpl = struct {
+                impl: *Impl,
+                timeout_state: usize = 0,
+
+                pub fn onValidate(this: @This()) bool {
+                    const current_state = this.impl.state.load(.Monotonic);
+                    return current_state & (READER_MASK | WRITER_PARKED) != 0;
                 }
-            }
 
-            // Park until notified by the last reader finishing up
-            try self.park(null, deadline_ns, struct {
-                fn onValidate(noalias this: *Impl) bool {
-                    const s = this.state.load(.Monotonic);
-                    return s & (READER_MASK | WRITER_PARKED) != 0;
+                pub fn onTimedOut(this: *@This(), has_more_threads: bool) void {
+                    if (has_more_threads) {
+                        unreachable; // there were multiple threads waiting for the reader to exit
+                    }
+
+                    // This writer thread TimedOut while trying to wait for readers to complete.
+                    // It needs to unset the WRITER and WRITER_PARKED bit to properly indicate so.
+                    const removed = WRITER | WRITER_PARKED;
+                    this.timeout_state = this.impl.state.fetchSub(removed, .Monotonic);
+                    if (this.timeout_state & removed != removed) {
+                        unreachable; // parked writer timed out without having WRITER | WRITER_PARKED set
+                    }
                 }
+            };
 
-                fn onTimedOut(noalias this: *Impl, was_last_thread: bool) void {
-                    _ = this;
-                    _ = was_last_thread;
+            // Park on the state address offset by one instead.
+            // This is the wait queue for the writer rather than all other parked threads.
+            const address = @ptrToInt(&self.state) + 1;
+            var park_impl = ParkImpl{ .impl = self };
+            self.parker.park(
+                address, 
+                WRITER_PARKED, 
+                deadline_ns, 
+                &park_impl,
+            ) catch {
+                // If we TimedOut, we also need to unpark another thread that was waiting on this writer thread.
+                // If it will unpark the last parked thread, then it must unset the PARKED bit.
+                if (park_impl.timeout_state & PARKED != 0) {
+                   self.notifyParkedWith(READER, struct {
+                        pub fn onNotify(impl: *Impl, has_more_threads: bool) void {
+                            if (!has_more_threads) {
+                                _ = impl.state.fetchAnd(~PARKED, .Monotonic);
+                            }
+                        }
+                    });
                 }
-            });
+
+                // The writer waiting on reader threads to exit has now gracefully timed out
+                return error.TimedOut;
+            };
         }
     }
 
-    fn release(self: *Impl) void {
-        
-    }
+    fn wakeReaderWaiter(self: *Impl) void {
+        @setCold(true);
 
-    fn tryAcquireShared(self: *Impl) bool {
-        
-    }
+        const UnparkImpl = struct {
+            impl: *Impl,
+            did_unpark = false,
 
-    fn acquireShared(self: *Impl) void {
-        return self.acquireSharedAccess({}) catch unreachable;
-    }
+            pub fn onFilter(this: *@This(), token: Parker.Token) Parker.Filter {
+                if (token != WRITER_PARKED) {
+                    unreachable; // invalid thread parked on the offseted state address
+                }
 
-    fn timedAcquireShared(self: *Impl, timeout_ns: u64) error{TimedOut}!void {
-       return self.acquireSharedAccess(timeout_ns) catch unreachable;
-    }
+                if (this.did_unpark) {
+                    unreachable; // there were multiple threads parked on the offseted state address
+                }
 
-    fn acquireSharedAccess(self: *Impl, timeout_ns: anytype) error{TimedOut}!void {
-        
-    }
+                this.did_unpark = true;
+                return .unpark;
+            }
 
-    fn releaseShared(self: *Impl) void {
+            pub fn onUnpark(this: *@This(), has_more_threads: bool) void {
+                if (has_more_threads) {
+                    unreachable; // there were multiple threads parked on the offseted state address
+                }
+
+                // clear the WRITER_PARKED bit
+                _ = this.impl.state.fetchAnd(~WRITER_PARKED, .Monotonic);
+            }
+        };
         
+        // Unpark the writer waiting on the readers from the offseted state address
+        const address = @ptrToInt(&self.state) + 1;
+        var unpark_impl = UnparkImpl{ .impl = self };
+        self.parker.unpark(address, &unpark_impl);
     }
 
-    fn waitWith(self: *Impl, token: Token, deadline_ns: anytype, callback: anytype) error{TimedOut}!void {
+    fn acquireByParkingWith(
+        self: *Impl, 
+        deadline_ns: anytype,
+        comptime token: Parker.Token,
+        comptime TryAcquireWithImpl: type,
+    ) error{TimedOut}!void {
+        var spin: u8 = spin_count;
+        var state = self.state.load(.Monotonic);
+        while (true) {
+            // Try to acquire the lock
+            if (TryAcquireWithImpl.tryAcquireWith(self, &state)) {
+                return;
+            }
+
+            // Spin a bit if there's no threads parked on the lock
+            if ((state & (PARKED | WRITER_PARKED) == 0) and (spin > 0)) {
+                spin -= 1;
+                std.atomic.spinLoopHint();
+                state = self.state.load(.Monotonic);
+                continue;
+            }
+
+            // This thread has spun enough, set the parked bit to indicate that we're sleeping
+            if (state & PARKED == 0) blk: {
+                state = self.state.tryCompareAndSwap(
+                    state,
+                    state | PARKED,
+                    .Monotonic,
+                    .Monotonic,
+                ) orelse break :blk;
+                continue;
+            }
+
+            // Park until woken up by notifyParked or a release*()
+            const ParkImpl = struct {
+                impl: *Impl,
+
+                pub fn onValidate(this: @This()) bool {
+                    const expected_active = PARKED | WRITER;
+                    const current_state = this.impl.state.load(.Monotonic);
+                    return (current_state & expected_active) == expected_active;
+                }
 
+                pub fn onTimedOut(this: @This(), has_more_threads: bool) void {
+                    // Unset the parked bit if we timeout without any other parked threads
+                    if (!has_more_threads) {
+                        _ = this.impl.state.fetchAnd(~PARKED, .Monotonic);
+                    }
+                }
+            };
+
+            // Park on the state address
+            const address = @ptrToInt(&self.state);
+            try self.parker.park(
+                address, 
+                token, 
+                deadline_ns,
+                ParkImpl{ .impl = self },
+            );
+
+            // Reset the spin count and try to acquire again.
+            spin = spin_count;
+            state = self.state.load(.Monotonic);
+        }
     }
 
-    fn notifyWith(self: *Impl, token: Token) void {
+    fn notifyParkedWith(
+        self: *Impl, 
+        comptime start_state: Parker.Token, 
+        comptime NotifyImpl: type,
+    ) void {
+        const UnparkImpl = struct {
+            impl: *Impl,
+            state: Parker.Token = start_state,
+
+            pub fn onFilter(this: *@This(), token: Parker.Token) Parker.Filter {
+                switch (token) {
+                    READER => {},
+                    WRITER => {},
+                    WRITER_PARKED => unreachable, // waitForReaders() shouldn't be parked on the state address
+                    else => unreachable, // invalid thread parked on the state address
+                }
+
+                // If we're waking up a writer, don't wake up anything else
+                if (this.state & WRITER != 0) {
+                    return .stop;
+                }
+
+                // Otherwise wake all readers and one writer
+                this.state += token;
+                return .unpark;
+            }
 
+            pub fn onUnpark(this: @This(), has_more_threads: bool) void {
+                NotifyImpl.onNotify(this.impl, has_more_threads);
+            }
+        };
+        
+        // Unpark threads waiting on the state address
+        const address = @ptrToInt(&self.state);
+        var unpark_impl = UnparkImpl{ .impl = self };
+        self.parker.unpark(address, &unpark_impl);
     }
 
-    fn park(self: *Impl, address: usize, deadline_ns: anytype, callback: anytype) error{TimedOut}!void {
-        var waiter: Waiter = undefined;
-        {
-            const held = self.mutex.acquire();
-            defer held.release();
+    const Parker = struct {
+        mutex: std.Thread.Mutex = .{},
+        waiters: ?*Waiter = null,
 
-            const token = callback.onValidate(self) orelse return;
-            waiter.* = Waiter{
-                .address = address,
-                .token = token,
-                .tail = &waiter,
-            };
+        const Token = usize;
+        const Filter = enum {
+            stop,
+            skip,
+            unpark,
+        };
 
-            var head = self.waiters;
-            var left: ?*Waiter = null;
-            while (head) |h| {
-                if (h.address == address) break;
-                head = h.right;
-                left = h;
+        const Waiter = struct {
+            address: usize,
+            token: Token,
+            prev: ?*Waiter = null,
+            next: ?*Waiter = null,
+            tail: ?*Waiter = null,
+            left: ?*Waiter = null,
+            right: ?*Waiter = null,
+            event: Atomic(u32) = Atomic(u32).init(0),
+
+            fn wait(self: *Waiter, deadline_ns: anytype) error{TimedOut}!void {
+                while (self.event.load(.Acquire) == 0) {
+                    try std.Thread.Futex.wait(&self.event, 0, blk: {
+                        if (@TypeOf(deadline_ns) != u64) break :blk null;
+                        const now_ns = std.time.now();
+                        if (now_ns >= deadline_ns) return error.TimedOut;
+                        break :blk (deadline_ns - now_ns);
+                    });
+                }
             }
 
-            if (head) |h| {
-                waiter.prev = h.tail;
-                h.tail.?.next = &waiter;
-                h.tail = &waiter;
-            } else if (left) |l| {
-                waiter.left = l;
-                l.right = &waiter;
-            } else {
-                self.waiters = &waiter;
+            fn notify(self: *Waiter) void {
+                self.event.store(1, .Release);
+                std.Thread.Futex.wake(&self.event, 1);
             }
-        }
+        };
+        
+        const Queue = struct {
+            address: usize,
+            parker: *Parker,
+            head: ?*Waiter,
+            left: ?*Waiter,
+
+            // Find the wait queue associated for the address using the Parker
+            fn fromAddress(parker: *Parker, address: usize) Queue {
+                var head = parker.waiters;
+                var left: ?*Waiter = null;
+
+                // Search the top-layer Waiters representing Queues for unique addresses.
+                //
+                // NOTE: It's not worth using a balanced binary tree to make this lookup O(log(N)) instead of O(n)
+                // given that N = unique `address`es and the RwLock implementation only uses two.
+                while (head) |node| {
+                    if (node.address == address) break;
+                    head = node.right;
+                    left = node;
+                }
 
-        waiter.wait(deadline_ns) catch {
-            const held = self.mutex.acquire();
-            const timed_out = waiter.tail != null;
+                return .{
+                    .address = address,
+                    .parker = parker,
+                    .head = head,
+                    .left = left,
+                };
+            }
+            
+            // Same as fromAddress() but can skip the search if the waiter is the head of its wait queue
+            fn fromWaiter(noalias parker: *Parker, noalias waiter: *Waiter) void {
+                if (!isInserted(waiter)) {
+                    unreachable; // Trying to find a parker queue from an invalid waiter
+                }
 
-            if (!timed_out) {
-                held.release();
-                waiter.wait({}) catch unreachable;
-                return;
+                const address = waiter.address;
+                if (waiter.prev != null) {
+                    return fromAddress(parker, address);
+                }
+
+                return .{
+                    .address = address,
+                    .parker = parker,
+                    .head = waiter,
+                    .left = waiter.left,
+                };
             }
 
-            var head = &waiter;
-            while (head.prev) |prev|
-                head = prev;
+            fn isEmpty(self: Queue) bool {
+                return self.head == null;
+            }
+
+            fn isInserted(waiter: *const Waiter) bool {
+                return waiter.tail != null;
+            }
 
-            if (waiter.prev) |prev| {
-                prev.next = waiter.next;
-                if (waiter.next) |next| {
-                    next.prev = prev;
+            fn insert(noalias self: *Queue, noalias waiter: *Waiter) void {
+                waiter.* = Waiter{
+                    .address = self.address,
+                    .token = waiter.token,
+                    .tail = &waiter,
+                };
+
+                // Append to the tail if there's already a queue going on
+                if (self.head) |head| {
+                    const tail = head.tail orelse unreachable; // inserted waiter without a tail
+                    waiter.prev = tail;
+                    tail.next = waiter;
+                    head.tail = waiter;
+                    return;
+                }
+                
+                // Insert to the top-level tree if this is the first waiter for this address
+                self.head = waiter;
+                if (self.left) |left| {
+                    waiter.left = left;
+                    left.right = waiter;
                 } else {
-                    head.tail = prev;
+                    self.parker.waiters = waiter;
                 }
-            } else {
+            }
+
+            fn remove(noalias self: *Queue, noalias waiter: *Waiter) void {
+                if (waiter.address != self.address) {
+                    unreachable; // tried to remove a waiter that doesnt belong to this wait queue
+                }
+
+                defer waiter.tail = null;
+                if (!isInserted(waiter)) {
+                    unreachable; // tried to remove waiter that wasn't inserted
+                }
+                
+                // If we're removing a waiter, there is bound to at least be a head waiter
+                const head = self.head orelse unreachable; // remove without a queue head
+
+                // Remove a waiter while there's other waiters in the wait queue for its address
+                if (waiter.prev) |prev| {
+                    prev.next = waiter.next;
+                    if (waiter.next) |next| {
+                        next.prev = prev;
+                    } else {
+                        head.tail = prev;
+                    }
+                    return;
+                }
+
+                // The waiter being removed is the head of the queue.
+                // Prepare the next waiter in-line to be the new head
+                self.head = head.next;
                 if (head.next) |new_head| {
                     new_head.prev = null;
                     new_head.tail = head.tail;
                     new_head.left = head.left;
                     new_head.right = head.right;
                 }
+
+                // Update top-level address links for the new head
                 if (head.left) |left| blk: {
                     left.right = head.next;
                     const right = head.right orelse break :blk;
@@ -379,32 +726,91 @@ const RwLockImpl = extern struct {
                     self.waiters = head.next;
                 }
             }
-
-            const was_last_thread = waiter == head and head.next == null;
-            callback.onTimedOut(self, was_last_thread);
-            held.release();
-            return error.TimedOut;
         };
-    }
 
-    fn unpark(self: *Impl, callback: anytype) void {
-        var unparked: ?*Waiter = null;
-        defer while (unparked) |waiter| {
-            unparked = waiter.next;
-            waiter.notify();
-        };
+        fn park(
+            self: *Parker, 
+            address: usize,
+            token: Token, 
+            deadline_ns: anytype, 
+            callback: anytype,
+        ) error{TimedOut}!void {
+            var waiter: Waiter = undefined;
+            {
+                const held = self.mutex.acquire();
+                defer held.release();
+
+                if (!callback.onValidate()) return;
+                waiter.token = token;
+
+                var queue = Queue.fromAddress(address);
+                queue.insert(&waiter);
+            }
 
-        const held = self.mutex.acquire();
-        defer held.release();
+            waiter.wait(deadline_ns) catch {
+                const held = self.mutex.acquire();
+                const timed_out = Queue.isInserted(&waiter);
 
-        defer callback.onUnpark();
-        while (head)
-    }
+                if (timed_out) {
+                    var queue = Queue.fromWaiter(&waiter);
+                    queue.remove(&waiter);
+
+                    const has_more_threads = !queue.isEmpty();
+                    callback.onTimedOut(has_more_threads);
+
+                    held.release();
+                    return error.TimedOut;
+                }
+
+                // We were dequeued for unparking after waiter.wait() timeout and before mutex.acquire().
+                // Wait for the unparking thread to unpark us, unless we risk invalidating the Waiter memory while in use.
+                held.release();
+                waiter.wait({}) catch unreachable;
+                return;
+            };
+        }
+
+        fn unpark(self: *Parker, address: usize, token: Token, callback: anytype) void {
+            // Unpark all waiters outside the mutex lock
+            var unparked: ?*Waiter = null;
+            defer while (unparked) |waiter| {
+                unparked = waiter.next;
+                waiter.notify();
+            };
+
+            const held = self.mutex.acquire();
+            defer held.release();
+
+            var queue = Queue.fromAddress(address);
+            defer {
+                const has_more_threads = !queue.isEmpty();
+                callback.onUnpark(has_more_threads);
+            }
+
+            var current = queue.head;
+            while (current) |waiter| {
+                current = waiter.next;
+
+                const filter = callback.onFilter(waiter.token);
+                switch (filter) {
+                    .stop => break,
+                    .skip => continue,
+                    .unpark => {},
+                }
+
+                queue.remove(waiter);
+                waiter.next = unparked;
+                unparked = waiter;
+            }
+        }
+    };
 
     const arch = std.Target.current.cpu.arch;
+    const spin_count = if (arch.isX86()) 100 else 10;
+
     const LockElision = switch (arch) {
         .x86_64 => struct {
-            inline fn compareAndSwapAcquire(ptr: *Atomic(usize), cmp: usize, xchg: usize) ?usize {
+            fn compareAndSwapAcquire(ptr: *Atomic(usize), cmp: usize, xchg: usize) ?usize {
                 const prev = asm volatile("xacquire; lock cmpxchgq %[xchg], %[ptr]"
                     : [result] "={rax}" (-> usize),
                       [ptr] "+*m" (&ptr.value)
@@ -416,7 +822,7 @@ const RwLockImpl = extern struct {
                 return prev;
             }
 
-            inline fn fetchSubRelease(ptr: *Atomic(usize), value: usize) usize {
+            fn fetchSubRelease(ptr: *Atomic(usize), value: usize) usize {
                 return asm volatile("xrelease; lock xaddq %[value], %[ptr]"
                     : [result] "=r" (-> usize),
                       [ptr] "+*m" (&ptr.value)
@@ -426,7 +832,7 @@ const RwLockImpl = extern struct {
             }
         },
         .i386 => struct {
-            inline fn compareAndSwapAcquire(ptr: *Atomic(usize), cmp: usize, xchg: usize) ?usize {
+            fn compareAndSwapAcquire(ptr: *Atomic(usize), cmp: usize, xchg: usize) ?usize {
                 const prev = asm volatile("xacquire; lock cmpxchgl %[xchg], %[ptr]"
                     : [result] "={rax}" (-> usize),
                       [ptr] "+*m" (&ptr.value)
@@ -438,7 +844,7 @@ const RwLockImpl = extern struct {
                 return prev;
             }
 
-            inline fn fetchSubRelease(ptr: *Atomic(usize), value: usize) usize {
+            fn fetchSubRelease(ptr: *Atomic(usize), value: usize) usize {
                 return asm volatile("xrelease; lock xaddl %[value], %[ptr]"
                     : [result] "=r" (-> usize),
                       [ptr] "+*m" (&ptr.value)