diff --git a/lib/std/compress/deflate/huffman_code.zig b/lib/std/compress/deflate/huffman_code.zig
index 4827feb24562..689ac1441a95 100644
--- a/lib/std/compress/deflate/huffman_code.zig
+++ b/lib/std/compress/deflate/huffman_code.zig
@@ -93,7 +93,7 @@ pub const HuffmanEncoder = struct {
             return;
         }
         self.lfs = list;
-        sort.sort(LiteralNode, self.lfs, {}, byFreq);
+        mem.sort(LiteralNode, self.lfs, {}, byFreq);
 
         // Get the number of literals for each bit count
         var bit_count = self.bitCounts(list, max_bits);
@@ -270,7 +270,7 @@ pub const HuffmanEncoder = struct {
             var chunk = list[list.len - @intCast(u32, bits) ..];
 
             self.lns = chunk;
-            sort.sort(LiteralNode, self.lns, {}, byLiteral);
+            mem.sort(LiteralNode, self.lns, {}, byLiteral);
 
             for (chunk) |node| {
                 self.codes[node.literal] = HuffCode{
diff --git a/lib/std/compress/zstandard/decode/fse.zig b/lib/std/compress/zstandard/decode/fse.zig
index 741fd81cccd8..232af39ccfe5 100644
--- a/lib/std/compress/zstandard/decode/fse.zig
+++ b/lib/std/compress/zstandard/decode/fse.zig
@@ -107,7 +107,7 @@ fn buildFseTable(values: []const u16, entries: []Table.Fse) !void {
                 position &= entries.len - 1;
             }
         }
-        std.sort.sort(u16, temp_states[0..probability], {}, std.sort.asc(u16));
+        std.mem.sort(u16, temp_states[0..probability], {}, std.sort.asc(u16));
         for (0..probability) |i| {
             entries[temp_states[i]] = if (i < double_state_count) Table.Fse{
                 .symbol = @intCast(u8, symbol),
diff --git a/lib/std/compress/zstandard/decode/huffman.zig b/lib/std/compress/zstandard/decode/huffman.zig
index 291419826831..f5e977d0dadd 100644
--- a/lib/std/compress/zstandard/decode/huffman.zig
+++ b/lib/std/compress/zstandard/decode/huffman.zig
@@ -124,7 +124,7 @@ fn assignSymbols(weight_sorted_prefixed_symbols: []LiteralsSection.HuffmanTree.P
         };
     }
 
-    std.sort.sort(
+    std.mem.sort(
         LiteralsSection.HuffmanTree.PrefixedSymbol,
         weight_sorted_prefixed_symbols,
         weights,
diff --git a/lib/std/comptime_string_map.zig b/lib/std/comptime_string_map.zig
index 7620ec7af8af..e6859c32c15d 100644
--- a/lib/std/comptime_string_map.zig
+++ b/lib/std/comptime_string_map.zig
@@ -28,7 +28,7 @@ pub fn ComptimeStringMap(comptime V: type, comptime kvs_list: anytype) type {
                 sorted_kvs[i] = .{ .key = kv.@"0", .value = {} };
             }
         }
-        std.sort.sort(KV, &sorted_kvs, {}, lenAsc);
+        mem.sort(KV, &sorted_kvs, {}, lenAsc);
         const min_len = sorted_kvs[0].key.len;
         const max_len = sorted_kvs[sorted_kvs.len - 1].key.len;
         var len_indexes: [max_len + 1]usize = undefined;
diff --git a/lib/std/debug.zig b/lib/std/debug.zig
index ecc1a9f0cf39..005c2b540428 100644
--- a/lib/std/debug.zig
+++ b/lib/std/debug.zig
@@ -1211,7 +1211,7 @@ fn readMachODebugInfo(allocator: mem.Allocator, macho_file: File) !ModuleDebugIn
     // Even though lld emits symbols in ascending order, this debug code
     // should work for programs linked in any valid way.
     // This sort is so that we can binary search later.
-    std.sort.sort(MachoSymbol, symbols, {}, MachoSymbol.addressLessThan);
+    mem.sort(MachoSymbol, symbols, {}, MachoSymbol.addressLessThan);
 
     return ModuleDebugInfo{
         .base_address = undefined,
diff --git a/lib/std/enums.zig b/lib/std/enums.zig
index aa6edd60b175..757c616b9bcb 100644
--- a/lib/std/enums.zig
+++ b/lib/std/enums.zig
@@ -1314,7 +1314,7 @@ pub fn EnumIndexer(comptime E: type) type {
             }
         };
     }
-    std.sort.sort(EnumField, &fields, {}, ascByValue);
+    std.mem.sort(EnumField, &fields, {}, ascByValue);
     const min = fields[0].value;
     const max = fields[fields.len - 1].value;
     const fields_len = fields.len;
diff --git a/lib/std/http/Headers.zig b/lib/std/http/Headers.zig
index 429df9368a15..fb7a9360d823 100644
--- a/lib/std/http/Headers.zig
+++ b/lib/std/http/Headers.zig
@@ -191,7 +191,7 @@ pub const Headers = struct {
 
     /// Sorts the headers in lexicographical order.
     pub fn sort(headers: *Headers) void {
-        std.sort.sort(Field, headers.list.items, {}, Field.lessThan);
+        std.mem.sort(Field, headers.list.items, {}, Field.lessThan);
         headers.rebuildIndex();
     }
 
diff --git a/lib/std/mem.zig b/lib/std/mem.zig
index 8cb2c00a3af2..1840a86819eb 100644
--- a/lib/std/mem.zig
+++ b/lib/std/mem.zig
@@ -566,6 +566,34 @@ test "zeroInit" {
     }, nested_baz);
 }
 
+pub fn sort(
+    comptime T: type,
+    items: []T,
+    context: anytype,
+    comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    std.sort.block(T, items, context, lessThanFn);
+}
+
+pub fn sortUnstable(
+    comptime T: type,
+    items: []T,
+    context: anytype,
+    comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    std.sort.pdq(T, items, context, lessThanFn);
+}
+
+/// TODO: currently this just calls `insertionSortContext`. The block sort implementation
+/// in this file needs to be adapted to use the sort context.
+pub fn sortContext(length: usize, context: anytype) void {
+    std.sort.insertionContext(length, context);
+}
+
+pub fn sortUnstableContext(length: usize, context: anytype) void {
+    std.sort.pdqContext(length, context);
+}
+
 /// Compares two slices of numbers lexicographically. O(n).
 pub fn order(comptime T: type, lhs: []const T, rhs: []const T) math.Order {
     const n = math.min(lhs.len, rhs.len);
diff --git a/lib/std/meta.zig b/lib/std/meta.zig
index 8adba2439aa5..d0b07b934fd6 100644
--- a/lib/std/meta.zig
+++ b/lib/std/meta.zig
@@ -985,7 +985,7 @@ pub fn declList(comptime Namespace: type, comptime Decl: type) []const *const De
         for (decls, 0..) |decl, i| {
             array[i] = &@field(Namespace, decl.name);
         }
-        std.sort.sort(*const Decl, &array, {}, S.declNameLessThan);
+        mem.sort(*const Decl, &array, {}, S.declNameLessThan);
         return &array;
     }
 }
diff --git a/lib/std/multi_array_list.zig b/lib/std/multi_array_list.zig
index 322471bedffe..5cd5f573d965 100644
--- a/lib/std/multi_array_list.zig
+++ b/lib/std/multi_array_list.zig
@@ -160,7 +160,7 @@ pub fn MultiArrayList(comptime T: type) type {
                     return lhs.alignment > rhs.alignment;
                 }
             };
-            std.sort.sort(Data, &data, {}, Sort.lessThan);
+            mem.sort(Data, &data, {}, Sort.lessThan);
             var sizes_bytes: [fields.len]usize = undefined;
             var field_indexes: [fields.len]usize = undefined;
             for (data, 0..) |elem, i| {
@@ -469,29 +469,19 @@ pub fn MultiArrayList(comptime T: type) type {
         /// `ctx` has the following method:
         /// `fn lessThan(ctx: @TypeOf(ctx), a_index: usize, b_index: usize) bool`
         pub fn sort(self: Self, ctx: anytype) void {
-            const SortContext = struct {
-                sub_ctx: @TypeOf(ctx),
-                slice: Slice,
-
-                pub fn swap(sc: @This(), a_index: usize, b_index: usize) void {
-                    inline for (fields, 0..) |field_info, i| {
+            var i: usize = 1;
+            while (i < self.len) : (i += 1) {
+                var j: usize = i;
+                while (j > 0 and ctx.lessThan(j, j - 1)) : (j -= 1) {
+                    inline for (fields, 0..) |field_info, field_index| {
                         if (@sizeOf(field_info.type) != 0) {
-                            const field = @intToEnum(Field, i);
-                            const ptr = sc.slice.items(field);
-                            mem.swap(field_info.type, &ptr[a_index], &ptr[b_index]);
+                            const field = @intToEnum(Field, field_index);
+                            const ptr = self.items(field);
+                            mem.swap(field_info.type, &ptr[j], &ptr[j - 1]);
                         }
                     }
                 }
-
-                pub fn lessThan(sc: @This(), a_index: usize, b_index: usize) bool {
-                    return sc.sub_ctx.lessThan(a_index, b_index);
-                }
-            };
-
-            std.sort.sortContext(self.len, SortContext{
-                .sub_ctx = ctx,
-                .slice = self.slice(),
-            });
+            }
         }
 
         fn capacityInBytes(capacity: usize) usize {
diff --git a/lib/std/net.zig b/lib/std/net.zig
index 57e50a7349f7..7629ecc8f793 100644
--- a/lib/std/net.zig
+++ b/lib/std/net.zig
@@ -1082,7 +1082,7 @@ fn linuxLookupName(
         key |= (MAXADDRS - @intCast(i32, i)) << DAS_ORDER_SHIFT;
         addr.sortkey = key;
     }
-    std.sort.sort(LookupAddr, addrs.items, {}, addrCmpLessThan);
+    mem.sort(LookupAddr, addrs.items, {}, addrCmpLessThan);
 }
 
 const Policy = struct {
diff --git a/lib/std/sort.zig b/lib/std/sort.zig
index 3e219b8566b4..bf2bf40f89ed 100644
--- a/lib/std/sort.zig
+++ b/lib/std/sort.zig
@@ -4,1241 +4,152 @@ const testing = std.testing;
 const mem = std.mem;
 const math = std.math;
 
-pub fn binarySearch(
-    comptime T: type,
-    key: anytype,
-    items: []const T,
-    context: anytype,
-    comptime compareFn: fn (context: @TypeOf(context), key: @TypeOf(key), mid_item: T) math.Order,
-) ?usize {
-    var left: usize = 0;
-    var right: usize = items.len;
-
-    while (left < right) {
-        // Avoid overflowing in the midpoint calculation
-        const mid = left + (right - left) / 2;
-        // Compare the key with the midpoint element
-        switch (compareFn(context, key, items[mid])) {
-            .eq => return mid,
-            .gt => left = mid + 1,
-            .lt => right = mid,
-        }
-    }
-
-    return null;
-}
-
-test "binarySearch" {
-    const S = struct {
-        fn order_u32(context: void, lhs: u32, rhs: u32) math.Order {
-            _ = context;
-            return math.order(lhs, rhs);
-        }
-        fn order_i32(context: void, lhs: i32, rhs: i32) math.Order {
-            _ = context;
-            return math.order(lhs, rhs);
-        }
-    };
-    try testing.expectEqual(
-        @as(?usize, null),
-        binarySearch(u32, @as(u32, 1), &[_]u32{}, {}, S.order_u32),
-    );
-    try testing.expectEqual(
-        @as(?usize, 0),
-        binarySearch(u32, @as(u32, 1), &[_]u32{1}, {}, S.order_u32),
-    );
-    try testing.expectEqual(
-        @as(?usize, null),
-        binarySearch(u32, @as(u32, 1), &[_]u32{0}, {}, S.order_u32),
-    );
-    try testing.expectEqual(
-        @as(?usize, null),
-        binarySearch(u32, @as(u32, 0), &[_]u32{1}, {}, S.order_u32),
-    );
-    try testing.expectEqual(
-        @as(?usize, 4),
-        binarySearch(u32, @as(u32, 5), &[_]u32{ 1, 2, 3, 4, 5 }, {}, S.order_u32),
-    );
-    try testing.expectEqual(
-        @as(?usize, 0),
-        binarySearch(u32, @as(u32, 2), &[_]u32{ 2, 4, 8, 16, 32, 64 }, {}, S.order_u32),
-    );
-    try testing.expectEqual(
-        @as(?usize, 1),
-        binarySearch(i32, @as(i32, -4), &[_]i32{ -7, -4, 0, 9, 10 }, {}, S.order_i32),
-    );
-    try testing.expectEqual(
-        @as(?usize, 3),
-        binarySearch(i32, @as(i32, 98), &[_]i32{ -100, -25, 2, 98, 99, 100 }, {}, S.order_i32),
-    );
-    const R = struct {
-        b: i32,
-        e: i32,
-
-        fn r(b: i32, e: i32) @This() {
-            return @This(){ .b = b, .e = e };
-        }
-
-        fn order(context: void, key: i32, mid_item: @This()) math.Order {
-            _ = context;
-
-            if (key < mid_item.b) {
-                return .lt;
-            }
-
-            if (key > mid_item.e) {
-                return .gt;
-            }
-
-            return .eq;
-        }
-    };
-    try testing.expectEqual(
-        @as(?usize, null),
-        binarySearch(R, @as(i32, -45), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order),
-    );
-    try testing.expectEqual(
-        @as(?usize, 2),
-        binarySearch(R, @as(i32, 10), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order),
-    );
-    try testing.expectEqual(
-        @as(?usize, 1),
-        binarySearch(R, @as(i32, -20), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order),
-    );
-}
+pub const block = @import("sort/block.zig").block;
+pub const pdq = @import("sort/pdq.zig").pdq;
+pub const pdqContext = @import("sort/pdq.zig").pdqContext;
 
 /// Stable in-place sort. O(n) best case, O(pow(n, 2)) worst case.
 /// O(1) memory (no allocator required).
 /// Sorts in ascending order with respect to the given `lessThan` function.
-/// This can be expressed in terms of `insertionSortContext` but the glue
-/// code is slightly longer than the direct implementation.
-pub fn insertionSort(
+pub fn insertion(
     comptime T: type,
     items: []T,
     context: anytype,
-    comptime lessThan: fn (context: @TypeOf(context), lhs: T, rhs: T) bool,
+    comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) void {
-    var i: usize = 1;
-    while (i < items.len) : (i += 1) {
-        const x = items[i];
-        var j: usize = i;
-        while (j > 0 and lessThan(context, x, items[j - 1])) : (j -= 1) {
-            items[j] = items[j - 1];
+    const Context = struct {
+        items: []T,
+        sub_ctx: @TypeOf(context),
+
+        pub fn lessThan(ctx: @This(), a: usize, b: usize) bool {
+            return lessThanFn(ctx.sub_ctx, ctx.items[a], ctx.items[b]);
         }
-        items[j] = x;
-    }
+
+        pub fn swap(ctx: @This(), a: usize, b: usize) void {
+            return mem.swap(T, &ctx.items[a], &ctx.items[b]);
+        }
+    };
+    insertionContext(0, items.len, Context{ .items = items, .sub_ctx = context });
 }
 
 /// Stable in-place sort. O(n) best case, O(pow(n, 2)) worst case.
 /// O(1) memory (no allocator required).
-/// Sorts in ascending order with respect to the given `context.lessThan` function.
-pub fn insertionSortContext(len: usize, context: anytype) void {
-    var i: usize = 1;
-    while (i < len) : (i += 1) {
-        var j: usize = i;
-        while (j > 0 and context.lessThan(j, j - 1)) : (j -= 1) {
+/// Sorts in ascending order with respect to the given `lessThan` function.
+pub fn insertionContext(a: usize, b: usize, context: anytype) void {
+    var i = a + 1;
+    while (i < b) : (i += 1) {
+        var j = i;
+        while (j > a and context.lessThan(j, j - 1)) : (j -= 1) {
             context.swap(j, j - 1);
         }
     }
 }
 
-const Range = struct {
-    start: usize,
-    end: usize,
-
-    fn init(start: usize, end: usize) Range {
-        return Range{
-            .start = start,
-            .end = end,
-        };
-    }
-
-    fn length(self: Range) usize {
-        return self.end - self.start;
-    }
-};
-
-const Iterator = struct {
-    size: usize,
-    power_of_two: usize,
-    numerator: usize,
-    decimal: usize,
-    denominator: usize,
-    decimal_step: usize,
-    numerator_step: usize,
-
-    fn init(size2: usize, min_level: usize) Iterator {
-        const power_of_two = math.floorPowerOfTwo(usize, size2);
-        const denominator = power_of_two / min_level;
-        return Iterator{
-            .numerator = 0,
-            .decimal = 0,
-            .size = size2,
-            .power_of_two = power_of_two,
-            .denominator = denominator,
-            .decimal_step = size2 / denominator,
-            .numerator_step = size2 % denominator,
-        };
-    }
-
-    fn begin(self: *Iterator) void {
-        self.numerator = 0;
-        self.decimal = 0;
-    }
-
-    fn nextRange(self: *Iterator) Range {
-        const start = self.decimal;
-
-        self.decimal += self.decimal_step;
-        self.numerator += self.numerator_step;
-        if (self.numerator >= self.denominator) {
-            self.numerator -= self.denominator;
-            self.decimal += 1;
-        }
-
-        return Range{
-            .start = start,
-            .end = self.decimal,
-        };
-    }
-
-    fn finished(self: *Iterator) bool {
-        return self.decimal >= self.size;
-    }
-
-    fn nextLevel(self: *Iterator) bool {
-        self.decimal_step += self.decimal_step;
-        self.numerator_step += self.numerator_step;
-        if (self.numerator_step >= self.denominator) {
-            self.numerator_step -= self.denominator;
-            self.decimal_step += 1;
-        }
-
-        return (self.decimal_step < self.size);
-    }
-
-    fn length(self: *Iterator) usize {
-        return self.decimal_step;
-    }
-};
-
-const Pull = struct {
-    from: usize,
-    to: usize,
-    count: usize,
-    range: Range,
-};
-
-/// Stable in-place sort. O(n) best case, O(n*log(n)) worst case and average case.
+/// Unstable in-place sort. O(n*log(n)) best case, worst case and average case.
 /// O(1) memory (no allocator required).
 /// Sorts in ascending order with respect to the given `lessThan` function.
-/// Currently implemented as block sort.
-pub fn sort(
+pub fn heap(
     comptime T: type,
     items: []T,
     context: anytype,
-    comptime lessThan: fn (context: @TypeOf(context), lhs: T, rhs: T) bool,
+    comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) void {
+    const Context = struct {
+        items: []T,
+        sub_ctx: @TypeOf(context),
 
-    // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c
-    var cache: [512]T = undefined;
-
-    if (items.len < 4) {
-        if (items.len == 3) {
-            // hard coded insertion sort
-            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
-            if (lessThan(context, items[2], items[1])) {
-                mem.swap(T, &items[1], &items[2]);
-                if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
-            }
-        } else if (items.len == 2) {
-            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
+        pub fn lessThan(ctx: @This(), a: usize, b: usize) bool {
+            return lessThanFn(ctx.sub_ctx, ctx.items[a], ctx.items[b]);
         }
-        return;
-    }
-
-    // sort groups of 4-8 items at a time using an unstable sorting network,
-    // but keep track of the original item orders to force it to be stable
-    // http://pages.ripco.net/~jgamble/nw.html
-    var iterator = Iterator.init(items.len, 4);
-    while (!iterator.finished()) {
-        var order = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7 };
-        const range = iterator.nextRange();
-
-        const sliced_items = items[range.start..];
-        switch (range.length()) {
-            8 => {
-                swap(T, sliced_items, context, lessThan, &order, 0, 1);
-                swap(T, sliced_items, context, lessThan, &order, 2, 3);
-                swap(T, sliced_items, context, lessThan, &order, 4, 5);
-                swap(T, sliced_items, context, lessThan, &order, 6, 7);
-                swap(T, sliced_items, context, lessThan, &order, 0, 2);
-                swap(T, sliced_items, context, lessThan, &order, 1, 3);
-                swap(T, sliced_items, context, lessThan, &order, 4, 6);
-                swap(T, sliced_items, context, lessThan, &order, 5, 7);
-                swap(T, sliced_items, context, lessThan, &order, 1, 2);
-                swap(T, sliced_items, context, lessThan, &order, 5, 6);
-                swap(T, sliced_items, context, lessThan, &order, 0, 4);
-                swap(T, sliced_items, context, lessThan, &order, 3, 7);
-                swap(T, sliced_items, context, lessThan, &order, 1, 5);
-                swap(T, sliced_items, context, lessThan, &order, 2, 6);
-                swap(T, sliced_items, context, lessThan, &order, 1, 4);
-                swap(T, sliced_items, context, lessThan, &order, 3, 6);
-                swap(T, sliced_items, context, lessThan, &order, 2, 4);
-                swap(T, sliced_items, context, lessThan, &order, 3, 5);
-                swap(T, sliced_items, context, lessThan, &order, 3, 4);
-            },
-            7 => {
-                swap(T, sliced_items, context, lessThan, &order, 1, 2);
-                swap(T, sliced_items, context, lessThan, &order, 3, 4);
-                swap(T, sliced_items, context, lessThan, &order, 5, 6);
-                swap(T, sliced_items, context, lessThan, &order, 0, 2);
-                swap(T, sliced_items, context, lessThan, &order, 3, 5);
-                swap(T, sliced_items, context, lessThan, &order, 4, 6);
-                swap(T, sliced_items, context, lessThan, &order, 0, 1);
-                swap(T, sliced_items, context, lessThan, &order, 4, 5);
-                swap(T, sliced_items, context, lessThan, &order, 2, 6);
-                swap(T, sliced_items, context, lessThan, &order, 0, 4);
-                swap(T, sliced_items, context, lessThan, &order, 1, 5);
-                swap(T, sliced_items, context, lessThan, &order, 0, 3);
-                swap(T, sliced_items, context, lessThan, &order, 2, 5);
-                swap(T, sliced_items, context, lessThan, &order, 1, 3);
-                swap(T, sliced_items, context, lessThan, &order, 2, 4);
-                swap(T, sliced_items, context, lessThan, &order, 2, 3);
-            },
-            6 => {
-                swap(T, sliced_items, context, lessThan, &order, 1, 2);
-                swap(T, sliced_items, context, lessThan, &order, 4, 5);
-                swap(T, sliced_items, context, lessThan, &order, 0, 2);
-                swap(T, sliced_items, context, lessThan, &order, 3, 5);
-                swap(T, sliced_items, context, lessThan, &order, 0, 1);
-                swap(T, sliced_items, context, lessThan, &order, 3, 4);
-                swap(T, sliced_items, context, lessThan, &order, 2, 5);
-                swap(T, sliced_items, context, lessThan, &order, 0, 3);
-                swap(T, sliced_items, context, lessThan, &order, 1, 4);
-                swap(T, sliced_items, context, lessThan, &order, 2, 4);
-                swap(T, sliced_items, context, lessThan, &order, 1, 3);
-                swap(T, sliced_items, context, lessThan, &order, 2, 3);
-            },
-            5 => {
-                swap(T, sliced_items, context, lessThan, &order, 0, 1);
-                swap(T, sliced_items, context, lessThan, &order, 3, 4);
-                swap(T, sliced_items, context, lessThan, &order, 2, 4);
-                swap(T, sliced_items, context, lessThan, &order, 2, 3);
-                swap(T, sliced_items, context, lessThan, &order, 1, 4);
-                swap(T, sliced_items, context, lessThan, &order, 0, 3);
-                swap(T, sliced_items, context, lessThan, &order, 0, 2);
-                swap(T, sliced_items, context, lessThan, &order, 1, 3);
-                swap(T, sliced_items, context, lessThan, &order, 1, 2);
-            },
-            4 => {
-                swap(T, sliced_items, context, lessThan, &order, 0, 1);
-                swap(T, sliced_items, context, lessThan, &order, 2, 3);
-                swap(T, sliced_items, context, lessThan, &order, 0, 2);
-                swap(T, sliced_items, context, lessThan, &order, 1, 3);
-                swap(T, sliced_items, context, lessThan, &order, 1, 2);
-            },
-            else => {},
-        }
-    }
-    if (items.len < 8) return;
-
-    // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
-    while (true) {
-        // if every A and B block will fit into the cache, use a special branch
-        // specifically for merging with the cache
-        // (we use < rather than <= since the block size might be one more than
-        // iterator.length())
-        if (iterator.length() < cache.len) {
-            // if four subarrays fit into the cache, it's faster to merge both
-            // pairs of subarrays into the cache,
-            // then merge the two merged subarrays from the cache back into the original array
-            if ((iterator.length() + 1) * 4 <= cache.len and iterator.length() * 4 <= items.len) {
-                iterator.begin();
-                while (!iterator.finished()) {
-                    // merge A1 and B1 into the cache
-                    var A1 = iterator.nextRange();
-                    var B1 = iterator.nextRange();
-                    var A2 = iterator.nextRange();
-                    var B2 = iterator.nextRange();
-
-                    if (lessThan(context, items[B1.end - 1], items[A1.start])) {
-                        // the two ranges are in reverse order, so copy them in reverse order into the cache
-                        const a1_items = items[A1.start..A1.end];
-                        @memcpy(cache[B1.length()..][0..a1_items.len], a1_items);
-                        const b1_items = items[B1.start..B1.end];
-                        @memcpy(cache[0..b1_items.len], b1_items);
-                    } else if (lessThan(context, items[B1.start], items[A1.end - 1])) {
-                        // these two ranges weren't already in order, so merge them into the cache
-                        mergeInto(T, items, A1, B1, context, lessThan, cache[0..]);
-                    } else {
-                        // if A1, B1, A2, and B2 are all in order, skip doing anything else
-                        if (!lessThan(context, items[B2.start], items[A2.end - 1]) and !lessThan(context, items[A2.start], items[B1.end - 1])) continue;
-
-                        // copy A1 and B1 into the cache in the same order
-                        const a1_items = items[A1.start..A1.end];
-                        @memcpy(cache[0..a1_items.len], a1_items);
-                        const b1_items = items[B1.start..B1.end];
-                        @memcpy(cache[A1.length()..][0..b1_items.len], b1_items);
-                    }
-                    A1 = Range.init(A1.start, B1.end);
-
-                    // merge A2 and B2 into the cache
-                    if (lessThan(context, items[B2.end - 1], items[A2.start])) {
-                        // the two ranges are in reverse order, so copy them in reverse order into the cache
-                        const a2_items = items[A2.start..A2.end];
-                        @memcpy(cache[A1.length() + B2.length() ..][0..a2_items.len], a2_items);
-                        const b2_items = items[B2.start..B2.end];
-                        @memcpy(cache[A1.length()..][0..b2_items.len], b2_items);
-                    } else if (lessThan(context, items[B2.start], items[A2.end - 1])) {
-                        // these two ranges weren't already in order, so merge them into the cache
-                        mergeInto(T, items, A2, B2, context, lessThan, cache[A1.length()..]);
-                    } else {
-                        // copy A2 and B2 into the cache in the same order
-                        const a2_items = items[A2.start..A2.end];
-                        @memcpy(cache[A1.length()..][0..a2_items.len], a2_items);
-                        const b2_items = items[B2.start..B2.end];
-                        @memcpy(cache[A1.length() + A2.length() ..][0..b2_items.len], b2_items);
-                    }
-                    A2 = Range.init(A2.start, B2.end);
-
-                    // merge A1 and A2 from the cache into the items
-                    const A3 = Range.init(0, A1.length());
-                    const B3 = Range.init(A1.length(), A1.length() + A2.length());
-
-                    if (lessThan(context, cache[B3.end - 1], cache[A3.start])) {
-                        // the two ranges are in reverse order, so copy them in reverse order into the items
-                        const a3_items = cache[A3.start..A3.end];
-                        @memcpy(items[A1.start + A2.length() ..][0..a3_items.len], a3_items);
-                        const b3_items = cache[B3.start..B3.end];
-                        @memcpy(items[A1.start..][0..b3_items.len], b3_items);
-                    } else if (lessThan(context, cache[B3.start], cache[A3.end - 1])) {
-                        // these two ranges weren't already in order, so merge them back into the items
-                        mergeInto(T, cache[0..], A3, B3, context, lessThan, items[A1.start..]);
-                    } else {
-                        // copy A3 and B3 into the items in the same order
-                        const a3_items = cache[A3.start..A3.end];
-                        @memcpy(items[A1.start..][0..a3_items.len], a3_items);
-                        const b3_items = cache[B3.start..B3.end];
-                        @memcpy(items[A1.start + A1.length() ..][0..b3_items.len], b3_items);
-                    }
-                }
-
-                // we merged two levels at the same time, so we're done with this level already
-                // (iterator.nextLevel() is called again at the bottom of this outer merge loop)
-                _ = iterator.nextLevel();
-            } else {
-                iterator.begin();
-                while (!iterator.finished()) {
-                    var A = iterator.nextRange();
-                    var B = iterator.nextRange();
-
-                    if (lessThan(context, items[B.end - 1], items[A.start])) {
-                        // the two ranges are in reverse order, so a simple rotation should fix it
-                        mem.rotate(T, items[A.start..B.end], A.length());
-                    } else if (lessThan(context, items[B.start], items[A.end - 1])) {
-                        // these two ranges weren't already in order, so we'll need to merge them!
-                        const a_items = items[A.start..A.end];
-                        @memcpy(cache[0..a_items.len], a_items);
-                        mergeExternal(T, items, A, B, context, lessThan, cache[0..]);
-                    }
-                }
-            }
-        } else {
-            // this is where the in-place merge logic starts!
-            // 1. pull out two internal buffers each containing √A unique values
-            //    1a. adjust block_size and buffer_size if we couldn't find enough unique values
-            // 2. loop over the A and B subarrays within this level of the merge sort
-            // 3. break A and B into blocks of size 'block_size'
-            // 4. "tag" each of the A blocks with values from the first internal buffer
-            // 5. roll the A blocks through the B blocks and drop/rotate them where they belong
-            // 6. merge each A block with any B values that follow, using the cache or the second internal buffer
-            // 7. sort the second internal buffer if it exists
-            // 8. redistribute the two internal buffers back into the items
-            var block_size: usize = math.sqrt(iterator.length());
-            var buffer_size = iterator.length() / block_size + 1;
-
-            // as an optimization, we really only need to pull out the internal buffers once for each level of merges
-            // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
-            var A: Range = undefined;
-            var B: Range = undefined;
-            var index: usize = 0;
-            var last: usize = 0;
-            var count: usize = 0;
-            var find: usize = 0;
-            var start: usize = 0;
-            var pull_index: usize = 0;
-            var pull = [_]Pull{
-                Pull{
-                    .from = 0,
-                    .to = 0,
-                    .count = 0,
-                    .range = Range.init(0, 0),
-                },
-                Pull{
-                    .from = 0,
-                    .to = 0,
-                    .count = 0,
-                    .range = Range.init(0, 0),
-                },
-            };
-
-            var buffer1 = Range.init(0, 0);
-            var buffer2 = Range.init(0, 0);
-
-            // find two internal buffers of size 'buffer_size' each
-            find = buffer_size + buffer_size;
-            var find_separately = false;
-
-            if (block_size <= cache.len) {
-                // if every A block fits into the cache then we won't need the second internal buffer,
-                // so we really only need to find 'buffer_size' unique values
-                find = buffer_size;
-            } else if (find > iterator.length()) {
-                // we can't fit both buffers into the same A or B subarray, so find two buffers separately
-                find = buffer_size;
-                find_separately = true;
-            }
-
-            // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
-            // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
-            // OR if we couldn't find that many unique values, we need the largest possible buffer we can get
-
-            // in the case where it couldn't find a single buffer of at least √A unique values,
-            // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
-            iterator.begin();
-            while (!iterator.finished()) {
-                A = iterator.nextRange();
-                B = iterator.nextRange();
-
-                // just store information about where the values will be pulled from and to,
-                // as well as how many values there are, to create the two internal buffers
-
-                // check A for the number of unique values we need to fill an internal buffer
-                // these values will be pulled out to the start of A
-                last = A.start;
-                count = 1;
-                while (count < find) : ({
-                    last = index;
-                    count += 1;
-                }) {
-                    index = findLastForward(T, items, items[last], Range.init(last + 1, A.end), context, lessThan, find - count);
-                    if (index == A.end) break;
-                }
-                index = last;
-
-                if (count >= buffer_size) {
-                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = A.start,
-                    };
-                    pull_index = 1;
-
-                    if (count == buffer_size + buffer_size) {
-                        // we were able to find a single contiguous section containing 2√A unique values,
-                        // so this section can be used to contain both of the internal buffers we'll need
-                        buffer1 = Range.init(A.start, A.start + buffer_size);
-                        buffer2 = Range.init(A.start + buffer_size, A.start + count);
-                        break;
-                    } else if (find == buffer_size + buffer_size) {
-                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
-                        // so we still need to find a second separate buffer of at least √A unique values
-                        buffer1 = Range.init(A.start, A.start + count);
-                        find = buffer_size;
-                    } else if (block_size <= cache.len) {
-                        // we found the first and only internal buffer that we need, so we're done!
-                        buffer1 = Range.init(A.start, A.start + count);
-                        break;
-                    } else if (find_separately) {
-                        // found one buffer, but now find the other one
-                        buffer1 = Range.init(A.start, A.start + count);
-                        find_separately = false;
-                    } else {
-                        // we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
-                        buffer2 = Range.init(A.start, A.start + count);
-                        break;
-                    }
-                } else if (pull_index == 0 and count > buffer1.length()) {
-                    // keep track of the largest buffer we were able to find
-                    buffer1 = Range.init(A.start, A.start + count);
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = A.start,
-                    };
-                }
-
-                // check B for the number of unique values we need to fill an internal buffer
-                // these values will be pulled out to the end of B
-                last = B.end - 1;
-                count = 1;
-                while (count < find) : ({
-                    last = index - 1;
-                    count += 1;
-                }) {
-                    index = findFirstBackward(T, items, items[last], Range.init(B.start, last), context, lessThan, find - count);
-                    if (index == B.start) break;
-                }
-                index = last;
 
-                if (count >= buffer_size) {
-                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = B.end,
-                    };
-                    pull_index = 1;
-
-                    if (count == buffer_size + buffer_size) {
-                        // we were able to find a single contiguous section containing 2√A unique values,
-                        // so this section can be used to contain both of the internal buffers we'll need
-                        buffer1 = Range.init(B.end - count, B.end - buffer_size);
-                        buffer2 = Range.init(B.end - buffer_size, B.end);
-                        break;
-                    } else if (find == buffer_size + buffer_size) {
-                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
-                        // so we still need to find a second separate buffer of at least √A unique values
-                        buffer1 = Range.init(B.end - count, B.end);
-                        find = buffer_size;
-                    } else if (block_size <= cache.len) {
-                        // we found the first and only internal buffer that we need, so we're done!
-                        buffer1 = Range.init(B.end - count, B.end);
-                        break;
-                    } else if (find_separately) {
-                        // found one buffer, but now find the other one
-                        buffer1 = Range.init(B.end - count, B.end);
-                        find_separately = false;
-                    } else {
-                        // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
-                        // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
-                        if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
-
-                        // we found a second buffer in an 'B' subarray containing √A unique values, so we're done!
-                        buffer2 = Range.init(B.end - count, B.end);
-                        break;
-                    }
-                } else if (pull_index == 0 and count > buffer1.length()) {
-                    // keep track of the largest buffer we were able to find
-                    buffer1 = Range.init(B.end - count, B.end);
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = B.end,
-                    };
-                }
-            }
-
-            // pull out the two ranges so we can use them as internal buffers
-            pull_index = 0;
-            while (pull_index < 2) : (pull_index += 1) {
-                const length = pull[pull_index].count;
-
-                if (pull[pull_index].to < pull[pull_index].from) {
-                    // we're pulling the values out to the left, which means the start of an A subarray
-                    index = pull[pull_index].from;
-                    count = 1;
-                    while (count < length) : (count += 1) {
-                        index = findFirstBackward(T, items, items[index - 1], Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), context, lessThan, length - count);
-                        const range = Range.init(index + 1, pull[pull_index].from + 1);
-                        mem.rotate(T, items[range.start..range.end], range.length() - count);
-                        pull[pull_index].from = index + count;
-                    }
-                } else if (pull[pull_index].to > pull[pull_index].from) {
-                    // we're pulling values out to the right, which means the end of a B subarray
-                    index = pull[pull_index].from + 1;
-                    count = 1;
-                    while (count < length) : (count += 1) {
-                        index = findLastForward(T, items, items[index], Range.init(index, pull[pull_index].to), context, lessThan, length - count);
-                        const range = Range.init(pull[pull_index].from, index - 1);
-                        mem.rotate(T, items[range.start..range.end], count);
-                        pull[pull_index].from = index - 1 - count;
-                    }
-                }
-            }
-
-            // adjust block_size and buffer_size based on the values we were able to pull out
-            buffer_size = buffer1.length();
-            block_size = iterator.length() / buffer_size + 1;
-
-            // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
-            // so this was originally here to test the math for adjusting block_size above
-            // assert((iterator.length() + 1)/block_size <= buffer_size);
-
-            // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
-            iterator.begin();
-            while (!iterator.finished()) {
-                A = iterator.nextRange();
-                B = iterator.nextRange();
-
-                // remove any parts of A or B that are being used by the internal buffers
-                start = A.start;
-                if (start == pull[0].range.start) {
-                    if (pull[0].from > pull[0].to) {
-                        A.start += pull[0].count;
-
-                        // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
-                        // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
-                        // which also only happens when cache.len is small or 0 since it'd otherwise use MergeExternal
-                        if (A.length() == 0) continue;
-                    } else if (pull[0].from < pull[0].to) {
-                        B.end -= pull[0].count;
-                        if (B.length() == 0) continue;
-                    }
-                }
-                if (start == pull[1].range.start) {
-                    if (pull[1].from > pull[1].to) {
-                        A.start += pull[1].count;
-                        if (A.length() == 0) continue;
-                    } else if (pull[1].from < pull[1].to) {
-                        B.end -= pull[1].count;
-                        if (B.length() == 0) continue;
-                    }
-                }
-
-                if (lessThan(context, items[B.end - 1], items[A.start])) {
-                    // the two ranges are in reverse order, so a simple rotation should fix it
-                    mem.rotate(T, items[A.start..B.end], A.length());
-                } else if (lessThan(context, items[A.end], items[A.end - 1])) {
-                    // these two ranges weren't already in order, so we'll need to merge them!
-                    var findA: usize = undefined;
-
-                    // break the remainder of A into blocks. firstA is the uneven-sized first A block
-                    var blockA = Range.init(A.start, A.end);
-                    var firstA = Range.init(A.start, A.start + blockA.length() % block_size);
-
-                    // swap the first value of each A block with the value in buffer1
-                    var indexA = buffer1.start;
-                    index = firstA.end;
-                    while (index < blockA.end) : ({
-                        indexA += 1;
-                        index += block_size;
-                    }) {
-                        mem.swap(T, &items[indexA], &items[index]);
-                    }
-
-                    // start rolling the A blocks through the B blocks!
-                    // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
-                    var lastA = firstA;
-                    var lastB = Range.init(0, 0);
-                    var blockB = Range.init(B.start, B.start + math.min(block_size, B.length()));
-                    blockA.start += firstA.length();
-                    indexA = buffer1.start;
-
-                    // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
-                    // otherwise, if the second buffer is available, block swap the contents into that
-                    if (lastA.length() <= cache.len) {
-                        const last_a_items = items[lastA.start..lastA.end];
-                        @memcpy(cache[0..last_a_items.len], last_a_items);
-                    } else if (buffer2.length() > 0) {
-                        blockSwap(T, items, lastA.start, buffer2.start, lastA.length());
-                    }
-
-                    if (blockA.length() > 0) {
-                        while (true) {
-                            // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
-                            // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
-                            if ((lastB.length() > 0 and !lessThan(context, items[lastB.end - 1], items[indexA])) or blockB.length() == 0) {
-                                // figure out where to split the previous B block, and rotate it at the split
-                                const B_split = binaryFirst(T, items, items[indexA], lastB, context, lessThan);
-                                const B_remaining = lastB.end - B_split;
-
-                                // swap the minimum A block to the beginning of the rolling A blocks
-                                var minA = blockA.start;
-                                findA = minA + block_size;
-                                while (findA < blockA.end) : (findA += block_size) {
-                                    if (lessThan(context, items[findA], items[minA])) {
-                                        minA = findA;
-                                    }
-                                }
-                                blockSwap(T, items, blockA.start, minA, block_size);
-
-                                // swap the first item of the previous A block back with its original value, which is stored in buffer1
-                                mem.swap(T, &items[blockA.start], &items[indexA]);
-                                indexA += 1;
-
-                                // locally merge the previous A block with the B values that follow it
-                                // if lastA fits into the external cache we'll use that (with MergeExternal),
-                                // or if the second internal buffer exists we'll use that (with MergeInternal),
-                                // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
-
-                                if (lastA.length() <= cache.len) {
-                                    mergeExternal(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan, cache[0..]);
-                                } else if (buffer2.length() > 0) {
-                                    mergeInternal(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan, buffer2);
-                                } else {
-                                    mergeInPlace(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan);
-                                }
-
-                                if (buffer2.length() > 0 or block_size <= cache.len) {
-                                    // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
-                                    if (block_size <= cache.len) {
-                                        @memcpy(cache[0..block_size], items[blockA.start..][0..block_size]);
-                                    } else {
-                                        blockSwap(T, items, blockA.start, buffer2.start, block_size);
-                                    }
-
-                                    // this is equivalent to rotating, but faster
-                                    // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
-                                    // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
-                                    blockSwap(T, items, B_split, blockA.start + block_size - B_remaining, B_remaining);
-                                } else {
-                                    // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
-                                    mem.rotate(T, items[B_split .. blockA.start + block_size], blockA.start - B_split);
-                                }
-
-                                // update the range for the remaining A blocks, and the range remaining from the B block after it was split
-                                lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
-                                lastB = Range.init(lastA.end, lastA.end + B_remaining);
-
-                                // if there are no more A blocks remaining, this step is finished!
-                                blockA.start += block_size;
-                                if (blockA.length() == 0) break;
-                            } else if (blockB.length() < block_size) {
-                                // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
-                                // the cache is disabled here since it might contain the contents of the previous A block
-                                mem.rotate(T, items[blockA.start..blockB.end], blockB.start - blockA.start);
-
-                                lastB = Range.init(blockA.start, blockA.start + blockB.length());
-                                blockA.start += blockB.length();
-                                blockA.end += blockB.length();
-                                blockB.end = blockB.start;
-                            } else {
-                                // roll the leftmost A block to the end by swapping it with the next B block
-                                blockSwap(T, items, blockA.start, blockB.start, block_size);
-                                lastB = Range.init(blockA.start, blockA.start + block_size);
-
-                                blockA.start += block_size;
-                                blockA.end += block_size;
-                                blockB.start += block_size;
-
-                                if (blockB.end > B.end - block_size) {
-                                    blockB.end = B.end;
-                                } else {
-                                    blockB.end += block_size;
-                                }
-                            }
-                        }
-                    }
-
-                    // merge the last A block with the remaining B values
-                    if (lastA.length() <= cache.len) {
-                        mergeExternal(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan, cache[0..]);
-                    } else if (buffer2.length() > 0) {
-                        mergeInternal(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan, buffer2);
-                    } else {
-                        mergeInPlace(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan);
-                    }
-                }
-            }
-
-            // when we're finished with this merge step we should have the one
-            // or two internal buffers left over, where the second buffer is all jumbled up
-            // insertion sort the second buffer, then redistribute the buffers
-            // back into the items using the opposite process used for creating the buffer
-
-            // while an unstable sort like quicksort could be applied here, in benchmarks
-            // it was consistently slightly slower than a simple insertion sort,
-            // even for tens of millions of items. this may be because insertion
-            // sort is quite fast when the data is already somewhat sorted, like it is here
-            insertionSort(T, items[buffer2.start..buffer2.end], context, lessThan);
-
-            pull_index = 0;
-            while (pull_index < 2) : (pull_index += 1) {
-                var unique = pull[pull_index].count * 2;
-                if (pull[pull_index].from > pull[pull_index].to) {
-                    // the values were pulled out to the left, so redistribute them back to the right
-                    var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
-                    while (buffer.length() > 0) {
-                        index = findFirstForward(T, items, items[buffer.start], Range.init(buffer.end, pull[pull_index].range.end), context, lessThan, unique);
-                        const amount = index - buffer.end;
-                        mem.rotate(T, items[buffer.start..index], buffer.length());
-                        buffer.start += (amount + 1);
-                        buffer.end += amount;
-                        unique -= 2;
-                    }
-                } else if (pull[pull_index].from < pull[pull_index].to) {
-                    // the values were pulled out to the right, so redistribute them back to the left
-                    var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
-                    while (buffer.length() > 0) {
-                        index = findLastBackward(T, items, items[buffer.end - 1], Range.init(pull[pull_index].range.start, buffer.start), context, lessThan, unique);
-                        const amount = buffer.start - index;
-                        mem.rotate(T, items[index..buffer.end], amount);
-                        buffer.start -= amount;
-                        buffer.end -= (amount + 1);
-                        unique -= 2;
-                    }
-                }
-            }
+        pub fn swap(ctx: @This(), a: usize, b: usize) void {
+            return mem.swap(T, &ctx.items[a], &ctx.items[b]);
         }
-
-        // double the size of each A and B subarray that will be merged in the next level
-        if (!iterator.nextLevel()) break;
-    }
-}
-
-/// TODO currently this just calls `insertionSortContext`. The block sort implementation
-/// in this file needs to be adapted to use the sort context.
-pub fn sortContext(len: usize, context: anytype) void {
-    return insertionSortContext(len, context);
-}
-
-// merge operation without a buffer
-fn mergeInPlace(
-    comptime T: type,
-    items: []T,
-    A_arg: Range,
-    B_arg: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-) void {
-    if (A_arg.length() == 0 or B_arg.length() == 0) return;
-
-    // this just repeatedly binary searches into B and rotates A into position.
-    // the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
-    // but I decided to stick with this because it had better situational performance
-    //
-    // (Hwang and Lin is designed for merging subarrays of very different sizes,
-    // but WikiSort almost always uses subarrays that are roughly the same size)
-    //
-    // normally this is incredibly suboptimal, but this function is only called
-    // when none of the A or B blocks in any subarray contained 2√A unique values,
-    // which places a hard limit on the number of times this will ACTUALLY need
-    // to binary search and rotate.
-    //
-    // according to my analysis the worst case is √A rotations performed on √A items
-    // once the constant factors are removed, which ends up being O(n)
-    //
-    // again, this is NOT a general-purpose solution – it only works well in this case!
-    // kind of like how the O(n^2) insertion sort is used in some places
-
-    var A = A_arg;
-    var B = B_arg;
-
-    while (true) {
-        // find the first place in B where the first item in A needs to be inserted
-        const mid = binaryFirst(T, items, items[A.start], B, context, lessThan);
-
-        // rotate A into place
-        const amount = mid - A.end;
-        mem.rotate(T, items[A.start..mid], A.length());
-        if (B.end == mid) break;
-
-        // calculate the new A and B ranges
-        B.start = mid;
-        A = Range.init(A.start + amount, B.start);
-        A.start = binaryLast(T, items, items[A.start], A, context, lessThan);
-        if (A.length() == 0) break;
-    }
-}
-
-// merge operation using an internal buffer
-fn mergeInternal(
-    comptime T: type,
-    items: []T,
-    A: Range,
-    B: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-    buffer: Range,
-) void {
-    // whenever we find a value to add to the final array, swap it with the value that's already in that spot
-    // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order
-    var A_count: usize = 0;
-    var B_count: usize = 0;
-    var insert: usize = 0;
-
-    if (B.length() > 0 and A.length() > 0) {
-        while (true) {
-            if (!lessThan(context, items[B.start + B_count], items[buffer.start + A_count])) {
-                mem.swap(T, &items[A.start + insert], &items[buffer.start + A_count]);
-                A_count += 1;
-                insert += 1;
-                if (A_count >= A.length()) break;
-            } else {
-                mem.swap(T, &items[A.start + insert], &items[B.start + B_count]);
-                B_count += 1;
-                insert += 1;
-                if (B_count >= B.length()) break;
-            }
-        }
-    }
-
-    // swap the remainder of A into the final array
-    blockSwap(T, items, buffer.start + A_count, A.start + insert, A.length() - A_count);
-}
-
-fn blockSwap(comptime T: type, items: []T, start1: usize, start2: usize, block_size: usize) void {
-    var index: usize = 0;
-    while (index < block_size) : (index += 1) {
-        mem.swap(T, &items[start1 + index], &items[start2 + index]);
-    }
-}
-
-// combine a linear search with a binary search to reduce the number of comparisons in situations
-// where have some idea as to how many unique values there are and where the next value might be
-fn findFirstForward(
-    comptime T: type,
-    items: []T,
-    value: T,
-    range: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-    unique: usize,
-) usize {
-    if (range.length() == 0) return range.start;
-    const skip = math.max(range.length() / unique, @as(usize, 1));
-
-    var index = range.start + skip;
-    while (lessThan(context, items[index - 1], value)) : (index += skip) {
-        if (index >= range.end - skip) {
-            return binaryFirst(T, items, value, Range.init(index, range.end), context, lessThan);
-        }
-    }
-
-    return binaryFirst(T, items, value, Range.init(index - skip, index), context, lessThan);
-}
-
-fn findFirstBackward(
-    comptime T: type,
-    items: []T,
-    value: T,
-    range: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-    unique: usize,
-) usize {
-    if (range.length() == 0) return range.start;
-    const skip = math.max(range.length() / unique, @as(usize, 1));
-
-    var index = range.end - skip;
-    while (index > range.start and !lessThan(context, items[index - 1], value)) : (index -= skip) {
-        if (index < range.start + skip) {
-            return binaryFirst(T, items, value, Range.init(range.start, index), context, lessThan);
-        }
-    }
-
-    return binaryFirst(T, items, value, Range.init(index, index + skip), context, lessThan);
-}
-
-fn findLastForward(
-    comptime T: type,
-    items: []T,
-    value: T,
-    range: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-    unique: usize,
-) usize {
-    if (range.length() == 0) return range.start;
-    const skip = math.max(range.length() / unique, @as(usize, 1));
-
-    var index = range.start + skip;
-    while (!lessThan(context, value, items[index - 1])) : (index += skip) {
-        if (index >= range.end - skip) {
-            return binaryLast(T, items, value, Range.init(index, range.end), context, lessThan);
-        }
-    }
-
-    return binaryLast(T, items, value, Range.init(index - skip, index), context, lessThan);
-}
-
-fn findLastBackward(
-    comptime T: type,
-    items: []T,
-    value: T,
-    range: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-    unique: usize,
-) usize {
-    if (range.length() == 0) return range.start;
-    const skip = math.max(range.length() / unique, @as(usize, 1));
-
-    var index = range.end - skip;
-    while (index > range.start and lessThan(context, value, items[index - 1])) : (index -= skip) {
-        if (index < range.start + skip) {
-            return binaryLast(T, items, value, Range.init(range.start, index), context, lessThan);
-        }
-    }
-
-    return binaryLast(T, items, value, Range.init(index, index + skip), context, lessThan);
+    };
+    heapContext(0, items.len, Context{ .items = items, .sub_ctx = context });
 }
 
-fn binaryFirst(
-    comptime T: type,
-    items: []T,
-    value: T,
-    range: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-) usize {
-    var curr = range.start;
-    var size = range.length();
-    if (range.start >= range.end) return range.end;
-    while (size > 0) {
-        const offset = size % 2;
-
-        size /= 2;
-        const mid_item = items[curr + size];
-        if (lessThan(context, mid_item, value)) {
-            curr += size + offset;
-        }
+/// Unstable in-place sort. O(n*log(n)) best case, worst case and average case.
+/// O(1) memory (no allocator required).
+/// Sorts in ascending order with respect to the given `lessThan` function.
+pub fn heapContext(a: usize, b: usize, context: anytype) void {
+    // build the heap in linear time.
+    var i = b / 2;
+    while (i > a) : (i -= 1) {
+        siftDown(i - 1, b, context);
     }
-    return curr;
-}
-
-fn binaryLast(
-    comptime T: type,
-    items: []T,
-    value: T,
-    range: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-) usize {
-    var curr = range.start;
-    var size = range.length();
-    if (range.start >= range.end) return range.end;
-    while (size > 0) {
-        const offset = size % 2;
 
-        size /= 2;
-        const mid_item = items[curr + size];
-        if (!lessThan(context, value, mid_item)) {
-            curr += size + offset;
-        }
+    // pop maximal elements from the heap.
+    i = b;
+    while (i > a) : (i -= 1) {
+        context.swap(a, i - 1);
+        siftDown(a, i - 1, context);
     }
-    return curr;
 }
 
-fn mergeInto(
-    comptime T: type,
-    from: []T,
-    A: Range,
-    B: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-    into: []T,
-) void {
-    var A_index: usize = A.start;
-    var B_index: usize = B.start;
-    const A_last = A.end;
-    const B_last = B.end;
-    var insert_index: usize = 0;
-
+fn siftDown(root: usize, n: usize, context: anytype) void {
+    var node = root;
     while (true) {
-        if (!lessThan(context, from[B_index], from[A_index])) {
-            into[insert_index] = from[A_index];
-            A_index += 1;
-            insert_index += 1;
-            if (A_index == A_last) {
-                // copy the remainder of B into the final array
-                const from_b = from[B_index..B_last];
-                @memcpy(into[insert_index..][0..from_b.len], from_b);
-                break;
-            }
-        } else {
-            into[insert_index] = from[B_index];
-            B_index += 1;
-            insert_index += 1;
-            if (B_index == B_last) {
-                // copy the remainder of A into the final array
-                const from_a = from[A_index..A_last];
-                @memcpy(into[insert_index..][0..from_a.len], from_a);
-                break;
-            }
-        }
-    }
-}
-
-fn mergeExternal(
-    comptime T: type,
-    items: []T,
-    A: Range,
-    B: Range,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), T, T) bool,
-    cache: []T,
-) void {
-    // A fits into the cache, so use that instead of the internal buffer
-    var A_index: usize = 0;
-    var B_index: usize = B.start;
-    var insert_index: usize = A.start;
-    const A_last = A.length();
-    const B_last = B.end;
+        var child = 2 * node + 1;
+        if (child >= n) break;
 
-    if (B.length() > 0 and A.length() > 0) {
-        while (true) {
-            if (!lessThan(context, items[B_index], cache[A_index])) {
-                items[insert_index] = cache[A_index];
-                A_index += 1;
-                insert_index += 1;
-                if (A_index == A_last) break;
-            } else {
-                items[insert_index] = items[B_index];
-                B_index += 1;
-                insert_index += 1;
-                if (B_index == B_last) break;
-            }
+        // choose the greater child.
+        if (child + 1 < n and context.lessThan(child, child + 1)) {
+            child += 1;
         }
-    }
 
-    // copy the remainder of A into the final array
-    const cache_a = cache[A_index..A_last];
-    @memcpy(items[insert_index..][0..cache_a.len], cache_a);
-}
+        // stop if the invariant holds at `node`.
+        if (!context.lessThan(node, child)) break;
 
-fn swap(
-    comptime T: type,
-    items: []T,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
-    order: *[8]u8,
-    x: usize,
-    y: usize,
-) void {
-    if (lessThan(context, items[y], items[x]) or ((order.*)[x] > (order.*)[y] and !lessThan(context, items[x], items[y]))) {
-        mem.swap(T, &items[x], &items[y]);
-        mem.swap(u8, &(order.*)[x], &(order.*)[y]);
+        // swap `node` with the greater child,
+        // move one step down, and continue sifting.
+        context.swap(node, child);
+        node = child;
     }
 }
 
-/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, comptime asc(u8))`.
+/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, asc(u8))`.
 pub fn asc(comptime T: type) fn (void, T, T) bool {
-    const impl = struct {
-        fn inner(context: void, a: T, b: T) bool {
-            _ = context;
+    return struct {
+        pub fn inner(_: void, a: T, b: T) bool {
             return a < b;
         }
-    };
-
-    return impl.inner;
+    }.inner;
 }
 
-/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, comptime desc(u8))`.
+/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, desc(u8))`.
 pub fn desc(comptime T: type) fn (void, T, T) bool {
-    const impl = struct {
-        fn inner(context: void, a: T, b: T) bool {
-            _ = context;
+    return struct {
+        pub fn inner(_: void, a: T, b: T) bool {
             return a > b;
         }
-    };
-
-    return impl.inner;
+    }.inner;
 }
 
+const asc_u8 = asc(u8);
+const asc_i32 = asc(i32);
+const desc_u8 = desc(u8);
+const desc_i32 = desc(i32);
+
+const sort_funcs = &[_]fn (comptime type, anytype, anytype, comptime anytype) void{
+    block,
+    pdq,
+    insertion,
+    heap,
+};
+
+const IdAndValue = struct {
+    id: usize,
+    value: i32,
+
+    fn lessThan(context: void, a: IdAndValue, b: IdAndValue) bool {
+        _ = context;
+        return a.value < b.value;
+    }
+};
+
 test "stable sort" {
-    try testStableSort();
-    comptime try testStableSort();
-}
-fn testStableSort() !void {
-    var expected = [_]IdAndValue{
+    const expected = [_]IdAndValue{
         IdAndValue{ .id = 0, .value = 0 },
         IdAndValue{ .id = 1, .value = 0 },
         IdAndValue{ .id = 2, .value = 0 },
@@ -1249,6 +160,7 @@ fn testStableSort() !void {
         IdAndValue{ .id = 1, .value = 2 },
         IdAndValue{ .id = 2, .value = 2 },
     };
+
     var cases = [_][9]IdAndValue{
         [_]IdAndValue{
             IdAndValue{ .id = 0, .value = 0 },
@@ -1273,26 +185,15 @@ fn testStableSort() !void {
             IdAndValue{ .id = 2, .value = 0 },
         },
     };
+
     for (&cases) |*case| {
-        insertionSort(IdAndValue, (case.*)[0..], {}, cmpByValue);
+        block(IdAndValue, (case.*)[0..], {}, IdAndValue.lessThan);
         for (case.*, 0..) |item, i| {
             try testing.expect(item.id == expected[i].id);
             try testing.expect(item.value == expected[i].value);
         }
     }
 }
-const IdAndValue = struct {
-    id: usize,
-    value: i32,
-};
-fn cmpByValue(context: void, a: IdAndValue, b: IdAndValue) bool {
-    return asc_i32(context, a.value, b.value);
-}
-
-const asc_u8 = asc(u8);
-const asc_i32 = asc(i32);
-const desc_u8 = desc(u8);
-const desc_i32 = desc(i32);
 
 test "sort" {
     const u8cases = [_][]const []const u8{
@@ -1322,14 +223,6 @@ test "sort" {
         },
     };
 
-    for (u8cases) |case| {
-        var buf: [8]u8 = undefined;
-        const slice = buf[0..case[0].len];
-        @memcpy(slice, case[0]);
-        sort(u8, slice, {}, asc_u8);
-        try testing.expect(mem.eql(u8, slice, case[1]));
-    }
-
     const i32cases = [_][]const []const i32{
         &[_][]const i32{
             &[_]i32{},
@@ -1357,12 +250,22 @@ test "sort" {
         },
     };
 
-    for (i32cases) |case| {
-        var buf: [8]i32 = undefined;
-        const slice = buf[0..case[0].len];
-        @memcpy(slice, case[0]);
-        sort(i32, slice, {}, asc_i32);
-        try testing.expect(mem.eql(i32, slice, case[1]));
+    inline for (sort_funcs) |sortFn| {
+        for (u8cases) |case| {
+            var buf: [8]u8 = undefined;
+            const slice = buf[0..case[0].len];
+            @memcpy(slice, case[0]);
+            sortFn(u8, slice, {}, asc_u8);
+            try testing.expect(mem.eql(u8, slice, case[1]));
+        }
+
+        for (i32cases) |case| {
+            var buf: [8]i32 = undefined;
+            const slice = buf[0..case[0].len];
+            @memcpy(slice, case[0]);
+            sortFn(i32, slice, {}, asc_i32);
+            try testing.expect(mem.eql(i32, slice, case[1]));
+        }
     }
 }
 
@@ -1394,53 +297,139 @@ test "sort descending" {
         },
     };
 
-    for (rev_cases) |case| {
-        var buf: [8]i32 = undefined;
-        const slice = buf[0..case[0].len];
-        @memcpy(slice, case[0]);
-        sort(i32, slice, {}, desc_i32);
-        try testing.expect(mem.eql(i32, slice, case[1]));
+    inline for (sort_funcs) |sortFn| {
+        for (rev_cases) |case| {
+            var buf: [8]i32 = undefined;
+            const slice = buf[0..case[0].len];
+            @memcpy(slice, case[0]);
+            sortFn(i32, slice, {}, desc_i32);
+            try testing.expect(mem.eql(i32, slice, case[1]));
+        }
     }
 }
 
-test "another sort case" {
-    var arr = [_]i32{ 5, 3, 1, 2, 4 };
-    sort(i32, arr[0..], {}, asc_i32);
-
-    try testing.expect(mem.eql(i32, &arr, &[_]i32{ 1, 2, 3, 4, 5 }));
-}
-
 test "sort fuzz testing" {
     var prng = std.rand.DefaultPrng.init(0x12345678);
     const random = prng.random();
     const test_case_count = 10;
-    var i: usize = 0;
-    while (i < test_case_count) : (i += 1) {
-        try fuzzTest(random);
+
+    inline for (sort_funcs) |sortFn| {
+        var i: usize = 0;
+        while (i < test_case_count) : (i += 1) {
+            const array_size = random.intRangeLessThan(usize, 0, 1000);
+            var array = try testing.allocator.alloc(i32, array_size);
+            defer testing.allocator.free(array);
+            // populate with random data
+            for (array) |*item| {
+                item.* = random.intRangeLessThan(i32, 0, 100);
+            }
+            sortFn(i32, array, {}, asc_i32);
+            try testing.expect(isSorted(i32, array, {}, asc_i32));
+        }
     }
 }
 
-var fixed_buffer_mem: [100 * 1024]u8 = undefined;
+pub fn binarySearch(
+    comptime T: type,
+    key: anytype,
+    items: []const T,
+    context: anytype,
+    comptime compareFn: fn (context: @TypeOf(context), key: @TypeOf(key), mid_item: T) math.Order,
+) ?usize {
+    var left: usize = 0;
+    var right: usize = items.len;
 
-fn fuzzTest(rng: std.rand.Random) !void {
-    const array_size = rng.intRangeLessThan(usize, 0, 1000);
-    var array = try testing.allocator.alloc(IdAndValue, array_size);
-    defer testing.allocator.free(array);
-    // populate with random data
-    for (array, 0..) |*item, index| {
-        item.id = index;
-        item.value = rng.intRangeLessThan(i32, 0, 100);
+    while (left < right) {
+        // Avoid overflowing in the midpoint calculation
+        const mid = left + (right - left) / 2;
+        // Compare the key with the midpoint element
+        switch (compareFn(context, key, items[mid])) {
+            .eq => return mid,
+            .gt => left = mid + 1,
+            .lt => right = mid,
+        }
     }
-    sort(IdAndValue, array, {}, cmpByValue);
 
-    var index: usize = 1;
-    while (index < array.len) : (index += 1) {
-        if (array[index].value == array[index - 1].value) {
-            try testing.expect(array[index].id > array[index - 1].id);
-        } else {
-            try testing.expect(array[index].value > array[index - 1].value);
+    return null;
+}
+
+test "binarySearch" {
+    const S = struct {
+        fn order_u32(context: void, lhs: u32, rhs: u32) math.Order {
+            _ = context;
+            return math.order(lhs, rhs);
         }
-    }
+        fn order_i32(context: void, lhs: i32, rhs: i32) math.Order {
+            _ = context;
+            return math.order(lhs, rhs);
+        }
+    };
+    try testing.expectEqual(
+        @as(?usize, null),
+        binarySearch(u32, @as(u32, 1), &[_]u32{}, {}, S.order_u32),
+    );
+    try testing.expectEqual(
+        @as(?usize, 0),
+        binarySearch(u32, @as(u32, 1), &[_]u32{1}, {}, S.order_u32),
+    );
+    try testing.expectEqual(
+        @as(?usize, null),
+        binarySearch(u32, @as(u32, 1), &[_]u32{0}, {}, S.order_u32),
+    );
+    try testing.expectEqual(
+        @as(?usize, null),
+        binarySearch(u32, @as(u32, 0), &[_]u32{1}, {}, S.order_u32),
+    );
+    try testing.expectEqual(
+        @as(?usize, 4),
+        binarySearch(u32, @as(u32, 5), &[_]u32{ 1, 2, 3, 4, 5 }, {}, S.order_u32),
+    );
+    try testing.expectEqual(
+        @as(?usize, 0),
+        binarySearch(u32, @as(u32, 2), &[_]u32{ 2, 4, 8, 16, 32, 64 }, {}, S.order_u32),
+    );
+    try testing.expectEqual(
+        @as(?usize, 1),
+        binarySearch(i32, @as(i32, -4), &[_]i32{ -7, -4, 0, 9, 10 }, {}, S.order_i32),
+    );
+    try testing.expectEqual(
+        @as(?usize, 3),
+        binarySearch(i32, @as(i32, 98), &[_]i32{ -100, -25, 2, 98, 99, 100 }, {}, S.order_i32),
+    );
+    const R = struct {
+        b: i32,
+        e: i32,
+
+        fn r(b: i32, e: i32) @This() {
+            return @This(){ .b = b, .e = e };
+        }
+
+        fn order(context: void, key: i32, mid_item: @This()) math.Order {
+            _ = context;
+
+            if (key < mid_item.b) {
+                return .lt;
+            }
+
+            if (key > mid_item.e) {
+                return .gt;
+            }
+
+            return .eq;
+        }
+    };
+    try testing.expectEqual(
+        @as(?usize, null),
+        binarySearch(R, @as(i32, -45), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order),
+    );
+    try testing.expectEqual(
+        @as(?usize, 2),
+        binarySearch(R, @as(i32, 10), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order),
+    );
+    try testing.expectEqual(
+        @as(?usize, 1),
+        binarySearch(R, @as(i32, -20), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order),
+    );
 }
 
 pub fn argMin(
diff --git a/lib/std/sort/block.zig b/lib/std/sort/block.zig
new file mode 100644
index 000000000000..6c1be9c6c240
--- /dev/null
+++ b/lib/std/sort/block.zig
@@ -0,0 +1,1066 @@
+const std = @import("../std.zig");
+const sort = std.sort;
+const math = std.math;
+const mem = std.mem;
+
+const Range = struct {
+    start: usize,
+    end: usize,
+
+    fn init(start: usize, end: usize) Range {
+        return Range{
+            .start = start,
+            .end = end,
+        };
+    }
+
+    fn length(self: Range) usize {
+        return self.end - self.start;
+    }
+};
+
+const Iterator = struct {
+    size: usize,
+    power_of_two: usize,
+    numerator: usize,
+    decimal: usize,
+    denominator: usize,
+    decimal_step: usize,
+    numerator_step: usize,
+
+    fn init(size2: usize, min_level: usize) Iterator {
+        const power_of_two = math.floorPowerOfTwo(usize, size2);
+        const denominator = power_of_two / min_level;
+        return Iterator{
+            .numerator = 0,
+            .decimal = 0,
+            .size = size2,
+            .power_of_two = power_of_two,
+            .denominator = denominator,
+            .decimal_step = size2 / denominator,
+            .numerator_step = size2 % denominator,
+        };
+    }
+
+    fn begin(self: *Iterator) void {
+        self.numerator = 0;
+        self.decimal = 0;
+    }
+
+    fn nextRange(self: *Iterator) Range {
+        const start = self.decimal;
+
+        self.decimal += self.decimal_step;
+        self.numerator += self.numerator_step;
+        if (self.numerator >= self.denominator) {
+            self.numerator -= self.denominator;
+            self.decimal += 1;
+        }
+
+        return Range{
+            .start = start,
+            .end = self.decimal,
+        };
+    }
+
+    fn finished(self: *Iterator) bool {
+        return self.decimal >= self.size;
+    }
+
+    fn nextLevel(self: *Iterator) bool {
+        self.decimal_step += self.decimal_step;
+        self.numerator_step += self.numerator_step;
+        if (self.numerator_step >= self.denominator) {
+            self.numerator_step -= self.denominator;
+            self.decimal_step += 1;
+        }
+
+        return (self.decimal_step < self.size);
+    }
+
+    fn length(self: *Iterator) usize {
+        return self.decimal_step;
+    }
+};
+
+const Pull = struct {
+    from: usize,
+    to: usize,
+    count: usize,
+    range: Range,
+};
+
+/// Stable in-place sort. O(n) best case, O(n*log(n)) worst case and average case.
+/// O(1) memory (no allocator required).
+/// Sorts in ascending order with respect to the given `lessThan` function.
+///
+/// NOTE: the algorithm only work when the comparison is less-than or greater-than
+///       (See https://github.com/ziglang/zig/issues/8289)
+pub fn block(
+    comptime T: type,
+    items: []T,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+
+    // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c
+    var cache: [512]T = undefined;
+
+    if (items.len < 4) {
+        if (items.len == 3) {
+            // hard coded insertion sort
+            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
+            if (lessThan(context, items[2], items[1])) {
+                mem.swap(T, &items[1], &items[2]);
+                if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
+            }
+        } else if (items.len == 2) {
+            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
+        }
+        return;
+    }
+
+    // sort groups of 4-8 items at a time using an unstable sorting network,
+    // but keep track of the original item orders to force it to be stable
+    // http://pages.ripco.net/~jgamble/nw.html
+    var iterator = Iterator.init(items.len, 4);
+    while (!iterator.finished()) {
+        var order = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7 };
+        const range = iterator.nextRange();
+
+        const sliced_items = items[range.start..];
+        switch (range.length()) {
+            8 => {
+                swap(T, sliced_items, &order, 0, 1, context, lessThan);
+                swap(T, sliced_items, &order, 2, 3, context, lessThan);
+                swap(T, sliced_items, &order, 4, 5, context, lessThan);
+                swap(T, sliced_items, &order, 6, 7, context, lessThan);
+                swap(T, sliced_items, &order, 0, 2, context, lessThan);
+                swap(T, sliced_items, &order, 1, 3, context, lessThan);
+                swap(T, sliced_items, &order, 4, 6, context, lessThan);
+                swap(T, sliced_items, &order, 5, 7, context, lessThan);
+                swap(T, sliced_items, &order, 1, 2, context, lessThan);
+                swap(T, sliced_items, &order, 5, 6, context, lessThan);
+                swap(T, sliced_items, &order, 0, 4, context, lessThan);
+                swap(T, sliced_items, &order, 3, 7, context, lessThan);
+                swap(T, sliced_items, &order, 1, 5, context, lessThan);
+                swap(T, sliced_items, &order, 2, 6, context, lessThan);
+                swap(T, sliced_items, &order, 1, 4, context, lessThan);
+                swap(T, sliced_items, &order, 3, 6, context, lessThan);
+                swap(T, sliced_items, &order, 2, 4, context, lessThan);
+                swap(T, sliced_items, &order, 3, 5, context, lessThan);
+                swap(T, sliced_items, &order, 3, 4, context, lessThan);
+            },
+            7 => {
+                swap(T, sliced_items, &order, 1, 2, context, lessThan);
+                swap(T, sliced_items, &order, 3, 4, context, lessThan);
+                swap(T, sliced_items, &order, 5, 6, context, lessThan);
+                swap(T, sliced_items, &order, 0, 2, context, lessThan);
+                swap(T, sliced_items, &order, 3, 5, context, lessThan);
+                swap(T, sliced_items, &order, 4, 6, context, lessThan);
+                swap(T, sliced_items, &order, 0, 1, context, lessThan);
+                swap(T, sliced_items, &order, 4, 5, context, lessThan);
+                swap(T, sliced_items, &order, 2, 6, context, lessThan);
+                swap(T, sliced_items, &order, 0, 4, context, lessThan);
+                swap(T, sliced_items, &order, 1, 5, context, lessThan);
+                swap(T, sliced_items, &order, 0, 3, context, lessThan);
+                swap(T, sliced_items, &order, 2, 5, context, lessThan);
+                swap(T, sliced_items, &order, 1, 3, context, lessThan);
+                swap(T, sliced_items, &order, 2, 4, context, lessThan);
+                swap(T, sliced_items, &order, 2, 3, context, lessThan);
+            },
+            6 => {
+                swap(T, sliced_items, &order, 1, 2, context, lessThan);
+                swap(T, sliced_items, &order, 4, 5, context, lessThan);
+                swap(T, sliced_items, &order, 0, 2, context, lessThan);
+                swap(T, sliced_items, &order, 3, 5, context, lessThan);
+                swap(T, sliced_items, &order, 0, 1, context, lessThan);
+                swap(T, sliced_items, &order, 3, 4, context, lessThan);
+                swap(T, sliced_items, &order, 2, 5, context, lessThan);
+                swap(T, sliced_items, &order, 0, 3, context, lessThan);
+                swap(T, sliced_items, &order, 1, 4, context, lessThan);
+                swap(T, sliced_items, &order, 2, 4, context, lessThan);
+                swap(T, sliced_items, &order, 1, 3, context, lessThan);
+                swap(T, sliced_items, &order, 2, 3, context, lessThan);
+            },
+            5 => {
+                swap(T, sliced_items, &order, 0, 1, context, lessThan);
+                swap(T, sliced_items, &order, 3, 4, context, lessThan);
+                swap(T, sliced_items, &order, 2, 4, context, lessThan);
+                swap(T, sliced_items, &order, 2, 3, context, lessThan);
+                swap(T, sliced_items, &order, 1, 4, context, lessThan);
+                swap(T, sliced_items, &order, 0, 3, context, lessThan);
+                swap(T, sliced_items, &order, 0, 2, context, lessThan);
+                swap(T, sliced_items, &order, 1, 3, context, lessThan);
+                swap(T, sliced_items, &order, 1, 2, context, lessThan);
+            },
+            4 => {
+                swap(T, sliced_items, &order, 0, 1, context, lessThan);
+                swap(T, sliced_items, &order, 2, 3, context, lessThan);
+                swap(T, sliced_items, &order, 0, 2, context, lessThan);
+                swap(T, sliced_items, &order, 1, 3, context, lessThan);
+                swap(T, sliced_items, &order, 1, 2, context, lessThan);
+            },
+            else => {},
+        }
+    }
+    if (items.len < 8) return;
+
+    // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
+    while (true) {
+        // if every A and B block will fit into the cache, use a special branch
+        // specifically for merging with the cache
+        // (we use < rather than <= since the block size might be one more than
+        // iterator.length())
+        if (iterator.length() < cache.len) {
+            // if four subarrays fit into the cache, it's faster to merge both
+            // pairs of subarrays into the cache,
+            // then merge the two merged subarrays from the cache back into the original array
+            if ((iterator.length() + 1) * 4 <= cache.len and iterator.length() * 4 <= items.len) {
+                iterator.begin();
+                while (!iterator.finished()) {
+                    // merge A1 and B1 into the cache
+                    var A1 = iterator.nextRange();
+                    var B1 = iterator.nextRange();
+                    var A2 = iterator.nextRange();
+                    var B2 = iterator.nextRange();
+
+                    if (lessThan(context, items[B1.end - 1], items[A1.start])) {
+                        // the two ranges are in reverse order, so copy them in reverse order into the cache
+                        const a1_items = items[A1.start..A1.end];
+                        @memcpy(cache[B1.length()..][0..a1_items.len], a1_items);
+                        const b1_items = items[B1.start..B1.end];
+                        @memcpy(cache[0..b1_items.len], b1_items);
+                    } else if (lessThan(context, items[B1.start], items[A1.end - 1])) {
+                        // these two ranges weren't already in order, so merge them into the cache
+                        mergeInto(T, items, A1, B1, cache[0..], context, lessThan);
+                    } else {
+                        // if A1, B1, A2, and B2 are all in order, skip doing anything else
+                        if (!lessThan(context, items[B2.start], items[A2.end - 1]) and !lessThan(context, items[A2.start], items[B1.end - 1])) continue;
+
+                        // copy A1 and B1 into the cache in the same order
+                        const a1_items = items[A1.start..A1.end];
+                        @memcpy(cache[0..a1_items.len], a1_items);
+                        const b1_items = items[B1.start..B1.end];
+                        @memcpy(cache[A1.length()..][0..b1_items.len], b1_items);
+                    }
+                    A1 = Range.init(A1.start, B1.end);
+
+                    // merge A2 and B2 into the cache
+                    if (lessThan(context, items[B2.end - 1], items[A2.start])) {
+                        // the two ranges are in reverse order, so copy them in reverse order into the cache
+                        const a2_items = items[A2.start..A2.end];
+                        @memcpy(cache[A1.length() + B2.length() ..][0..a2_items.len], a2_items);
+                        const b2_items = items[B2.start..B2.end];
+                        @memcpy(cache[A1.length()..][0..b2_items.len], b2_items);
+                    } else if (lessThan(context, items[B2.start], items[A2.end - 1])) {
+                        // these two ranges weren't already in order, so merge them into the cache
+                        mergeInto(T, items, A2, B2, cache[A1.length()..], context, lessThan);
+                    } else {
+                        // copy A2 and B2 into the cache in the same order
+                        const a2_items = items[A2.start..A2.end];
+                        @memcpy(cache[A1.length()..][0..a2_items.len], a2_items);
+                        const b2_items = items[B2.start..B2.end];
+                        @memcpy(cache[A1.length() + A2.length() ..][0..b2_items.len], b2_items);
+                    }
+                    A2 = Range.init(A2.start, B2.end);
+
+                    // merge A1 and A2 from the cache into the items
+                    const A3 = Range.init(0, A1.length());
+                    const B3 = Range.init(A1.length(), A1.length() + A2.length());
+
+                    if (lessThan(context, cache[B3.end - 1], cache[A3.start])) {
+                        // the two ranges are in reverse order, so copy them in reverse order into the items
+                        const a3_items = cache[A3.start..A3.end];
+                        @memcpy(items[A1.start + A2.length() ..][0..a3_items.len], a3_items);
+                        const b3_items = cache[B3.start..B3.end];
+                        @memcpy(items[A1.start..][0..b3_items.len], b3_items);
+                    } else if (lessThan(context, cache[B3.start], cache[A3.end - 1])) {
+                        // these two ranges weren't already in order, so merge them back into the items
+                        mergeInto(T, cache[0..], A3, B3, items[A1.start..], context, lessThan);
+                    } else {
+                        // copy A3 and B3 into the items in the same order
+                        const a3_items = cache[A3.start..A3.end];
+                        @memcpy(items[A1.start..][0..a3_items.len], a3_items);
+                        const b3_items = cache[B3.start..B3.end];
+                        @memcpy(items[A1.start + A1.length() ..][0..b3_items.len], b3_items);
+                    }
+                }
+
+                // we merged two levels at the same time, so we're done with this level already
+                // (iterator.nextLevel() is called again at the bottom of this outer merge loop)
+                _ = iterator.nextLevel();
+            } else {
+                iterator.begin();
+                while (!iterator.finished()) {
+                    var A = iterator.nextRange();
+                    var B = iterator.nextRange();
+
+                    if (lessThan(context, items[B.end - 1], items[A.start])) {
+                        // the two ranges are in reverse order, so a simple rotation should fix it
+                        mem.rotate(T, items[A.start..B.end], A.length());
+                    } else if (lessThan(context, items[B.start], items[A.end - 1])) {
+                        // these two ranges weren't already in order, so we'll need to merge them!
+                        const a_items = items[A.start..A.end];
+                        @memcpy(cache[0..a_items.len], a_items);
+                        mergeExternal(T, items, A, B, cache[0..], context, lessThan);
+                    }
+                }
+            }
+        } else {
+            // this is where the in-place merge logic starts!
+            // 1. pull out two internal buffers each containing √A unique values
+            //    1a. adjust block_size and buffer_size if we couldn't find enough unique values
+            // 2. loop over the A and B subarrays within this level of the merge sort
+            // 3. break A and B into blocks of size 'block_size'
+            // 4. "tag" each of the A blocks with values from the first internal buffer
+            // 5. roll the A blocks through the B blocks and drop/rotate them where they belong
+            // 6. merge each A block with any B values that follow, using the cache or the second internal buffer
+            // 7. sort the second internal buffer if it exists
+            // 8. redistribute the two internal buffers back into the items
+            var block_size: usize = math.sqrt(iterator.length());
+            var buffer_size = iterator.length() / block_size + 1;
+
+            // as an optimization, we really only need to pull out the internal buffers once for each level of merges
+            // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
+            var A: Range = undefined;
+            var B: Range = undefined;
+            var index: usize = 0;
+            var last: usize = 0;
+            var count: usize = 0;
+            var find: usize = 0;
+            var start: usize = 0;
+            var pull_index: usize = 0;
+            var pull = [_]Pull{
+                Pull{
+                    .from = 0,
+                    .to = 0,
+                    .count = 0,
+                    .range = Range.init(0, 0),
+                },
+                Pull{
+                    .from = 0,
+                    .to = 0,
+                    .count = 0,
+                    .range = Range.init(0, 0),
+                },
+            };
+
+            var buffer1 = Range.init(0, 0);
+            var buffer2 = Range.init(0, 0);
+
+            // find two internal buffers of size 'buffer_size' each
+            find = buffer_size + buffer_size;
+            var find_separately = false;
+
+            if (block_size <= cache.len) {
+                // if every A block fits into the cache then we won't need the second internal buffer,
+                // so we really only need to find 'buffer_size' unique values
+                find = buffer_size;
+            } else if (find > iterator.length()) {
+                // we can't fit both buffers into the same A or B subarray, so find two buffers separately
+                find = buffer_size;
+                find_separately = true;
+            }
+
+            // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
+            // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
+            // OR if we couldn't find that many unique values, we need the largest possible buffer we can get
+
+            // in the case where it couldn't find a single buffer of at least √A unique values,
+            // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
+            iterator.begin();
+            while (!iterator.finished()) {
+                A = iterator.nextRange();
+                B = iterator.nextRange();
+
+                // just store information about where the values will be pulled from and to,
+                // as well as how many values there are, to create the two internal buffers
+
+                // check A for the number of unique values we need to fill an internal buffer
+                // these values will be pulled out to the start of A
+                last = A.start;
+                count = 1;
+                while (count < find) : ({
+                    last = index;
+                    count += 1;
+                }) {
+                    index = findLastForward(T, items, items[last], Range.init(last + 1, A.end), find - count, context, lessThan);
+                    if (index == A.end) break;
+                }
+                index = last;
+
+                if (count >= buffer_size) {
+                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer
+                    pull[pull_index] = Pull{
+                        .range = Range.init(A.start, B.end),
+                        .count = count,
+                        .from = index,
+                        .to = A.start,
+                    };
+                    pull_index = 1;
+
+                    if (count == buffer_size + buffer_size) {
+                        // we were able to find a single contiguous section containing 2√A unique values,
+                        // so this section can be used to contain both of the internal buffers we'll need
+                        buffer1 = Range.init(A.start, A.start + buffer_size);
+                        buffer2 = Range.init(A.start + buffer_size, A.start + count);
+                        break;
+                    } else if (find == buffer_size + buffer_size) {
+                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+                        // so we still need to find a second separate buffer of at least √A unique values
+                        buffer1 = Range.init(A.start, A.start + count);
+                        find = buffer_size;
+                    } else if (block_size <= cache.len) {
+                        // we found the first and only internal buffer that we need, so we're done!
+                        buffer1 = Range.init(A.start, A.start + count);
+                        break;
+                    } else if (find_separately) {
+                        // found one buffer, but now find the other one
+                        buffer1 = Range.init(A.start, A.start + count);
+                        find_separately = false;
+                    } else {
+                        // we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
+                        buffer2 = Range.init(A.start, A.start + count);
+                        break;
+                    }
+                } else if (pull_index == 0 and count > buffer1.length()) {
+                    // keep track of the largest buffer we were able to find
+                    buffer1 = Range.init(A.start, A.start + count);
+                    pull[pull_index] = Pull{
+                        .range = Range.init(A.start, B.end),
+                        .count = count,
+                        .from = index,
+                        .to = A.start,
+                    };
+                }
+
+                // check B for the number of unique values we need to fill an internal buffer
+                // these values will be pulled out to the end of B
+                last = B.end - 1;
+                count = 1;
+                while (count < find) : ({
+                    last = index - 1;
+                    count += 1;
+                }) {
+                    index = findFirstBackward(T, items, items[last], Range.init(B.start, last), find - count, context, lessThan);
+                    if (index == B.start) break;
+                }
+                index = last;
+
+                if (count >= buffer_size) {
+                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe
+                    pull[pull_index] = Pull{
+                        .range = Range.init(A.start, B.end),
+                        .count = count,
+                        .from = index,
+                        .to = B.end,
+                    };
+                    pull_index = 1;
+
+                    if (count == buffer_size + buffer_size) {
+                        // we were able to find a single contiguous section containing 2√A unique values,
+                        // so this section can be used to contain both of the internal buffers we'll need
+                        buffer1 = Range.init(B.end - count, B.end - buffer_size);
+                        buffer2 = Range.init(B.end - buffer_size, B.end);
+                        break;
+                    } else if (find == buffer_size + buffer_size) {
+                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+                        // so we still need to find a second separate buffer of at least √A unique values
+                        buffer1 = Range.init(B.end - count, B.end);
+                        find = buffer_size;
+                    } else if (block_size <= cache.len) {
+                        // we found the first and only internal buffer that we need, so we're done!
+                        buffer1 = Range.init(B.end - count, B.end);
+                        break;
+                    } else if (find_separately) {
+                        // found one buffer, but now find the other one
+                        buffer1 = Range.init(B.end - count, B.end);
+                        find_separately = false;
+                    } else {
+                        // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
+                        // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
+                        if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
+
+                        // we found a second buffer in an 'B' subarray containing √A unique values, so we're done!
+                        buffer2 = Range.init(B.end - count, B.end);
+                        break;
+                    }
+                } else if (pull_index == 0 and count > buffer1.length()) {
+                    // keep track of the largest buffer we were able to find
+                    buffer1 = Range.init(B.end - count, B.end);
+                    pull[pull_index] = Pull{
+                        .range = Range.init(A.start, B.end),
+                        .count = count,
+                        .from = index,
+                        .to = B.end,
+                    };
+                }
+            }
+
+            // pull out the two ranges so we can use them as internal buffers
+            pull_index = 0;
+            while (pull_index < 2) : (pull_index += 1) {
+                const length = pull[pull_index].count;
+
+                if (pull[pull_index].to < pull[pull_index].from) {
+                    // we're pulling the values out to the left, which means the start of an A subarray
+                    index = pull[pull_index].from;
+                    count = 1;
+                    while (count < length) : (count += 1) {
+                        index = findFirstBackward(T, items, items[index - 1], Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), length - count, context, lessThan);
+                        const range = Range.init(index + 1, pull[pull_index].from + 1);
+                        mem.rotate(T, items[range.start..range.end], range.length() - count);
+                        pull[pull_index].from = index + count;
+                    }
+                } else if (pull[pull_index].to > pull[pull_index].from) {
+                    // we're pulling values out to the right, which means the end of a B subarray
+                    index = pull[pull_index].from + 1;
+                    count = 1;
+                    while (count < length) : (count += 1) {
+                        index = findLastForward(T, items, items[index], Range.init(index, pull[pull_index].to), length - count, context, lessThan);
+                        const range = Range.init(pull[pull_index].from, index - 1);
+                        mem.rotate(T, items[range.start..range.end], count);
+                        pull[pull_index].from = index - 1 - count;
+                    }
+                }
+            }
+
+            // adjust block_size and buffer_size based on the values we were able to pull out
+            buffer_size = buffer1.length();
+            block_size = iterator.length() / buffer_size + 1;
+
+            // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
+            // so this was originally here to test the math for adjusting block_size above
+            // assert((iterator.length() + 1)/block_size <= buffer_size);
+
+            // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
+            iterator.begin();
+            while (!iterator.finished()) {
+                A = iterator.nextRange();
+                B = iterator.nextRange();
+
+                // remove any parts of A or B that are being used by the internal buffers
+                start = A.start;
+                if (start == pull[0].range.start) {
+                    if (pull[0].from > pull[0].to) {
+                        A.start += pull[0].count;
+
+                        // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
+                        // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
+                        // which also only happens when cache.len is small or 0 since it'd otherwise use MergeExternal
+                        if (A.length() == 0) continue;
+                    } else if (pull[0].from < pull[0].to) {
+                        B.end -= pull[0].count;
+                        if (B.length() == 0) continue;
+                    }
+                }
+                if (start == pull[1].range.start) {
+                    if (pull[1].from > pull[1].to) {
+                        A.start += pull[1].count;
+                        if (A.length() == 0) continue;
+                    } else if (pull[1].from < pull[1].to) {
+                        B.end -= pull[1].count;
+                        if (B.length() == 0) continue;
+                    }
+                }
+
+                if (lessThan(context, items[B.end - 1], items[A.start])) {
+                    // the two ranges are in reverse order, so a simple rotation should fix it
+                    mem.rotate(T, items[A.start..B.end], A.length());
+                } else if (lessThan(context, items[A.end], items[A.end - 1])) {
+                    // these two ranges weren't already in order, so we'll need to merge them!
+                    var findA: usize = undefined;
+
+                    // break the remainder of A into blocks. firstA is the uneven-sized first A block
+                    var blockA = Range.init(A.start, A.end);
+                    var firstA = Range.init(A.start, A.start + blockA.length() % block_size);
+
+                    // swap the first value of each A block with the value in buffer1
+                    var indexA = buffer1.start;
+                    index = firstA.end;
+                    while (index < blockA.end) : ({
+                        indexA += 1;
+                        index += block_size;
+                    }) {
+                        mem.swap(T, &items[indexA], &items[index]);
+                    }
+
+                    // start rolling the A blocks through the B blocks!
+                    // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
+                    var lastA = firstA;
+                    var lastB = Range.init(0, 0);
+                    var blockB = Range.init(B.start, B.start + math.min(block_size, B.length()));
+                    blockA.start += firstA.length();
+                    indexA = buffer1.start;
+
+                    // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
+                    // otherwise, if the second buffer is available, block swap the contents into that
+                    if (lastA.length() <= cache.len) {
+                        const last_a_items = items[lastA.start..lastA.end];
+                        @memcpy(cache[0..last_a_items.len], last_a_items);
+                    } else if (buffer2.length() > 0) {
+                        blockSwap(T, items, lastA.start, buffer2.start, lastA.length());
+                    }
+
+                    if (blockA.length() > 0) {
+                        while (true) {
+                            // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
+                            // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
+                            if ((lastB.length() > 0 and !lessThan(context, items[lastB.end - 1], items[indexA])) or blockB.length() == 0) {
+                                // figure out where to split the previous B block, and rotate it at the split
+                                const B_split = binaryFirst(T, items, items[indexA], lastB, context, lessThan);
+                                const B_remaining = lastB.end - B_split;
+
+                                // swap the minimum A block to the beginning of the rolling A blocks
+                                var minA = blockA.start;
+                                findA = minA + block_size;
+                                while (findA < blockA.end) : (findA += block_size) {
+                                    if (lessThan(context, items[findA], items[minA])) {
+                                        minA = findA;
+                                    }
+                                }
+                                blockSwap(T, items, blockA.start, minA, block_size);
+
+                                // swap the first item of the previous A block back with its original value, which is stored in buffer1
+                                mem.swap(T, &items[blockA.start], &items[indexA]);
+                                indexA += 1;
+
+                                // locally merge the previous A block with the B values that follow it
+                                // if lastA fits into the external cache we'll use that (with MergeExternal),
+                                // or if the second internal buffer exists we'll use that (with MergeInternal),
+                                // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
+
+                                if (lastA.length() <= cache.len) {
+                                    mergeExternal(T, items, lastA, Range.init(lastA.end, B_split), cache[0..], context, lessThan);
+                                } else if (buffer2.length() > 0) {
+                                    mergeInternal(T, items, lastA, Range.init(lastA.end, B_split), buffer2, context, lessThan);
+                                } else {
+                                    mergeInPlace(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan);
+                                }
+
+                                if (buffer2.length() > 0 or block_size <= cache.len) {
+                                    // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
+                                    if (block_size <= cache.len) {
+                                        @memcpy(cache[0..block_size], items[blockA.start..][0..block_size]);
+                                    } else {
+                                        blockSwap(T, items, blockA.start, buffer2.start, block_size);
+                                    }
+
+                                    // this is equivalent to rotating, but faster
+                                    // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
+                                    // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
+                                    blockSwap(T, items, B_split, blockA.start + block_size - B_remaining, B_remaining);
+                                } else {
+                                    // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
+                                    mem.rotate(T, items[B_split .. blockA.start + block_size], blockA.start - B_split);
+                                }
+
+                                // update the range for the remaining A blocks, and the range remaining from the B block after it was split
+                                lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
+                                lastB = Range.init(lastA.end, lastA.end + B_remaining);
+
+                                // if there are no more A blocks remaining, this step is finished!
+                                blockA.start += block_size;
+                                if (blockA.length() == 0) break;
+                            } else if (blockB.length() < block_size) {
+                                // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
+                                // the cache is disabled here since it might contain the contents of the previous A block
+                                mem.rotate(T, items[blockA.start..blockB.end], blockB.start - blockA.start);
+
+                                lastB = Range.init(blockA.start, blockA.start + blockB.length());
+                                blockA.start += blockB.length();
+                                blockA.end += blockB.length();
+                                blockB.end = blockB.start;
+                            } else {
+                                // roll the leftmost A block to the end by swapping it with the next B block
+                                blockSwap(T, items, blockA.start, blockB.start, block_size);
+                                lastB = Range.init(blockA.start, blockA.start + block_size);
+
+                                blockA.start += block_size;
+                                blockA.end += block_size;
+                                blockB.start += block_size;
+
+                                if (blockB.end > B.end - block_size) {
+                                    blockB.end = B.end;
+                                } else {
+                                    blockB.end += block_size;
+                                }
+                            }
+                        }
+                    }
+
+                    // merge the last A block with the remaining B values
+                    if (lastA.length() <= cache.len) {
+                        mergeExternal(T, items, lastA, Range.init(lastA.end, B.end), cache[0..], context, lessThan);
+                    } else if (buffer2.length() > 0) {
+                        mergeInternal(T, items, lastA, Range.init(lastA.end, B.end), buffer2, context, lessThan);
+                    } else {
+                        mergeInPlace(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan);
+                    }
+                }
+            }
+
+            // when we're finished with this merge step we should have the one
+            // or two internal buffers left over, where the second buffer is all jumbled up
+            // insertion sort the second buffer, then redistribute the buffers
+            // back into the items using the opposite process used for creating the buffer
+
+            // while an unstable sort like quicksort could be applied here, in benchmarks
+            // it was consistently slightly slower than a simple insertion sort,
+            // even for tens of millions of items. this may be because insertion
+            // sort is quite fast when the data is already somewhat sorted, like it is here
+            sort.insertion(T, items[buffer2.start..buffer2.end], context, lessThan);
+
+            pull_index = 0;
+            while (pull_index < 2) : (pull_index += 1) {
+                var unique = pull[pull_index].count * 2;
+                if (pull[pull_index].from > pull[pull_index].to) {
+                    // the values were pulled out to the left, so redistribute them back to the right
+                    var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
+                    while (buffer.length() > 0) {
+                        index = findFirstForward(T, items, items[buffer.start], Range.init(buffer.end, pull[pull_index].range.end), unique, context, lessThan);
+                        const amount = index - buffer.end;
+                        mem.rotate(T, items[buffer.start..index], buffer.length());
+                        buffer.start += (amount + 1);
+                        buffer.end += amount;
+                        unique -= 2;
+                    }
+                } else if (pull[pull_index].from < pull[pull_index].to) {
+                    // the values were pulled out to the right, so redistribute them back to the left
+                    var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
+                    while (buffer.length() > 0) {
+                        index = findLastBackward(T, items, items[buffer.end - 1], Range.init(pull[pull_index].range.start, buffer.start), unique, context, lessThan);
+                        const amount = buffer.start - index;
+                        mem.rotate(T, items[index..buffer.end], amount);
+                        buffer.start -= amount;
+                        buffer.end -= (amount + 1);
+                        unique -= 2;
+                    }
+                }
+            }
+        }
+
+        // double the size of each A and B subarray that will be merged in the next level
+        if (!iterator.nextLevel()) break;
+    }
+}
+// merge operation without a buffer
+fn mergeInPlace(
+    comptime T: type,
+    items: []T,
+    A_arg: Range,
+    B_arg: Range,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    if (A_arg.length() == 0 or B_arg.length() == 0) return;
+
+    // this just repeatedly binary searches into B and rotates A into position.
+    // the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
+    // but I decided to stick with this because it had better situational performance
+    //
+    // (Hwang and Lin is designed for merging subarrays of very different sizes,
+    // but WikiSort almost always uses subarrays that are roughly the same size)
+    //
+    // normally this is incredibly suboptimal, but this function is only called
+    // when none of the A or B blocks in any subarray contained 2√A unique values,
+    // which places a hard limit on the number of times this will ACTUALLY need
+    // to binary search and rotate.
+    //
+    // according to my analysis the worst case is √A rotations performed on √A items
+    // once the constant factors are removed, which ends up being O(n)
+    //
+    // again, this is NOT a general-purpose solution – it only works well in this case!
+    // kind of like how the O(n^2) insertion sort is used in some places
+
+    var A = A_arg;
+    var B = B_arg;
+
+    while (true) {
+        // find the first place in B where the first item in A needs to be inserted
+        const mid = binaryFirst(T, items, items[A.start], B, context, lessThan);
+
+        // rotate A into place
+        const amount = mid - A.end;
+        mem.rotate(T, items[A.start..mid], A.length());
+        if (B.end == mid) break;
+
+        // calculate the new A and B ranges
+        B.start = mid;
+        A = Range.init(A.start + amount, B.start);
+        A.start = binaryLast(T, items, items[A.start], A, context, lessThan);
+        if (A.length() == 0) break;
+    }
+}
+
+// merge operation using an internal buffer
+fn mergeInternal(
+    comptime T: type,
+    items: []T,
+    A: Range,
+    B: Range,
+    buffer: Range,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    // whenever we find a value to add to the final array, swap it with the value that's already in that spot
+    // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order
+    var A_count: usize = 0;
+    var B_count: usize = 0;
+    var insert: usize = 0;
+
+    if (B.length() > 0 and A.length() > 0) {
+        while (true) {
+            if (!lessThan(context, items[B.start + B_count], items[buffer.start + A_count])) {
+                mem.swap(T, &items[A.start + insert], &items[buffer.start + A_count]);
+                A_count += 1;
+                insert += 1;
+                if (A_count >= A.length()) break;
+            } else {
+                mem.swap(T, &items[A.start + insert], &items[B.start + B_count]);
+                B_count += 1;
+                insert += 1;
+                if (B_count >= B.length()) break;
+            }
+        }
+    }
+
+    // swap the remainder of A into the final array
+    blockSwap(T, items, buffer.start + A_count, A.start + insert, A.length() - A_count);
+}
+
+fn blockSwap(comptime T: type, items: []T, start1: usize, start2: usize, block_size: usize) void {
+    var index: usize = 0;
+    while (index < block_size) : (index += 1) {
+        mem.swap(T, &items[start1 + index], &items[start2 + index]);
+    }
+}
+
+// combine a linear search with a binary search to reduce the number of comparisons in situations
+// where have some idea as to how many unique values there are and where the next value might be
+fn findFirstForward(
+    comptime T: type,
+    items: []T,
+    value: T,
+    range: Range,
+    unique: usize,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) usize {
+    if (range.length() == 0) return range.start;
+    const skip = math.max(range.length() / unique, @as(usize, 1));
+
+    var index = range.start + skip;
+    while (lessThan(context, items[index - 1], value)) : (index += skip) {
+        if (index >= range.end - skip) {
+            return binaryFirst(T, items, value, Range.init(index, range.end), context, lessThan);
+        }
+    }
+
+    return binaryFirst(T, items, value, Range.init(index - skip, index), context, lessThan);
+}
+
+fn findFirstBackward(
+    comptime T: type,
+    items: []T,
+    value: T,
+    range: Range,
+    unique: usize,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) usize {
+    if (range.length() == 0) return range.start;
+    const skip = math.max(range.length() / unique, @as(usize, 1));
+
+    var index = range.end - skip;
+    while (index > range.start and !lessThan(context, items[index - 1], value)) : (index -= skip) {
+        if (index < range.start + skip) {
+            return binaryFirst(T, items, value, Range.init(range.start, index), context, lessThan);
+        }
+    }
+
+    return binaryFirst(T, items, value, Range.init(index, index + skip), context, lessThan);
+}
+
+fn findLastForward(
+    comptime T: type,
+    items: []T,
+    value: T,
+    range: Range,
+    unique: usize,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) usize {
+    if (range.length() == 0) return range.start;
+    const skip = math.max(range.length() / unique, @as(usize, 1));
+
+    var index = range.start + skip;
+    while (!lessThan(context, value, items[index - 1])) : (index += skip) {
+        if (index >= range.end - skip) {
+            return binaryLast(T, items, value, Range.init(index, range.end), context, lessThan);
+        }
+    }
+
+    return binaryLast(T, items, value, Range.init(index - skip, index), context, lessThan);
+}
+
+fn findLastBackward(
+    comptime T: type,
+    items: []T,
+    value: T,
+    range: Range,
+    unique: usize,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) usize {
+    if (range.length() == 0) return range.start;
+    const skip = math.max(range.length() / unique, @as(usize, 1));
+
+    var index = range.end - skip;
+    while (index > range.start and lessThan(context, value, items[index - 1])) : (index -= skip) {
+        if (index < range.start + skip) {
+            return binaryLast(T, items, value, Range.init(range.start, index), context, lessThan);
+        }
+    }
+
+    return binaryLast(T, items, value, Range.init(index, index + skip), context, lessThan);
+}
+
+fn binaryFirst(
+    comptime T: type,
+    items: []T,
+    value: T,
+    range: Range,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) usize {
+    var curr = range.start;
+    var size = range.length();
+    if (range.start >= range.end) return range.end;
+    while (size > 0) {
+        const offset = size % 2;
+
+        size /= 2;
+        const mid_item = items[curr + size];
+        if (lessThan(context, mid_item, value)) {
+            curr += size + offset;
+        }
+    }
+    return curr;
+}
+
+fn binaryLast(
+    comptime T: type,
+    items: []T,
+    value: T,
+    range: Range,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) usize {
+    var curr = range.start;
+    var size = range.length();
+    if (range.start >= range.end) return range.end;
+    while (size > 0) {
+        const offset = size % 2;
+
+        size /= 2;
+        const mid_item = items[curr + size];
+        if (!lessThan(context, value, mid_item)) {
+            curr += size + offset;
+        }
+    }
+    return curr;
+}
+
+fn mergeInto(
+    comptime T: type,
+    from: []T,
+    A: Range,
+    B: Range,
+    into: []T,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    var A_index: usize = A.start;
+    var B_index: usize = B.start;
+    const A_last = A.end;
+    const B_last = B.end;
+    var insert_index: usize = 0;
+
+    while (true) {
+        if (!lessThan(context, from[B_index], from[A_index])) {
+            into[insert_index] = from[A_index];
+            A_index += 1;
+            insert_index += 1;
+            if (A_index == A_last) {
+                // copy the remainder of B into the final array
+                const from_b = from[B_index..B_last];
+                @memcpy(into[insert_index..][0..from_b.len], from_b);
+                break;
+            }
+        } else {
+            into[insert_index] = from[B_index];
+            B_index += 1;
+            insert_index += 1;
+            if (B_index == B_last) {
+                // copy the remainder of A into the final array
+                const from_a = from[A_index..A_last];
+                @memcpy(into[insert_index..][0..from_a.len], from_a);
+                break;
+            }
+        }
+    }
+}
+
+fn mergeExternal(
+    comptime T: type,
+    items: []T,
+    A: Range,
+    B: Range,
+    cache: []T,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    // A fits into the cache, so use that instead of the internal buffer
+    var A_index: usize = 0;
+    var B_index: usize = B.start;
+    var insert_index: usize = A.start;
+    const A_last = A.length();
+    const B_last = B.end;
+
+    if (B.length() > 0 and A.length() > 0) {
+        while (true) {
+            if (!lessThan(context, items[B_index], cache[A_index])) {
+                items[insert_index] = cache[A_index];
+                A_index += 1;
+                insert_index += 1;
+                if (A_index == A_last) break;
+            } else {
+                items[insert_index] = items[B_index];
+                B_index += 1;
+                insert_index += 1;
+                if (B_index == B_last) break;
+            }
+        }
+    }
+
+    // copy the remainder of A into the final array
+    const cache_a = cache[A_index..A_last];
+    @memcpy(items[insert_index..][0..cache_a.len], cache_a);
+}
+
+fn swap(
+    comptime T: type,
+    items: []T,
+    order: *[8]u8,
+    x: usize,
+    y: usize,
+    context: anytype,
+    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    if (lessThan(context, items[y], items[x]) or ((order.*)[x] > (order.*)[y] and !lessThan(context, items[x], items[y]))) {
+        mem.swap(T, &items[x], &items[y]);
+        mem.swap(u8, &(order.*)[x], &(order.*)[y]);
+    }
+}
diff --git a/lib/std/sort/pdq.zig b/lib/std/sort/pdq.zig
new file mode 100644
index 000000000000..11d330047ae7
--- /dev/null
+++ b/lib/std/sort/pdq.zig
@@ -0,0 +1,335 @@
+const std = @import("../std.zig");
+const sort = std.sort;
+const mem = std.mem;
+const math = std.math;
+const testing = std.testing;
+
+/// Unstable in-place sort. n best case, n*log(n) worst case and average case.
+/// log(n) memory (no allocator required).
+///
+/// Sorts in ascending order with respect to the given `lessThan` function.
+pub fn pdq(
+    comptime T: type,
+    items: []T,
+    context: anytype,
+    comptime lessThanFn: fn (context: @TypeOf(context), lhs: T, rhs: T) bool,
+) void {
+    const Context = struct {
+        items: []T,
+        sub_ctx: @TypeOf(context),
+
+        pub fn lessThan(ctx: @This(), a: usize, b: usize) bool {
+            return lessThanFn(ctx.sub_ctx, ctx.items[a], ctx.items[b]);
+        }
+
+        pub fn swap(ctx: @This(), a: usize, b: usize) void {
+            return mem.swap(T, &ctx.items[a], &ctx.items[b]);
+        }
+    };
+    pdqContext(0, items.len, Context{ .items = items, .sub_ctx = context });
+}
+
+const Hint = enum {
+    increasing,
+    decreasing,
+    unknown,
+};
+
+/// Unstable in-place sort. O(n) best case, O(n*log(n)) worst case and average case.
+/// O(log(n)) memory (no allocator required).
+///
+/// Sorts in ascending order with respect to the given `lessThan` function.
+pub fn pdqContext(a: usize, b: usize, context: anytype) void {
+    // slices of up to this length get sorted using insertion sort.
+    const max_insertion = 24;
+    // number of allowed imbalanced partitions before switching to heap sort.
+    const max_limit = std.math.floorPowerOfTwo(usize, b) + 1;
+
+    // set upper bound on stack memory usage.
+    const Range = struct { a: usize, b: usize, limit: usize };
+    const stack_size = math.log2(math.maxInt(usize) + 1);
+    var stack: [stack_size]Range = undefined;
+    var range = Range{ .a = a, .b = b, .limit = max_limit };
+    var top: usize = 0;
+
+    while (true) {
+        var was_balanced = true;
+        var was_partitioned = true;
+
+        while (true) {
+            const len = range.b - range.a;
+
+            // very short slices get sorted using insertion sort.
+            if (len <= max_insertion) {
+                break sort.insertionContext(range.a, range.b, context);
+            }
+
+            // if too many bad pivot choices were made, simply fall back to heapsort in order to
+            // guarantee O(n*log(n)) worst-case.
+            if (range.limit == 0) {
+                break sort.heapContext(range.a, range.b, context);
+            }
+
+            // if the last partitioning was imbalanced, try breaking patterns in the slice by shuffling
+            // some elements around. Hopefully we'll choose a better pivot this time.
+            if (!was_balanced) {
+                breakPatterns(range.a, range.b, context);
+                range.limit -= 1;
+            }
+
+            // choose a pivot and try guessing whether the slice is already sorted.
+            var pivot: usize = 0;
+            var hint = chosePivot(range.a, range.b, &pivot, context);
+
+            if (hint == .decreasing) {
+                // The maximum number of swaps was performed, so items are likely
+                // in reverse order. Reverse it to make sorting faster.
+                reverseRange(range.a, range.b, context);
+                pivot = (range.b - 1) - (pivot - range.a);
+                hint = .increasing;
+            }
+
+            // if the last partitioning was decently balanced and didn't shuffle elements, and if pivot
+            // selection predicts the slice is likely already sorted...
+            if (was_balanced and was_partitioned and hint == .increasing) {
+                // try identifying several out-of-order elements and shifting them to correct
+                // positions. If the slice ends up being completely sorted, we're done.
+                if (partialInsertionSort(range.a, range.b, context)) break;
+            }
+
+            // if the chosen pivot is equal to the predecessor, then it's the smallest element in the
+            // slice. Partition the slice into elements equal to and elements greater than the pivot.
+            // This case is usually hit when the slice contains many duplicate elements.
+            if (range.a > 0 and !context.lessThan(range.a - 1, pivot)) {
+                range.a = partitionEqual(range.a, range.b, pivot, context);
+                continue;
+            }
+
+            // partition the slice.
+            var mid = pivot;
+            was_partitioned = partition(range.a, range.b, &mid, context);
+
+            const left_len = mid - range.a;
+            const right_len = range.b - mid;
+            const balanced_threshold = len / 8;
+            if (left_len < right_len) {
+                was_balanced = left_len >= balanced_threshold;
+                stack[top] = .{ .a = range.a, .b = mid, .limit = range.limit };
+                top += 1;
+                range.a = mid + 1;
+            } else {
+                was_balanced = right_len >= balanced_threshold;
+                stack[top] = .{ .a = mid + 1, .b = range.b, .limit = range.limit };
+                top += 1;
+                range.b = mid;
+            }
+        }
+
+        top = math.sub(usize, top, 1) catch break;
+        range = stack[top];
+    }
+}
+
+/// partitions `items[a..b]` into elements smaller than `items[pivot]`,
+/// followed by elements greater than or equal to `items[pivot]`.
+///
+/// sets the new pivot.
+/// returns `true` if already partitioned.
+fn partition(a: usize, b: usize, pivot: *usize, context: anytype) bool {
+    // move pivot to the first place
+    context.swap(a, pivot.*);
+
+    var i = a + 1;
+    var j = b - 1;
+
+    while (i <= j and context.lessThan(i, a)) i += 1;
+    while (i <= j and !context.lessThan(j, a)) j -= 1;
+
+    // check if items are already partitioned (no item to swap)
+    if (i > j) {
+        // put pivot back to the middle
+        context.swap(j, a);
+        pivot.* = j;
+        return true;
+    }
+
+    context.swap(i, j);
+    i += 1;
+    j -= 1;
+
+    while (true) {
+        while (i <= j and context.lessThan(i, a)) i += 1;
+        while (i <= j and !context.lessThan(j, a)) j -= 1;
+        if (i > j) break;
+
+        context.swap(i, j);
+        i += 1;
+        j -= 1;
+    }
+
+    // TODO: Enable the BlockQuicksort optimization
+
+    context.swap(j, a);
+    pivot.* = j;
+    return false;
+}
+
+/// partitions items into elements equal to `items[pivot]`
+/// followed by elements greater than `items[pivot]`.
+///
+/// it assumed that `items[a..b]` does not contain elements smaller than the `items[pivot]`.
+fn partitionEqual(a: usize, b: usize, pivot: usize, context: anytype) usize {
+    // move pivot to the first place
+    context.swap(a, pivot);
+
+    var i = a + 1;
+    var j = b - 1;
+
+    while (true) {
+        while (i <= j and !context.lessThan(a, i)) i += 1;
+        while (i <= j and context.lessThan(a, j)) j -= 1;
+        if (i > j) break;
+
+        context.swap(i, j);
+        i += 1;
+        j -= 1;
+    }
+
+    return i;
+}
+
+/// partially sorts a slice by shifting several out-of-order elements around.
+///
+/// returns `true` if the slice is sorted at the end. This function is `O(n)` worst-case.
+fn partialInsertionSort(a: usize, b: usize, context: anytype) bool {
+    @setCold(true);
+
+    // maximum number of adjacent out-of-order pairs that will get shifted
+    const max_steps = 5;
+    // if the slice is shorter than this, don't shift any elements
+    const shortest_shifting = 50;
+
+    var i = a + 1;
+    for (0..max_steps) |_| {
+        // find the next pair of adjacent out-of-order elements.
+        while (i < b and !context.lessThan(i, i - 1)) i += 1;
+
+        // are we done?
+        if (i == b) return true;
+
+        // don't shift elements on short arrays, that has a performance cost.
+        if (b - a < shortest_shifting) return false;
+
+        // swap the found pair of elements. This puts them in correct order.
+        context.swap(i, i - 1);
+
+        // shift the smaller element to the left.
+        if (i - a >= 2) {
+            var j = i - 1;
+            while (j >= 1) : (j -= 1) {
+                if (!context.lessThan(j, j - 1)) break;
+                context.swap(j, j - 1);
+            }
+        }
+
+        // shift the greater element to the right.
+        if (b - i >= 2) {
+            var j = i + 1;
+            while (j < b) : (j += 1) {
+                if (!context.lessThan(j, j - 1)) break;
+                context.swap(j, j - 1);
+            }
+        }
+    }
+
+    return false;
+}
+
+fn breakPatterns(a: usize, b: usize, context: anytype) void {
+    @setCold(true);
+
+    const len = b - a;
+    if (len < 8) return;
+
+    var rand = len;
+    const modulus = math.ceilPowerOfTwoAssert(u64, len);
+
+    var i = a + (len / 4) * 2 - 1;
+    while (i <= a + (len / 4) * 2 + 1) : (i += 1) {
+        var other: usize = xorshift64(&rand) & (modulus - 1);
+        if (other >= len) other -= len;
+        context.swap(i, a + other);
+    }
+}
+
+fn xorshift64(a: *u64) u64 {
+    var x = a.*;
+    x ^= x << 13;
+    x ^= x >> 7;
+    x ^= x << 17;
+    a.* = x;
+    return x;
+}
+
+/// choses a pivot in `items[a..b]`.
+/// swaps likely_sorted when `items[a..b]` seems to be already sorted.
+fn chosePivot(a: usize, b: usize, pivot: *usize, context: anytype) Hint {
+    // minimum length for using the Tukey's ninther method
+    const shortest_ninther = 50;
+    // max_swaps is the maximum number of swaps allowed in this function
+    const max_swaps = 4 * 3;
+
+    var len = b - a;
+    var i = a + len / 4 * 1;
+    var j = a + len / 4 * 2;
+    var k = a + len / 4 * 3;
+    var swaps: usize = 0;
+
+    if (len >= 8) {
+        if (len >= shortest_ninther) {
+            // find medians in the neighborhoods of `i`, `j` and `k`
+            i = sort3(i - 1, i, i + 1, &swaps, context);
+            j = sort3(j - 1, j, j + 1, &swaps, context);
+            k = sort3(k - 1, k, k + 1, &swaps, context);
+        }
+
+        // find the median among `i`, `j` and `k`
+        j = sort3(i, j, k, &swaps, context);
+    }
+
+    pivot.* = j;
+    return switch (swaps) {
+        0 => .increasing,
+        max_swaps => .decreasing,
+        else => .unknown,
+    };
+}
+
+fn sort3(a: usize, b: usize, c: usize, swaps: *usize, context: anytype) usize {
+    if (context.lessThan(b, a)) {
+        swaps.* += 1;
+        context.swap(b, a);
+    }
+
+    if (context.lessThan(c, b)) {
+        swaps.* += 1;
+        context.swap(c, b);
+    }
+
+    if (context.lessThan(b, a)) {
+        swaps.* += 1;
+        context.swap(b, a);
+    }
+
+    return b;
+}
+
+fn reverseRange(a: usize, b: usize, context: anytype) void {
+    var i = a;
+    var j = b - 1;
+    while (i < j) {
+        context.swap(i, j);
+        i += 1;
+        j -= 1;
+    }
+}
diff --git a/src/Compilation.zig b/src/Compilation.zig
index b48580032981..cc2e2a916b0d 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -672,7 +672,7 @@ fn addPackageTableToCacheHash(
         }
     }
     // Sort the slice by package name
-    std.sort.sort(Package.Table.KV, packages, {}, struct {
+    mem.sort(Package.Table.KV, packages, {}, struct {
         fn lessThan(_: void, lhs: Package.Table.KV, rhs: Package.Table.KV) bool {
             return std.mem.lessThan(u8, lhs.key, rhs.key);
         }
diff --git a/src/Package.zig b/src/Package.zig
index f28aac885d57..cde3f38e2886 100644
--- a/src/Package.zig
+++ b/src/Package.zig
@@ -672,7 +672,7 @@ fn computePackageHash(
         }
     }
 
-    std.sort.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan);
+    mem.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan);
 
     var hasher = Manifest.Hash.init(.{});
     var any_failures = false;
diff --git a/src/RangeSet.zig b/src/RangeSet.zig
index 7e501f984be9..aa051ff424fb 100644
--- a/src/RangeSet.zig
+++ b/src/RangeSet.zig
@@ -60,7 +60,7 @@ pub fn spans(self: *RangeSet, first: Value, last: Value, ty: Type) !bool {
     if (self.ranges.items.len == 0)
         return false;
 
-    std.sort.sort(Range, self.ranges.items, LessThanContext{
+    std.mem.sort(Range, self.ranges.items, LessThanContext{
         .ty = ty,
         .module = self.module,
     }, lessThan);
diff --git a/src/Sema.zig b/src/Sema.zig
index 6a1c889a6eee..bf49378726b9 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -30945,7 +30945,7 @@ fn resolveStructLayout(sema: *Sema, ty: Type) CompileError!void {
                         ctx.struct_obj.fields.values()[b].ty.abiAlignment(target);
                 }
             };
-            std.sort.sort(u32, optimized_order, AlignSortContext{
+            mem.sort(u32, optimized_order, AlignSortContext{
                 .struct_obj = struct_obj,
                 .sema = sema,
             }, AlignSortContext.lessThan);
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index e83524237971..55a9694fd354 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -2176,7 +2176,7 @@ fn computeFrameLayout(self: *Self) !FrameLayout {
             }
         };
         const sort_context = SortContext{ .frame_align = frame_align };
-        std.sort.sort(FrameIndex, stack_frame_order, sort_context, SortContext.lessThan);
+        mem.sort(FrameIndex, stack_frame_order, sort_context, SortContext.lessThan);
     }
 
     const call_frame_align = frame_align[@enumToInt(FrameIndex.call_frame)];
diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig
index 6ed0aeeff409..625a5283b98a 100644
--- a/src/arch/x86_64/Encoding.zig
+++ b/src/arch/x86_64/Encoding.zig
@@ -770,7 +770,7 @@ const mnemonic_to_encodings_map = init: {
     @setEvalBranchQuota(30_000);
     const encodings = @import("encodings.zig");
     var entries = encodings.table;
-    std.sort.sort(encodings.Entry, &entries, {}, struct {
+    std.mem.sort(encodings.Entry, &entries, {}, struct {
         fn lessThan(_: void, lhs: encodings.Entry, rhs: encodings.Entry) bool {
             return @enumToInt(lhs[0]) < @enumToInt(rhs[0]);
         }
diff --git a/src/codegen/c/type.zig b/src/codegen/c/type.zig
index 892914ea3d1a..8494ae7353ab 100644
--- a/src/codegen/c/type.zig
+++ b/src/codegen/c/type.zig
@@ -1292,7 +1292,7 @@ pub const CType = extern union {
         fn sortFields(self: *@This(), fields_len: usize) []Payload.Fields.Field {
             const Field = Payload.Fields.Field;
             const slice = self.storage.anon.fields[0..fields_len];
-            std.sort.sort(Field, slice, {}, struct {
+            mem.sort(Field, slice, {}, struct {
                 fn before(_: void, lhs: Field, rhs: Field) bool {
                     return lhs.alignas.@"align" > rhs.alignas.@"align";
                 }
diff --git a/src/link/Coff.zig b/src/link/Coff.zig
index 81e8c57bddfb..01f18a73b3e4 100644
--- a/src/link/Coff.zig
+++ b/src/link/Coff.zig
@@ -1837,7 +1837,7 @@ fn writeBaseRelocations(self: *Coff) !void {
             pages.appendAssumeCapacity(page.*);
         }
     }
-    std.sort.sort(u32, pages.items, {}, std.sort.asc(u32));
+    mem.sort(u32, pages.items, {}, std.sort.asc(u32));
 
     var buffer = std.ArrayList(u8).init(gpa);
     defer buffer.deinit();
diff --git a/src/link/MachO/Object.zig b/src/link/MachO/Object.zig
index 7cc6f78c7def..b218fdbd2dfe 100644
--- a/src/link/MachO/Object.zig
+++ b/src/link/MachO/Object.zig
@@ -209,7 +209,7 @@ pub fn parse(self: *Object, allocator: Allocator, cpu_arch: std.Target.Cpu.Arch)
     // afterwards by address in each group. Normally, dysymtab should
     // be enough to guarantee the sort, but turns out not every compiler
     // is kind enough to specify the symbols in the correct order.
-    sort.sort(SymbolAtIndex, sorted_all_syms.items, self, SymbolAtIndex.lessThan);
+    mem.sort(SymbolAtIndex, sorted_all_syms.items, self, SymbolAtIndex.lessThan);
 
     var prev_sect_id: u8 = 0;
     var section_index_lookup: ?Entry = null;
@@ -462,7 +462,7 @@ pub fn splitRegularSections(self: *Object, zld: *Zld, object_id: u32) !void {
         sorted_sections[id] = .{ .header = sect, .id = @intCast(u8, id) };
     }
 
-    std.sort.sort(SortedSection, sorted_sections, {}, sectionLessThanByAddress);
+    mem.sort(SortedSection, sorted_sections, {}, sectionLessThanByAddress);
 
     var sect_sym_index: u32 = 0;
     for (sorted_sections) |section| {
@@ -663,7 +663,7 @@ fn parseRelocs(self: *Object, gpa: Allocator, sect_id: u8) !void {
     if (self.getSourceRelocs(section)) |relocs| {
         try self.relocations.ensureUnusedCapacity(gpa, relocs.len);
         self.relocations.appendUnalignedSliceAssumeCapacity(relocs);
-        std.sort.sort(macho.relocation_info, self.relocations.items[start..], {}, relocGreaterThan);
+        mem.sort(macho.relocation_info, self.relocations.items[start..], {}, relocGreaterThan);
     }
     self.section_relocs_lookup.items[sect_id] = start;
 }
@@ -901,7 +901,7 @@ pub fn parseDataInCode(self: *Object, gpa: Allocator) !void {
     const dice = @ptrCast([*]align(1) const macho.data_in_code_entry, self.contents.ptr + cmd.dataoff)[0..ndice];
     try self.data_in_code.ensureTotalCapacityPrecise(gpa, dice.len);
     self.data_in_code.appendUnalignedSliceAssumeCapacity(dice);
-    std.sort.sort(macho.data_in_code_entry, self.data_in_code.items, {}, diceLessThan);
+    mem.sort(macho.data_in_code_entry, self.data_in_code.items, {}, diceLessThan);
 }
 
 fn diceLessThan(ctx: void, lhs: macho.data_in_code_entry, rhs: macho.data_in_code_entry) bool {
diff --git a/src/link/MachO/UnwindInfo.zig b/src/link/MachO/UnwindInfo.zig
index 0071657f8b94..8d2a36be9dfb 100644
--- a/src/link/MachO/UnwindInfo.zig
+++ b/src/link/MachO/UnwindInfo.zig
@@ -411,7 +411,7 @@ pub fn collect(info: *UnwindInfo, zld: *Zld) !void {
         }
 
         var slice = common_encodings_counts.values();
-        std.sort.sort(CommonEncWithCount, slice, {}, CommonEncWithCount.greaterThan);
+        mem.sort(CommonEncWithCount, slice, {}, CommonEncWithCount.greaterThan);
 
         var i: u7 = 0;
         while (i < slice.len) : (i += 1) {
diff --git a/src/link/MachO/dyld_info/Rebase.zig b/src/link/MachO/dyld_info/Rebase.zig
index 1d7a0c94c0c8..5b386a81368b 100644
--- a/src/link/MachO/dyld_info/Rebase.zig
+++ b/src/link/MachO/dyld_info/Rebase.zig
@@ -39,7 +39,7 @@ pub fn finalize(rebase: *Rebase, gpa: Allocator) !void {
 
     const writer = rebase.buffer.writer(gpa);
 
-    std.sort.sort(Entry, rebase.entries.items, {}, Entry.lessThan);
+    std.mem.sort(Entry, rebase.entries.items, {}, Entry.lessThan);
 
     try setTypePointer(writer);
 
diff --git a/src/link/MachO/dyld_info/bind.zig b/src/link/MachO/dyld_info/bind.zig
index 98a693920a6f..14ce1587aa79 100644
--- a/src/link/MachO/dyld_info/bind.zig
+++ b/src/link/MachO/dyld_info/bind.zig
@@ -47,7 +47,7 @@ pub fn Bind(comptime Ctx: type, comptime Target: type) type {
 
             const writer = self.buffer.writer(gpa);
 
-            std.sort.sort(Entry, self.entries.items, ctx, Entry.lessThan);
+            std.mem.sort(Entry, self.entries.items, ctx, Entry.lessThan);
 
             var start: usize = 0;
             var seg_id: ?u8 = null;
diff --git a/src/link/MachO/zld.zig b/src/link/MachO/zld.zig
index 7e6870ecbc23..b151aee19b18 100644
--- a/src/link/MachO/zld.zig
+++ b/src/link/MachO/zld.zig
@@ -1441,7 +1441,7 @@ pub const Zld = struct {
             }
         }
 
-        std.sort.sort(Section, sections.items, {}, SortSection.lessThan);
+        mem.sort(Section, sections.items, {}, SortSection.lessThan);
 
         self.sections.shrinkRetainingCapacity(0);
         for (sections.items) |out| {
@@ -2237,7 +2237,7 @@ pub const Zld = struct {
             }
         }
 
-        std.sort.sort(u64, addresses.items, {}, asc_u64);
+        mem.sort(u64, addresses.items, {}, asc_u64);
 
         var offsets = std.ArrayList(u32).init(gpa);
         defer offsets.deinit();
diff --git a/src/link/Wasm.zig b/src/link/Wasm.zig
index cd9c44d656e3..5dfc91d4ce80 100644
--- a/src/link/Wasm.zig
+++ b/src/link/Wasm.zig
@@ -2143,7 +2143,7 @@ fn sortDataSegments(wasm: *Wasm) !void {
         }
     };
 
-    std.sort.sort([]const u8, keys, {}, SortContext.sort);
+    mem.sort([]const u8, keys, {}, SortContext.sort);
     for (keys) |key| {
         const segment_index = wasm.data_segments.get(key).?;
         new_mapping.putAssumeCapacity(key, segment_index);
@@ -2187,7 +2187,7 @@ fn setupInitFunctions(wasm: *Wasm) !void {
     }
 
     // sort the initfunctions based on their priority
-    std.sort.sort(InitFuncLoc, wasm.init_funcs.items, {}, InitFuncLoc.lessThan);
+    mem.sort(InitFuncLoc, wasm.init_funcs.items, {}, InitFuncLoc.lessThan);
 }
 
 /// Generates an atom containing the global error set' size.
@@ -3687,7 +3687,7 @@ fn writeToFile(
             }
         }.sort;
 
-        std.sort.sort(*Atom, sorted_atoms.items, wasm, atom_sort_fn);
+        mem.sort(*Atom, sorted_atoms.items, wasm, atom_sort_fn);
 
         for (sorted_atoms.items) |sorted_atom| {
             try leb.writeULEB128(binary_writer, sorted_atom.size);
@@ -4050,8 +4050,8 @@ fn emitNameSection(wasm: *Wasm, binary_bytes: *std.ArrayList(u8), arena: std.mem
         data_segment_index += 1;
     }
 
-    std.sort.sort(Name, funcs.values(), {}, Name.lessThan);
-    std.sort.sort(Name, globals.items, {}, Name.lessThan);
+    mem.sort(Name, funcs.values(), {}, Name.lessThan);
+    mem.sort(Name, globals.items, {}, Name.lessThan);
 
     const header_offset = try reserveCustomSectionHeader(binary_bytes);
     const writer = binary_bytes.writer();
diff --git a/src/objcopy.zig b/src/objcopy.zig
index 12129aba9c1c..c5d0e8dcb38e 100644
--- a/src/objcopy.zig
+++ b/src/objcopy.zig
@@ -402,7 +402,7 @@ const BinaryElfOutput = struct {
             }
         }
 
-        std.sort.sort(*BinaryElfSegment, self.segments.items, {}, segmentSortCompare);
+        mem.sort(*BinaryElfSegment, self.segments.items, {}, segmentSortCompare);
 
         for (self.segments.items, 0..) |firstSegment, i| {
             if (firstSegment.firstSection) |firstSection| {
@@ -427,7 +427,7 @@ const BinaryElfOutput = struct {
             }
         }
 
-        std.sort.sort(*BinaryElfSection, self.sections.items, {}, sectionSortCompare);
+        mem.sort(*BinaryElfSection, self.sections.items, {}, sectionSortCompare);
 
         return self;
     }
diff --git a/test/src/Cases.zig b/test/src/Cases.zig
index 0451079a0e41..63dd2fd3da48 100644
--- a/test/src/Cases.zig
+++ b/test/src/Cases.zig
@@ -607,7 +607,7 @@ fn sortTestFilenames(filenames: [][]const u8) void {
             };
         }
     };
-    std.sort.sort([]const u8, filenames, Context{}, Context.lessThan);
+    std.mem.sort([]const u8, filenames, Context{}, Context.lessThan);
 }
 
 /// Iterates a set of filenames extracting batches that are either incremental
diff --git a/tools/gen_stubs.zig b/tools/gen_stubs.zig
index bc2637e197d8..95787b719a7d 100644
--- a/tools/gen_stubs.zig
+++ b/tools/gen_stubs.zig
@@ -437,7 +437,7 @@ fn parseElf(parse: Parse, comptime is_64: bool, comptime endian: builtin.Endian)
     const dynstr = elf_bytes[dynstr_offset..];
 
     // Sort the list by address, ascending.
-    std.sort.sort(Sym, @alignCast(8, dyn_syms), {}, S.symbolAddrLessThan);
+    mem.sort(Sym, @alignCast(8, dyn_syms), {}, S.symbolAddrLessThan);
 
     for (dyn_syms) |sym| {
         const this_section = s(sym.st_shndx);
diff --git a/tools/generate_JSONTestSuite.zig b/tools/generate_JSONTestSuite.zig
index b8550959c7a6..2229cf4012fb 100644
--- a/tools/generate_JSONTestSuite.zig
+++ b/tools/generate_JSONTestSuite.zig
@@ -23,7 +23,7 @@ pub fn main() !void {
     while (try it.next()) |entry| {
         try names.append(try allocator.dupe(u8, entry.name));
     }
-    std.sort.sort([]const u8, names.items, {}, (struct {
+    std.mem.sort([]const u8, names.items, {}, (struct {
         fn lessThan(_: void, a: []const u8, b: []const u8) bool {
             return std.mem.lessThan(u8, a, b);
         }
diff --git a/tools/process_headers.zig b/tools/process_headers.zig
index a6550a2573b3..0321c0e0eb5a 100644
--- a/tools/process_headers.zig
+++ b/tools/process_headers.zig
@@ -460,7 +460,7 @@ pub fn main() !void {
                 try contents_list.append(contents);
             }
         }
-        std.sort.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan);
+        std.mem.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan);
         const best_contents = contents_list.popOrNull().?;
         if (best_contents.hit_count > 1) {
             // worth it to make it generic
diff --git a/tools/update-linux-headers.zig b/tools/update-linux-headers.zig
index 38fbab66458b..0f31e5e893e3 100644
--- a/tools/update-linux-headers.zig
+++ b/tools/update-linux-headers.zig
@@ -260,7 +260,7 @@ pub fn main() !void {
                 try contents_list.append(contents);
             }
         }
-        std.sort.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan);
+        std.mem.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan);
         const best_contents = contents_list.popOrNull().?;
         if (best_contents.hit_count > 1) {
             // worth it to make it generic
diff --git a/tools/update_clang_options.zig b/tools/update_clang_options.zig
index 682ec7e152e8..feefeb0a83a6 100644
--- a/tools/update_clang_options.zig
+++ b/tools/update_clang_options.zig
@@ -646,7 +646,7 @@ pub fn main() anyerror!void {
     }
     // Some options have multiple matches. As an example, "-Wl,foo" matches both
     // "W" and "Wl,". So we sort this list in order of descending priority.
-    std.sort.sort(*json.ObjectMap, all_objects.items, {}, objectLessThan);
+    std.mem.sort(*json.ObjectMap, all_objects.items, {}, objectLessThan);
 
     var buffered_stdout = std.io.bufferedWriter(std.io.getStdOut().writer());
     const stdout = buffered_stdout.writer();
diff --git a/tools/update_cpu_features.zig b/tools/update_cpu_features.zig
index 53bb365f4183..d5c3d48852b2 100644
--- a/tools/update_cpu_features.zig
+++ b/tools/update_cpu_features.zig
@@ -1187,8 +1187,8 @@ fn processOneTarget(job: Job) anyerror!void {
     for (llvm_target.extra_cpus) |extra_cpu| {
         try all_cpus.append(extra_cpu);
     }
-    std.sort.sort(Feature, all_features.items, {}, featureLessThan);
-    std.sort.sort(Cpu, all_cpus.items, {}, cpuLessThan);
+    mem.sort(Feature, all_features.items, {}, featureLessThan);
+    mem.sort(Cpu, all_cpus.items, {}, cpuLessThan);
 
     const target_sub_path = try fs.path.join(arena, &.{ "lib", "std", "target" });
     var target_dir = try job.zig_src_dir.makeOpenPath(target_sub_path, .{});
@@ -1283,7 +1283,7 @@ fn processOneTarget(job: Job) anyerror!void {
                 try dependencies.append(key.*);
             }
         }
-        std.sort.sort([]const u8, dependencies.items, {}, asciiLessThan);
+        mem.sort([]const u8, dependencies.items, {}, asciiLessThan);
 
         if (dependencies.items.len == 0) {
             try w.writeAll(
@@ -1328,7 +1328,7 @@ fn processOneTarget(job: Job) anyerror!void {
                 try cpu_features.append(key.*);
             }
         }
-        std.sort.sort([]const u8, cpu_features.items, {}, asciiLessThan);
+        mem.sort([]const u8, cpu_features.items, {}, asciiLessThan);
         if (cpu.llvm_name) |llvm_name| {
             try w.print(
                 \\    pub const {} = CpuModel{{
diff --git a/tools/update_spirv_features.zig b/tools/update_spirv_features.zig
index 8d398f58de5b..44d8b6a44517 100644
--- a/tools/update_spirv_features.zig
+++ b/tools/update_spirv_features.zig
@@ -303,7 +303,7 @@ fn gatherVersions(allocator: Allocator, registry: g.CoreRegistry) ![]const Versi
         }
     }
 
-    std.sort.sort(Version, versions.items, {}, Version.lessThan);
+    std.mem.sort(Version, versions.items, {}, Version.lessThan);
 
     return versions.items;
 }