From ea5585387edeb3bae12bf1bad3e09da99deb7809 Mon Sep 17 00:00:00 2001
From: Robert Knight <robertknight@gmail.com>
Date: Tue, 2 Apr 2024 18:47:32 +0100
Subject: [PATCH] Implement blocked copy / transpose

Add an alternative strategy for copying elements from a transposed tensor into a
contiguous buffer, using blocking, and enable it to be used via
`Tensor::copy_from`.

The existing naive copy implementation performs well except when the strides of
the source view lead to a significant rate of cache conflicts. This typically
happens when the last stride is a multiple of the cache line size, and
especially when it is a power of 2. To improve this, detect this case and
switch to an alternative copying procedure which uses blocking and tiling.

Using the `bench_transpose` benchmark in `src/ops/layout.rs, this avoids the
significant increase in overhead, vs a simple memory copy, when the source
stride is a power of 2.
---
 rten-tensor/src/tensor.rs    |  38 ++++++-
 rten-tensor/src/transpose.rs | 210 +++++++++++++++++++++++++++--------
 2 files changed, 199 insertions(+), 49 deletions(-)
diff --git a/rten-tensor/src/tensor.rs b/rten-tensor/src/tensor.rs
index 8274c32e..e330be35 100644
--- a/rten-tensor/src/tensor.rs
+++ b/rten-tensor/src/tensor.rs
@@ -1,5 +1,6 @@
 use std::borrow::Cow;
 use std::marker::PhantomData;
+use std::mem::MaybeUninit;
 use std::ops::{Index, IndexMut, Range};
 
 use crate::errors::{DimensionError, FromDataError, SliceError};
@@ -11,7 +12,7 @@ use crate::layout::{
     AsIndex, BroadcastLayout, DynLayout, IntoLayout, Layout, MatrixLayout, MutLayout, NdLayout,
     OverlapPolicy, ResizeLayout,
 };
-use crate::transpose::contiguous_data;
+use crate::transpose::{contiguous_data, copy_contiguous};
 use crate::{IntoSliceItems, RandomSource, SliceItem};
 
 /// The base type for multi-dimensional arrays. This consists of storage for
@@ -151,7 +152,7 @@ pub trait AsView: Layout {
     /// data or changing the iteration order.
     ///
     /// If the tensor is contiguous, this has the effect of flattening the
-    /// tensor into a single vector.
+    /// tensor into a vector.
     fn merge_axes(&mut self)
     where
         Self::Layout: ResizeLayout;
@@ -375,6 +376,11 @@ impl<T, S: AsRef<[T]>, L: MutLayout> TensorBase<T, S, L> {
                 .expect("invalid layout");
         Some(layout)
     }
+
+    /// Return a raw pointer to the tensor's underlying data.
+    pub fn data_ptr(&self) -> *const T {
+        self.data.as_ref().as_ptr()
+    }
 }
 
 impl<T, S: AsRef<[T]> + AsMut<[T]>, L: MutLayout> TensorBase<T, S, L> {
@@ -420,8 +426,27 @@ impl<T, S: AsRef<[T]> + AsMut<[T]>, L: MutLayout> TensorBase<T, S, L> {
         L: Clone,
     {
         assert!(self.shape() == other.shape());
-        for (out, x) in self.iter_mut().zip(other.iter()) {
-            *out = x.clone();
+
+        if let Some(dest) = self.data_mut() {
+            if let Some(src) = other.data() {
+                dest.clone_from_slice(src);
+            } else {
+                // Drop all the existing values. This should be compiled away for
+                // `Copy` types.
+                let uninit_dest: &mut [MaybeUninit<T>] = unsafe { std::mem::transmute(dest) };
+                for x in &mut *uninit_dest {
+                    // Safety: All elements were initialized at the start of this
+                    // block, and we haven't written to the slice yet.
+                    unsafe { x.assume_init_drop() }
+                }
+
+                // Copy source into destination in contiguous order.
+                copy_contiguous(other.as_dyn(), uninit_dest);
+            }
+        } else {
+            for (out, x) in self.iter_mut().zip(other.iter()) {
+                *out = x.clone();
+            }
         }
     }
 
@@ -430,6 +455,11 @@ impl<T, S: AsRef<[T]> + AsMut<[T]>, L: MutLayout> TensorBase<T, S, L> {
         self.layout.is_contiguous().then_some(self.data.as_mut())
     }
 
+    /// Return a raw pointer to the tensor's underlying data.
+    pub fn data_mut_ptr(&mut self) -> *mut T {
+        self.data.as_mut().as_mut_ptr()
+    }
+
     /// Replace all elements of this tensor with `value`.
     pub fn fill(&mut self, value: T)
     where
diff --git a/rten-tensor/src/transpose.rs b/rten-tensor/src/transpose.rs
index cdaa71b6..1551e2a0 100644
--- a/rten-tensor/src/transpose.rs
+++ b/rten-tensor/src/transpose.rs
@@ -1,41 +1,89 @@
+use std::mem::MaybeUninit;
+use std::ops::Range;
+
 use crate::{AsView, Layout};
-use crate::{NdTensorView, TensorView};
+use crate::{Matrix, MatrixLayout, MatrixMut, NdTensorView, NdTensorViewMut, TensorView};
 
-/// Call `f` with every element in `x` in logical order.
-///
-/// This is equivalent to `x.iter().for_each(f)` but is faster that Rust's
-/// standard iteration protocol when `x` is non-contiguous and has <= 4
-/// dimensions.
-fn fast_for_each_element<T, F: FnMut(&T)>(mut x: TensorView<T>, mut f: F) {
-    // Merge axes to increase the chance that we can use the fast path and
-    // also maximize the iteration count of the innermost loops.
-    x.merge_axes();
+/// Iterator returned by [range_chunks].
+pub struct RangeChunks {
+    remainder: Range<usize>,
+    chunk_size: usize,
+}
 
-    if x.ndim() > 4 {
-        x.iter().for_each(f)
-    } else {
-        while x.ndim() < 4 {
-            x.insert_axis(0);
+impl Iterator for RangeChunks {
+    type Item = Range<usize>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        if !self.remainder.is_empty() {
+            let start = self.remainder.start;
+            let end = (start + self.chunk_size).min(self.remainder.end);
+            self.remainder.start += self.chunk_size;
+            Some(start..end)
+        } else {
+            None
         }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remainder.len().div_ceil(self.chunk_size);
+        (len, Some(len))
+    }
+}
 
-        let x_data = x.non_contiguous_data();
-        let x: NdTensorView<T, 4> = x.nd_view();
-        let shape = x.shape();
-        let strides = x.strides();
+impl ExactSizeIterator for RangeChunks {}
 
-        assert!(x_data.len() >= x.layout().min_data_len());
+impl std::iter::FusedIterator for RangeChunks {}
 
-        for i0 in 0..shape[0] {
-            for i1 in 0..shape[1] {
-                for i2 in 0..shape[2] {
-                    for i3 in 0..shape[3] {
-                        let offset =
-                            i0 * strides[0] + i1 * strides[1] + i2 * strides[2] + i3 * strides[3];
+/// Return an iterator over sub-ranges of `range`. If `range.len()` is not a
+/// multiple of `chunk_size` then the final chunk will be shorter.
+#[inline]
+pub fn range_chunks(range: Range<usize>, chunk_size: usize) -> RangeChunks {
+    RangeChunks {
+        remainder: range,
+        chunk_size,
+    }
+}
+
+/// Tile size for blocked copy. A tile should fit in registers for 32-bit
+/// values.
+const TILE_SIZE: usize = 4;
+
+/// Block size for blocked copy. A source and dest block should fit in the cache
+/// for 32-bit values.
+const BLOCK_SIZE: usize = 64;
+
+/// Copy elements from `src` into `dest`.
+///
+/// `src` and `dest` must have the same shape but can (should) have different
+/// strides. This function uses blocking to avoid the cache conflicts that can
+/// arise in a naive copy if `src` is transposed.
+fn copy_blocked<T: Clone>(src: Matrix<T>, mut dest: MatrixMut<MaybeUninit<T>>) {
+    // Ensure src and dest have same index range.
+    assert!(src.shape() == dest.shape());
 
-                        // Safety: We checked data length > max offset produced
-                        // by layout.
-                        let elt = unsafe { x_data.get_unchecked(offset) };
-                        f(elt)
+    // Ensure tiles are always full.
+    assert!(dest.rows() % TILE_SIZE == 0);
+    assert!(dest.cols() % TILE_SIZE == 0);
+
+    for row_block in range_chunks(0..dest.rows(), BLOCK_SIZE) {
+        for col_block in range_chunks(0..dest.cols(), BLOCK_SIZE) {
+            for row_tile in range_chunks(row_block.clone(), TILE_SIZE) {
+                for col_tile in range_chunks(col_block.clone(), TILE_SIZE) {
+                    debug_assert!(row_tile.len() == TILE_SIZE);
+                    debug_assert!(col_tile.len() == TILE_SIZE);
+
+                    for y in 0..TILE_SIZE {
+                        for x in 0..TILE_SIZE {
+                            // Safety: Max values of `idx` are in-bounds for
+                            // `src` and `dest`.
+                            unsafe {
+                                let idx = [row_tile.start + y, col_tile.start + x];
+                                let src_el = src.get_unchecked(idx).clone();
+                                dest.get_unchecked_mut(idx).write(src_el);
+                            }
+                        }
                     }
                 }
             }
@@ -46,29 +94,87 @@ fn fast_for_each_element<T, F: FnMut(&T)>(mut x: TensorView<T>, mut f: F) {
 /// Return the elements of `src` as a contiguous vector, in the same order they
 /// would be yielded by `src.iter()`.
 ///
+/// This function assumes that the caller has already checked if `src` is
+/// contiguous and used more efficient methods to copy the data in that case.
+///
 /// This is equivalent to `src.iter().cloned().collect::<Vec<_>>()` but
 /// faster.
 pub fn contiguous_data<T: Clone>(src: TensorView<T>) -> Vec<T> {
     let src_len = src.len();
+    let mut result = Vec::with_capacity(src_len);
+    copy_contiguous(src, &mut result.spare_capacity_mut()[..src_len]);
 
-    // This is equivalent to `x.iter().cloned().collect::<Vec<_>>()` but uses a
-    // faster iteration method that is optimized for tensors with few (<= 4)
-    // dimensions.
-    let mut data = Vec::with_capacity(src.len());
-    let ptr: *mut T = data.as_mut_ptr();
+    // Safety: `copy_contiguous` initialized `src_len` elements of result.
+    unsafe { result.set_len(src_len) };
 
-    let mut offset = 0;
-    fast_for_each_element(src, |elt| {
-        // Safety: `fast_for_each_element` calls fn `self.len()` times,
-        // matching the buffer capacity.
-        unsafe { *ptr.add(offset) = elt.clone() };
-        offset += 1;
-    });
+    result
+}
 
-    // Safety: Length here matches capacity passed to `Vec::with_capacity`.
-    unsafe { data.set_len(src_len) }
+/// Copy elements of `src` into `dest` in contiguous order.
+///
+/// Returns `dest` as an initialized slice.
+pub fn copy_contiguous<'a, T: Clone>(
+    src: TensorView<T>,
+    dest: &'a mut [MaybeUninit<T>],
+) -> &'a [T] {
+    assert!(dest.len() == src.len());
 
-    data
+    // Merge axes to increase the chance that we can use the fast path and
+    // also maximize the iteration count of the innermost loops.
+    let mut src = src.clone();
+    src.merge_axes();
+
+    if src.ndim() > 4 {
+        for (dst, src) in dest.iter_mut().zip(src.iter()) {
+            dst.write(src.clone());
+        }
+        // Safety: Loop above initialized all elements of `dest`.
+        return unsafe { std::mem::transmute(dest) };
+    }
+
+    while src.ndim() < 4 {
+        src.insert_axis(0);
+    }
+
+    let src: NdTensorView<T, 4> = src.nd_view();
+
+    // As a heuristic, use a blocked copy if the source stride is likely to lead
+    // to cache conflicts. Otherwise a simple direct copy is probably going to
+    // be faster. With a better optimized blocked copy path, we might be able to
+    // use it all the time.
+    let use_blocked_copy = src.stride(3).count_ones() == 1
+        && src.stride(3) >= 32
+        && src.size(2) % TILE_SIZE == 0
+        && src.size(3) % TILE_SIZE == 0;
+
+    if use_blocked_copy {
+        let mut dest = NdTensorViewMut::from_data(src.shape(), dest);
+        for i0 in 0..src.size(0) {
+            for i1 in 0..src.size(1) {
+                let src = src.slice::<2, _>([i0, i1]);
+                let dest = dest.slice_mut::<2, _>([i0, i1]);
+                copy_blocked(src, dest);
+            }
+        }
+    } else {
+        let mut dest_offset = 0;
+        for i0 in 0..src.size(0) {
+            for i1 in 0..src.size(1) {
+                for i2 in 0..src.size(2) {
+                    for i3 in 0..src.size(3) {
+                        unsafe {
+                            let elt = src.get_unchecked([i0, i1, i2, i3]).clone();
+                            dest.get_unchecked_mut(dest_offset).write(elt);
+                            dest_offset += 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Safety: Loop above initialized all elements of `dest`.
+    unsafe { std::mem::transmute(dest) }
 }
 
 #[cfg(test)]
@@ -87,5 +193,19 @@ mod tests {
         let x = Tensor::from_data(&[1, 1, 1, 2, 2], vec![1, 2, 3, 4]);
         assert_eq!(contiguous_data(x.view()), [1, 2, 3, 4]);
         assert_eq!(contiguous_data(x.transposed()), [1, 3, 2, 4]);
+
+        // Transposed matrices of varying sizes. This includes:
+        //
+        // - Zero
+        // - Powers of 2
+        // - Non-powers of 2
+        // - Values above and below threshold for using blocked copy
+        for size in [0usize, 2, 4, 8, 15, 16, 32, 64, 65, 68] {
+            let x = Tensor::<i32>::arange(0, (size * size) as i32, None);
+            let x = x.reshaped([size, size]);
+            let transposed = contiguous_data(x.transposed().as_dyn());
+            let expected = x.transposed().iter().copied().collect::<Vec<_>>();
+            assert_eq!(transposed, expected);
+        }
     }
 }