From ea5585387edeb3bae12bf1bad3e09da99deb7809 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Tue, 2 Apr 2024 18:47:32 +0100 Subject: [PATCH] Implement blocked copy / transpose Add an alternative strategy for copying elements from a transposed tensor into a contiguous buffer, using blocking, and enable it to be used via `Tensor::copy_from`. The existing naive copy implementation performs well except when the strides of the source view lead to a significant rate of cache conflicts. This typically happens when the last stride is a multiple of the cache line size, and especially when it is a power of 2. To improve this, detect this case and switch to an alternative copying procedure which uses blocking and tiling. Using the `bench_transpose` benchmark in `src/ops/layout.rs, this avoids the significant increase in overhead, vs a simple memory copy, when the source stride is a power of 2. --- rten-tensor/src/tensor.rs | 38 ++++++- rten-tensor/src/transpose.rs | 210 +++++++++++++++++++++++++++-------- 2 files changed, 199 insertions(+), 49 deletions(-) diff --git a/rten-tensor/src/tensor.rs b/rten-tensor/src/tensor.rs index 8274c32e..e330be35 100644 --- a/rten-tensor/src/tensor.rs +++ b/rten-tensor/src/tensor.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::marker::PhantomData; +use std::mem::MaybeUninit; use std::ops::{Index, IndexMut, Range}; use crate::errors::{DimensionError, FromDataError, SliceError}; @@ -11,7 +12,7 @@ use crate::layout::{ AsIndex, BroadcastLayout, DynLayout, IntoLayout, Layout, MatrixLayout, MutLayout, NdLayout, OverlapPolicy, ResizeLayout, }; -use crate::transpose::contiguous_data; +use crate::transpose::{contiguous_data, copy_contiguous}; use crate::{IntoSliceItems, RandomSource, SliceItem}; /// The base type for multi-dimensional arrays. This consists of storage for @@ -151,7 +152,7 @@ pub trait AsView: Layout { /// data or changing the iteration order. /// /// If the tensor is contiguous, this has the effect of flattening the - /// tensor into a single vector. + /// tensor into a vector. fn merge_axes(&mut self) where Self::Layout: ResizeLayout; @@ -375,6 +376,11 @@ impl, L: MutLayout> TensorBase { .expect("invalid layout"); Some(layout) } + + /// Return a raw pointer to the tensor's underlying data. + pub fn data_ptr(&self) -> *const T { + self.data.as_ref().as_ptr() + } } impl + AsMut<[T]>, L: MutLayout> TensorBase { @@ -420,8 +426,27 @@ impl + AsMut<[T]>, L: MutLayout> TensorBase { L: Clone, { assert!(self.shape() == other.shape()); - for (out, x) in self.iter_mut().zip(other.iter()) { - *out = x.clone(); + + if let Some(dest) = self.data_mut() { + if let Some(src) = other.data() { + dest.clone_from_slice(src); + } else { + // Drop all the existing values. This should be compiled away for + // `Copy` types. + let uninit_dest: &mut [MaybeUninit] = unsafe { std::mem::transmute(dest) }; + for x in &mut *uninit_dest { + // Safety: All elements were initialized at the start of this + // block, and we haven't written to the slice yet. + unsafe { x.assume_init_drop() } + } + + // Copy source into destination in contiguous order. + copy_contiguous(other.as_dyn(), uninit_dest); + } + } else { + for (out, x) in self.iter_mut().zip(other.iter()) { + *out = x.clone(); + } } } @@ -430,6 +455,11 @@ impl + AsMut<[T]>, L: MutLayout> TensorBase { self.layout.is_contiguous().then_some(self.data.as_mut()) } + /// Return a raw pointer to the tensor's underlying data. + pub fn data_mut_ptr(&mut self) -> *mut T { + self.data.as_mut().as_mut_ptr() + } + /// Replace all elements of this tensor with `value`. pub fn fill(&mut self, value: T) where diff --git a/rten-tensor/src/transpose.rs b/rten-tensor/src/transpose.rs index cdaa71b6..1551e2a0 100644 --- a/rten-tensor/src/transpose.rs +++ b/rten-tensor/src/transpose.rs @@ -1,41 +1,89 @@ +use std::mem::MaybeUninit; +use std::ops::Range; + use crate::{AsView, Layout}; -use crate::{NdTensorView, TensorView}; +use crate::{Matrix, MatrixLayout, MatrixMut, NdTensorView, NdTensorViewMut, TensorView}; -/// Call `f` with every element in `x` in logical order. -/// -/// This is equivalent to `x.iter().for_each(f)` but is faster that Rust's -/// standard iteration protocol when `x` is non-contiguous and has <= 4 -/// dimensions. -fn fast_for_each_element(mut x: TensorView, mut f: F) { - // Merge axes to increase the chance that we can use the fast path and - // also maximize the iteration count of the innermost loops. - x.merge_axes(); +/// Iterator returned by [range_chunks]. +pub struct RangeChunks { + remainder: Range, + chunk_size: usize, +} - if x.ndim() > 4 { - x.iter().for_each(f) - } else { - while x.ndim() < 4 { - x.insert_axis(0); +impl Iterator for RangeChunks { + type Item = Range; + + #[inline] + fn next(&mut self) -> Option { + if !self.remainder.is_empty() { + let start = self.remainder.start; + let end = (start + self.chunk_size).min(self.remainder.end); + self.remainder.start += self.chunk_size; + Some(start..end) + } else { + None } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.remainder.len().div_ceil(self.chunk_size); + (len, Some(len)) + } +} - let x_data = x.non_contiguous_data(); - let x: NdTensorView = x.nd_view(); - let shape = x.shape(); - let strides = x.strides(); +impl ExactSizeIterator for RangeChunks {} - assert!(x_data.len() >= x.layout().min_data_len()); +impl std::iter::FusedIterator for RangeChunks {} - for i0 in 0..shape[0] { - for i1 in 0..shape[1] { - for i2 in 0..shape[2] { - for i3 in 0..shape[3] { - let offset = - i0 * strides[0] + i1 * strides[1] + i2 * strides[2] + i3 * strides[3]; +/// Return an iterator over sub-ranges of `range`. If `range.len()` is not a +/// multiple of `chunk_size` then the final chunk will be shorter. +#[inline] +pub fn range_chunks(range: Range, chunk_size: usize) -> RangeChunks { + RangeChunks { + remainder: range, + chunk_size, + } +} + +/// Tile size for blocked copy. A tile should fit in registers for 32-bit +/// values. +const TILE_SIZE: usize = 4; + +/// Block size for blocked copy. A source and dest block should fit in the cache +/// for 32-bit values. +const BLOCK_SIZE: usize = 64; + +/// Copy elements from `src` into `dest`. +/// +/// `src` and `dest` must have the same shape but can (should) have different +/// strides. This function uses blocking to avoid the cache conflicts that can +/// arise in a naive copy if `src` is transposed. +fn copy_blocked(src: Matrix, mut dest: MatrixMut>) { + // Ensure src and dest have same index range. + assert!(src.shape() == dest.shape()); - // Safety: We checked data length > max offset produced - // by layout. - let elt = unsafe { x_data.get_unchecked(offset) }; - f(elt) + // Ensure tiles are always full. + assert!(dest.rows() % TILE_SIZE == 0); + assert!(dest.cols() % TILE_SIZE == 0); + + for row_block in range_chunks(0..dest.rows(), BLOCK_SIZE) { + for col_block in range_chunks(0..dest.cols(), BLOCK_SIZE) { + for row_tile in range_chunks(row_block.clone(), TILE_SIZE) { + for col_tile in range_chunks(col_block.clone(), TILE_SIZE) { + debug_assert!(row_tile.len() == TILE_SIZE); + debug_assert!(col_tile.len() == TILE_SIZE); + + for y in 0..TILE_SIZE { + for x in 0..TILE_SIZE { + // Safety: Max values of `idx` are in-bounds for + // `src` and `dest`. + unsafe { + let idx = [row_tile.start + y, col_tile.start + x]; + let src_el = src.get_unchecked(idx).clone(); + dest.get_unchecked_mut(idx).write(src_el); + } + } } } } @@ -46,29 +94,87 @@ fn fast_for_each_element(mut x: TensorView, mut f: F) { /// Return the elements of `src` as a contiguous vector, in the same order they /// would be yielded by `src.iter()`. /// +/// This function assumes that the caller has already checked if `src` is +/// contiguous and used more efficient methods to copy the data in that case. +/// /// This is equivalent to `src.iter().cloned().collect::>()` but /// faster. pub fn contiguous_data(src: TensorView) -> Vec { let src_len = src.len(); + let mut result = Vec::with_capacity(src_len); + copy_contiguous(src, &mut result.spare_capacity_mut()[..src_len]); - // This is equivalent to `x.iter().cloned().collect::>()` but uses a - // faster iteration method that is optimized for tensors with few (<= 4) - // dimensions. - let mut data = Vec::with_capacity(src.len()); - let ptr: *mut T = data.as_mut_ptr(); + // Safety: `copy_contiguous` initialized `src_len` elements of result. + unsafe { result.set_len(src_len) }; - let mut offset = 0; - fast_for_each_element(src, |elt| { - // Safety: `fast_for_each_element` calls fn `self.len()` times, - // matching the buffer capacity. - unsafe { *ptr.add(offset) = elt.clone() }; - offset += 1; - }); + result +} - // Safety: Length here matches capacity passed to `Vec::with_capacity`. - unsafe { data.set_len(src_len) } +/// Copy elements of `src` into `dest` in contiguous order. +/// +/// Returns `dest` as an initialized slice. +pub fn copy_contiguous<'a, T: Clone>( + src: TensorView, + dest: &'a mut [MaybeUninit], +) -> &'a [T] { + assert!(dest.len() == src.len()); - data + // Merge axes to increase the chance that we can use the fast path and + // also maximize the iteration count of the innermost loops. + let mut src = src.clone(); + src.merge_axes(); + + if src.ndim() > 4 { + for (dst, src) in dest.iter_mut().zip(src.iter()) { + dst.write(src.clone()); + } + // Safety: Loop above initialized all elements of `dest`. + return unsafe { std::mem::transmute(dest) }; + } + + while src.ndim() < 4 { + src.insert_axis(0); + } + + let src: NdTensorView = src.nd_view(); + + // As a heuristic, use a blocked copy if the source stride is likely to lead + // to cache conflicts. Otherwise a simple direct copy is probably going to + // be faster. With a better optimized blocked copy path, we might be able to + // use it all the time. + let use_blocked_copy = src.stride(3).count_ones() == 1 + && src.stride(3) >= 32 + && src.size(2) % TILE_SIZE == 0 + && src.size(3) % TILE_SIZE == 0; + + if use_blocked_copy { + let mut dest = NdTensorViewMut::from_data(src.shape(), dest); + for i0 in 0..src.size(0) { + for i1 in 0..src.size(1) { + let src = src.slice::<2, _>([i0, i1]); + let dest = dest.slice_mut::<2, _>([i0, i1]); + copy_blocked(src, dest); + } + } + } else { + let mut dest_offset = 0; + for i0 in 0..src.size(0) { + for i1 in 0..src.size(1) { + for i2 in 0..src.size(2) { + for i3 in 0..src.size(3) { + unsafe { + let elt = src.get_unchecked([i0, i1, i2, i3]).clone(); + dest.get_unchecked_mut(dest_offset).write(elt); + dest_offset += 1; + } + } + } + } + } + } + + // Safety: Loop above initialized all elements of `dest`. + unsafe { std::mem::transmute(dest) } } #[cfg(test)] @@ -87,5 +193,19 @@ mod tests { let x = Tensor::from_data(&[1, 1, 1, 2, 2], vec![1, 2, 3, 4]); assert_eq!(contiguous_data(x.view()), [1, 2, 3, 4]); assert_eq!(contiguous_data(x.transposed()), [1, 3, 2, 4]); + + // Transposed matrices of varying sizes. This includes: + // + // - Zero + // - Powers of 2 + // - Non-powers of 2 + // - Values above and below threshold for using blocked copy + for size in [0usize, 2, 4, 8, 15, 16, 32, 64, 65, 68] { + let x = Tensor::::arange(0, (size * size) as i32, None); + let x = x.reshaped([size, size]); + let transposed = contiguous_data(x.transposed().as_dyn()); + let expected = x.transposed().iter().copied().collect::>(); + assert_eq!(transposed, expected); + } } }