Skip to content

Commit

Permalink
Implement blocked copy / transpose
Browse files Browse the repository at this point in the history
Add an alternative strategy for copying elements from a transposed tensor into a
contiguous buffer, using blocking, and enable it to be used via
`Tensor::copy_from`.

The existing naive copy implementation performs well except when the strides of
the source view lead to a significant rate of cache conflicts. This typically
happens when the last stride is a multiple of the cache line size, and
especially when it is a power of 2. To improve this, detect this case and
switch to an alternative copying procedure which uses blocking and tiling.

Using the `bench_transpose` benchmark in `src/ops/layout.rs, this avoids the
significant increase in overhead, vs a simple memory copy, when the source
stride is a power of 2.
  • Loading branch information
robertknight committed Apr 5, 2024
1 parent 1aed5a4 commit ea55853
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 49 deletions.
38 changes: 34 additions & 4 deletions rten-tensor/src/tensor.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::borrow::Cow;
use std::marker::PhantomData;
use std::mem::MaybeUninit;
use std::ops::{Index, IndexMut, Range};

use crate::errors::{DimensionError, FromDataError, SliceError};
Expand All @@ -11,7 +12,7 @@ use crate::layout::{
AsIndex, BroadcastLayout, DynLayout, IntoLayout, Layout, MatrixLayout, MutLayout, NdLayout,
OverlapPolicy, ResizeLayout,
};
use crate::transpose::contiguous_data;
use crate::transpose::{contiguous_data, copy_contiguous};
use crate::{IntoSliceItems, RandomSource, SliceItem};

/// The base type for multi-dimensional arrays. This consists of storage for
Expand Down Expand Up @@ -151,7 +152,7 @@ pub trait AsView: Layout {
/// data or changing the iteration order.
///
/// If the tensor is contiguous, this has the effect of flattening the
/// tensor into a single vector.
/// tensor into a vector.
fn merge_axes(&mut self)
where
Self::Layout: ResizeLayout;
Expand Down Expand Up @@ -375,6 +376,11 @@ impl<T, S: AsRef<[T]>, L: MutLayout> TensorBase<T, S, L> {
.expect("invalid layout");
Some(layout)
}

/// Return a raw pointer to the tensor's underlying data.
pub fn data_ptr(&self) -> *const T {
self.data.as_ref().as_ptr()
}
}

impl<T, S: AsRef<[T]> + AsMut<[T]>, L: MutLayout> TensorBase<T, S, L> {
Expand Down Expand Up @@ -420,8 +426,27 @@ impl<T, S: AsRef<[T]> + AsMut<[T]>, L: MutLayout> TensorBase<T, S, L> {
L: Clone,
{
assert!(self.shape() == other.shape());
for (out, x) in self.iter_mut().zip(other.iter()) {
*out = x.clone();

if let Some(dest) = self.data_mut() {
if let Some(src) = other.data() {
dest.clone_from_slice(src);
} else {
// Drop all the existing values. This should be compiled away for
// `Copy` types.
let uninit_dest: &mut [MaybeUninit<T>] = unsafe { std::mem::transmute(dest) };
for x in &mut *uninit_dest {
// Safety: All elements were initialized at the start of this
// block, and we haven't written to the slice yet.
unsafe { x.assume_init_drop() }
}

// Copy source into destination in contiguous order.
copy_contiguous(other.as_dyn(), uninit_dest);
}
} else {
for (out, x) in self.iter_mut().zip(other.iter()) {
*out = x.clone();
}
}
}

Expand All @@ -430,6 +455,11 @@ impl<T, S: AsRef<[T]> + AsMut<[T]>, L: MutLayout> TensorBase<T, S, L> {
self.layout.is_contiguous().then_some(self.data.as_mut())
}

/// Return a raw pointer to the tensor's underlying data.
pub fn data_mut_ptr(&mut self) -> *mut T {
self.data.as_mut().as_mut_ptr()
}

/// Replace all elements of this tensor with `value`.
pub fn fill(&mut self, value: T)
where
Expand Down
210 changes: 165 additions & 45 deletions rten-tensor/src/transpose.rs
Original file line number Diff line number Diff line change
@@ -1,41 +1,89 @@
use std::mem::MaybeUninit;
use std::ops::Range;

use crate::{AsView, Layout};
use crate::{NdTensorView, TensorView};
use crate::{Matrix, MatrixLayout, MatrixMut, NdTensorView, NdTensorViewMut, TensorView};

/// Call `f` with every element in `x` in logical order.
///
/// This is equivalent to `x.iter().for_each(f)` but is faster that Rust's
/// standard iteration protocol when `x` is non-contiguous and has <= 4
/// dimensions.
fn fast_for_each_element<T, F: FnMut(&T)>(mut x: TensorView<T>, mut f: F) {
// Merge axes to increase the chance that we can use the fast path and
// also maximize the iteration count of the innermost loops.
x.merge_axes();
/// Iterator returned by [range_chunks].
pub struct RangeChunks {
remainder: Range<usize>,
chunk_size: usize,
}

if x.ndim() > 4 {
x.iter().for_each(f)
} else {
while x.ndim() < 4 {
x.insert_axis(0);
impl Iterator for RangeChunks {
type Item = Range<usize>;

#[inline]
fn next(&mut self) -> Option<Self::Item> {
if !self.remainder.is_empty() {
let start = self.remainder.start;
let end = (start + self.chunk_size).min(self.remainder.end);
self.remainder.start += self.chunk_size;
Some(start..end)
} else {
None
}
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.remainder.len().div_ceil(self.chunk_size);
(len, Some(len))
}
}

let x_data = x.non_contiguous_data();
let x: NdTensorView<T, 4> = x.nd_view();
let shape = x.shape();
let strides = x.strides();
impl ExactSizeIterator for RangeChunks {}

assert!(x_data.len() >= x.layout().min_data_len());
impl std::iter::FusedIterator for RangeChunks {}

for i0 in 0..shape[0] {
for i1 in 0..shape[1] {
for i2 in 0..shape[2] {
for i3 in 0..shape[3] {
let offset =
i0 * strides[0] + i1 * strides[1] + i2 * strides[2] + i3 * strides[3];
/// Return an iterator over sub-ranges of `range`. If `range.len()` is not a
/// multiple of `chunk_size` then the final chunk will be shorter.
#[inline]
pub fn range_chunks(range: Range<usize>, chunk_size: usize) -> RangeChunks {
RangeChunks {
remainder: range,
chunk_size,
}
}

/// Tile size for blocked copy. A tile should fit in registers for 32-bit
/// values.
const TILE_SIZE: usize = 4;

/// Block size for blocked copy. A source and dest block should fit in the cache
/// for 32-bit values.
const BLOCK_SIZE: usize = 64;

/// Copy elements from `src` into `dest`.
///
/// `src` and `dest` must have the same shape but can (should) have different
/// strides. This function uses blocking to avoid the cache conflicts that can
/// arise in a naive copy if `src` is transposed.
fn copy_blocked<T: Clone>(src: Matrix<T>, mut dest: MatrixMut<MaybeUninit<T>>) {
// Ensure src and dest have same index range.
assert!(src.shape() == dest.shape());

// Safety: We checked data length > max offset produced
// by layout.
let elt = unsafe { x_data.get_unchecked(offset) };
f(elt)
// Ensure tiles are always full.
assert!(dest.rows() % TILE_SIZE == 0);
assert!(dest.cols() % TILE_SIZE == 0);

for row_block in range_chunks(0..dest.rows(), BLOCK_SIZE) {
for col_block in range_chunks(0..dest.cols(), BLOCK_SIZE) {
for row_tile in range_chunks(row_block.clone(), TILE_SIZE) {
for col_tile in range_chunks(col_block.clone(), TILE_SIZE) {
debug_assert!(row_tile.len() == TILE_SIZE);
debug_assert!(col_tile.len() == TILE_SIZE);

for y in 0..TILE_SIZE {
for x in 0..TILE_SIZE {
// Safety: Max values of `idx` are in-bounds for
// `src` and `dest`.
unsafe {
let idx = [row_tile.start + y, col_tile.start + x];
let src_el = src.get_unchecked(idx).clone();
dest.get_unchecked_mut(idx).write(src_el);
}
}
}
}
}
Expand All @@ -46,29 +94,87 @@ fn fast_for_each_element<T, F: FnMut(&T)>(mut x: TensorView<T>, mut f: F) {
/// Return the elements of `src` as a contiguous vector, in the same order they
/// would be yielded by `src.iter()`.
///
/// This function assumes that the caller has already checked if `src` is
/// contiguous and used more efficient methods to copy the data in that case.
///
/// This is equivalent to `src.iter().cloned().collect::<Vec<_>>()` but
/// faster.
pub fn contiguous_data<T: Clone>(src: TensorView<T>) -> Vec<T> {
let src_len = src.len();
let mut result = Vec::with_capacity(src_len);
copy_contiguous(src, &mut result.spare_capacity_mut()[..src_len]);

// This is equivalent to `x.iter().cloned().collect::<Vec<_>>()` but uses a
// faster iteration method that is optimized for tensors with few (<= 4)
// dimensions.
let mut data = Vec::with_capacity(src.len());
let ptr: *mut T = data.as_mut_ptr();
// Safety: `copy_contiguous` initialized `src_len` elements of result.
unsafe { result.set_len(src_len) };

let mut offset = 0;
fast_for_each_element(src, |elt| {
// Safety: `fast_for_each_element` calls fn `self.len()` times,
// matching the buffer capacity.
unsafe { *ptr.add(offset) = elt.clone() };
offset += 1;
});
result
}

// Safety: Length here matches capacity passed to `Vec::with_capacity`.
unsafe { data.set_len(src_len) }
/// Copy elements of `src` into `dest` in contiguous order.
///
/// Returns `dest` as an initialized slice.
pub fn copy_contiguous<'a, T: Clone>(
src: TensorView<T>,
dest: &'a mut [MaybeUninit<T>],
) -> &'a [T] {
assert!(dest.len() == src.len());

data
// Merge axes to increase the chance that we can use the fast path and
// also maximize the iteration count of the innermost loops.
let mut src = src.clone();
src.merge_axes();

if src.ndim() > 4 {
for (dst, src) in dest.iter_mut().zip(src.iter()) {
dst.write(src.clone());
}
// Safety: Loop above initialized all elements of `dest`.
return unsafe { std::mem::transmute(dest) };
}

while src.ndim() < 4 {
src.insert_axis(0);
}

let src: NdTensorView<T, 4> = src.nd_view();

// As a heuristic, use a blocked copy if the source stride is likely to lead
// to cache conflicts. Otherwise a simple direct copy is probably going to
// be faster. With a better optimized blocked copy path, we might be able to
// use it all the time.
let use_blocked_copy = src.stride(3).count_ones() == 1
&& src.stride(3) >= 32
&& src.size(2) % TILE_SIZE == 0
&& src.size(3) % TILE_SIZE == 0;

if use_blocked_copy {
let mut dest = NdTensorViewMut::from_data(src.shape(), dest);
for i0 in 0..src.size(0) {
for i1 in 0..src.size(1) {
let src = src.slice::<2, _>([i0, i1]);
let dest = dest.slice_mut::<2, _>([i0, i1]);
copy_blocked(src, dest);
}
}
} else {
let mut dest_offset = 0;
for i0 in 0..src.size(0) {
for i1 in 0..src.size(1) {
for i2 in 0..src.size(2) {
for i3 in 0..src.size(3) {
unsafe {
let elt = src.get_unchecked([i0, i1, i2, i3]).clone();
dest.get_unchecked_mut(dest_offset).write(elt);
dest_offset += 1;
}
}
}
}
}
}

// Safety: Loop above initialized all elements of `dest`.
unsafe { std::mem::transmute(dest) }
}

#[cfg(test)]
Expand All @@ -87,5 +193,19 @@ mod tests {
let x = Tensor::from_data(&[1, 1, 1, 2, 2], vec![1, 2, 3, 4]);
assert_eq!(contiguous_data(x.view()), [1, 2, 3, 4]);
assert_eq!(contiguous_data(x.transposed()), [1, 3, 2, 4]);

// Transposed matrices of varying sizes. This includes:
//
// - Zero
// - Powers of 2
// - Non-powers of 2
// - Values above and below threshold for using blocked copy
for size in [0usize, 2, 4, 8, 15, 16, 32, 64, 65, 68] {
let x = Tensor::<i32>::arange(0, (size * size) as i32, None);
let x = x.reshaped([size, size]);
let transposed = contiguous_data(x.transposed().as_dyn());
let expected = x.transposed().iter().copied().collect::<Vec<_>>();
assert_eq!(transposed, expected);
}
}
}

0 comments on commit ea55853

Please sign in to comment.