From d9d6c0274df362a6c172b17b54c4be08d5977a18 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Fri, 10 May 2024 18:06:50 +0100 Subject: [PATCH] Optimize `TensorBase::copy_from` for non-contiguous `self` Replace the slow iterator-based fallback in `TensorBase::copy_from` with a much faster version which uses nested loops over the innermost 4 dimensions. This reduces time in the `Pad` operator from ~4.5ms to ~0.8ms in Piper TTS models. That operator used the fallback because it creates a non-contiguous view of the non-padded region into which it copies the source. --- rten-tensor/src/{transpose.rs => copy.rs} | 83 +++++++++++++++++++---- rten-tensor/src/lib.rs | 2 +- rten-tensor/src/tensor.rs | 12 ++-- 3 files changed, 77 insertions(+), 20 deletions(-) rename rten-tensor/src/{transpose.rs => copy.rs} (72%) diff --git a/rten-tensor/src/transpose.rs b/rten-tensor/src/copy.rs similarity index 72% rename from rten-tensor/src/transpose.rs rename to rten-tensor/src/copy.rs index b905a389..da749bae 100644 --- a/rten-tensor/src/transpose.rs +++ b/rten-tensor/src/copy.rs @@ -2,7 +2,9 @@ use std::mem::MaybeUninit; use std::ops::Range; use crate::{AsView, Layout}; -use crate::{Matrix, MatrixLayout, MatrixMut, NdTensorView, NdTensorViewMut, TensorView}; +use crate::{ + Matrix, MatrixLayout, MatrixMut, NdTensorView, NdTensorViewMut, TensorView, TensorViewMut, +}; /// Iterator returned by [range_chunks]. pub struct RangeChunks { @@ -91,10 +93,11 @@ fn copy_blocked(src: Matrix, mut dest: MatrixMut>) { } } -/// Copy elements of `src` into `dest` in contiguous order. +/// Copy elements of `src` into a contiguous destination slice with the same +/// length. /// /// Returns `dest` as an initialized slice. -pub fn copy_contiguous<'a, T: Clone>( +pub fn copy_into_slice<'a, T: Clone>( src: TensorView, dest: &'a mut [MaybeUninit], ) -> &'a [T] { @@ -160,9 +163,50 @@ pub fn copy_contiguous<'a, T: Clone>( } } +/// Clone elements of `src` into `dest`. +/// +/// This is functionally equivalent to: +/// +/// ```text +/// src.iter().zip(dest.iter_mut()).for_each(|(y, x)| *y = x.clone()) +/// ``` +/// +/// But more efficient, especially when `src` or `dest` are not contiguous. +pub fn copy_into(mut src: TensorView, mut dest: TensorViewMut) { + assert!(src.shape() == dest.shape()); + + while src.ndim() < 4 { + src.insert_axis(0); + dest.insert_axis(0); + } + + // Efficiency could be improved here by sorting dims so that those with + // the smallest stride are innermost. Also it could use the blocked copy + // that `copy_into_slice` uses to avoid cache conflicts when inputs are + // transposed. + + src.inner_iter::<4>() + .zip(dest.inner_iter_mut::<4>()) + .for_each(|(src, mut dest)| { + for i0 in 0..src.size(0) { + for i1 in 0..src.size(1) { + for i2 in 0..src.size(2) { + for i3 in 0..src.size(3) { + unsafe { + *dest.get_unchecked_mut([i0, i1, i2, i3]) = + src.get_unchecked([i0, i1, i2, i3]).clone(); + } + } + } + } + } + }); +} + #[cfg(test)] mod tests { - use super::copy_contiguous; + use super::{copy_into, copy_into_slice}; + use crate::rng::XorShiftRng; use crate::{AsView, Layout, Tensor, TensorView}; /// Return the elements of `src` as a contiguous vector, in the same order they @@ -173,10 +217,10 @@ mod tests { /// /// This is equivalent to `src.iter().cloned().collect::>()` but /// faster. - fn contiguous_data(src: TensorView) -> Vec { + fn copy_into_vec(src: TensorView) -> Vec { let src_len = src.len(); let mut result = Vec::with_capacity(src_len); - copy_contiguous(src, &mut result.spare_capacity_mut()[..src_len]); + copy_into_slice(src, &mut result.spare_capacity_mut()[..src_len]); // Safety: `copy_contiguous` initialized `src_len` elements of result. unsafe { result.set_len(src_len) }; @@ -185,16 +229,31 @@ mod tests { } #[test] - fn test_contiguous_data() { + fn test_copy_into() { + let mut rng = XorShiftRng::new(1234); + for ndim in 0..5 { + let shape: Vec<_> = (0..ndim).map(|d| d + 1).collect(); + let src = Tensor::rand(&shape, &mut rng); + let src = src.transposed(); + + let mut dest = Tensor::zeros(src.shape()); + copy_into(src.view(), dest.view_mut()); + + assert_eq!(dest, src); + } + } + + #[test] + fn test_copy_into_slice() { // <= 4 dims let x = Tensor::from_data(&[2, 2], vec![1, 2, 3, 4]); - assert_eq!(contiguous_data(x.view()), [1, 2, 3, 4]); - assert_eq!(contiguous_data(x.transposed()), [1, 3, 2, 4]); + assert_eq!(copy_into_vec(x.view()), [1, 2, 3, 4]); + assert_eq!(copy_into_vec(x.transposed()), [1, 3, 2, 4]); // > 4 dims let x = Tensor::from_data(&[1, 1, 1, 2, 2], vec![1, 2, 3, 4]); - assert_eq!(contiguous_data(x.view()), [1, 2, 3, 4]); - assert_eq!(contiguous_data(x.transposed()), [1, 3, 2, 4]); + assert_eq!(copy_into_vec(x.view()), [1, 2, 3, 4]); + assert_eq!(copy_into_vec(x.transposed()), [1, 3, 2, 4]); // Transposed matrices of varying sizes. This includes: // @@ -205,7 +264,7 @@ mod tests { for size in [0usize, 2, 4, 8, 15, 16, 32, 64, 65, 68] { let x = Tensor::::arange(0, (size * size) as i32, None); let x = x.reshaped([size, size]); - let transposed = contiguous_data(x.transposed().as_dyn()); + let transposed = copy_into_vec(x.transposed().as_dyn()); let expected = x.transposed().iter().copied().collect::>(); assert_eq!(transposed, expected); } diff --git a/rten-tensor/src/lib.rs b/rten-tensor/src/lib.rs index 1946e0e0..a91472fe 100644 --- a/rten-tensor/src/lib.rs +++ b/rten-tensor/src/lib.rs @@ -40,6 +40,7 @@ //! } //! ``` +mod copy; mod errors; mod index_iterator; mod iterators; @@ -49,7 +50,6 @@ mod overlap; mod slice_range; mod storage; mod tensor; -mod transpose; /// Trait for sources of random data for tensors, for use with [Tensor::rand]. pub trait RandomSource { diff --git a/rten-tensor/src/tensor.rs b/rten-tensor/src/tensor.rs index c274f31e..85a018e8 100644 --- a/rten-tensor/src/tensor.rs +++ b/rten-tensor/src/tensor.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::mem::MaybeUninit; use std::ops::{Index, IndexMut, Range}; +use crate::copy::{copy_into, copy_into_slice}; use crate::errors::{DimensionError, FromDataError, SliceError}; use crate::iterators::{ AxisChunks, AxisChunksMut, AxisIter, AxisIterMut, BroadcastIter, InnerIter, InnerIterDyn, @@ -12,7 +13,6 @@ use crate::layout::{ OverlapPolicy, ResizeLayout, }; use crate::storage::{CowData, IntoStorage, Storage, StorageMut, ViewData, ViewMutData}; -use crate::transpose::copy_contiguous; use crate::{Alloc, GlobalAlloc, IntoSliceItems, RandomSource, SliceItem}; /// The base type for multi-dimensional arrays. This consists of storage for @@ -479,12 +479,10 @@ impl TensorBase { } // Copy source into destination in contiguous order. - copy_contiguous(other.as_dyn(), uninit_dest); + copy_into_slice(other.as_dyn(), uninit_dest); } } else { - for (out, x) in self.iter_mut().zip(other.iter()) { - *out = x.clone(); - } + copy_into(other.as_dyn(), self.as_dyn_mut()); } } @@ -954,7 +952,7 @@ where let data: &[MaybeUninit] = unsafe { std::mem::transmute(data) }; self.data.as_mut().clone_from_slice(data); } else { - copy_contiguous(other.as_dyn(), self.data.as_mut()); + copy_into_slice(other.as_dyn(), self.data.as_mut()); } unsafe { self.assume_init() } } @@ -1398,7 +1396,7 @@ impl, L: MutLayout + Clone> AsView for TensorBase if let Some(data) = self.data() { buf.extend_from_slice(data); } else { - copy_contiguous(self.as_dyn(), &mut buf.spare_capacity_mut()[..len]); + copy_into_slice(self.as_dyn(), &mut buf.spare_capacity_mut()[..len]); // Safety: We initialized `len` elements. unsafe { buf.set_len(len) }