Merge pull request #53 from charles-r-earp/conv-direct

Conv direct
charles-r-earp · Feb 1, 2024 · 16b044b · 16b044b
2 parents 0c384c1 + dfaaa30
commit 16b044b
Show file tree

Hide file tree

Showing 16 changed files with 7,291 additions and 4,058 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -64,13 +64,16 @@ half = { version = "=2.1.0", features = ["num-traits", "bytemuck", "serde"] }
 http = { version = "0.2.4", optional = true }
 paste = "1.0.7"
 dry = "0.1.1"
-crunchy = { version = "0.2.2", optional = true }
+crunchy = { version = "0.2.2" }
 crossbeam-channel = { workspace = true, optional = true }
 parking_lot = { workspace = true, optional = true }
 rayon.workspace = true
 once_cell = { version = "1.17.1", optional = true, features = ["std"] }
 num-traits = "0.2.15"
 smallvec = { version = "1.11.1", optional = true }
+matrixmultiply_mt = { version = "0.2.1", optional = true }
+matrixmultiply = { version = "0.3.8", optional = true }
+wide = "0.7.13"
 
 [dev-dependencies]
 approx = "0.4.0"
@@ -82,7 +85,7 @@ libtest-mimic = "0.6.0"
 
 [features]
 default = ["device"]
-device = ["krnl/device", "dep:crunchy", "dep:once_cell"]
+device = ["krnl/device", "dep:once_cell"]
 dataset = ["dep:rand"]
 iris = []
 mnist = ["dataset", "dep:dirs", "dep:flate2", "dep:downloader", "dep:byteorder", "dep:http"]

diff --git a/README.md b/README.md
@@ -109,19 +109,19 @@ See the [Neural Network MNIST](examples/neural-network-mnist) example.
 
 |                   | `autograph`               | `tch`                            |
 |:------------------|:--------------------------|:-------------------------------- |
-| **`bf16_host`**   | `591.07 ms` (✅ **1.00x**) | `76.58 ms` (🚀 **7.72x faster**)  |
-| **`f32_host`**    | `16.60 ms` (✅ **1.00x**)  | `3.18 ms` (🚀 **5.22x faster**)   |
-| **`bf16_device`** | `7.44 ms` (✅ **1.00x**)   | `21.09 ms` (❌ *2.84x slower*)    |
-| **`f32_device`**  | `1.56 ms` (✅ **1.00x**)   | `3.94 ms` (❌ *2.52x slower*)     |
+| **`bf16_host`**   | `494.98 ms` (✅ **1.00x**) | `78.29 ms` (🚀 **6.32x faster**)  |
+| **`f32_host`**    | `7.21 ms` (✅ **1.00x**)   | `3.15 ms` (🚀 **2.28x faster**)   |
+| **`bf16_device`** | `10.12 ms` (✅ **1.00x**)  | `17.65 ms` (❌ *1.74x slower*)    |
+| **`f32_device`**  | `1.71 ms` (✅ **1.00x**)   | `1.19 ms` (✅ **1.43x faster**)   |
 
 ## LeNet5(inference, batch_size = 1,000)
 
-|                   | `autograph`               | `tch`                              |
-|:------------------|:--------------------------|:---------------------------------- |
-| **`bf16_host`**   | `2.14 s` (✅ **1.00x**)    | `196.52 ms` (🚀 **10.87x faster**)  |
-| **`f32_host`**    | `104.09 ms` (✅ **1.00x**) | `9.15 ms` (🚀 **11.38x faster**)    |
-| **`bf16_device`** | `4.31 ms` (✅ **1.00x**)   | `48.74 ms` (❌ *11.31x slower*)     |
-| **`f32_device`**  | `4.34 ms` (✅ **1.00x**)   | `1.85 ms` (🚀 **2.35x faster**)     |
+|                   | `autograph`              | `tch`                             |
+|:------------------|:-------------------------|:--------------------------------- |
+| **`bf16_host`**   | `1.82 s` (✅ **1.00x**)   | `197.40 ms` (🚀 **9.23x faster**)  |
+| **`f32_host`**    | `16.96 ms` (✅ **1.00x**) | `9.49 ms` (✅ **1.79x faster**)    |
+| **`bf16_device`** | `4.61 ms` (✅ **1.00x**)  | `48.71 ms` (❌ *10.57x slower*)    |
+| **`f32_device`**  | `4.60 ms` (✅ **1.00x**)  | `1.84 ms` (🚀 **2.49x faster**)    |
 
 See the [Neural Network](benches/neural-network-benches) benchmark.
 

diff --git a/benches/neural-network-benches/Cargo.toml b/benches/neural-network-benches/Cargo.toml
@@ -14,9 +14,10 @@ autobenches = false
 autograph = { workspace = true, default-features = false, features = ["neural-network"] }
 tch = { version = "0.12.0", optional = true }
 criterion = { version = "0.4.0", default-features = false }
-anyhow.workspace = true
+anyhow = { workspace = true }
 bytemuck = { workspace = true, optional = true }
 
+
 [dev-dependencies]
 num-format.workspace = true
 

diff --git a/benches/neural-network-benches/benches/benchmarks.rs b/benches/neural-network-benches/benches/benchmarks.rs
@@ -7,7 +7,7 @@ use num_format::{Locale, ToFormattedString};
 use std::str::FromStr;
 
 pub fn criterion_benchmark(c: &mut Criterion) {
-    let device_index = {
+    let device_index = if cfg!(feature = "device") {
         let krnl_device = std::env::var("KRNL_DEVICE");
         println!("KRNL_DEVICE = {krnl_device:?}");
         let device_index = if let Ok(krnl_device) = krnl_device.as_ref() {
@@ -17,6 +17,8 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         };
         println!("testing device {device_index}");
         device_index
+    } else {
+        0
     };
 
     #[cfg_attr(not(feature = "cuda"), allow(unused))]

diff --git a/benches/neural-network-benches/src/tch_backend.rs b/benches/neural-network-benches/src/tch_backend.rs
@@ -19,7 +19,7 @@ pub struct LeNet5Classifier {
 impl LeNet5Classifier {
     pub fn new(device: Device, kind: Kind) -> Result<Self> {
         let mut var_store = VarStore::new(device);
-        let model = Lenet5::new(&var_store);
+        let model = LeNet5::new(&var_store);
         var_store.set_kind(kind);
         Ok(Self {
             device,

diff --git a/examples/neural-network-mnist/src/main.rs b/examples/neural-network-mnist/src/main.rs
@@ -14,7 +14,7 @@ use autograph::{
             optimizer::{Optimizer, SGD},
         },
     },
-    ndarray::{ArcArray, ArcArray1, Axis, Dimension, Ix4},
+    ndarray::{self, ArcArray, ArcArray1, Axis, Dimension, Ix4},
     tensor::{CowTensor, ScalarTensor, Tensor, Tensor1, Tensor4},
 };
 use clap::{Parser, ValueEnum};
@@ -179,12 +179,11 @@ fn main() -> Result<()> {
     let start = Instant::now();
     for epoch in 1..=options.epochs {
         let epoch_start = Instant::now();
-        let train_iter = batches(
+        let train_iter = shuffled_batches(
             train_images.clone(),
             train_classes.clone(),
             device.clone(),
             options.train_batch_size,
-            true,
         );
         let train_stats = train(
             &mut model,
@@ -202,7 +201,6 @@ fn main() -> Result<()> {
             test_classes.clone(),
             device.clone(),
             options.test_batch_size,
-            false,
         );
         let test_stats = test(&model, image_scale, test_iter)?;
         let test_count = test_stats.count;
@@ -223,43 +221,47 @@ fn batches(
     classes: ArcArray1<u8>,
     device: Device,
     batch_size: usize,
-    shuffle: bool,
 ) -> impl Iterator<Item = Result<(Tensor4<u8>, Tensor1<u8>)>> {
-    let (sender, receiver) = crossbeam_channel::bounded(0);
-    std::thread::spawn(move || {
-        let (count, depth, height, width) = images.dim();
-        if shuffle {
-            let mut index_iter = sample(&mut thread_rng(), count, count).into_iter();
-            for _ in 0..count / batch_size {
-                let mut output_images =
-                    Vec::<u8>::with_capacity(batch_size * depth * height * width);
-                let mut output_classes = Vec::<u8>::with_capacity(batch_size);
-                for index in index_iter.by_ref().take(batch_size) {
-                    output_images
-                        .extend_from_slice(images.index_axis(Axis(0), index).as_slice().unwrap());
-                    output_classes.push(classes[index]);
-                }
-                let images = Tensor::from(output_images)
-                    .into_shape([batch_size, depth, height, width])
-                    .unwrap()
-                    .into_device(device.clone());
-                let classes = Tensor::from(output_classes).into_device(device.clone());
-                let result = images.and_then(|images| Ok((images, classes?)));
-                sender.send(result).unwrap();
-            }
-        } else {
-            for (images, classes) in images
-                .axis_chunks_iter(Axis(0), batch_size)
-                .zip(classes.axis_chunks_iter(Axis(0), batch_size))
-            {
-                let images = CowTensor::from(images).to_device(device.clone());
-                let classes = CowTensor::from(classes).to_device(device.clone());
-                let result = images.and_then(|images| Ok((images, classes?)));
-                sender.send(result).unwrap();
-            }
+    let (count, _inputs, _height, _width) = images.dim();
+    (0..count).step_by(batch_size).map(move |index| {
+        let end = (index + batch_size).min(count);
+        let images = images.slice_axis(
+            Axis(0),
+            ndarray::Slice::new(index as isize, Some(end as isize), 1),
+        );
+        let classes = classes.slice_axis(
+            Axis(0),
+            ndarray::Slice::new(index as isize, Some(end as isize), 1),
+        );
+        let images = CowTensor::from(images).to_device(device.clone());
+        let classes = CowTensor::from(classes).to_device(device.clone());
+        images.and_then(|images| Ok((images, classes?)))
+    })
+}
+
+fn shuffled_batches(
+    images: ArcArray<u8, Ix4>,
+    classes: ArcArray1<u8>,
+    device: Device,
+    batch_size: usize,
+) -> impl Iterator<Item = Result<(Tensor4<u8>, Tensor1<u8>)>> {
+    let (count, inputs, height, width) = images.dim();
+    let mut index_iter = sample(&mut thread_rng(), count, count).into_iter();
+    (0..count).step_by(batch_size).map(move |index| {
+        let batch_size = (index..count).take(batch_size).len();
+        let mut output_images = Vec::<u8>::with_capacity(batch_size * inputs * height * width);
+        let mut output_classes = Vec::<u8>::with_capacity(batch_size);
+        for index in index_iter.by_ref().take(batch_size) {
+            output_images.extend_from_slice(images.index_axis(Axis(0), index).as_slice().unwrap());
+            output_classes.push(classes[index]);
         }
-    });
-    receiver.into_iter()
+        let images = Tensor::from(output_images)
+            .into_shape([batch_size, inputs, height, width])
+            .unwrap()
+            .into_device(device.clone());
+        let classes = Tensor::from(output_classes).into_device(device.clone());
+        images.and_then(|images| Ok((images, classes?)))
+    })
 }
 
 #[derive(Default)]