Skip to content

Commit

Permalink
Merge pull request #53 from charles-r-earp/conv-direct
Browse files Browse the repository at this point in the history
Conv direct
  • Loading branch information
charles-r-earp authored Feb 1, 2024
2 parents 0c384c1 + dfaaa30 commit 16b044b
Show file tree
Hide file tree
Showing 16 changed files with 7,291 additions and 4,058 deletions.
7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,16 @@ half = { version = "=2.1.0", features = ["num-traits", "bytemuck", "serde"] }
http = { version = "0.2.4", optional = true }
paste = "1.0.7"
dry = "0.1.1"
crunchy = { version = "0.2.2", optional = true }
crunchy = { version = "0.2.2" }
crossbeam-channel = { workspace = true, optional = true }
parking_lot = { workspace = true, optional = true }
rayon.workspace = true
once_cell = { version = "1.17.1", optional = true, features = ["std"] }
num-traits = "0.2.15"
smallvec = { version = "1.11.1", optional = true }
matrixmultiply_mt = { version = "0.2.1", optional = true }
matrixmultiply = { version = "0.3.8", optional = true }
wide = "0.7.13"

[dev-dependencies]
approx = "0.4.0"
Expand All @@ -82,7 +85,7 @@ libtest-mimic = "0.6.0"

[features]
default = ["device"]
device = ["krnl/device", "dep:crunchy", "dep:once_cell"]
device = ["krnl/device", "dep:once_cell"]
dataset = ["dep:rand"]
iris = []
mnist = ["dataset", "dep:dirs", "dep:flate2", "dep:downloader", "dep:byteorder", "dep:http"]
Expand Down
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,19 +109,19 @@ See the [Neural Network MNIST](examples/neural-network-mnist) example.

| | `autograph` | `tch` |
|:------------------|:--------------------------|:-------------------------------- |
| **`bf16_host`** | `591.07 ms` (✅ **1.00x**) | `76.58 ms` (🚀 **7.72x faster**) |
| **`f32_host`** | `16.60 ms` (✅ **1.00x**) | `3.18 ms` (🚀 **5.22x faster**) |
| **`bf16_device`** | `7.44 ms` (✅ **1.00x**) | `21.09 ms` (❌ *2.84x slower*) |
| **`f32_device`** | `1.56 ms` (✅ **1.00x**) | `3.94 ms` (*2.52x slower*) |
| **`bf16_host`** | `494.98 ms` (✅ **1.00x**) | `78.29 ms` (🚀 **6.32x faster**) |
| **`f32_host`** | `7.21 ms` (✅ **1.00x**) | `3.15 ms` (🚀 **2.28x faster**) |
| **`bf16_device`** | `10.12 ms` (✅ **1.00x**) | `17.65 ms` (❌ *1.74x slower*) |
| **`f32_device`** | `1.71 ms` (✅ **1.00x**) | `1.19 ms` (**1.43x faster**) |

## LeNet5(inference, batch_size = 1,000)

| | `autograph` | `tch` |
|:------------------|:--------------------------|:---------------------------------- |
| **`bf16_host`** | `2.14 s` (✅ **1.00x**) | `196.52 ms` (🚀 **10.87x faster**) |
| **`f32_host`** | `104.09 ms` (✅ **1.00x**) | `9.15 ms` (🚀 **11.38x faster**) |
| **`bf16_device`** | `4.31 ms` (✅ **1.00x**) | `48.74 ms` (❌ *11.31x slower*) |
| **`f32_device`** | `4.34 ms` (✅ **1.00x**) | `1.85 ms` (🚀 **2.35x faster**) |
| | `autograph` | `tch` |
|:------------------|:-------------------------|:--------------------------------- |
| **`bf16_host`** | `1.82 s` (✅ **1.00x**) | `197.40 ms` (🚀 **9.23x faster**) |
| **`f32_host`** | `16.96 ms` (✅ **1.00x**) | `9.49 ms` ( **1.79x faster**) |
| **`bf16_device`** | `4.61 ms` (✅ **1.00x**) | `48.71 ms` (❌ *10.57x slower*) |
| **`f32_device`** | `4.60 ms` (✅ **1.00x**) | `1.84 ms` (🚀 **2.49x faster**) |

See the [Neural Network](benches/neural-network-benches) benchmark.

Expand Down
3 changes: 2 additions & 1 deletion benches/neural-network-benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ autobenches = false
autograph = { workspace = true, default-features = false, features = ["neural-network"] }
tch = { version = "0.12.0", optional = true }
criterion = { version = "0.4.0", default-features = false }
anyhow.workspace = true
anyhow = { workspace = true }
bytemuck = { workspace = true, optional = true }


[dev-dependencies]
num-format.workspace = true

Expand Down
4 changes: 3 additions & 1 deletion benches/neural-network-benches/benches/benchmarks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use num_format::{Locale, ToFormattedString};
use std::str::FromStr;

pub fn criterion_benchmark(c: &mut Criterion) {
let device_index = {
let device_index = if cfg!(feature = "device") {
let krnl_device = std::env::var("KRNL_DEVICE");
println!("KRNL_DEVICE = {krnl_device:?}");
let device_index = if let Ok(krnl_device) = krnl_device.as_ref() {
Expand All @@ -17,6 +17,8 @@ pub fn criterion_benchmark(c: &mut Criterion) {
};
println!("testing device {device_index}");
device_index
} else {
0
};

#[cfg_attr(not(feature = "cuda"), allow(unused))]
Expand Down
2 changes: 1 addition & 1 deletion benches/neural-network-benches/src/tch_backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub struct LeNet5Classifier {
impl LeNet5Classifier {
pub fn new(device: Device, kind: Kind) -> Result<Self> {
let mut var_store = VarStore::new(device);
let model = Lenet5::new(&var_store);
let model = LeNet5::new(&var_store);
var_store.set_kind(kind);
Ok(Self {
device,
Expand Down
80 changes: 41 additions & 39 deletions examples/neural-network-mnist/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use autograph::{
optimizer::{Optimizer, SGD},
},
},
ndarray::{ArcArray, ArcArray1, Axis, Dimension, Ix4},
ndarray::{self, ArcArray, ArcArray1, Axis, Dimension, Ix4},
tensor::{CowTensor, ScalarTensor, Tensor, Tensor1, Tensor4},
};
use clap::{Parser, ValueEnum};
Expand Down Expand Up @@ -179,12 +179,11 @@ fn main() -> Result<()> {
let start = Instant::now();
for epoch in 1..=options.epochs {
let epoch_start = Instant::now();
let train_iter = batches(
let train_iter = shuffled_batches(
train_images.clone(),
train_classes.clone(),
device.clone(),
options.train_batch_size,
true,
);
let train_stats = train(
&mut model,
Expand All @@ -202,7 +201,6 @@ fn main() -> Result<()> {
test_classes.clone(),
device.clone(),
options.test_batch_size,
false,
);
let test_stats = test(&model, image_scale, test_iter)?;
let test_count = test_stats.count;
Expand All @@ -223,43 +221,47 @@ fn batches(
classes: ArcArray1<u8>,
device: Device,
batch_size: usize,
shuffle: bool,
) -> impl Iterator<Item = Result<(Tensor4<u8>, Tensor1<u8>)>> {
let (sender, receiver) = crossbeam_channel::bounded(0);
std::thread::spawn(move || {
let (count, depth, height, width) = images.dim();
if shuffle {
let mut index_iter = sample(&mut thread_rng(), count, count).into_iter();
for _ in 0..count / batch_size {
let mut output_images =
Vec::<u8>::with_capacity(batch_size * depth * height * width);
let mut output_classes = Vec::<u8>::with_capacity(batch_size);
for index in index_iter.by_ref().take(batch_size) {
output_images
.extend_from_slice(images.index_axis(Axis(0), index).as_slice().unwrap());
output_classes.push(classes[index]);
}
let images = Tensor::from(output_images)
.into_shape([batch_size, depth, height, width])
.unwrap()
.into_device(device.clone());
let classes = Tensor::from(output_classes).into_device(device.clone());
let result = images.and_then(|images| Ok((images, classes?)));
sender.send(result).unwrap();
}
} else {
for (images, classes) in images
.axis_chunks_iter(Axis(0), batch_size)
.zip(classes.axis_chunks_iter(Axis(0), batch_size))
{
let images = CowTensor::from(images).to_device(device.clone());
let classes = CowTensor::from(classes).to_device(device.clone());
let result = images.and_then(|images| Ok((images, classes?)));
sender.send(result).unwrap();
}
let (count, _inputs, _height, _width) = images.dim();
(0..count).step_by(batch_size).map(move |index| {
let end = (index + batch_size).min(count);
let images = images.slice_axis(
Axis(0),
ndarray::Slice::new(index as isize, Some(end as isize), 1),
);
let classes = classes.slice_axis(
Axis(0),
ndarray::Slice::new(index as isize, Some(end as isize), 1),
);
let images = CowTensor::from(images).to_device(device.clone());
let classes = CowTensor::from(classes).to_device(device.clone());
images.and_then(|images| Ok((images, classes?)))
})
}

fn shuffled_batches(
images: ArcArray<u8, Ix4>,
classes: ArcArray1<u8>,
device: Device,
batch_size: usize,
) -> impl Iterator<Item = Result<(Tensor4<u8>, Tensor1<u8>)>> {
let (count, inputs, height, width) = images.dim();
let mut index_iter = sample(&mut thread_rng(), count, count).into_iter();
(0..count).step_by(batch_size).map(move |index| {
let batch_size = (index..count).take(batch_size).len();
let mut output_images = Vec::<u8>::with_capacity(batch_size * inputs * height * width);
let mut output_classes = Vec::<u8>::with_capacity(batch_size);
for index in index_iter.by_ref().take(batch_size) {
output_images.extend_from_slice(images.index_axis(Axis(0), index).as_slice().unwrap());
output_classes.push(classes[index]);
}
});
receiver.into_iter()
let images = Tensor::from(output_images)
.into_shape([batch_size, inputs, height, width])
.unwrap()
.into_device(device.clone());
let classes = Tensor::from(output_classes).into_device(device.clone());
images.and_then(|images| Ok((images, classes?)))
})
}

#[derive(Default)]
Expand Down
Loading

0 comments on commit 16b044b

Please sign in to comment.