From e09eb6498e54bd059cb8099d7801947cb7958fca Mon Sep 17 00:00:00 2001
From: yvt <i@yvt.jp>
Date: Fri, 9 Nov 2018 01:45:55 +0900
Subject: [PATCH] yFFT: Fix SEGV caused by the alignment issue of lookup tables

Tested with: rustc 1.32.0-nightly (25a42b2ce 2018-11-07)

jemalloc was removed from the standard library recently:
<https://github.com/rust-lang/rust/pull/55238>
As a result, we started seeing cases where the alignment requirements
enforced on allocated regions are weaker than before. Some code ceased
working because it erroneously relied on jemalloc's larger alignment
values.

This commit fixes this issue by introducing `AlignedVec`, a wrapper of
`Vec`, which provides access to a portion of `Vec` that meets an
alignment requirement required by SIMD operations, and by modifying
code in question to use `AlignedVec` in place of `Vec`.
---
 src/aligned.rs                      | 74 +++++++++++++++++++++++++++++
 src/kernel/x86/x86avxf32realfft.rs  |  3 +-
 src/kernel/x86/x86sse1realfft.rs    |  9 ++--
 src/kernel/x86/x86sse3f32realfft.rs |  3 +-
 src/lib.rs                          |  1 +
 5 files changed, 84 insertions(+), 6 deletions(-)
 create mode 100644 src/aligned.rs
diff --git a/src/aligned.rs b/src/aligned.rs
new file mode 100644
index 0000000..f7e9dd4
--- /dev/null
+++ b/src/aligned.rs
@@ -0,0 +1,74 @@
+//
+// Copyright 2018 yvt, all rights reserved.
+//
+// This source code is a part of Nightingales.
+//
+use std::{
+    fmt,
+    mem::size_of,
+    ops::{Deref, DerefMut},
+};
+
+/// The alignment value guaranteed by `AlignedVec`.
+const ALIGN: usize = 32;
+
+fn ptr_lsbs(x: usize) -> usize {
+    x & (ALIGN - 1)
+}
+
+/// Provides a subset of `Vec`'s interface while providing a minimum alignment
+/// guarantee that is convenient for SIMD operations.
+pub struct AlignedVec<T> {
+    storage: Vec<T>,
+    offset: usize,
+}
+
+impl<T: Copy + Default> AlignedVec<T> {
+    pub fn with_capacity(i: usize) -> Self {
+        debug_assert!(size_of::<T>() <= ALIGN);
+        debug_assert!(ALIGN % size_of::<T>() == 0);
+
+        let mut storage: Vec<T> = Vec::with_capacity(i + ALIGN / size_of::<T>() - 1);
+        let mut offset = 0;
+
+        // Increase the padding until the storage is aligned
+        while ptr_lsbs(storage.as_ptr().wrapping_add(offset) as _) != 0 {
+            storage.push(T::default());
+            offset += 1;
+
+            debug_assert!(offset < ALIGN / size_of::<T>());
+        }
+
+        Self { storage, offset }
+    }
+
+    pub fn push(&mut self, x: T) {
+        if self.storage.len() >= self.storage.capacity() {
+            panic!("collection is full");
+        }
+        self.storage.push(x);
+    }
+}
+
+impl<T: fmt::Debug> fmt::Debug for AlignedVec<T> {
+    fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+        fmt.debug_struct("AlignedVec")
+            .field("offset", &self.offset)
+            .field("entries", &&self[..])
+            .finish()
+    }
+}
+
+impl<T> Deref for AlignedVec<T> {
+    type Target = [T];
+
+    fn deref(&self) -> &Self::Target {
+        &self.storage[self.offset..]
+    }
+}
+
+impl<T> DerefMut for AlignedVec<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.storage[self.offset..]
+    }
+}
diff --git a/src/kernel/x86/x86avxf32realfft.rs b/src/kernel/x86/x86avxf32realfft.rs
index 085ec7a..5fc13e0 100644
--- a/src/kernel/x86/x86avxf32realfft.rs
+++ b/src/kernel/x86/x86avxf32realfft.rs
@@ -12,6 +12,7 @@ use std::f32;
 use std::mem;
 use std::ptr::{read_unaligned, write_unaligned};
 
+use aligned::AlignedVec;
 use simdutils::{avx_f32x8_bitxor, avx_f32x8_complex_mul_riri};
 use Num;
 
@@ -39,7 +40,7 @@ where
 #[derive(Debug)]
 struct AvxF32RealFFTPrePostProcessKernel {
     len: usize,
-    table: [Vec<f32>; 2],
+    table: [AlignedVec<f32>; 2],
     inverse: bool,
 }
 
diff --git a/src/kernel/x86/x86sse1realfft.rs b/src/kernel/x86/x86sse1realfft.rs
index 5d54481..a1a3083 100644
--- a/src/kernel/x86/x86sse1realfft.rs
+++ b/src/kernel/x86/x86sse1realfft.rs
@@ -12,6 +12,7 @@ use std::f32;
 use std::mem;
 use std::ptr::{read_unaligned, write_unaligned};
 
+use aligned::AlignedVec;
 use simdutils::{f32x4_bitxor, f32x4_complex_mul_rrii};
 use {mul_pos_i, Complex, Num};
 
@@ -34,10 +35,10 @@ where
     })
 }
 
-pub(super) fn new_real_fft_coef_table(len: usize, inverse: bool) -> [Vec<f32>; 2] {
+pub(super) fn new_real_fft_coef_table(len: usize, inverse: bool) -> [AlignedVec<f32>; 2] {
     assert!(len % 2 == 0);
-    let mut table_a = Vec::with_capacity(len);
-    let mut table_b = Vec::with_capacity(len);
+    let mut table_a = AlignedVec::with_capacity(len);
+    let mut table_b = AlignedVec::with_capacity(len);
     for i in 0..(len / 2) {
         let c = Complex::new(0f32, (i as f32) * -f32::consts::PI / (len / 2) as f32).exp();
 
@@ -61,7 +62,7 @@ pub(super) fn new_real_fft_coef_table(len: usize, inverse: bool) -> [Vec<f32>; 2
 #[derive(Debug)]
 struct SseRealFFTPrePostProcessKernel {
     len: usize,
-    table: [Vec<f32>; 2],
+    table: [AlignedVec<f32>; 2],
     inverse: bool,
 }
 
diff --git a/src/kernel/x86/x86sse3f32realfft.rs b/src/kernel/x86/x86sse3f32realfft.rs
index c4c9627..b857f38 100644
--- a/src/kernel/x86/x86sse3f32realfft.rs
+++ b/src/kernel/x86/x86sse3f32realfft.rs
@@ -12,6 +12,7 @@ use std::f32;
 use std::mem;
 use std::ptr::{read_unaligned, write_unaligned};
 
+use aligned::AlignedVec;
 use simdutils::{f32x4_bitxor, sse3_f32x4_complex_mul_riri};
 use Num;
 
@@ -39,7 +40,7 @@ where
 #[derive(Debug)]
 struct Sse3F32RealFFTPrePostProcessKernel {
     len: usize,
-    table: [Vec<f32>; 2],
+    table: [AlignedVec<f32>; 2],
     inverse: bool,
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index f3bd6aa..4538660 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -42,6 +42,7 @@ use num_complex::Complex;
 
 #[macro_use]
 mod simdutils;
+mod aligned;
 mod env;
 mod kernel;
 mod setup;