Rust by Example

50 SIMD

Rust provides experimental support for SIMD vectors. These SIMD vectors are exposed as structs (f32x4, u8x16, etc.), that implement basic operations (+, -, *, etc) using SIMD instructions under the hood.

// simd.rs
#![feature(core)]

use std::simd::f32x4;

fn main() {
    // create simd vectors
    let x = f32x4(1.0, 2.0, 3.0, 4.0);
    let y = f32x4(4.0, 3.0, 2.0, 1.0);

    // simd product
    let z = x * y;

    // like any struct, the simd vector can be destructured using `let`
    let f32x4(a, b, c, d) = z;

    println!("{:?}", (a, b, c, d));
}
$ rustc simd.rs && ./simd
(4, 6, 6, 4)

Here's a more complex example that sums two Vec<f32>, using the f32x4 type to operate on 4-element chunks at a time.

// simd_add.rs
#![feature(test)]
#![feature(core)]

use std::simd::f32x4;

macro_rules! assert_equal_len {
    ($a:ident, $b: ident) => {
        assert!($a.len() == $b.len(),
                "add_assign: dimension mismatch: {:?} += {:?}",
                ($a.len(),),
                ($b.len(),));
    }
}

// element-wise addition
fn add_assign(xs: &mut Vec<f32>, ys: &Vec<f32>) {
    assert_equal_len!(xs, ys);

    for (x, y) in xs.iter_mut().zip(ys.iter()) {
        *x += *y;
    }
}

// simd accelerated addition
fn simd_add_assign(xs: &mut Vec<f32>, ys: &Vec<f32>) {
    assert_equal_len!(xs, ys);

    let size = xs.len() as isize;
    let chunks = size / 4;

    // pointer to the start of the vector data
    let p_x: *mut f32 = xs.as_mut_ptr();
    let p_y: *const f32 = ys.as_ptr();

    // sum excess elements that don't fit in the simd vector
    for i in (4 * chunks)..size {
        // dereferencing a raw pointer requires an unsafe block
        unsafe {
            // offset by i elements
            *p_x.offset(i) += *p_y.offset(i);
        }
    }

    // treat f32 vector as an simd f32x4 vector
    let simd_p_x = p_x as *mut f32x4;
    let simd_p_y = p_y as *const f32x4;

    // sum "simd vector"
    for i in 0..chunks {
        unsafe {
            *simd_p_x.offset(i) += *simd_p_y.offset(i);
        }
    }
}

mod bench {
    extern crate test;
    use self::test::Bencher;
    use std::iter;
    static BENCH_SIZE: usize = 10_000;

    macro_rules! bench {
        ($name:ident, $func:ident) => {
            #[bench]
            fn $name(b: &mut Bencher) {
                let mut x: Vec<_> = iter::repeat(1.0f32)
                                        .take(BENCH_SIZE)
                                        .collect();
                let y: Vec<_> = iter::repeat(1.0f32)
                                        .take(BENCH_SIZE)
                                        .collect();

                b.iter(|| {
                    super::$func(&mut x, &y);
                })
            }
        }
    }

    bench!(vanilla, add_assign);
    bench!(simd, simd_add_assign);
}

And here's the result of the benchmark:

$ rustc -O --test simd_add.rs && ./simd_add --bench
running 4 tests
test test::simd ... ignored
test test::vanilla ... ignored
test bench::simd    ... bench:      1852 ns/iter (+/- 17)
test bench::vanilla ... bench:      8346 ns/iter (+/- 103)

test result: ok. 0 passed; 0 failed; 2 ignored; 2 measured