use super::Cpu; use core::arch::wasm32::*; pub struct CurrentCpu {} const STEP: usize = 16; const EPR: usize = 4; const ARR: usize = STEP / EPR; impl Cpu for CurrentCpu { type Unit = v128; type Array = [v128; ARR]; const STEP: usize = STEP; const EPR: usize = EPR; fn n() -> usize { ARR } unsafe fn zero() -> Self::Unit { f32x4_splat(0.0) } unsafe fn zero_array() -> Self::Array { [Self::zero(); ARR] } unsafe fn from_f32(v: f32) -> Self::Unit { f32x4_splat(v) } unsafe fn load(mem_addr: *const f32) -> Self::Unit { v128_load(mem_addr as *mut v128) } unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit { f32x4_add(a, b) } unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit { f32x4_add(f32x4_mul(b, c), a) } unsafe fn vec_store(mem_addr: *mut f32, a: Self::Unit) { v128_store(mem_addr as *mut v128, a); } unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) { for i in 0..ARR / 2 { x[2 * i] = f32x4_add(x[2 * i], x[2 * i + 1]); } for i in 0..ARR / 4 { x[4 * i] = f32x4_add(x[4 * i], x[4 * i + 2]); } for i in 0..ARR / 8 { x[8 * i] = f32x4_add(x[8 * i], x[8 * i + 4]); } *y = f32x4_extract_lane::<0>(x[0]) + f32x4_extract_lane::<1>(x[0]) + f32x4_extract_lane::<2>(x[0]) + f32x4_extract_lane::<3>(x[0]); } }