42#ifndef OJPH_SIMD_VSX_H
43#define OJPH_SIMD_VSX_H
45#if !defined(__powerpc64__) && !defined(__PPC64__)
46 #error "this header is for 64-bit POWER targets only"
48#if !defined(__LITTLE_ENDIAN__) && \
49 !(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
50 #error "this header assumes a little-endian target (ppc64le)"
62typedef __vector
unsigned char v128_t;
78{
return vec_xl(0, (
const unsigned char *)p); }
81{ vec_xst(a, 0, (
unsigned char *)p); }
83#define vsx_v128_store32_lane(p, a, i) \
84 do { vsx_v_i32 t_ = (vsx_v_i32)(a); int v_ = t_[(i)]; \
85 std::memcpy((p), &v_, 4); } while (0)
93 signed char c0,
signed char c1,
signed char c2,
signed char c3,
94 signed char c4,
signed char c5,
signed char c6,
signed char c7,
95 signed char c8,
signed char c9,
signed char c10,
signed char c11,
96 signed char c12,
signed char c13,
signed char c14,
signed char c15)
97{
vsx_v_i8 v = {c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15};
100 short c3,
short c4,
short c5,
104 unsigned short c2,
unsigned short c3,
105 unsigned short c4,
unsigned short c5,
106 unsigned short c6,
unsigned short c7)
111 unsigned int c2,
unsigned int c3)
116 unsigned long long c1)
120{
return (
v128_t)vec_splats(x); }
122{
return (
v128_t)vec_splats(x); }
124{
return (
v128_t)vec_splats(x); }
126{
return (
v128_t)vec_splats(x); }
128{
return (
v128_t)vec_splats((
signed long long)x); }
130{
return (
v128_t)vec_splats(x); }
138#define vsx_u8x16_extract_lane(a, i) (((vsx_v_u8)(a))[(i)])
139#define vsx_u16x8_extract_lane(a, i) (((vsx_v_u16)(a))[(i)])
140#define vsx_i32x4_extract_lane(a, i) (((vsx_v_i32)(a))[(i)])
141#define vsx_u32x4_extract_lane(a, i) (((vsx_v_u32)(a))[(i)])
142#define vsx_i64x2_extract_lane(a, i) (((vsx_v_i64)(a))[(i)])
148{
return vec_and(a, b); }
150{
return vec_or(a, b); }
152{
return vec_xor(a, b); }
155{
return vec_andc(a, b); }
201 vec_splats((
unsigned long long)n)); }
207 vec_splats((
unsigned long long)n)); }
215 vec_splats((
unsigned long long)n)); }
264 return (
v128_t)__builtin_convertvector(
265 __builtin_shufflevector(v, v, 0, 1),
vsx_v_i64);
270 return (
v128_t)__builtin_convertvector(
271 __builtin_shufflevector(v, v, 2, 3),
vsx_v_i64);
277#define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7, \
278 c8,c9,c10,c11,c12,c13,c14,c15) \
279 ((v128_t)__builtin_shufflevector((vsx_v_u8)(a), (vsx_v_u8)(b), \
280 c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15))
281#define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7) \
282 ((v128_t)__builtin_shufflevector((vsx_v_i16)(a), (vsx_v_i16)(b), \
283 c0,c1,c2,c3,c4,c5,c6,c7))
284#define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3) \
285 ((v128_t)__builtin_shufflevector((vsx_v_i32)(a), (vsx_v_i32)(b), \
287#define vsx_i64x2_shuffle(a, b, c0,c1) \
288 ((v128_t)__builtin_shufflevector((vsx_v_i64)(a), (vsx_v_i64)(b), c0,c1))
295 v128_t r = vec_perm(a, a, idx);
297 vec_splats((
unsigned char)15));
298 return vec_andc(r, oob);
308#if defined(__POWER10_VECTOR__)
309 return (
int)vec_extractm(a);
311 const vsx_v_u8 perm = { 120, 112, 104, 96, 88, 80, 72, 64,
312 56, 48, 40, 32, 24, 16, 8, 0 };
static v128_t vsx_i8x16_splat(signed char x)
__vector unsigned int vsx_v_u32
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_make(int a, int b, int c, int d)
static v128_t vsx_i16x8_sub(v128_t a, v128_t b)
static v128_t vsx_u16x8_shr(v128_t a, int n)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static int vsx_i8x16_bitmask(v128_t a)
__vector signed int vsx_v_i32
static v128_t vsx_i8x16_swizzle(v128_t a, v128_t idx)
static v128_t vsx_f32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i8x16_const(signed char c0, signed char c1, signed char c2, signed char c3, signed char c4, signed char c5, signed char c6, signed char c7, signed char c8, signed char c9, signed char c10, signed char c11, signed char c12, signed char c13, signed char c14, signed char c15)
static v128_t vsx_i64x2_lt(v128_t a, v128_t b)
static v128_t vsx_f32x4_convert_i32x4(v128_t a)
static v128_t vsx_f32x4_add(v128_t a, v128_t b)
static v128_t vsx_v128_xor(v128_t a, v128_t b)
__vector signed char vsx_v_i8
static v128_t vsx_i16x8_mul(v128_t a, v128_t b)
static v128_t vsx_u32x4_shr(v128_t a, int n)
__vector signed long long vsx_v_i64
static v128_t vsx_i16x8_splat(short x)
static v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1, unsigned int c2, unsigned int c3)
static v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3)
static v128_t vsx_u8x16_min(v128_t a, v128_t b)
static v128_t vsx_f32x4_ge(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_high_i32x4(v128_t a)
static v128_t vsx_f32x4_splat(float x)
__vector signed short vsx_v_i16
static v128_t vsx_u64x2_const(unsigned long long c0, unsigned long long c1)
static v128_t vsx_i32x4_shl(v128_t a, int n)
static v128_t vsx_i16x8_max(v128_t a, v128_t b)
static v128_t vsx_i64x2_const(long long c0, long long c1)
static v128_t vsx_i8x16_abs(v128_t a)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_lt(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_low_i32x4(v128_t a)
__vector unsigned char v128_t
__vector unsigned char vsx_v_u8
static v128_t vsx_i8x16_add(v128_t a, v128_t b)
static v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
static v128_t vsx_i16x8_add(v128_t a, v128_t b)
static v128_t vsx_v128_andnot(v128_t a, v128_t b)
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_u64x2_shr(v128_t a, int n)
static v128_t vsx_v128_and(v128_t a, v128_t b)
static v128_t vsx_i64x2_shl(v128_t a, int n)
static v128_t vsx_i32x4_mul(v128_t a, v128_t b)
static v128_t vsx_i16x8_eq(v128_t a, v128_t b)
static v128_t vsx_v128_or(v128_t a, v128_t b)
static v128_t vsx_i32x4_lt(v128_t a, v128_t b)
__vector unsigned short vsx_v_u16
static v128_t vsx_i8x16_eq(v128_t a, v128_t b)
static v128_t vsx_i64x2_mul(v128_t a, v128_t b)
static v128_t vsx_i64x2_splat(long long x)
__vector unsigned long long vsx_v_u64
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i64x2_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i16x8_const(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
static v128_t vsx_i32x4_shr(v128_t a, int n)
static v128_t vsx_i16x8_shl(v128_t a, int n)
static v128_t vsx_u32x4_splat(unsigned int x)
static v128_t vsx_i32x4_trunc_sat_f32x4(v128_t a)
static v128_t vsx_i8x16_gt(v128_t a, v128_t b)
static v128_t vsx_i32x4_eq(v128_t a, v128_t b)
static v128_t vsx_i32x4_gt(v128_t a, v128_t b)