39#include "ojph_simd_vsx.h"
57 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
74 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
91 for (; width > 0; width -= 4,
92 sp = (
const char*)sp + 32,
93 dpl = (
char*)dpl + 16,
94 dph = (
char*)dph + 16)
110 for (; width > 0; width -= 4,
112 spl = (
const char*)spl + 16,
113 sph = (
const char*)sph + 16)
128 for (; width > 0; width -= 4, p += 4)
138 ui32 repeat,
bool synthesis)
146 float* dst = aug->
f32;
147 const float* src1 = sig->
f32, * src2 = other->
f32;
149 for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
168 ui32 width,
bool even)
174 float* dpl = even ? ldst->
f32 : hdst->
f32;
175 float* dph = even ? hdst->
f32 : ldst->
f32;
176 float* sp = src->
f32;
182 float* hp = hdst->
f32, * lp = ldst->
f32;
183 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
184 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
186 for (
ui32 j = num_steps; j > 0; --j)
193 lp[l_width] = lp[l_width - 1];
195 const float* sp = lp;
197 int i = (int)h_width;
201 for (; i > 0; i -= 4, sp += 4, dp += 4)
212 for (; i > 0; i -= 4, sp += 4, dp += 4)
223 float* t = lp; lp = hp; hp = t;
225 ui32 w = l_width; l_width = h_width; h_width = w;
229 float K = atk->
get_K();
230 float K_inv = 1.0f / K;
237 ldst->
f32[0] = src->
f32[0];
239 hdst->
f32[0] = src->
f32[0] * 2.0f;
246 ui32 width,
bool even)
251 float* oth = hsrc->
f32, * aug = lsrc->
f32;
252 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
253 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
256 float K = atk->
get_K();
257 float K_inv = 1.0f / K;
264 for (
ui32 j = 0; j < num_steps; ++j)
271 oth[oth_width] = oth[oth_width - 1];
273 const float* sp = oth;
275 int i = (int)aug_width;
279 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
290 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
301 float* t = aug; aug = oth; oth = t;
303 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
308 float* dp = dst->
f32;
309 float* spl = even ? lsrc->
f32 : hsrc->
f32;
310 float* sph = even ? hsrc->
f32 : lsrc->
f32;
317 dst->
f32[0] = lsrc->
f32[0];
319 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
326 ui32 repeat,
bool synthesis)
335 const si32* src1 = sig->
i32, * src2 = other->
i32;
343 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
355 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
367 else if (a == -1 && b == 1 && e == 1)
371 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
382 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
397 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
409 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
425 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
438 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
456 ui32 repeat,
bool synthesis)
465 const si64* src1 = sig->
i64, * src2 = other->
i64;
473 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
485 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
497 else if (a == -1 && b == 1 && e == 1)
501 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
512 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
527 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
539 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
555 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
568 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
586 ui32 repeat,
bool synthesis)
610 ui32 width,
bool even)
616 float* dpl = even ? ldst->
f32 : hdst->
f32;
617 float* dph = even ? hdst->
f32 : ldst->
f32;
618 float* sp = src->
f32;
624 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
625 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
627 for (
ui32 j = num_steps; j > 0; --j)
639 lp[l_width] = lp[l_width - 1];
645 int i = (int)h_width;
648 for (; i > 0; i -= 4, sp += 4, dp += 4)
662 for (; i > 0; i -= 4, sp += 4, dp += 4)
675 else if (a == -1 && b == 1 && e == 1)
677 int i = (int)h_width;
679 for (; i > 0; i -= 4, sp += 4, dp += 4)
690 for (; i > 0; i -= 4, sp += 4, dp += 4)
703 int i = (int)h_width;
705 for (; i > 0; i -= 4, sp += 4, dp += 4)
717 for (; i > 0; i -= 4, sp += 4, dp += 4)
731 int i = (int)h_width;
733 for (; i > 0; i -= 4, sp += 4, dp += 4)
746 for (; i > 0; i -= 4, sp += 4, dp += 4)
761 si32* t = lp; lp = hp; hp = t;
763 ui32 w = l_width; l_width = h_width; h_width = w;
768 ldst->
i32[0] = src->
i32[0];
770 hdst->
i32[0] = src->
i32[0] << 1;
778 ui32 width,
bool even)
784 void* dpl = even ? ldst->
p : hdst->
p;
785 void* dph = even ? hdst->
p : ldst->
p;
786 const void* sp = src->
p;
792 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
793 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
795 for (
ui32 j = num_steps; j > 0; --j)
807 lp[l_width] = lp[l_width - 1];
813 int i = (int)h_width;
816 for (; i > 0; i -= 2, sp += 2, dp += 2)
830 for (; i > 0; i -= 2, sp += 2, dp += 2)
843 else if (a == -1 && b == 1 && e == 1)
845 int i = (int)h_width;
847 for (; i > 0; i -= 2, sp += 2, dp += 2)
858 for (; i > 0; i -= 2, sp += 2, dp += 2)
871 int i = (int)h_width;
873 for (; i > 0; i -= 2, sp += 2, dp += 2)
885 for (; i > 0; i -= 2, sp += 2, dp += 2)
899 int i = (int)h_width;
901 for (; i > 0; i -= 2, sp += 2, dp += 2)
914 for (; i > 0; i -= 2, sp += 2, dp += 2)
929 si64* t = lp; lp = hp; hp = t;
931 ui32 w = l_width; l_width = h_width; h_width = w;
936 ldst->
i64[0] = src->
i64[0];
938 hdst->
i64[0] = src->
i64[0] << 1;
945 ui32 width,
bool even)
965 ui32 width,
bool even)
971 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
972 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
974 for (
ui32 j = 0; j < num_steps; ++j)
985 oth[oth_width] = oth[oth_width - 1];
987 const si32* sp = oth;
991 int i = (int)aug_width;
994 for (; i > 0; i -= 4, sp += 4, dp += 4)
1008 for (; i > 0; i -= 4, sp += 4, dp += 4)
1021 else if (a == -1 && b == 1 && e == 1)
1023 int i = (int)aug_width;
1025 for (; i > 0; i -= 4, sp += 4, dp += 4)
1036 for (; i > 0; i -= 4, sp += 4, dp += 4)
1049 int i = (int)aug_width;
1051 for (; i > 0; i -= 4, sp += 4, dp += 4)
1063 for (; i > 0; i -= 4, sp += 4, dp += 4)
1077 int i = (int)aug_width;
1079 for (; i > 0; i -= 4, sp += 4, dp += 4)
1092 for (; i > 0; i -= 4, sp += 4, dp += 4)
1107 si32* t = aug; aug = oth; oth = t;
1109 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1114 float* dp = dst->
f32;
1115 float* spl = even ? lsrc->
f32 : hsrc->
f32;
1116 float* sph = even ? hsrc->
f32 : lsrc->
f32;
1123 dst->
i32[0] = lsrc->
i32[0];
1125 dst->
i32[0] = hsrc->
i32[0] >> 1;
1132 ui32 width,
bool even)
1138 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1139 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1141 for (
ui32 j = 0; j < num_steps; ++j)
1152 oth[oth_width] = oth[oth_width - 1];
1154 const si64* sp = oth;
1158 int i = (int)aug_width;
1161 for (; i > 0; i -= 2, sp += 2, dp += 2)
1175 for (; i > 0; i -= 2, sp += 2, dp += 2)
1188 else if (a == -1 && b == 1 && e == 1)
1190 int i = (int)aug_width;
1192 for (; i > 0; i -= 2, sp += 2, dp += 2)
1203 for (; i > 0; i -= 2, sp += 2, dp += 2)
1216 int i = (int)aug_width;
1218 for (; i > 0; i -= 2, sp += 2, dp += 2)
1230 for (; i > 0; i -= 2, sp += 2, dp += 2)
1244 int i = (int)aug_width;
1246 for (; i > 0; i -= 2, sp += 2, dp += 2)
1259 for (; i > 0; i -= 2, sp += 2, dp += 2)
1274 si64* t = aug; aug = oth; oth = t;
1276 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1282 const void* spl = even ? lsrc->
p : hsrc->
p;
1283 const void* sph = even ? hsrc->
p : lsrc->
p;
1290 dst->
i64[0] = lsrc->
i64[0];
1292 dst->
i64[0] = hsrc->
i64[0] >> 1;
1299 ui32 width,
bool even)
void vsx_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
static void vsx_multiply_const(float *p, float f, int width)
static void vsx_rev_horz_ana64(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void vsx_rev_horz_syn64(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void vsx_deinterleave32(float *dpl, float *dph, float *sp, int width)
void vsx_rev_vert_step32(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void vsx_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
static void vsx_rev_horz_ana32(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
static void vsx_interleave32(float *dp, float *spl, float *sph, int width)
void vsx_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void vsx_rev_horz_syn32(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void vsx_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void vsx_rev_vert_step64(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void vsx_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void vsx_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void vsx_deinterleave64(void *dpl, void *dph, const void *sp, int width)
void vsx_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
static void vsx_interleave64(void *dp, const void *spl, const void *sph, int width)
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static v128_t vsx_f32x4_sub(v128_t a, v128_t b)
static v128_t vsx_f32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_splat(float x)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
__vector unsigned char v128_t
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_i32x4_mul(v128_t a, v128_t b)
#define vsx_i64x2_shuffle(a, b, c0, c1)
static v128_t vsx_i64x2_mul(v128_t a, v128_t b)
#define vsx_i32x4_shuffle(a, b, c0, c1, c2, c3)
static v128_t vsx_i64x2_splat(long long x)
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i64x2_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i32x4_shr(v128_t a, int n)
ui32 get_num_steps() const
const lifting_step * get_step(ui32 s) const