39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
59 static inline __m128i sse2_mm_srai_epi64(__m128i a,
int amt, __m128i m)
63 __m128i x = _mm_srli_epi64(a, amt);
64 x = _mm_xor_si128(x, m);
65 __m128i result = _mm_sub_epi64(x, m);
71 void sse2_deinterleave32(
float* dpl,
float* dph,
float* sp,
int width)
73 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
75 __m128 a = _mm_load_ps(sp);
76 __m128 b = _mm_load_ps(sp + 4);
77 __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
78 __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
86 void sse2_interleave32(
float* dp,
float* spl,
float* sph,
int width) \
88 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
90 __m128 a = _mm_load_ps(spl);
91 __m128 b = _mm_load_ps(sph);
92 __m128 c = _mm_unpacklo_ps(a, b);
93 __m128 d = _mm_unpackhi_ps(a, b);
95 _mm_store_ps(dp + 4, d);
101 void sse2_deinterleave64(
void* dpl,
void* dph,
const void* sp,
int width)
103 for (; width > 0; width -= 4,
104 sp = (
const char*)sp + 32,
105 dpl = (
char*)dpl + 16,
106 dph = (
char*)dph + 16)
108 __m128i a = _mm_load_si128((
const __m128i*)sp);
109 __m128i b = _mm_load_si128((
const __m128i*)((
const char*)sp + 16));
110 __m128i c = _mm_unpacklo_epi64(a, b);
111 __m128i d = _mm_unpackhi_epi64(a, b);
112 _mm_store_si128((__m128i*)dpl, c);
113 _mm_store_si128((__m128i*)dph, d);
119 void sse2_interleave64(
void* dp,
const void* spl,
const void* sph,
122 for (; width > 0; width -= 4,
124 spl = (
const char*)spl + 16,
125 sph = (
const char*)sph + 16)
127 __m128i a = _mm_load_si128((
const __m128i*)spl);
128 __m128i b = _mm_load_si128((
const __m128i*)sph);
129 __m128i c = _mm_unpacklo_epi64(a, b);
130 __m128i d = _mm_unpackhi_epi64(a, b);
131 _mm_store_si128((__m128i*)dp, c);
132 _mm_store_si128((__m128i*)((
char*)dp + 16), d);
138 void sse2_rev_vert_step32(
const lifting_step* s,
const line_buf* sig,
139 const line_buf* other,
const line_buf* aug,
140 ui32 repeat,
bool synthesis)
142 const si32 a = s->rev.Aatk;
143 const si32 b = s->rev.Batk;
144 const ui8 e = s->rev.Eatk;
145 __m128i vb = _mm_set1_epi32(b);
147 si32* dst = aug->i32;
148 const si32* src1 = sig->i32, * src2 = other->i32;
156 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
158 __m128i s1 = _mm_load_si128((__m128i*)src1);
159 __m128i s2 = _mm_load_si128((__m128i*)src2);
160 __m128i d = _mm_load_si128((__m128i*)dst);
161 __m128i t = _mm_add_epi32(s1, s2);
162 __m128i v = _mm_add_epi32(vb, t);
163 __m128i w = _mm_srai_epi32(v, e);
164 d = _mm_sub_epi32(d, w);
165 _mm_store_si128((__m128i*)dst, d);
168 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
170 __m128i s1 = _mm_load_si128((__m128i*)src1);
171 __m128i s2 = _mm_load_si128((__m128i*)src2);
172 __m128i d = _mm_load_si128((__m128i*)dst);
173 __m128i t = _mm_add_epi32(s1, s2);
174 __m128i v = _mm_add_epi32(vb, t);
175 __m128i w = _mm_srai_epi32(v, e);
176 d = _mm_add_epi32(d, w);
177 _mm_store_si128((__m128i*)dst, d);
180 else if (a == -1 && b == 1 && e == 1)
184 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
186 __m128i s1 = _mm_load_si128((__m128i*)src1);
187 __m128i s2 = _mm_load_si128((__m128i*)src2);
188 __m128i d = _mm_load_si128((__m128i*)dst);
189 __m128i t = _mm_add_epi32(s1, s2);
190 __m128i w = _mm_srai_epi32(t, e);
191 d = _mm_add_epi32(d, w);
192 _mm_store_si128((__m128i*)dst, d);
195 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
197 __m128i s1 = _mm_load_si128((__m128i*)src1);
198 __m128i s2 = _mm_load_si128((__m128i*)src2);
199 __m128i d = _mm_load_si128((__m128i*)dst);
200 __m128i t = _mm_add_epi32(s1, s2);
201 __m128i w = _mm_srai_epi32(t, e);
202 d = _mm_sub_epi32(d, w);
203 _mm_store_si128((__m128i*)dst, d);
210 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
212 __m128i s1 = _mm_load_si128((__m128i*)src1);
213 __m128i s2 = _mm_load_si128((__m128i*)src2);
214 __m128i d = _mm_load_si128((__m128i*)dst);
215 __m128i t = _mm_add_epi32(s1, s2);
216 __m128i v = _mm_sub_epi32(vb, t);
217 __m128i w = _mm_srai_epi32(v, e);
218 d = _mm_sub_epi32(d, w);
219 _mm_store_si128((__m128i*)dst, d);
222 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
224 __m128i s1 = _mm_load_si128((__m128i*)src1);
225 __m128i s2 = _mm_load_si128((__m128i*)src2);
226 __m128i d = _mm_load_si128((__m128i*)dst);
227 __m128i t = _mm_add_epi32(s1, s2);
228 __m128i v = _mm_sub_epi32(vb, t);
229 __m128i w = _mm_srai_epi32(v, e);
230 d = _mm_add_epi32(d, w);
231 _mm_store_si128((__m128i*)dst, d);
239 for (
ui32 i = repeat; i > 0; --i)
240 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
242 for (
ui32 i = repeat; i > 0; --i)
243 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
249 void sse2_rev_vert_step64(
const lifting_step* s,
const line_buf* sig,
250 const line_buf* other,
const line_buf* aug,
251 ui32 repeat,
bool synthesis)
253 const si64 a = s->rev.Aatk;
254 const si64 b = s->rev.Batk;
255 const ui8 e = s->rev.Eatk;
256 __m128i vb = _mm_set1_epi64x(b);
257 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
259 si64* dst = aug->i64;
260 const si64* src1 = sig->i64, * src2 = other->i64;
268 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
270 __m128i s1 = _mm_load_si128((__m128i*)src1);
271 __m128i s2 = _mm_load_si128((__m128i*)src2);
272 __m128i d = _mm_load_si128((__m128i*)dst);
273 __m128i t = _mm_add_epi64(s1, s2);
274 __m128i v = _mm_add_epi64(vb, t);
275 __m128i w = sse2_mm_srai_epi64(v, e, ve);
276 d = _mm_sub_epi64(d, w);
277 _mm_store_si128((__m128i*)dst, d);
280 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
282 __m128i s1 = _mm_load_si128((__m128i*)src1);
283 __m128i s2 = _mm_load_si128((__m128i*)src2);
284 __m128i d = _mm_load_si128((__m128i*)dst);
285 __m128i t = _mm_add_epi64(s1, s2);
286 __m128i v = _mm_add_epi64(vb, t);
287 __m128i w = sse2_mm_srai_epi64(v, e, ve);
288 d = _mm_add_epi64(d, w);
289 _mm_store_si128((__m128i*)dst, d);
292 else if (a == -1 && b == 1 && e == 1)
296 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
298 __m128i s1 = _mm_load_si128((__m128i*)src1);
299 __m128i s2 = _mm_load_si128((__m128i*)src2);
300 __m128i d = _mm_load_si128((__m128i*)dst);
301 __m128i t = _mm_add_epi64(s1, s2);
302 __m128i w = sse2_mm_srai_epi64(t, e, ve);
303 d = _mm_add_epi64(d, w);
304 _mm_store_si128((__m128i*)dst, d);
307 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
309 __m128i s1 = _mm_load_si128((__m128i*)src1);
310 __m128i s2 = _mm_load_si128((__m128i*)src2);
311 __m128i d = _mm_load_si128((__m128i*)dst);
312 __m128i t = _mm_add_epi64(s1, s2);
313 __m128i w = sse2_mm_srai_epi64(t, e, ve);
314 d = _mm_sub_epi64(d, w);
315 _mm_store_si128((__m128i*)dst, d);
322 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
324 __m128i s1 = _mm_load_si128((__m128i*)src1);
325 __m128i s2 = _mm_load_si128((__m128i*)src2);
326 __m128i d = _mm_load_si128((__m128i*)dst);
327 __m128i t = _mm_add_epi64(s1, s2);
328 __m128i v = _mm_sub_epi64(vb, t);
329 __m128i w = sse2_mm_srai_epi64(v, e, ve);
330 d = _mm_sub_epi64(d, w);
331 _mm_store_si128((__m128i*)dst, d);
334 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
336 __m128i s1 = _mm_load_si128((__m128i*)src1);
337 __m128i s2 = _mm_load_si128((__m128i*)src2);
338 __m128i d = _mm_load_si128((__m128i*)dst);
339 __m128i t = _mm_add_epi64(s1, s2);
340 __m128i v = _mm_sub_epi64(vb, t);
341 __m128i w = sse2_mm_srai_epi64(v, e, ve);
342 d = _mm_add_epi64(d, w);
343 _mm_store_si128((__m128i*)dst, d);
349 for (
ui32 i = repeat; i > 0; --i)
350 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
352 for (
ui32 i = repeat; i > 0; --i)
353 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
359 const line_buf* other,
const line_buf* aug,
360 ui32 repeat,
bool synthesis)
369 sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
376 sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
382 void sse2_rev_horz_ana32(
const param_atk* atk,
const line_buf* ldst,
383 const line_buf* hdst,
const line_buf* src,
384 ui32 width,
bool even)
390 float* dpl = even ? ldst->f32 : hdst->f32;
391 float* dph = even ? hdst->f32 : ldst->f32;
392 float* sp = src->f32;
394 sse2_deinterleave32(dpl, dph, sp, w);
397 si32* hp = hdst->i32, * lp = ldst->i32;
398 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
399 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
400 ui32 num_steps = atk->get_num_steps();
401 for (
ui32 j = num_steps; j > 0; --j)
406 const si32 b = s->rev.Batk;
407 const ui8 e = s->rev.Eatk;
408 __m128i vb = _mm_set1_epi32(b);
412 lp[l_width] = lp[l_width - 1];
418 int i = (int)h_width;
421 for (; i > 0; i -= 4, sp += 4, dp += 4)
423 __m128i s1 = _mm_load_si128((__m128i*)sp);
424 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
425 __m128i d = _mm_load_si128((__m128i*)dp);
426 __m128i t = _mm_add_epi32(s1, s2);
427 __m128i v = _mm_add_epi32(vb, t);
428 __m128i w = _mm_srai_epi32(v, e);
429 d = _mm_add_epi32(d, w);
430 _mm_store_si128((__m128i*)dp, d);
435 for (; i > 0; i -= 4, sp += 4, dp += 4)
437 __m128i s1 = _mm_load_si128((__m128i*)sp);
438 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
439 __m128i d = _mm_load_si128((__m128i*)dp);
440 __m128i t = _mm_add_epi32(s1, s2);
441 __m128i v = _mm_add_epi32(vb, t);
442 __m128i w = _mm_srai_epi32(v, e);
443 d = _mm_add_epi32(d, w);
444 _mm_store_si128((__m128i*)dp, d);
448 else if (a == -1 && b == 1 && e == 1)
450 int i = (int)h_width;
452 for (; i > 0; i -= 4, sp += 4, dp += 4)
454 __m128i s1 = _mm_load_si128((__m128i*)sp);
455 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
456 __m128i d = _mm_load_si128((__m128i*)dp);
457 __m128i t = _mm_add_epi32(s1, s2);
458 __m128i w = _mm_srai_epi32(t, e);
459 d = _mm_sub_epi32(d, w);
460 _mm_store_si128((__m128i*)dp, d);
463 for (; i > 0; i -= 4, sp += 4, dp += 4)
465 __m128i s1 = _mm_load_si128((__m128i*)sp);
466 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
467 __m128i d = _mm_load_si128((__m128i*)dp);
468 __m128i t = _mm_add_epi32(s1, s2);
469 __m128i w = _mm_srai_epi32(t, e);
470 d = _mm_sub_epi32(d, w);
471 _mm_store_si128((__m128i*)dp, d);
476 int i = (int)h_width;
478 for (; i > 0; i -= 4, sp += 4, dp += 4)
480 __m128i s1 = _mm_load_si128((__m128i*)sp);
481 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
482 __m128i d = _mm_load_si128((__m128i*)dp);
483 __m128i t = _mm_add_epi32(s1, s2);
484 __m128i v = _mm_sub_epi32(vb, t);
485 __m128i w = _mm_srai_epi32(v, e);
486 d = _mm_add_epi32(d, w);
487 _mm_store_si128((__m128i*)dp, d);
490 for (; i > 0; i -= 4, sp += 4, dp += 4)
492 __m128i s1 = _mm_load_si128((__m128i*)sp);
493 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
494 __m128i d = _mm_load_si128((__m128i*)dp);
495 __m128i t = _mm_add_epi32(s1, s2);
496 __m128i v = _mm_sub_epi32(vb, t);
497 __m128i w = _mm_srai_epi32(v, e);
498 d = _mm_add_epi32(d, w);
499 _mm_store_si128((__m128i*)dp, d);
506 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
507 *dp += (b + a * (sp[0] + sp[1])) >> e;
509 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
510 *dp += (b + a * (sp[-1] + sp[0])) >> e;
514 si32* t = lp; lp = hp; hp = t;
516 ui32 w = l_width; l_width = h_width; h_width = w;
521 ldst->i32[0] = src->i32[0];
523 hdst->i32[0] = src->i32[0] << 1;
529 void sse2_rev_horz_ana64(
const param_atk* atk,
const line_buf* ldst,
530 const line_buf* hdst,
const line_buf* src,
531 ui32 width,
bool even)
537 void* dpl = even ? ldst->p : hdst->p;
538 void* dph = even ? hdst->p : ldst->p;
539 const void* sp = src->p;
541 sse2_deinterleave64(dpl, dph, sp, w);
544 si64* hp = hdst->i64, * lp = ldst->i64;
545 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
546 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
547 ui32 num_steps = atk->get_num_steps();
548 for (
ui32 j = num_steps; j > 0; --j)
553 const si32 b = s->rev.Batk;
554 const ui8 e = s->rev.Eatk;
555 __m128i vb = _mm_set1_epi64x(b);
556 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
560 lp[l_width] = lp[l_width - 1];
566 int i = (int)h_width;
569 for (; i > 0; i -= 2, sp += 2, dp += 2)
571 __m128i s1 = _mm_load_si128((__m128i*)sp);
572 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
573 __m128i d = _mm_load_si128((__m128i*)dp);
574 __m128i t = _mm_add_epi64(s1, s2);
575 __m128i v = _mm_add_epi64(vb, t);
576 __m128i w = sse2_mm_srai_epi64(v, e, ve);
577 d = _mm_add_epi64(d, w);
578 _mm_store_si128((__m128i*)dp, d);
583 for (; i > 0; i -= 2, sp += 2, dp += 2)
585 __m128i s1 = _mm_load_si128((__m128i*)sp);
586 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
587 __m128i d = _mm_load_si128((__m128i*)dp);
588 __m128i t = _mm_add_epi64(s1, s2);
589 __m128i v = _mm_add_epi64(vb, t);
590 __m128i w = sse2_mm_srai_epi64(v, e, ve);
591 d = _mm_add_epi64(d, w);
592 _mm_store_si128((__m128i*)dp, d);
596 else if (a == -1 && b == 1 && e == 1)
598 int i = (int)h_width;
600 for (; i > 0; i -= 2, sp += 2, dp += 2)
602 __m128i s1 = _mm_load_si128((__m128i*)sp);
603 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
604 __m128i d = _mm_load_si128((__m128i*)dp);
605 __m128i t = _mm_add_epi64(s1, s2);
606 __m128i w = sse2_mm_srai_epi64(t, e, ve);
607 d = _mm_sub_epi64(d, w);
608 _mm_store_si128((__m128i*)dp, d);
611 for (; i > 0; i -= 2, sp += 2, dp += 2)
613 __m128i s1 = _mm_load_si128((__m128i*)sp);
614 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
615 __m128i d = _mm_load_si128((__m128i*)dp);
616 __m128i t = _mm_add_epi64(s1, s2);
617 __m128i w = sse2_mm_srai_epi64(t, e, ve);
618 d = _mm_sub_epi64(d, w);
619 _mm_store_si128((__m128i*)dp, d);
624 int i = (int)h_width;
626 for (; i > 0; i -= 2, sp += 2, dp += 2)
628 __m128i s1 = _mm_load_si128((__m128i*)sp);
629 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
630 __m128i d = _mm_load_si128((__m128i*)dp);
631 __m128i t = _mm_add_epi64(s1, s2);
632 __m128i v = _mm_sub_epi64(vb, t);
633 __m128i w = sse2_mm_srai_epi64(v, e, ve);
634 d = _mm_add_epi64(d, w);
635 _mm_store_si128((__m128i*)dp, d);
638 for (; i > 0; i -= 2, sp += 2, dp += 2)
640 __m128i s1 = _mm_load_si128((__m128i*)sp);
641 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
642 __m128i d = _mm_load_si128((__m128i*)dp);
643 __m128i t = _mm_add_epi64(s1, s2);
644 __m128i v = _mm_sub_epi64(vb, t);
645 __m128i w = sse2_mm_srai_epi64(v, e, ve);
646 d = _mm_add_epi64(d, w);
647 _mm_store_si128((__m128i*)dp, d);
654 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
655 *dp += (b + a * (sp[0] + sp[1])) >> e;
657 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
658 *dp += (b + a * (sp[-1] + sp[0])) >> e;
662 si64* t = lp; lp = hp; hp = t;
664 ui32 w = l_width; l_width = h_width; h_width = w;
669 ldst->i64[0] = src->i64[0];
671 hdst->i64[0] = src->i64[0] << 1;
677 const line_buf* hdst,
const line_buf* src,
678 ui32 width,
bool even)
684 sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
691 sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
696 void sse2_rev_horz_syn32(
const param_atk* atk,
const line_buf* dst,
697 const line_buf* lsrc,
const line_buf* hsrc,
698 ui32 width,
bool even)
703 si32* oth = hsrc->i32, * aug = lsrc->i32;
704 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
705 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
706 ui32 num_steps = atk->get_num_steps();
707 for (
ui32 j = 0; j < num_steps; ++j)
711 const si32 b = s->rev.Batk;
712 const ui8 e = s->rev.Eatk;
713 __m128i vb = _mm_set1_epi32(b);
717 oth[oth_width] = oth[oth_width - 1];
719 const si32* sp = oth;
723 int i = (int)aug_width;
726 for (; i > 0; i -= 4, sp += 4, dp += 4)
728 __m128i s1 = _mm_load_si128((__m128i*)sp);
729 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
730 __m128i d = _mm_load_si128((__m128i*)dp);
731 __m128i t = _mm_add_epi32(s1, s2);
732 __m128i v = _mm_add_epi32(vb, t);
733 __m128i w = _mm_srai_epi32(v, e);
734 d = _mm_sub_epi32(d, w);
735 _mm_store_si128((__m128i*)dp, d);
740 for (; i > 0; i -= 4, sp += 4, dp += 4)
742 __m128i s1 = _mm_load_si128((__m128i*)sp);
743 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
744 __m128i d = _mm_load_si128((__m128i*)dp);
745 __m128i t = _mm_add_epi32(s1, s2);
746 __m128i v = _mm_add_epi32(vb, t);
747 __m128i w = _mm_srai_epi32(v, e);
748 d = _mm_sub_epi32(d, w);
749 _mm_store_si128((__m128i*)dp, d);
753 else if (a == -1 && b == 1 && e == 1)
755 int i = (int)aug_width;
757 for (; i > 0; i -= 4, sp += 4, dp += 4)
759 __m128i s1 = _mm_load_si128((__m128i*)sp);
760 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
761 __m128i d = _mm_load_si128((__m128i*)dp);
762 __m128i t = _mm_add_epi32(s1, s2);
763 __m128i w = _mm_srai_epi32(t, e);
764 d = _mm_add_epi32(d, w);
765 _mm_store_si128((__m128i*)dp, d);
768 for (; i > 0; i -= 4, sp += 4, dp += 4)
770 __m128i s1 = _mm_load_si128((__m128i*)sp);
771 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
772 __m128i d = _mm_load_si128((__m128i*)dp);
773 __m128i t = _mm_add_epi32(s1, s2);
774 __m128i w = _mm_srai_epi32(t, e);
775 d = _mm_add_epi32(d, w);
776 _mm_store_si128((__m128i*)dp, d);
781 int i = (int)aug_width;
783 for (; i > 0; i -= 4, sp += 4, dp += 4)
785 __m128i s1 = _mm_load_si128((__m128i*)sp);
786 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
787 __m128i d = _mm_load_si128((__m128i*)dp);
788 __m128i t = _mm_add_epi32(s1, s2);
789 __m128i v = _mm_sub_epi32(vb, t);
790 __m128i w = _mm_srai_epi32(v, e);
791 d = _mm_sub_epi32(d, w);
792 _mm_store_si128((__m128i*)dp, d);
795 for (; i > 0; i -= 4, sp += 4, dp += 4)
797 __m128i s1 = _mm_load_si128((__m128i*)sp);
798 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
799 __m128i d = _mm_load_si128((__m128i*)dp);
800 __m128i t = _mm_add_epi32(s1, s2);
801 __m128i v = _mm_sub_epi32(vb, t);
802 __m128i w = _mm_srai_epi32(v, e);
803 d = _mm_sub_epi32(d, w);
804 _mm_store_si128((__m128i*)dp, d);
813 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
814 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
816 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
817 *dp -= (b + a * (sp[0] + sp[1])) >> e;
821 si32* t = aug; aug = oth; oth = t;
823 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
828 float* dp = dst->f32;
829 float* spl = even ? lsrc->f32 : hsrc->f32;
830 float* sph = even ? hsrc->f32 : lsrc->f32;
832 sse2_interleave32(dp, spl, sph, w);
837 dst->i32[0] = lsrc->i32[0];
839 dst->i32[0] = hsrc->i32[0] >> 1;
844 void sse2_rev_horz_syn64(
const param_atk* atk,
const line_buf* dst,
845 const line_buf* lsrc,
const line_buf* hsrc,
846 ui32 width,
bool even)
851 si64* oth = hsrc->i64, * aug = lsrc->i64;
852 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
853 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
854 ui32 num_steps = atk->get_num_steps();
855 for (
ui32 j = 0; j < num_steps; ++j)
859 const si32 b = s->rev.Batk;
860 const ui8 e = s->rev.Eatk;
861 __m128i vb = _mm_set1_epi64x(b);
862 __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
866 oth[oth_width] = oth[oth_width - 1];
868 const si64* sp = oth;
872 int i = (int)aug_width;
875 for (; i > 0; i -= 2, sp += 2, dp += 2)
877 __m128i s1 = _mm_load_si128((__m128i*)sp);
878 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
879 __m128i d = _mm_load_si128((__m128i*)dp);
880 __m128i t = _mm_add_epi64(s1, s2);
881 __m128i v = _mm_add_epi64(vb, t);
882 __m128i w = sse2_mm_srai_epi64(v, e, ve);
883 d = _mm_sub_epi64(d, w);
884 _mm_store_si128((__m128i*)dp, d);
889 for (; i > 0; i -= 2, sp += 2, dp += 2)
891 __m128i s1 = _mm_load_si128((__m128i*)sp);
892 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
893 __m128i d = _mm_load_si128((__m128i*)dp);
894 __m128i t = _mm_add_epi64(s1, s2);
895 __m128i v = _mm_add_epi64(vb, t);
896 __m128i w = sse2_mm_srai_epi64(v, e, ve);
897 d = _mm_sub_epi64(d, w);
898 _mm_store_si128((__m128i*)dp, d);
902 else if (a == -1 && b == 1 && e == 1)
904 int i = (int)aug_width;
906 for (; i > 0; i -= 2, sp += 2, dp += 2)
908 __m128i s1 = _mm_load_si128((__m128i*)sp);
909 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
910 __m128i d = _mm_load_si128((__m128i*)dp);
911 __m128i t = _mm_add_epi64(s1, s2);
912 __m128i w = sse2_mm_srai_epi64(t, e, ve);
913 d = _mm_add_epi64(d, w);
914 _mm_store_si128((__m128i*)dp, d);
917 for (; i > 0; i -= 2, sp += 2, dp += 2)
919 __m128i s1 = _mm_load_si128((__m128i*)sp);
920 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
921 __m128i d = _mm_load_si128((__m128i*)dp);
922 __m128i t = _mm_add_epi64(s1, s2);
923 __m128i w = sse2_mm_srai_epi64(t, e, ve);
924 d = _mm_add_epi64(d, w);
925 _mm_store_si128((__m128i*)dp, d);
930 int i = (int)aug_width;
932 for (; i > 0; i -= 2, sp += 2, dp += 2)
934 __m128i s1 = _mm_load_si128((__m128i*)sp);
935 __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
936 __m128i d = _mm_load_si128((__m128i*)dp);
937 __m128i t = _mm_add_epi64(s1, s2);
938 __m128i v = _mm_sub_epi64(vb, t);
939 __m128i w = sse2_mm_srai_epi64(v, e, ve);
940 d = _mm_sub_epi64(d, w);
941 _mm_store_si128((__m128i*)dp, d);
944 for (; i > 0; i -= 2, sp += 2, dp += 2)
946 __m128i s1 = _mm_load_si128((__m128i*)sp);
947 __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
948 __m128i d = _mm_load_si128((__m128i*)dp);
949 __m128i t = _mm_add_epi64(s1, s2);
950 __m128i v = _mm_sub_epi64(vb, t);
951 __m128i w = sse2_mm_srai_epi64(v, e, ve);
952 d = _mm_sub_epi64(d, w);
953 _mm_store_si128((__m128i*)dp, d);
960 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
961 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
963 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
964 *dp -= (b + a * (sp[0] + sp[1])) >> e;
968 si64* t = aug; aug = oth; oth = t;
970 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
976 const void* spl = even ? lsrc->p : hsrc->p;
977 const void* sph = even ? hsrc->p : lsrc->p;
979 sse2_interleave64(dp, spl, sph, w);
984 dst->i64[0] = lsrc->i64[0];
986 dst->i64[0] = hsrc->i64[0] >> 1;
992 const line_buf* lsrc,
const line_buf* hsrc,
993 ui32 width,
bool even)
999 sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1006 sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
void sse2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void sse2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)