57 for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
59 v128_t a = wasm_v128_load(sp);
60 v128_t b = wasm_v128_load(sp + 4);
61 v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
62 v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
65 wasm_v128_store(dpl, c);
66 wasm_v128_store(dph, d);
74 for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
76 v128_t a = wasm_v128_load(spl);
77 v128_t b = wasm_v128_load(sph);
78 v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
79 v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
82 wasm_v128_store(dp, c);
83 wasm_v128_store(dp + 4, d);
168 ui32 width,
bool even)
174 float* dpl = even ? ldst->
f32 : hdst->
f32;
175 float* dph = even ? hdst->
f32 : ldst->
f32;
176 float* sp = src->
f32;
182 float* hp = hdst->
f32, * lp = ldst->
f32;
183 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
184 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
186 for (
ui32 j = num_steps; j > 0; --j)
193 lp[l_width] = lp[l_width - 1];
195 const float* sp = lp;
197 int i = (int)h_width;
198 v128_t f = wasm_f32x4_splat(a);
201 for (; i > 0; i -= 4, sp += 4, dp += 4)
203 v128_t m = wasm_v128_load(sp);
204 v128_t n = wasm_v128_load(sp + 1);
205 v128_t p = wasm_v128_load(dp);
206 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
207 wasm_v128_store(dp, p);
212 for (; i > 0; i -= 4, sp += 4, dp += 4)
214 v128_t m = wasm_v128_load(sp);
215 v128_t n = wasm_v128_load(sp - 1);
216 v128_t p = wasm_v128_load(dp);
217 p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
218 wasm_v128_store(dp, p);
223 float* t = lp; lp = hp; hp = t;
225 ui32 w = l_width; l_width = h_width; h_width = w;
229 float K = atk->
get_K();
230 float K_inv = 1.0f / K;
237 ldst->
f32[0] = src->
f32[0];
239 hdst->
f32[0] = src->
f32[0] * 2.0f;
246 ui32 width,
bool even)
251 float* oth = hsrc->
f32, * aug = lsrc->
f32;
252 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
253 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
256 float K = atk->
get_K();
257 float K_inv = 1.0f / K;
264 for (
ui32 j = 0; j < num_steps; ++j)
271 oth[oth_width] = oth[oth_width - 1];
273 const float* sp = oth;
275 int i = (int)aug_width;
276 v128_t f = wasm_f32x4_splat(a);
279 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
281 v128_t m = wasm_v128_load(sp);
282 v128_t n = wasm_v128_load(sp - 1);
283 v128_t p = wasm_v128_load(dp);
284 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
285 wasm_v128_store(dp, p);
290 for ( ; i > 0; i -= 4, sp += 4, dp += 4)
292 v128_t m = wasm_v128_load(sp);
293 v128_t n = wasm_v128_load(sp + 1);
294 v128_t p = wasm_v128_load(dp);
295 p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
296 wasm_v128_store(dp, p);
301 float* t = aug; aug = oth; oth = t;
303 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
308 float* dp = dst->
f32;
309 float* spl = even ? lsrc->
f32 : hsrc->
f32;
310 float* sph = even ? hsrc->
f32 : lsrc->
f32;
317 dst->
f32[0] = lsrc->
f32[0];
319 dst->
f32[0] = hsrc->
f32[0] * 0.5f;
326 ui32 repeat,
bool synthesis)
331 v128_t va = wasm_i32x4_splat(a);
332 v128_t vb = wasm_i32x4_splat(b);
335 const si32* src1 = sig->
i32, * src2 = other->
i32;
343 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
348 v128_t t = wasm_i32x4_add(s1, s2);
349 v128_t v = wasm_i32x4_add(vb, t);
350 v128_t w = wasm_i32x4_shr(v, e);
351 d = wasm_i32x4_sub(d, w);
352 wasm_v128_store((
v128_t*)dst, d);
355 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
360 v128_t t = wasm_i32x4_add(s1, s2);
361 v128_t v = wasm_i32x4_add(vb, t);
362 v128_t w = wasm_i32x4_shr(v, e);
363 d = wasm_i32x4_add(d, w);
364 wasm_v128_store((
v128_t*)dst, d);
367 else if (a == -1 && b == 1 && e == 1)
371 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
376 v128_t t = wasm_i32x4_add(s1, s2);
377 v128_t w = wasm_i32x4_shr(t, e);
378 d = wasm_i32x4_add(d, w);
379 wasm_v128_store((
v128_t*)dst, d);
382 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
387 v128_t t = wasm_i32x4_add(s1, s2);
388 v128_t w = wasm_i32x4_shr(t, e);
389 d = wasm_i32x4_sub(d, w);
390 wasm_v128_store((
v128_t*)dst, d);
397 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
402 v128_t t = wasm_i32x4_add(s1, s2);
403 v128_t v = wasm_i32x4_sub(vb, t);
404 v128_t w = wasm_i32x4_shr(v, e);
405 d = wasm_i32x4_sub(d, w);
406 wasm_v128_store((
v128_t*)dst, d);
409 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
414 v128_t t = wasm_i32x4_add(s1, s2);
415 v128_t v = wasm_i32x4_sub(vb, t);
416 v128_t w = wasm_i32x4_shr(v, e);
417 d = wasm_i32x4_add(d, w);
418 wasm_v128_store((
v128_t*)dst, d);
425 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
430 v128_t t = wasm_i32x4_add(s1, s2);
431 v128_t u = wasm_i32x4_mul(va, t);
432 v128_t v = wasm_i32x4_add(vb, u);
433 v128_t w = wasm_i32x4_shr(v, e);
434 d = wasm_i32x4_sub(d, w);
435 wasm_v128_store((
v128_t*)dst, d);
438 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
443 v128_t t = wasm_i32x4_add(s1, s2);
444 v128_t u = wasm_i32x4_mul(va, t);
445 v128_t v = wasm_i32x4_add(vb, u);
446 v128_t w = wasm_i32x4_shr(v, e);
447 d = wasm_i32x4_add(d, w);
448 wasm_v128_store((
v128_t*)dst, d);
456 ui32 repeat,
bool synthesis)
461 v128_t va = wasm_i64x2_splat(a);
462 v128_t vb = wasm_i64x2_splat(b);
465 const si64* src1 = sig->
i64, * src2 = other->
i64;
473 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
478 v128_t t = wasm_i64x2_add(s1, s2);
479 v128_t v = wasm_i64x2_add(vb, t);
480 v128_t w = wasm_i64x2_shr(v, e);
481 d = wasm_i64x2_sub(d, w);
482 wasm_v128_store((
v128_t*)dst, d);
485 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
490 v128_t t = wasm_i64x2_add(s1, s2);
491 v128_t v = wasm_i64x2_add(vb, t);
492 v128_t w = wasm_i64x2_shr(v, e);
493 d = wasm_i64x2_add(d, w);
494 wasm_v128_store((
v128_t*)dst, d);
497 else if (a == -1 && b == 1 && e == 1)
501 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
506 v128_t t = wasm_i64x2_add(s1, s2);
507 v128_t w = wasm_i64x2_shr(t, e);
508 d = wasm_i64x2_add(d, w);
509 wasm_v128_store((
v128_t*)dst, d);
512 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
517 v128_t t = wasm_i64x2_add(s1, s2);
518 v128_t w = wasm_i64x2_shr(t, e);
519 d = wasm_i64x2_sub(d, w);
520 wasm_v128_store((
v128_t*)dst, d);
527 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
532 v128_t t = wasm_i64x2_add(s1, s2);
533 v128_t v = wasm_i64x2_sub(vb, t);
534 v128_t w = wasm_i64x2_shr(v, e);
535 d = wasm_i64x2_sub(d, w);
536 wasm_v128_store((
v128_t*)dst, d);
539 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
544 v128_t t = wasm_i64x2_add(s1, s2);
545 v128_t v = wasm_i64x2_sub(vb, t);
546 v128_t w = wasm_i64x2_shr(v, e);
547 d = wasm_i64x2_add(d, w);
548 wasm_v128_store((
v128_t*)dst, d);
555 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
560 v128_t t = wasm_i64x2_add(s1, s2);
561 v128_t u = wasm_i64x2_mul(va, t);
562 v128_t v = wasm_i64x2_add(vb, u);
563 v128_t w = wasm_i64x2_shr(v, e);
564 d = wasm_i64x2_sub(d, w);
565 wasm_v128_store((
v128_t*)dst, d);
568 for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
573 v128_t t = wasm_i64x2_add(s1, s2);
574 v128_t u = wasm_i64x2_mul(va, t);
575 v128_t v = wasm_i64x2_add(vb, u);
576 v128_t w = wasm_i64x2_shr(v, e);
577 d = wasm_i64x2_add(d, w);
578 wasm_v128_store((
v128_t*)dst, d);
610 ui32 width,
bool even)
616 float* dpl = even ? ldst->
f32 : hdst->
f32;
617 float* dph = even ? hdst->
f32 : ldst->
f32;
618 float* sp = src->
f32;
624 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
625 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
627 for (
ui32 j = num_steps; j > 0; --j)
634 v128_t va = wasm_i32x4_splat(a);
635 v128_t vb = wasm_i32x4_splat(b);
639 lp[l_width] = lp[l_width - 1];
645 int i = (int)h_width;
648 for (; i > 0; i -= 4, sp += 4, dp += 4)
653 v128_t t = wasm_i32x4_add(s1, s2);
654 v128_t v = wasm_i32x4_add(vb, t);
655 v128_t w = wasm_i32x4_shr(v, e);
656 d = wasm_i32x4_add(d, w);
657 wasm_v128_store((
v128_t*)dp, d);
662 for (; i > 0; i -= 4, sp += 4, dp += 4)
667 v128_t t = wasm_i32x4_add(s1, s2);
668 v128_t v = wasm_i32x4_add(vb, t);
669 v128_t w = wasm_i32x4_shr(v, e);
670 d = wasm_i32x4_add(d, w);
671 wasm_v128_store((
v128_t*)dp, d);
675 else if (a == -1 && b == 1 && e == 1)
677 int i = (int)h_width;
679 for (; i > 0; i -= 4, sp += 4, dp += 4)
684 v128_t t = wasm_i32x4_add(s1, s2);
685 v128_t w = wasm_i32x4_shr(t, e);
686 d = wasm_i32x4_sub(d, w);
687 wasm_v128_store((
v128_t*)dp, d);
690 for (; i > 0; i -= 4, sp += 4, dp += 4)
695 v128_t t = wasm_i32x4_add(s1, s2);
696 v128_t w = wasm_i32x4_shr(t, e);
697 d = wasm_i32x4_sub(d, w);
698 wasm_v128_store((
v128_t*)dp, d);
703 int i = (int)h_width;
705 for (; i > 0; i -= 4, sp += 4, dp += 4)
710 v128_t t = wasm_i32x4_add(s1, s2);
711 v128_t v = wasm_i32x4_sub(vb, t);
712 v128_t w = wasm_i32x4_shr(v, e);
713 d = wasm_i32x4_add(d, w);
714 wasm_v128_store((
v128_t*)dp, d);
717 for (; i > 0; i -= 4, sp += 4, dp += 4)
722 v128_t t = wasm_i32x4_add(s1, s2);
723 v128_t v = wasm_i32x4_sub(vb, t);
724 v128_t w = wasm_i32x4_shr(v, e);
725 d = wasm_i32x4_add(d, w);
726 wasm_v128_store((
v128_t*)dp, d);
731 int i = (int)h_width;
733 for (; i > 0; i -= 4, sp += 4, dp += 4)
738 v128_t t = wasm_i32x4_add(s1, s2);
739 v128_t u = wasm_i32x4_mul(va, t);
740 v128_t v = wasm_i32x4_add(vb, u);
741 v128_t w = wasm_i32x4_shr(v, e);
742 d = wasm_i32x4_add(d, w);
743 wasm_v128_store((
v128_t*)dp, d);
746 for (; i > 0; i -= 4, sp += 4, dp += 4)
751 v128_t t = wasm_i32x4_add(s1, s2);
752 v128_t u = wasm_i32x4_mul(va, t);
753 v128_t v = wasm_i32x4_add(vb, u);
754 v128_t w = wasm_i32x4_shr(v, e);
755 d = wasm_i32x4_add(d, w);
756 wasm_v128_store((
v128_t*)dp, d);
761 si32* t = lp; lp = hp; hp = t;
763 ui32 w = l_width; l_width = h_width; h_width = w;
768 ldst->
i32[0] = src->
i32[0];
770 hdst->
i32[0] = src->
i32[0] << 1;
778 ui32 width,
bool even)
784 void* dpl = even ? ldst->
p : hdst->
p;
785 void* dph = even ? hdst->
p : ldst->
p;
786 const void* sp = src->
p;
792 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
793 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
795 for (
ui32 j = num_steps; j > 0; --j)
802 v128_t va = wasm_i64x2_splat(a);
803 v128_t vb = wasm_i64x2_splat(b);
807 lp[l_width] = lp[l_width - 1];
813 int i = (int)h_width;
816 for (; i > 0; i -= 2, sp += 2, dp += 2)
821 v128_t t = wasm_i64x2_add(s1, s2);
822 v128_t v = wasm_i64x2_add(vb, t);
823 v128_t w = wasm_i64x2_shr(v, e);
824 d = wasm_i64x2_add(d, w);
825 wasm_v128_store((
v128_t*)dp, d);
830 for (; i > 0; i -= 2, sp += 2, dp += 2)
835 v128_t t = wasm_i64x2_add(s1, s2);
836 v128_t v = wasm_i64x2_add(vb, t);
837 v128_t w = wasm_i64x2_shr(v, e);
838 d = wasm_i64x2_add(d, w);
839 wasm_v128_store((
v128_t*)dp, d);
843 else if (a == -1 && b == 1 && e == 1)
845 int i = (int)h_width;
847 for (; i > 0; i -= 2, sp += 2, dp += 2)
852 v128_t t = wasm_i64x2_add(s1, s2);
853 v128_t w = wasm_i64x2_shr(t, e);
854 d = wasm_i64x2_sub(d, w);
855 wasm_v128_store((
v128_t*)dp, d);
858 for (; i > 0; i -= 2, sp += 2, dp += 2)
863 v128_t t = wasm_i64x2_add(s1, s2);
864 v128_t w = wasm_i64x2_shr(t, e);
865 d = wasm_i64x2_sub(d, w);
866 wasm_v128_store((
v128_t*)dp, d);
871 int i = (int)h_width;
873 for (; i > 0; i -= 2, sp += 2, dp += 2)
878 v128_t t = wasm_i64x2_add(s1, s2);
879 v128_t v = wasm_i64x2_sub(vb, t);
880 v128_t w = wasm_i64x2_shr(v, e);
881 d = wasm_i64x2_add(d, w);
882 wasm_v128_store((
v128_t*)dp, d);
885 for (; i > 0; i -= 2, sp += 2, dp += 2)
890 v128_t t = wasm_i64x2_add(s1, s2);
891 v128_t v = wasm_i64x2_sub(vb, t);
892 v128_t w = wasm_i64x2_shr(v, e);
893 d = wasm_i64x2_add(d, w);
894 wasm_v128_store((
v128_t*)dp, d);
899 int i = (int)h_width;
901 for (; i > 0; i -= 2, sp += 2, dp += 2)
906 v128_t t = wasm_i64x2_add(s1, s2);
907 v128_t u = wasm_i64x2_mul(va, t);
908 v128_t v = wasm_i64x2_add(vb, u);
909 v128_t w = wasm_i64x2_shr(v, e);
910 d = wasm_i64x2_add(d, w);
911 wasm_v128_store((
v128_t*)dp, d);
914 for (; i > 0; i -= 2, sp += 2, dp += 2)
919 v128_t t = wasm_i64x2_add(s1, s2);
920 v128_t u = wasm_i64x2_mul(va, t);
921 v128_t v = wasm_i64x2_add(vb, u);
922 v128_t w = wasm_i64x2_shr(v, e);
923 d = wasm_i64x2_add(d, w);
924 wasm_v128_store((
v128_t*)dp, d);
929 si64* t = lp; lp = hp; hp = t;
931 ui32 w = l_width; l_width = h_width; h_width = w;
936 ldst->
i64[0] = src->
i64[0];
938 hdst->
i64[0] = src->
i64[0] << 1;
965 ui32 width,
bool even)
971 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
972 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
974 for (
ui32 j = 0; j < num_steps; ++j)
980 v128_t va = wasm_i32x4_splat(a);
981 v128_t vb = wasm_i32x4_splat(b);
985 oth[oth_width] = oth[oth_width - 1];
987 const si32* sp = oth;
991 int i = (int)aug_width;
994 for (; i > 0; i -= 4, sp += 4, dp += 4)
999 v128_t t = wasm_i32x4_add(s1, s2);
1000 v128_t v = wasm_i32x4_add(vb, t);
1001 v128_t w = wasm_i32x4_shr(v, e);
1002 d = wasm_i32x4_sub(d, w);
1003 wasm_v128_store((
v128_t*)dp, d);
1008 for (; i > 0; i -= 4, sp += 4, dp += 4)
1013 v128_t t = wasm_i32x4_add(s1, s2);
1014 v128_t v = wasm_i32x4_add(vb, t);
1015 v128_t w = wasm_i32x4_shr(v, e);
1016 d = wasm_i32x4_sub(d, w);
1017 wasm_v128_store((
v128_t*)dp, d);
1021 else if (a == -1 && b == 1 && e == 1)
1023 int i = (int)aug_width;
1025 for (; i > 0; i -= 4, sp += 4, dp += 4)
1030 v128_t t = wasm_i32x4_add(s1, s2);
1031 v128_t w = wasm_i32x4_shr(t, e);
1032 d = wasm_i32x4_add(d, w);
1033 wasm_v128_store((
v128_t*)dp, d);
1036 for (; i > 0; i -= 4, sp += 4, dp += 4)
1041 v128_t t = wasm_i32x4_add(s1, s2);
1042 v128_t w = wasm_i32x4_shr(t, e);
1043 d = wasm_i32x4_add(d, w);
1044 wasm_v128_store((
v128_t*)dp, d);
1049 int i = (int)aug_width;
1051 for (; i > 0; i -= 4, sp += 4, dp += 4)
1056 v128_t t = wasm_i32x4_add(s1, s2);
1057 v128_t v = wasm_i32x4_sub(vb, t);
1058 v128_t w = wasm_i32x4_shr(v, e);
1059 d = wasm_i32x4_sub(d, w);
1060 wasm_v128_store((
v128_t*)dp, d);
1063 for (; i > 0; i -= 4, sp += 4, dp += 4)
1068 v128_t t = wasm_i32x4_add(s1, s2);
1069 v128_t v = wasm_i32x4_sub(vb, t);
1070 v128_t w = wasm_i32x4_shr(v, e);
1071 d = wasm_i32x4_sub(d, w);
1072 wasm_v128_store((
v128_t*)dp, d);
1077 int i = (int)aug_width;
1079 for (; i > 0; i -= 4, sp += 4, dp += 4)
1084 v128_t t = wasm_i32x4_add(s1, s2);
1085 v128_t u = wasm_i32x4_mul(va, t);
1086 v128_t v = wasm_i32x4_add(vb, u);
1087 v128_t w = wasm_i32x4_shr(v, e);
1088 d = wasm_i32x4_sub(d, w);
1089 wasm_v128_store((
v128_t*)dp, d);
1092 for (; i > 0; i -= 4, sp += 4, dp += 4)
1097 v128_t t = wasm_i32x4_add(s1, s2);
1098 v128_t u = wasm_i32x4_mul(va, t);
1099 v128_t v = wasm_i32x4_add(vb, u);
1100 v128_t w = wasm_i32x4_shr(v, e);
1101 d = wasm_i32x4_sub(d, w);
1102 wasm_v128_store((
v128_t*)dp, d);
1107 si32* t = aug; aug = oth; oth = t;
1109 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1114 float* dp = dst->
f32;
1115 float* spl = even ? lsrc->
f32 : hsrc->
f32;
1116 float* sph = even ? hsrc->
f32 : lsrc->
f32;
1123 dst->
i32[0] = lsrc->
i32[0];
1125 dst->
i32[0] = hsrc->
i32[0] >> 1;
1132 ui32 width,
bool even)
1138 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1139 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1141 for (
ui32 j = 0; j < num_steps; ++j)
1147 v128_t va = wasm_i64x2_splat(a);
1148 v128_t vb = wasm_i64x2_splat(b);
1152 oth[oth_width] = oth[oth_width - 1];
1154 const si64* sp = oth;
1158 int i = (int)aug_width;
1161 for (; i > 0; i -= 2, sp += 2, dp += 2)
1166 v128_t t = wasm_i64x2_add(s1, s2);
1167 v128_t v = wasm_i64x2_add(vb, t);
1168 v128_t w = wasm_i64x2_shr(v, e);
1169 d = wasm_i64x2_sub(d, w);
1170 wasm_v128_store((
v128_t*)dp, d);
1175 for (; i > 0; i -= 2, sp += 2, dp += 2)
1180 v128_t t = wasm_i64x2_add(s1, s2);
1181 v128_t v = wasm_i64x2_add(vb, t);
1182 v128_t w = wasm_i64x2_shr(v, e);
1183 d = wasm_i64x2_sub(d, w);
1184 wasm_v128_store((
v128_t*)dp, d);
1188 else if (a == -1 && b == 1 && e == 1)
1190 int i = (int)aug_width;
1192 for (; i > 0; i -= 2, sp += 2, dp += 2)
1197 v128_t t = wasm_i64x2_add(s1, s2);
1198 v128_t w = wasm_i64x2_shr(t, e);
1199 d = wasm_i64x2_add(d, w);
1200 wasm_v128_store((
v128_t*)dp, d);
1203 for (; i > 0; i -= 2, sp += 2, dp += 2)
1208 v128_t t = wasm_i64x2_add(s1, s2);
1209 v128_t w = wasm_i64x2_shr(t, e);
1210 d = wasm_i64x2_add(d, w);
1211 wasm_v128_store((
v128_t*)dp, d);
1216 int i = (int)aug_width;
1218 for (; i > 0; i -= 2, sp += 2, dp += 2)
1223 v128_t t = wasm_i64x2_add(s1, s2);
1224 v128_t v = wasm_i64x2_sub(vb, t);
1225 v128_t w = wasm_i64x2_shr(v, e);
1226 d = wasm_i64x2_sub(d, w);
1227 wasm_v128_store((
v128_t*)dp, d);
1230 for (; i > 0; i -= 2, sp += 2, dp += 2)
1235 v128_t t = wasm_i64x2_add(s1, s2);
1236 v128_t v = wasm_i64x2_sub(vb, t);
1237 v128_t w = wasm_i64x2_shr(v, e);
1238 d = wasm_i64x2_sub(d, w);
1239 wasm_v128_store((
v128_t*)dp, d);
1244 int i = (int)aug_width;
1246 for (; i > 0; i -= 2, sp += 2, dp += 2)
1251 v128_t t = wasm_i64x2_add(s1, s2);
1252 v128_t u = wasm_i64x2_mul(va, t);
1253 v128_t v = wasm_i64x2_add(vb, u);
1254 v128_t w = wasm_i64x2_shr(v, e);
1255 d = wasm_i64x2_sub(d, w);
1256 wasm_v128_store((
v128_t*)dp, d);
1259 for (; i > 0; i -= 2, sp += 2, dp += 2)
1264 v128_t t = wasm_i64x2_add(s1, s2);
1265 v128_t u = wasm_i64x2_mul(va, t);
1266 v128_t v = wasm_i64x2_add(vb, u);
1267 v128_t w = wasm_i64x2_shr(v, e);
1268 d = wasm_i64x2_sub(d, w);
1269 wasm_v128_store((
v128_t*)dp, d);
1274 si64* t = aug; aug = oth; oth = t;
1276 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1282 const void* spl = even ? lsrc->
p : hsrc->
p;
1283 const void* sph = even ? hsrc->
p : lsrc->
p;
1290 dst->
i64[0] = lsrc->
i64[0];
1292 dst->
i64[0] = hsrc->
i64[0] >> 1;