39#if defined(OJPH_ARCH_X86_64)
60 void avx512_deinterleave32(
float* dpl,
float* dph,
float* sp,
int width)
62 __m512i idx1 = _mm512_set_epi32(
63 0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10,
64 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
66 __m512i idx2 = _mm512_set_epi32(
67 0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11,
68 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
70 for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
72 __m512 a = _mm512_load_ps(sp);
73 __m512 b = _mm512_load_ps(sp + 16);
74 __m512 c = _mm512_permutex2var_ps(a, idx1, b);
75 __m512 d = _mm512_permutex2var_ps(a, idx2, b);
76 _mm512_store_ps(dpl, c);
77 _mm512_store_ps(dph, d);
79 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
81 __m256 a = _mm256_load_ps(sp);
82 __m256 b = _mm256_load_ps(sp + 8);
83 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
84 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
85 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
86 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
87 _mm256_store_ps(dpl, e);
88 _mm256_store_ps(dph, f);
96 void avx512_interleave32(
float* dp,
float* spl,
float* sph,
int width)
98 __m512i idx1 = _mm512_set_epi32(
99 0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4,
100 0x13, 0x3, 0x12, 0x2, 0x11, 0x1, 0x10, 0x0
102 __m512i idx2 = _mm512_set_epi32(
103 0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC,
104 0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8
106 for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
108 __m512 a = _mm512_load_ps(spl);
109 __m512 b = _mm512_load_ps(sph);
110 __m512 c = _mm512_permutex2var_ps(a, idx1, b);
111 __m512 d = _mm512_permutex2var_ps(a, idx2, b);
112 _mm512_store_ps(dp, c);
113 _mm512_store_ps(dp + 16, d);
115 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
117 __m256 a = _mm256_load_ps(spl);
118 __m256 b = _mm256_load_ps(sph);
119 __m256 c = _mm256_unpacklo_ps(a, b);
120 __m256 d = _mm256_unpackhi_ps(a, b);
121 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
122 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
123 _mm256_store_ps(dp, e);
124 _mm256_store_ps(dp + 8, f);
131 static void avx512_deinterleave64(
void* dpl,
void* dph,
const void* sp,
134 __m512i idx1 = _mm512_set_epi64(
135 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
137 __m512i idx2 = _mm512_set_epi64(
138 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
140 for (; width > 8; width -= 16,
141 sp = (
const char*)sp + 128,
142 dpl = (
char*)dpl + 64,
143 dph = (
char*)dph + 64)
145 __m512i a = _mm512_load_si512(sp);
146 __m512i b = _mm512_load_si512((
const char*)sp + 64);
147 __m512i c = _mm512_permutex2var_epi64(a, idx1, b);
148 __m512i d = _mm512_permutex2var_epi64(a, idx2, b);
149 _mm512_store_si512(dpl, c);
150 _mm512_store_si512(dph, d);
152 for (; width > 0; width -= 8,
153 sp = (
const char*)sp + 64,
154 dpl = (
char*)dpl + 32,
155 dph = (
char*)dph + 32)
157 __m256i a = _mm256_load_si256((
const __m256i*)sp);
158 __m256i b = _mm256_load_si256((
const __m256i*)((
const char*)sp + 32));
159 __m256i c = _mm256_permute2f128_si256(a, b, (2 << 4) | (0));
160 __m256i d = _mm256_permute2f128_si256(a, b, (3 << 4) | (1));
161 __m256i e = _mm256_unpacklo_epi64(c, d);
162 __m256i f = _mm256_unpackhi_epi64(c, d);
163 _mm256_store_si256((__m256i*)dpl, e);
164 _mm256_store_si256((__m256i*)dph, f);
171 static void avx512_interleave64(
void* dp,
const void* spl,
172 const void* sph,
int width)
174 __m512i idx1 = _mm512_set_epi64(
175 0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0
177 __m512i idx2 = _mm512_set_epi64(
178 0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4
180 for (; width > 8; width -= 16,
181 dp = (
char*)dp + 128,
182 spl = (
const char*)spl + 64,
183 sph = (
const char*)sph + 64)
185 __m512i a = _mm512_load_si512(spl);
186 __m512i b = _mm512_load_si512(sph);
187 __m512i c = _mm512_permutex2var_epi64(a, idx1, b);
188 __m512i d = _mm512_permutex2var_epi64(a, idx2, b);
189 _mm512_store_si512(dp, c);
190 _mm512_store_si512((
char*)dp + 64, d);
192 for (; width > 0; width -= 8,
194 spl = (
const char*)spl + 32,
195 sph = (
const char*)sph + 32)
197 __m256i a = _mm256_load_si256((
const __m256i*)spl);
198 __m256i b = _mm256_load_si256((
const __m256i*)sph);
199 __m256i c = _mm256_unpacklo_epi64(a, b);
200 __m256i d = _mm256_unpackhi_epi64(a, b);
201 __m256i e = _mm256_permute2f128_si256(c, d, (2 << 4) | (0));
202 __m256i f = _mm256_permute2f128_si256(c, d, (3 << 4) | (1));
203 _mm256_store_si256((__m256i*)dp, e);
204 _mm256_store_si256((__m256i*)((
char*)dp + 32), f);
209 static inline void avx512_multiply_const(
float* p,
float f,
int width)
211 __m512 factor = _mm512_set1_ps(f);
212 for (; width > 0; width -= 16, p += 16)
214 __m512 s = _mm512_load_ps(p);
215 _mm512_store_ps(p, _mm512_mul_ps(factor, s));
221 const line_buf* other,
const line_buf* aug,
222 ui32 repeat,
bool synthesis)
224 float a = s->irv.Aatk;
228 __m512 factor = _mm512_set1_ps(a);
230 float* dst = aug->f32;
231 const float* src1 = sig->f32, * src2 = other->f32;
233 for ( ; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
235 __m512 s1 = _mm512_load_ps(src1);
236 __m512 s2 = _mm512_load_ps(src2);
237 __m512 d = _mm512_load_ps(dst);
238 d = _mm512_add_ps(d, _mm512_mul_ps(factor, _mm512_add_ps(s1, s2)));
239 _mm512_store_ps(dst, d);
246 avx512_multiply_const(aug->f32, K, (
int)repeat);
251 const line_buf* hdst,
const line_buf* src,
252 ui32 width,
bool even)
258 float* dpl = even ? ldst->f32 : hdst->f32;
259 float* dph = even ? hdst->f32 : ldst->f32;
260 float* sp = src->f32;
262 avx512_deinterleave32(dpl, dph, sp, w);
266 float* hp = hdst->f32, * lp = ldst->f32;
267 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
268 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
269 ui32 num_steps = atk->get_num_steps();
270 for (
ui32 j = num_steps; j > 0; --j)
277 lp[l_width] = lp[l_width - 1];
279 const float* sp = lp;
281 int i = (int)h_width;
282 __m512 f = _mm512_set1_ps(a);
285 for (; i > 0; i -= 16, sp += 16, dp += 16)
287 __m512 m = _mm512_load_ps(sp);
288 __m512 n = _mm512_loadu_ps(sp + 1);
289 __m512 p = _mm512_load_ps(dp);
290 p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
291 _mm512_store_ps(dp, p);
296 for (; i > 0; i -= 16, sp += 16, dp += 16)
298 __m512 m = _mm512_load_ps(sp);
299 __m512 n = _mm512_loadu_ps(sp - 1);
300 __m512 p = _mm512_load_ps(dp);
301 p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
302 _mm512_store_ps(dp, p);
307 float* t = lp; lp = hp; hp = t;
309 ui32 w = l_width; l_width = h_width; h_width = w;
313 float K = atk->get_K();
314 float K_inv = 1.0f / K;
315 avx512_multiply_const(lp, K_inv, (
int)l_width);
316 avx512_multiply_const(hp, K, (
int)h_width);
321 ldst->f32[0] = src->f32[0];
323 hdst->f32[0] = src->f32[0] * 2.0f;
329 const line_buf* lsrc,
const line_buf* hsrc,
330 ui32 width,
bool even)
335 float* oth = hsrc->f32, * aug = lsrc->f32;
336 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
337 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
340 float K = atk->get_K();
341 float K_inv = 1.0f / K;
342 avx512_multiply_const(aug, K, (
int)aug_width);
343 avx512_multiply_const(oth, K_inv, (
int)oth_width);
347 ui32 num_steps = atk->get_num_steps();
348 for (
ui32 j = 0; j < num_steps; ++j)
355 oth[oth_width] = oth[oth_width - 1];
357 const float* sp = oth;
359 int i = (int)aug_width;
360 __m512 f = _mm512_set1_ps(a);
363 for (; i > 0; i -= 16, sp += 16, dp += 16)
365 __m512 m = _mm512_load_ps(sp);
366 __m512 n = _mm512_loadu_ps(sp - 1);
367 __m512 p = _mm512_load_ps(dp);
368 p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
369 _mm512_store_ps(dp, p);
374 for (; i > 0; i -= 16, sp += 16, dp += 16)
376 __m512 m = _mm512_load_ps(sp);
377 __m512 n = _mm512_loadu_ps(sp + 1);
378 __m512 p = _mm512_load_ps(dp);
379 p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
380 _mm512_store_ps(dp, p);
385 float* t = aug; aug = oth; oth = t;
387 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
392 float* dp = dst->f32;
393 float* spl = even ? lsrc->f32 : hsrc->f32;
394 float* sph = even ? hsrc->f32 : lsrc->f32;
396 avx512_interleave32(dp, spl, sph, w);
401 dst->f32[0] = lsrc->f32[0];
403 dst->f32[0] = hsrc->f32[0] * 0.5f;
409 void avx512_rev_vert_step32(
const lifting_step* s,
const line_buf* sig,
410 const line_buf* other,
const line_buf* aug,
411 ui32 repeat,
bool synthesis)
413 const si32 a = s->rev.Aatk;
414 const si32 b = s->rev.Batk;
415 const ui8 e = s->rev.Eatk;
416 __m512i va = _mm512_set1_epi32(a);
417 __m512i vb = _mm512_set1_epi32(b);
419 si32* dst = aug->i32;
420 const si32* src1 = sig->i32, * src2 = other->i32;
428 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
430 __m512i s1 = _mm512_load_si512((__m512i*)src1);
431 __m512i s2 = _mm512_load_si512((__m512i*)src2);
432 __m512i d = _mm512_load_si512((__m512i*)dst);
433 __m512i t = _mm512_add_epi32(s1, s2);
434 __m512i v = _mm512_add_epi32(vb, t);
435 __m512i w = _mm512_srai_epi32(v, e);
436 d = _mm512_sub_epi32(d, w);
437 _mm512_store_si512((__m512i*)dst, d);
440 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
442 __m512i s1 = _mm512_load_si512((__m512i*)src1);
443 __m512i s2 = _mm512_load_si512((__m512i*)src2);
444 __m512i d = _mm512_load_si512((__m512i*)dst);
445 __m512i t = _mm512_add_epi32(s1, s2);
446 __m512i v = _mm512_add_epi32(vb, t);
447 __m512i w = _mm512_srai_epi32(v, e);
448 d = _mm512_add_epi32(d, w);
449 _mm512_store_si512((__m512i*)dst, d);
452 else if (a == -1 && b == 1 && e == 1)
456 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
458 __m512i s1 = _mm512_load_si512((__m512i*)src1);
459 __m512i s2 = _mm512_load_si512((__m512i*)src2);
460 __m512i d = _mm512_load_si512((__m512i*)dst);
461 __m512i t = _mm512_add_epi32(s1, s2);
462 __m512i w = _mm512_srai_epi32(t, e);
463 d = _mm512_add_epi32(d, w);
464 _mm512_store_si512((__m512i*)dst, d);
467 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
469 __m512i s1 = _mm512_load_si512((__m512i*)src1);
470 __m512i s2 = _mm512_load_si512((__m512i*)src2);
471 __m512i d = _mm512_load_si512((__m512i*)dst);
472 __m512i t = _mm512_add_epi32(s1, s2);
473 __m512i w = _mm512_srai_epi32(t, e);
474 d = _mm512_sub_epi32(d, w);
475 _mm512_store_si512((__m512i*)dst, d);
482 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
484 __m512i s1 = _mm512_load_si512((__m512i*)src1);
485 __m512i s2 = _mm512_load_si512((__m512i*)src2);
486 __m512i d = _mm512_load_si512((__m512i*)dst);
487 __m512i t = _mm512_add_epi32(s1, s2);
488 __m512i v = _mm512_sub_epi32(vb, t);
489 __m512i w = _mm512_srai_epi32(v, e);
490 d = _mm512_sub_epi32(d, w);
491 _mm512_store_si512((__m512i*)dst, d);
494 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
496 __m512i s1 = _mm512_load_si512((__m512i*)src1);
497 __m512i s2 = _mm512_load_si512((__m512i*)src2);
498 __m512i d = _mm512_load_si512((__m512i*)dst);
499 __m512i t = _mm512_add_epi32(s1, s2);
500 __m512i v = _mm512_sub_epi32(vb, t);
501 __m512i w = _mm512_srai_epi32(v, e);
502 d = _mm512_add_epi32(d, w);
503 _mm512_store_si512((__m512i*)dst, d);
509 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
511 __m512i s1 = _mm512_load_si512((__m512i*)src1);
512 __m512i s2 = _mm512_load_si512((__m512i*)src2);
513 __m512i d = _mm512_load_si512((__m512i*)dst);
514 __m512i t = _mm512_add_epi32(s1, s2);
515 __m512i u = _mm512_mullo_epi32(va, t);
516 __m512i v = _mm512_add_epi32(vb, u);
517 __m512i w = _mm512_srai_epi32(v, e);
518 d = _mm512_sub_epi32(d, w);
519 _mm512_store_si512((__m512i*)dst, d);
522 for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
524 __m512i s1 = _mm512_load_si512((__m512i*)src1);
525 __m512i s2 = _mm512_load_si512((__m512i*)src2);
526 __m512i d = _mm512_load_si512((__m512i*)dst);
527 __m512i t = _mm512_add_epi32(s1, s2);
528 __m512i u = _mm512_mullo_epi32(va, t);
529 __m512i v = _mm512_add_epi32(vb, u);
530 __m512i w = _mm512_srai_epi32(v, e);
531 d = _mm512_add_epi32(d, w);
532 _mm512_store_si512((__m512i*)dst, d);
538 void avx512_rev_vert_step64(
const lifting_step* s,
const line_buf* sig,
539 const line_buf* other,
const line_buf* aug,
540 ui32 repeat,
bool synthesis)
542 const si32 a = s->rev.Aatk;
543 const si32 b = s->rev.Batk;
544 const ui8 e = s->rev.Eatk;
545 __m512i vb = _mm512_set1_epi64(b);
547 si64* dst = aug->i64;
548 const si64* src1 = sig->i64, * src2 = other->i64;
556 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
558 __m512i s1 = _mm512_load_si512((__m512i*)src1);
559 __m512i s2 = _mm512_load_si512((__m512i*)src2);
560 __m512i d = _mm512_load_si512((__m512i*)dst);
561 __m512i t = _mm512_add_epi64(s1, s2);
562 __m512i v = _mm512_add_epi64(vb, t);
563 __m512i w = _mm512_srai_epi64(v, e);
564 d = _mm512_sub_epi64(d, w);
565 _mm512_store_si512((__m512i*)dst, d);
568 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
570 __m512i s1 = _mm512_load_si512((__m512i*)src1);
571 __m512i s2 = _mm512_load_si512((__m512i*)src2);
572 __m512i d = _mm512_load_si512((__m512i*)dst);
573 __m512i t = _mm512_add_epi64(s1, s2);
574 __m512i v = _mm512_add_epi64(vb, t);
575 __m512i w = _mm512_srai_epi64(v, e);
576 d = _mm512_add_epi64(d, w);
577 _mm512_store_si512((__m512i*)dst, d);
580 else if (a == -1 && b == 1 && e == 1)
584 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
586 __m512i s1 = _mm512_load_si512((__m512i*)src1);
587 __m512i s2 = _mm512_load_si512((__m512i*)src2);
588 __m512i d = _mm512_load_si512((__m512i*)dst);
589 __m512i t = _mm512_add_epi64(s1, s2);
590 __m512i w = _mm512_srai_epi64(t, e);
591 d = _mm512_add_epi64(d, w);
592 _mm512_store_si512((__m512i*)dst, d);
595 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
597 __m512i s1 = _mm512_load_si512((__m512i*)src1);
598 __m512i s2 = _mm512_load_si512((__m512i*)src2);
599 __m512i d = _mm512_load_si512((__m512i*)dst);
600 __m512i t = _mm512_add_epi64(s1, s2);
601 __m512i w = _mm512_srai_epi64(t, e);
602 d = _mm512_sub_epi64(d, w);
603 _mm512_store_si512((__m512i*)dst, d);
610 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
612 __m512i s1 = _mm512_load_si512((__m512i*)src1);
613 __m512i s2 = _mm512_load_si512((__m512i*)src2);
614 __m512i d = _mm512_load_si512((__m512i*)dst);
615 __m512i t = _mm512_add_epi64(s1, s2);
616 __m512i v = _mm512_sub_epi64(vb, t);
617 __m512i w = _mm512_srai_epi64(v, e);
618 d = _mm512_sub_epi64(d, w);
619 _mm512_store_si512((__m512i*)dst, d);
622 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
624 __m512i s1 = _mm512_load_si512((__m512i*)src1);
625 __m512i s2 = _mm512_load_si512((__m512i*)src2);
626 __m512i d = _mm512_load_si512((__m512i*)dst);
627 __m512i t = _mm512_add_epi64(s1, s2);
628 __m512i v = _mm512_sub_epi64(vb, t);
629 __m512i w = _mm512_srai_epi64(v, e);
630 d = _mm512_add_epi64(d, w);
631 _mm512_store_si512((__m512i*)dst, d);
639 for (
ui32 i = repeat; i > 0; --i)
640 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
642 for (
ui32 i = repeat; i > 0; --i)
643 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
681 const line_buf* other,
const line_buf* aug,
682 ui32 repeat,
bool synthesis)
691 avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
698 avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
703 void avx512_rev_horz_ana32(
const param_atk* atk,
const line_buf* ldst,
704 const line_buf* hdst,
const line_buf* src,
705 ui32 width,
bool even)
711 float* dpl = even ? ldst->f32 : hdst->f32;
712 float* dph = even ? hdst->f32 : ldst->f32;
713 float* sp = src->f32;
715 avx512_deinterleave32(dpl, dph, sp, w);
718 si32* hp = hdst->i32, * lp = ldst->i32;
719 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
720 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
721 ui32 num_steps = atk->get_num_steps();
722 for (
ui32 j = num_steps; j > 0; --j)
727 const si32 b = s->rev.Batk;
728 const ui8 e = s->rev.Eatk;
729 __m512i va = _mm512_set1_epi32(a);
730 __m512i vb = _mm512_set1_epi32(b);
734 lp[l_width] = lp[l_width - 1];
740 int i = (int)h_width;
743 for (; i > 0; i -= 16, sp += 16, dp += 16)
745 __m512i s1 = _mm512_load_si512((__m512i*)sp);
746 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
747 __m512i d = _mm512_load_si512((__m512i*)dp);
748 __m512i t = _mm512_add_epi32(s1, s2);
749 __m512i v = _mm512_add_epi32(vb, t);
750 __m512i w = _mm512_srai_epi32(v, e);
751 d = _mm512_add_epi32(d, w);
752 _mm512_store_si512((__m512i*)dp, d);
757 for (; i > 0; i -= 16, sp += 16, dp += 16)
759 __m512i s1 = _mm512_load_si512((__m512i*)sp);
760 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
761 __m512i d = _mm512_load_si512((__m512i*)dp);
762 __m512i t = _mm512_add_epi32(s1, s2);
763 __m512i v = _mm512_add_epi32(vb, t);
764 __m512i w = _mm512_srai_epi32(v, e);
765 d = _mm512_add_epi32(d, w);
766 _mm512_store_si512((__m512i*)dp, d);
770 else if (a == -1 && b == 1 && e == 1)
772 int i = (int)h_width;
774 for (; i > 0; i -= 16, sp += 16, dp += 16)
776 __m512i s1 = _mm512_load_si512((__m512i*)sp);
777 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
778 __m512i d = _mm512_load_si512((__m512i*)dp);
779 __m512i t = _mm512_add_epi32(s1, s2);
780 __m512i w = _mm512_srai_epi32(t, e);
781 d = _mm512_sub_epi32(d, w);
782 _mm512_store_si512((__m512i*)dp, d);
785 for (; i > 0; i -= 16, sp += 16, dp += 16)
787 __m512i s1 = _mm512_load_si512((__m512i*)sp);
788 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
789 __m512i d = _mm512_load_si512((__m512i*)dp);
790 __m512i t = _mm512_add_epi32(s1, s2);
791 __m512i w = _mm512_srai_epi32(t, e);
792 d = _mm512_sub_epi32(d, w);
793 _mm512_store_si512((__m512i*)dp, d);
798 int i = (int)h_width;
800 for (; i > 0; i -= 16, sp += 16, dp += 16)
802 __m512i s1 = _mm512_load_si512((__m512i*)sp);
803 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
804 __m512i d = _mm512_load_si512((__m512i*)dp);
805 __m512i t = _mm512_add_epi32(s1, s2);
806 __m512i v = _mm512_sub_epi32(vb, t);
807 __m512i w = _mm512_srai_epi32(v, e);
808 d = _mm512_add_epi32(d, w);
809 _mm512_store_si512((__m512i*)dp, d);
812 for (; i > 0; i -= 16, sp += 16, dp += 16)
814 __m512i s1 = _mm512_load_si512((__m512i*)sp);
815 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
816 __m512i d = _mm512_load_si512((__m512i*)dp);
817 __m512i t = _mm512_add_epi32(s1, s2);
818 __m512i v = _mm512_sub_epi32(vb, t);
819 __m512i w = _mm512_srai_epi32(v, e);
820 d = _mm512_add_epi32(d, w);
821 _mm512_store_si512((__m512i*)dp, d);
826 int i = (int)h_width;
828 for (; i > 0; i -= 16, sp += 16, dp += 16)
830 __m512i s1 = _mm512_load_si512((__m512i*)sp);
831 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
832 __m512i d = _mm512_load_si512((__m512i*)dp);
833 __m512i t = _mm512_add_epi32(s1, s2);
834 __m512i u = _mm512_mullo_epi32(va, t);
835 __m512i v = _mm512_add_epi32(vb, u);
836 __m512i w = _mm512_srai_epi32(v, e);
837 d = _mm512_add_epi32(d, w);
838 _mm512_store_si512((__m512i*)dp, d);
841 for (; i > 0; i -= 16, sp += 16, dp += 16)
843 __m512i s1 = _mm512_load_si512((__m512i*)sp);
844 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
845 __m512i d = _mm512_load_si512((__m512i*)dp);
846 __m512i t = _mm512_add_epi32(s1, s2);
847 __m512i u = _mm512_mullo_epi32(va, t);
848 __m512i v = _mm512_add_epi32(vb, u);
849 __m512i w = _mm512_srai_epi32(v, e);
850 d = _mm512_add_epi32(d, w);
851 _mm512_store_si512((__m512i*)dp, d);
856 si32* t = lp; lp = hp; hp = t;
858 ui32 w = l_width; l_width = h_width; h_width = w;
863 ldst->i32[0] = src->i32[0];
865 hdst->i32[0] = src->i32[0] << 1;
870 void avx512_rev_horz_ana64(
const param_atk* atk,
const line_buf* ldst,
871 const line_buf* hdst,
const line_buf* src,
872 ui32 width,
bool even)
878 void* dpl = even ? ldst->p : hdst->p;
879 void* dph = even ? hdst->p : ldst->p;
880 const void* sp = src->p;
882 avx512_deinterleave64(dpl, dph, sp, w);
885 si64* hp = hdst->i64, * lp = ldst->i64;
886 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
887 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
888 ui32 num_steps = atk->get_num_steps();
889 for (
ui32 j = num_steps; j > 0; --j)
894 const si32 b = s->rev.Batk;
895 const ui8 e = s->rev.Eatk;
896 __m512i vb = _mm512_set1_epi64(b);
900 lp[l_width] = lp[l_width - 1];
906 int i = (int)h_width;
909 for (; i > 0; i -= 8, sp += 8, dp += 8)
911 __m512i s1 = _mm512_load_si512((__m512i*)sp);
912 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
913 __m512i d = _mm512_load_si512((__m512i*)dp);
914 __m512i t = _mm512_add_epi64(s1, s2);
915 __m512i v = _mm512_add_epi64(vb, t);
916 __m512i w = _mm512_srai_epi64(v, e);
917 d = _mm512_add_epi64(d, w);
918 _mm512_store_si512((__m512i*)dp, d);
923 for (; i > 0; i -= 8, sp += 8, dp += 8)
925 __m512i s1 = _mm512_load_si512((__m512i*)sp);
926 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
927 __m512i d = _mm512_load_si512((__m512i*)dp);
928 __m512i t = _mm512_add_epi64(s1, s2);
929 __m512i v = _mm512_add_epi64(vb, t);
930 __m512i w = _mm512_srai_epi64(v, e);
931 d = _mm512_add_epi64(d, w);
932 _mm512_store_si512((__m512i*)dp, d);
936 else if (a == -1 && b == 1 && e == 1)
938 int i = (int)h_width;
940 for (; i > 0; i -= 8, sp += 8, dp += 8)
942 __m512i s1 = _mm512_load_si512((__m512i*)sp);
943 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
944 __m512i d = _mm512_load_si512((__m512i*)dp);
945 __m512i t = _mm512_add_epi64(s1, s2);
946 __m512i w = _mm512_srai_epi64(t, e);
947 d = _mm512_sub_epi64(d, w);
948 _mm512_store_si512((__m512i*)dp, d);
951 for (; i > 0; i -= 8, sp += 8, dp += 8)
953 __m512i s1 = _mm512_load_si512((__m512i*)sp);
954 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
955 __m512i d = _mm512_load_si512((__m512i*)dp);
956 __m512i t = _mm512_add_epi64(s1, s2);
957 __m512i w = _mm512_srai_epi64(t, e);
958 d = _mm512_sub_epi64(d, w);
959 _mm512_store_si512((__m512i*)dp, d);
964 int i = (int)h_width;
966 for (; i > 0; i -= 8, sp += 8, dp += 8)
968 __m512i s1 = _mm512_load_si512((__m512i*)sp);
969 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
970 __m512i d = _mm512_load_si512((__m512i*)dp);
971 __m512i t = _mm512_add_epi64(s1, s2);
972 __m512i v = _mm512_sub_epi64(vb, t);
973 __m512i w = _mm512_srai_epi64(v, e);
974 d = _mm512_add_epi64(d, w);
975 _mm512_store_si512((__m512i*)dp, d);
978 for (; i > 0; i -= 8, sp += 8, dp += 8)
980 __m512i s1 = _mm512_load_si512((__m512i*)sp);
981 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
982 __m512i d = _mm512_load_si512((__m512i*)dp);
983 __m512i t = _mm512_add_epi64(s1, s2);
984 __m512i v = _mm512_sub_epi64(vb, t);
985 __m512i w = _mm512_srai_epi64(v, e);
986 d = _mm512_add_epi64(d, w);
987 _mm512_store_si512((__m512i*)dp, d);
996 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
997 *dp += (b + a * (sp[0] + sp[1])) >> e;
999 for (
ui32 i = h_width; i > 0; --i, sp++, dp++)
1000 *dp += (b + a * (sp[-1] + sp[0])) >> e;
1037 si64* t = lp; lp = hp; hp = t;
1039 ui32 w = l_width; l_width = h_width; h_width = w;
1044 ldst->i64[0] = src->i64[0];
1046 hdst->i64[0] = src->i64[0] << 1;
1052 const line_buf* hdst,
const line_buf* src,
1053 ui32 width,
bool even)
1059 avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even);
1066 avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even);
1071 void avx512_rev_horz_syn32(
const param_atk* atk,
const line_buf* dst,
1072 const line_buf* lsrc,
const line_buf* hsrc,
1073 ui32 width,
bool even)
1078 si32* oth = hsrc->i32, * aug = lsrc->i32;
1079 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1080 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1081 ui32 num_steps = atk->get_num_steps();
1082 for (
ui32 j = 0; j < num_steps; ++j)
1086 const si32 b = s->rev.Batk;
1087 const ui8 e = s->rev.Eatk;
1088 __m512i va = _mm512_set1_epi32(a);
1089 __m512i vb = _mm512_set1_epi32(b);
1093 oth[oth_width] = oth[oth_width - 1];
1095 const si32* sp = oth;
1099 int i = (int)aug_width;
1102 for (; i > 0; i -= 16, sp += 16, dp += 16)
1104 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1105 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1106 __m512i d = _mm512_load_si512((__m512i*)dp);
1107 __m512i t = _mm512_add_epi32(s1, s2);
1108 __m512i v = _mm512_add_epi32(vb, t);
1109 __m512i w = _mm512_srai_epi32(v, e);
1110 d = _mm512_sub_epi32(d, w);
1111 _mm512_store_si512((__m512i*)dp, d);
1116 for (; i > 0; i -= 16, sp += 16, dp += 16)
1118 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1119 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1120 __m512i d = _mm512_load_si512((__m512i*)dp);
1121 __m512i t = _mm512_add_epi32(s1, s2);
1122 __m512i v = _mm512_add_epi32(vb, t);
1123 __m512i w = _mm512_srai_epi32(v, e);
1124 d = _mm512_sub_epi32(d, w);
1125 _mm512_store_si512((__m512i*)dp, d);
1129 else if (a == -1 && b == 1 && e == 1)
1131 int i = (int)aug_width;
1133 for (; i > 0; i -= 16, sp += 16, dp += 16)
1135 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1136 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1137 __m512i d = _mm512_load_si512((__m512i*)dp);
1138 __m512i t = _mm512_add_epi32(s1, s2);
1139 __m512i w = _mm512_srai_epi32(t, e);
1140 d = _mm512_add_epi32(d, w);
1141 _mm512_store_si512((__m512i*)dp, d);
1144 for (; i > 0; i -= 16, sp += 16, dp += 16)
1146 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1147 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1148 __m512i d = _mm512_load_si512((__m512i*)dp);
1149 __m512i t = _mm512_add_epi32(s1, s2);
1150 __m512i w = _mm512_srai_epi32(t, e);
1151 d = _mm512_add_epi32(d, w);
1152 _mm512_store_si512((__m512i*)dp, d);
1157 int i = (int)aug_width;
1159 for (; i > 0; i -= 16, sp += 16, dp += 16)
1161 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1162 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1163 __m512i d = _mm512_load_si512((__m512i*)dp);
1164 __m512i t = _mm512_add_epi32(s1, s2);
1165 __m512i v = _mm512_sub_epi32(vb, t);
1166 __m512i w = _mm512_srai_epi32(v, e);
1167 d = _mm512_sub_epi32(d, w);
1168 _mm512_store_si512((__m512i*)dp, d);
1171 for (; i > 0; i -= 16, sp += 16, dp += 16)
1173 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1174 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1175 __m512i d = _mm512_load_si512((__m512i*)dp);
1176 __m512i t = _mm512_add_epi32(s1, s2);
1177 __m512i v = _mm512_sub_epi32(vb, t);
1178 __m512i w = _mm512_srai_epi32(v, e);
1179 d = _mm512_sub_epi32(d, w);
1180 _mm512_store_si512((__m512i*)dp, d);
1185 int i = (int)aug_width;
1187 for (; i > 0; i -= 16, sp += 16, dp += 16)
1189 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1190 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1191 __m512i d = _mm512_load_si512((__m512i*)dp);
1192 __m512i t = _mm512_add_epi32(s1, s2);
1193 __m512i u = _mm512_mullo_epi32(va, t);
1194 __m512i v = _mm512_add_epi32(vb, u);
1195 __m512i w = _mm512_srai_epi32(v, e);
1196 d = _mm512_sub_epi32(d, w);
1197 _mm512_store_si512((__m512i*)dp, d);
1200 for (; i > 0; i -= 16, sp += 16, dp += 16)
1202 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1203 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1204 __m512i d = _mm512_load_si512((__m512i*)dp);
1205 __m512i t = _mm512_add_epi32(s1, s2);
1206 __m512i u = _mm512_mullo_epi32(va, t);
1207 __m512i v = _mm512_add_epi32(vb, u);
1208 __m512i w = _mm512_srai_epi32(v, e);
1209 d = _mm512_sub_epi32(d, w);
1210 _mm512_store_si512((__m512i*)dp, d);
1215 si32* t = aug; aug = oth; oth = t;
1217 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1222 float* dp = dst->f32;
1223 float* spl = even ? lsrc->f32 : hsrc->f32;
1224 float* sph = even ? hsrc->f32 : lsrc->f32;
1226 avx512_interleave32(dp, spl, sph, w);
1231 dst->i32[0] = lsrc->i32[0];
1233 dst->i32[0] = hsrc->i32[0] >> 1;
1238 void avx512_rev_horz_syn64(
const param_atk* atk,
const line_buf* dst,
1239 const line_buf* lsrc,
const line_buf* hsrc,
1240 ui32 width,
bool even)
1245 si64* oth = hsrc->i64, * aug = lsrc->i64;
1246 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
1247 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
1248 ui32 num_steps = atk->get_num_steps();
1249 for (
ui32 j = 0; j < num_steps; ++j)
1253 const si32 b = s->rev.Batk;
1254 const ui8 e = s->rev.Eatk;
1255 __m512i vb = _mm512_set1_epi64(b);
1259 oth[oth_width] = oth[oth_width - 1];
1261 const si64* sp = oth;
1265 int i = (int)aug_width;
1268 for (; i > 0; i -= 8, sp += 8, dp += 8)
1270 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1271 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1272 __m512i d = _mm512_load_si512((__m512i*)dp);
1273 __m512i t = _mm512_add_epi64(s1, s2);
1274 __m512i v = _mm512_add_epi64(vb, t);
1275 __m512i w = _mm512_srai_epi64(v, e);
1276 d = _mm512_sub_epi64(d, w);
1277 _mm512_store_si512((__m512i*)dp, d);
1282 for (; i > 0; i -= 8, sp += 8, dp += 8)
1284 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1285 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1286 __m512i d = _mm512_load_si512((__m512i*)dp);
1287 __m512i t = _mm512_add_epi64(s1, s2);
1288 __m512i v = _mm512_add_epi64(vb, t);
1289 __m512i w = _mm512_srai_epi64(v, e);
1290 d = _mm512_sub_epi64(d, w);
1291 _mm512_store_si512((__m512i*)dp, d);
1295 else if (a == -1 && b == 1 && e == 1)
1297 int i = (int)aug_width;
1299 for (; i > 0; i -= 8, sp += 8, dp += 8)
1301 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1302 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1303 __m512i d = _mm512_load_si512((__m512i*)dp);
1304 __m512i t = _mm512_add_epi64(s1, s2);
1305 __m512i w = _mm512_srai_epi64(t, e);
1306 d = _mm512_add_epi64(d, w);
1307 _mm512_store_si512((__m512i*)dp, d);
1310 for (; i > 0; i -= 8, sp += 8, dp += 8)
1312 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1313 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1314 __m512i d = _mm512_load_si512((__m512i*)dp);
1315 __m512i t = _mm512_add_epi64(s1, s2);
1316 __m512i w = _mm512_srai_epi64(t, e);
1317 d = _mm512_add_epi64(d, w);
1318 _mm512_store_si512((__m512i*)dp, d);
1323 int i = (int)aug_width;
1325 for (; i > 0; i -= 8, sp += 8, dp += 8)
1327 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1328 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
1329 __m512i d = _mm512_load_si512((__m512i*)dp);
1330 __m512i t = _mm512_add_epi64(s1, s2);
1331 __m512i v = _mm512_sub_epi64(vb, t);
1332 __m512i w = _mm512_srai_epi64(v, e);
1333 d = _mm512_sub_epi64(d, w);
1334 _mm512_store_si512((__m512i*)dp, d);
1337 for (; i > 0; i -= 8, sp += 8, dp += 8)
1339 __m512i s1 = _mm512_load_si512((__m512i*)sp);
1340 __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
1341 __m512i d = _mm512_load_si512((__m512i*)dp);
1342 __m512i t = _mm512_add_epi64(s1, s2);
1343 __m512i v = _mm512_sub_epi64(vb, t);
1344 __m512i w = _mm512_srai_epi64(v, e);
1345 d = _mm512_sub_epi64(d, w);
1346 _mm512_store_si512((__m512i*)dp, d);
1355 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
1356 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
1358 for (
ui32 i = aug_width; i > 0; --i, sp++, dp++)
1359 *dp -= (b + a * (sp[0] + sp[1])) >> e;
1396 si64* t = aug; aug = oth; oth = t;
1398 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1404 const void* spl = even ? lsrc->p : hsrc->p;
1405 const void* sph = even ? hsrc->p : lsrc->p;
1407 avx512_interleave64(dp, spl, sph, w);
1412 dst->i64[0] = lsrc->i64[0];
1414 dst->i64[0] = hsrc->i64[0] >> 1;
1420 const line_buf* lsrc,
const line_buf* hsrc,
1421 ui32 width,
bool even)
1427 avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1434 avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
void avx512_irv_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx512_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx512_irv_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
void avx512_irv_vert_times_K(float K, const line_buf *aug, ui32 repeat)
void avx512_irv_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx512_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)