OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_transform_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_transform_avx2.cpp
34// Author: Aous Naman
35// Date: 28 August 2019
36//***************************************************************************/
37
38#include "ojph_arch.h"
39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
40
41#include <climits>
42#include <cstdio>
43
44#include "ojph_defs.h"
45#include "ojph_mem.h"
46#include "ojph_params.h"
48
49#include "ojph_transform.h"
51
52#include <immintrin.h>
53
54namespace ojph {
55 namespace local {
56
58 // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
59 static inline
60 __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
61 {
62 // note than m must be obtained using
63 // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
64 __m256i x = _mm256_srli_epi64(a, amt);
65 x = _mm256_xor_si256(x, m);
66 __m256i result = _mm256_sub_epi64(x, m);
67 return result;
68 }
69
71 static inline
72 void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width)
73 {
74 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
75 {
76 __m256 a = _mm256_load_ps(sp);
77 __m256 b = _mm256_load_ps(sp + 8);
78 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
79 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
80 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
81 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
82 _mm256_store_ps(dpl, e);
83 _mm256_store_ps(dph, f);
84 }
85 }
86
88 static inline
89 void avx2_interleave32(float* dp, float* spl, float* sph, int width)
90 {
91 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
92 {
93 __m256 a = _mm256_load_ps(spl);
94 __m256 b = _mm256_load_ps(sph);
95 __m256 c = _mm256_unpacklo_ps(a, b);
96 __m256 d = _mm256_unpackhi_ps(a, b);
97 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
98 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
99 _mm256_store_ps(dp, e);
100 _mm256_store_ps(dp + 8, f);
101 }
102 }
103
105 static inline
106 void avx2_deinterleave64(void* dpl, void* dph, const void* sp, int width)
107 {
108 for (; width > 0; width -= 8,
109 sp = (const char*)sp + 64,
110 dpl = (char*)dpl + 32,
111 dph = (char*)dph + 32)
112 {
113 __m256i a = _mm256_load_si256((const __m256i*)sp);
114 __m256i b = _mm256_load_si256((const __m256i*)((const char*)sp + 32));
115 __m256i c = _mm256_permute2f128_si256(a, b, (2 << 4) | (0));
116 __m256i d = _mm256_permute2f128_si256(a, b, (3 << 4) | (1));
117 __m256i e = _mm256_unpacklo_epi64(c, d);
118 __m256i f = _mm256_unpackhi_epi64(c, d);
119 _mm256_store_si256((__m256i*)dpl, e);
120 _mm256_store_si256((__m256i*)dph, f);
121 }
122 }
123
125 static inline
126 void avx2_interleave64(void* dp, const void* spl, const void* sph,
127 int width)
128 {
129 for (; width > 0; width -= 8,
130 dp = (char*)dp + 64,
131 spl = (const char*)spl + 32,
132 sph = (const char*)sph + 32)
133 {
134 __m256i a = _mm256_load_si256((const __m256i*)spl);
135 __m256i b = _mm256_load_si256((const __m256i*)sph);
136 __m256i c = _mm256_unpacklo_epi64(a, b);
137 __m256i d = _mm256_unpackhi_epi64(a, b);
138 __m256i e = _mm256_permute2f128_si256(c, d, (2 << 4) | (0));
139 __m256i f = _mm256_permute2f128_si256(c, d, (3 << 4) | (1));
140 _mm256_store_si256((__m256i*)dp, e);
141 _mm256_store_si256((__m256i*)((char*)dp + 32), f);
142 }
143 }
144
146 static
147 void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig,
148 const line_buf* other, const line_buf* aug,
149 ui32 repeat, bool synthesis)
150 {
151 const si32 a = s->rev.Aatk;
152 const si32 b = s->rev.Batk;
153 const ui8 e = s->rev.Eatk;
154 __m256i va = _mm256_set1_epi32(a);
155 __m256i vb = _mm256_set1_epi32(b);
156
157 si32* dst = aug->i32;
158 const si32* src1 = sig->i32, * src2 = other->i32;
159 // The general definition of the wavelet in Part 2 is slightly
160 // different to part 2, although they are mathematically equivalent
161 // here, we identify the simpler form from Part 1 and employ them
162 if (a == 1)
163 { // 5/3 update and any case with a == 1
164 int i = (int)repeat;
165 if (synthesis)
166 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
167 {
168 __m256i s1 = _mm256_load_si256((__m256i*)src1);
169 __m256i s2 = _mm256_load_si256((__m256i*)src2);
170 __m256i d = _mm256_load_si256((__m256i*)dst);
171 __m256i t = _mm256_add_epi32(s1, s2);
172 __m256i v = _mm256_add_epi32(vb, t);
173 __m256i w = _mm256_srai_epi32(v, e);
174 d = _mm256_sub_epi32(d, w);
175 _mm256_store_si256((__m256i*)dst, d);
176 }
177 else
178 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
179 {
180 __m256i s1 = _mm256_load_si256((__m256i*)src1);
181 __m256i s2 = _mm256_load_si256((__m256i*)src2);
182 __m256i d = _mm256_load_si256((__m256i*)dst);
183 __m256i t = _mm256_add_epi32(s1, s2);
184 __m256i v = _mm256_add_epi32(vb, t);
185 __m256i w = _mm256_srai_epi32(v, e);
186 d = _mm256_add_epi32(d, w);
187 _mm256_store_si256((__m256i*)dst, d);
188 }
189 }
190 else if (a == -1 && b == 1 && e == 1)
191 { // 5/3 predict
192 int i = (int)repeat;
193 if (synthesis)
194 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
195 {
196 __m256i s1 = _mm256_load_si256((__m256i*)src1);
197 __m256i s2 = _mm256_load_si256((__m256i*)src2);
198 __m256i d = _mm256_load_si256((__m256i*)dst);
199 __m256i t = _mm256_add_epi32(s1, s2);
200 __m256i w = _mm256_srai_epi32(t, e);
201 d = _mm256_add_epi32(d, w);
202 _mm256_store_si256((__m256i*)dst, d);
203 }
204 else
205 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
206 {
207 __m256i s1 = _mm256_load_si256((__m256i*)src1);
208 __m256i s2 = _mm256_load_si256((__m256i*)src2);
209 __m256i d = _mm256_load_si256((__m256i*)dst);
210 __m256i t = _mm256_add_epi32(s1, s2);
211 __m256i w = _mm256_srai_epi32(t, e);
212 d = _mm256_sub_epi32(d, w);
213 _mm256_store_si256((__m256i*)dst, d);
214 }
215 }
216 else if (a == -1)
217 { // any case with a == -1, which is not 5/3 predict
218 int i = (int)repeat;
219 if (synthesis)
220 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
221 {
222 __m256i s1 = _mm256_load_si256((__m256i*)src1);
223 __m256i s2 = _mm256_load_si256((__m256i*)src2);
224 __m256i d = _mm256_load_si256((__m256i*)dst);
225 __m256i t = _mm256_add_epi32(s1, s2);
226 __m256i v = _mm256_sub_epi32(vb, t);
227 __m256i w = _mm256_srai_epi32(v, e);
228 d = _mm256_sub_epi32(d, w);
229 _mm256_store_si256((__m256i*)dst, d);
230 }
231 else
232 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
233 {
234 __m256i s1 = _mm256_load_si256((__m256i*)src1);
235 __m256i s2 = _mm256_load_si256((__m256i*)src2);
236 __m256i d = _mm256_load_si256((__m256i*)dst);
237 __m256i t = _mm256_add_epi32(s1, s2);
238 __m256i v = _mm256_sub_epi32(vb, t);
239 __m256i w = _mm256_srai_epi32(v, e);
240 d = _mm256_add_epi32(d, w);
241 _mm256_store_si256((__m256i*)dst, d);
242 }
243 }
244 else { // general case
245 int i = (int)repeat;
246 if (synthesis)
247 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
248 {
249 __m256i s1 = _mm256_load_si256((__m256i*)src1);
250 __m256i s2 = _mm256_load_si256((__m256i*)src2);
251 __m256i d = _mm256_load_si256((__m256i*)dst);
252 __m256i t = _mm256_add_epi32(s1, s2);
253 __m256i u = _mm256_mullo_epi32(va, t);
254 __m256i v = _mm256_add_epi32(vb, u);
255 __m256i w = _mm256_srai_epi32(v, e);
256 d = _mm256_sub_epi32(d, w);
257 _mm256_store_si256((__m256i*)dst, d);
258 }
259 else
260 for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
261 {
262 __m256i s1 = _mm256_load_si256((__m256i*)src1);
263 __m256i s2 = _mm256_load_si256((__m256i*)src2);
264 __m256i d = _mm256_load_si256((__m256i*)dst);
265 __m256i t = _mm256_add_epi32(s1, s2);
266 __m256i u = _mm256_mullo_epi32(va, t);
267 __m256i v = _mm256_add_epi32(vb, u);
268 __m256i w = _mm256_srai_epi32(v, e);
269 d = _mm256_add_epi32(d, w);
270 _mm256_store_si256((__m256i*)dst, d);
271 }
272 }
273 }
274
276 static
277 void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig,
278 const line_buf* other, const line_buf* aug,
279 ui32 repeat, bool synthesis)
280 {
281 const si32 a = s->rev.Aatk;
282 const si32 b = s->rev.Batk;
283 const ui8 e = s->rev.Eatk;
284 __m256i vb = _mm256_set1_epi64x(b);
285 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
286
287 si64* dst = aug->i64;
288 const si64* src1 = sig->i64, * src2 = other->i64;
289 // The general definition of the wavelet in Part 2 is slightly
290 // different to part 2, although they are mathematically equivalent
291 // here, we identify the simpler form from Part 1 and employ them
292 if (a == 1)
293 { // 5/3 update and any case with a == 1
294 int i = (int)repeat;
295 if (synthesis)
296 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
297 {
298 __m256i s1 = _mm256_load_si256((__m256i*)src1);
299 __m256i s2 = _mm256_load_si256((__m256i*)src2);
300 __m256i d = _mm256_load_si256((__m256i*)dst);
301 __m256i t = _mm256_add_epi64(s1, s2);
302 __m256i v = _mm256_add_epi64(vb, t);
303 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
304 d = _mm256_sub_epi64(d, w);
305 _mm256_store_si256((__m256i*)dst, d);
306 }
307 else
308 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
309 {
310 __m256i s1 = _mm256_load_si256((__m256i*)src1);
311 __m256i s2 = _mm256_load_si256((__m256i*)src2);
312 __m256i d = _mm256_load_si256((__m256i*)dst);
313 __m256i t = _mm256_add_epi64(s1, s2);
314 __m256i v = _mm256_add_epi64(vb, t);
315 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
316 d = _mm256_add_epi64(d, w);
317 _mm256_store_si256((__m256i*)dst, d);
318 }
319 }
320 else if (a == -1 && b == 1 && e == 1)
321 { // 5/3 predict
322 int i = (int)repeat;
323 if (synthesis)
324 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
325 {
326 __m256i s1 = _mm256_load_si256((__m256i*)src1);
327 __m256i s2 = _mm256_load_si256((__m256i*)src2);
328 __m256i d = _mm256_load_si256((__m256i*)dst);
329 __m256i t = _mm256_add_epi64(s1, s2);
330 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
331 d = _mm256_add_epi64(d, w);
332 _mm256_store_si256((__m256i*)dst, d);
333 }
334 else
335 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
336 {
337 __m256i s1 = _mm256_load_si256((__m256i*)src1);
338 __m256i s2 = _mm256_load_si256((__m256i*)src2);
339 __m256i d = _mm256_load_si256((__m256i*)dst);
340 __m256i t = _mm256_add_epi64(s1, s2);
341 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
342 d = _mm256_sub_epi64(d, w);
343 _mm256_store_si256((__m256i*)dst, d);
344 }
345 }
346 else if (a == -1)
347 { // any case with a == -1, which is not 5/3 predict
348 int i = (int)repeat;
349 if (synthesis)
350 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
351 {
352 __m256i s1 = _mm256_load_si256((__m256i*)src1);
353 __m256i s2 = _mm256_load_si256((__m256i*)src2);
354 __m256i d = _mm256_load_si256((__m256i*)dst);
355 __m256i t = _mm256_add_epi64(s1, s2);
356 __m256i v = _mm256_sub_epi64(vb, t);
357 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
358 d = _mm256_sub_epi64(d, w);
359 _mm256_store_si256((__m256i*)dst, d);
360 }
361 else
362 for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
363 {
364 __m256i s1 = _mm256_load_si256((__m256i*)src1);
365 __m256i s2 = _mm256_load_si256((__m256i*)src2);
366 __m256i d = _mm256_load_si256((__m256i*)dst);
367 __m256i t = _mm256_add_epi64(s1, s2);
368 __m256i v = _mm256_sub_epi64(vb, t);
369 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
370 d = _mm256_add_epi64(d, w);
371 _mm256_store_si256((__m256i*)dst, d);
372 }
373 }
374 else { // general case
375 // 64bit multiplication is not supported in avx2;
376 // in particular, _mm256_mullo_epi64.
377 if (synthesis)
378 for (ui32 i = repeat; i > 0; --i)
379 *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
380 else
381 for (ui32 i = repeat; i > 0; --i)
382 *dst++ += (b + a * (*src1++ + *src2++)) >> e;
383 }
384 }
385
387 void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig,
388 const line_buf* other, const line_buf* aug,
389 ui32 repeat, bool synthesis)
390 {
391 if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) ||
392 ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
393 ((other != NULL) && (other->flags & line_buf::LFT_32BIT)))
394 {
395 assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
396 (other == NULL || other->flags & line_buf::LFT_32BIT) &&
397 (aug == NULL || aug->flags & line_buf::LFT_32BIT));
398 avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
399 }
400 else
401 {
402 assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
403 (other == NULL || other->flags & line_buf::LFT_64BIT) &&
404 (aug == NULL || aug->flags & line_buf::LFT_64BIT));
405 avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
406 }
407 }
408
410 static
411 void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst,
412 const line_buf* hdst, const line_buf* src,
413 ui32 width, bool even)
414 {
415 if (width > 1)
416 {
417 // split src into ldst and hdst
418 {
419 float* dpl = even ? ldst->f32 : hdst->f32;
420 float* dph = even ? hdst->f32 : ldst->f32;
421 float* sp = src->f32;
422 int w = (int)width;
423 avx2_deinterleave32(dpl, dph, sp, w);
424 }
425
426 si32* hp = hdst->i32, * lp = ldst->i32;
427 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
428 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
429 ui32 num_steps = atk->get_num_steps();
430 for (ui32 j = num_steps; j > 0; --j)
431 {
432 // first lifting step
433 const lifting_step* s = atk->get_step(j - 1);
434 const si32 a = s->rev.Aatk;
435 const si32 b = s->rev.Batk;
436 const ui8 e = s->rev.Eatk;
437 __m256i va = _mm256_set1_epi32(a);
438 __m256i vb = _mm256_set1_epi32(b);
439
440 // extension
441 lp[-1] = lp[0];
442 lp[l_width] = lp[l_width - 1];
443 // lifting step
444 const si32* sp = lp;
445 si32* dp = hp;
446 if (a == 1)
447 { // 5/3 update and any case with a == 1
448 int i = (int)h_width;
449 if (even)
450 {
451 for (; i > 0; i -= 8, sp += 8, dp += 8)
452 {
453 __m256i s1 = _mm256_load_si256((__m256i*)sp);
454 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
455 __m256i d = _mm256_load_si256((__m256i*)dp);
456 __m256i t = _mm256_add_epi32(s1, s2);
457 __m256i v = _mm256_add_epi32(vb, t);
458 __m256i w = _mm256_srai_epi32(v, e);
459 d = _mm256_add_epi32(d, w);
460 _mm256_store_si256((__m256i*)dp, d);
461 }
462 }
463 else
464 {
465 for (; i > 0; i -= 8, sp += 8, dp += 8)
466 {
467 __m256i s1 = _mm256_load_si256((__m256i*)sp);
468 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
469 __m256i d = _mm256_load_si256((__m256i*)dp);
470 __m256i t = _mm256_add_epi32(s1, s2);
471 __m256i v = _mm256_add_epi32(vb, t);
472 __m256i w = _mm256_srai_epi32(v, e);
473 d = _mm256_add_epi32(d, w);
474 _mm256_store_si256((__m256i*)dp, d);
475 }
476 }
477 }
478 else if (a == -1 && b == 1 && e == 1)
479 { // 5/3 predict
480 int i = (int)h_width;
481 if (even)
482 for (; i > 0; i -= 8, sp += 8, dp += 8)
483 {
484 __m256i s1 = _mm256_load_si256((__m256i*)sp);
485 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
486 __m256i d = _mm256_load_si256((__m256i*)dp);
487 __m256i t = _mm256_add_epi32(s1, s2);
488 __m256i w = _mm256_srai_epi32(t, e);
489 d = _mm256_sub_epi32(d, w);
490 _mm256_store_si256((__m256i*)dp, d);
491 }
492 else
493 for (; i > 0; i -= 8, sp += 8, dp += 8)
494 {
495 __m256i s1 = _mm256_load_si256((__m256i*)sp);
496 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
497 __m256i d = _mm256_load_si256((__m256i*)dp);
498 __m256i t = _mm256_add_epi32(s1, s2);
499 __m256i w = _mm256_srai_epi32(t, e);
500 d = _mm256_sub_epi32(d, w);
501 _mm256_store_si256((__m256i*)dp, d);
502 }
503 }
504 else if (a == -1)
505 { // any case with a == -1, which is not 5/3 predict
506 int i = (int)h_width;
507 if (even)
508 for (; i > 0; i -= 8, sp += 8, dp += 8)
509 {
510 __m256i s1 = _mm256_load_si256((__m256i*)sp);
511 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
512 __m256i d = _mm256_load_si256((__m256i*)dp);
513 __m256i t = _mm256_add_epi32(s1, s2);
514 __m256i v = _mm256_sub_epi32(vb, t);
515 __m256i w = _mm256_srai_epi32(v, e);
516 d = _mm256_add_epi32(d, w);
517 _mm256_store_si256((__m256i*)dp, d);
518 }
519 else
520 for (; i > 0; i -= 8, sp += 8, dp += 8)
521 {
522 __m256i s1 = _mm256_load_si256((__m256i*)sp);
523 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
524 __m256i d = _mm256_load_si256((__m256i*)dp);
525 __m256i t = _mm256_add_epi32(s1, s2);
526 __m256i v = _mm256_sub_epi32(vb, t);
527 __m256i w = _mm256_srai_epi32(v, e);
528 d = _mm256_add_epi32(d, w);
529 _mm256_store_si256((__m256i*)dp, d);
530 }
531 }
532 else {
533 // general case
534 int i = (int)h_width;
535 if (even)
536 for (; i > 0; i -= 8, sp += 8, dp += 8)
537 {
538 __m256i s1 = _mm256_load_si256((__m256i*)sp);
539 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
540 __m256i d = _mm256_load_si256((__m256i*)dp);
541 __m256i t = _mm256_add_epi32(s1, s2);
542 __m256i u = _mm256_mullo_epi32(va, t);
543 __m256i v = _mm256_add_epi32(vb, u);
544 __m256i w = _mm256_srai_epi32(v, e);
545 d = _mm256_add_epi32(d, w);
546 _mm256_store_si256((__m256i*)dp, d);
547 }
548 else
549 for (; i > 0; i -= 8, sp += 8, dp += 8)
550 {
551 __m256i s1 = _mm256_load_si256((__m256i*)sp);
552 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
553 __m256i d = _mm256_load_si256((__m256i*)dp);
554 __m256i t = _mm256_add_epi32(s1, s2);
555 __m256i u = _mm256_mullo_epi32(va, t);
556 __m256i v = _mm256_add_epi32(vb, u);
557 __m256i w = _mm256_srai_epi32(v, e);
558 d = _mm256_add_epi32(d, w);
559 _mm256_store_si256((__m256i*)dp, d);
560 }
561 }
562
563 // swap buffers
564 si32* t = lp; lp = hp; hp = t;
565 even = !even;
566 ui32 w = l_width; l_width = h_width; h_width = w;
567 }
568 }
569 else {
570 if (even)
571 ldst->i32[0] = src->i32[0];
572 else
573 hdst->i32[0] = src->i32[0] << 1;
574 }
575 }
576
578 static
579 void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst,
580 const line_buf* hdst, const line_buf* src,
581 ui32 width, bool even)
582 {
583 if (width > 1)
584 {
585 // split src into ldst and hdst
586 {
587 void* dpl = even ? ldst->p : hdst->p;
588 void* dph = even ? hdst->p : ldst->p;
589 const void* sp = src->p;
590 int w = (int)width;
591 avx2_deinterleave64(dpl, dph, sp, w);
592 }
593
594 si64* hp = hdst->i64, * lp = ldst->i64;
595 ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass
596 ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass
597 ui32 num_steps = atk->get_num_steps();
598 for (ui32 j = num_steps; j > 0; --j)
599 {
600 // first lifting step
601 const lifting_step* s = atk->get_step(j - 1);
602 const si32 a = s->rev.Aatk;
603 const si32 b = s->rev.Batk;
604 const ui8 e = s->rev.Eatk;
605 __m256i vb = _mm256_set1_epi64x(b);
606 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
607
608 // extension
609 lp[-1] = lp[0];
610 lp[l_width] = lp[l_width - 1];
611 // lifting step
612 const si64* sp = lp;
613 si64* dp = hp;
614 if (a == 1)
615 { // 5/3 update and any case with a == 1
616 int i = (int)h_width;
617 if (even)
618 {
619 for (; i > 0; i -= 4, sp += 4, dp += 4)
620 {
621 __m256i s1 = _mm256_load_si256((__m256i*)sp);
622 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
623 __m256i d = _mm256_load_si256((__m256i*)dp);
624 __m256i t = _mm256_add_epi64(s1, s2);
625 __m256i v = _mm256_add_epi64(vb, t);
626 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
627 d = _mm256_add_epi64(d, w);
628 _mm256_store_si256((__m256i*)dp, d);
629 }
630 }
631 else
632 {
633 for (; i > 0; i -= 4, sp += 4, dp += 4)
634 {
635 __m256i s1 = _mm256_load_si256((__m256i*)sp);
636 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
637 __m256i d = _mm256_load_si256((__m256i*)dp);
638 __m256i t = _mm256_add_epi64(s1, s2);
639 __m256i v = _mm256_add_epi64(vb, t);
640 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
641 d = _mm256_add_epi64(d, w);
642 _mm256_store_si256((__m256i*)dp, d);
643 }
644 }
645 }
646 else if (a == -1 && b == 1 && e == 1)
647 { // 5/3 predict
648 int i = (int)h_width;
649 if (even)
650 for (; i > 0; i -= 4, sp += 4, dp += 4)
651 {
652 __m256i s1 = _mm256_load_si256((__m256i*)sp);
653 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
654 __m256i d = _mm256_load_si256((__m256i*)dp);
655 __m256i t = _mm256_add_epi64(s1, s2);
656 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
657 d = _mm256_sub_epi64(d, w);
658 _mm256_store_si256((__m256i*)dp, d);
659 }
660 else
661 for (; i > 0; i -= 4, sp += 4, dp += 4)
662 {
663 __m256i s1 = _mm256_load_si256((__m256i*)sp);
664 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
665 __m256i d = _mm256_load_si256((__m256i*)dp);
666 __m256i t = _mm256_add_epi64(s1, s2);
667 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
668 d = _mm256_sub_epi64(d, w);
669 _mm256_store_si256((__m256i*)dp, d);
670 }
671 }
672 else if (a == -1)
673 { // any case with a == -1, which is not 5/3 predict
674 int i = (int)h_width;
675 if (even)
676 for (; i > 0; i -= 4, sp += 4, dp += 4)
677 {
678 __m256i s1 = _mm256_load_si256((__m256i*)sp);
679 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
680 __m256i d = _mm256_load_si256((__m256i*)dp);
681 __m256i t = _mm256_add_epi64(s1, s2);
682 __m256i v = _mm256_sub_epi64(vb, t);
683 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
684 d = _mm256_add_epi64(d, w);
685 _mm256_store_si256((__m256i*)dp, d);
686 }
687 else
688 for (; i > 0; i -= 4, sp += 4, dp += 4)
689 {
690 __m256i s1 = _mm256_load_si256((__m256i*)sp);
691 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
692 __m256i d = _mm256_load_si256((__m256i*)dp);
693 __m256i t = _mm256_add_epi64(s1, s2);
694 __m256i v = _mm256_sub_epi64(vb, t);
695 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
696 d = _mm256_add_epi64(d, w);
697 _mm256_store_si256((__m256i*)dp, d);
698 }
699 }
700 else {
701 // general case
702 // 64bit multiplication is not supported in avx2;
703 // in particular, _mm256_mullo_epi64.
704 if (even)
705 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
706 *dp += (b + a * (sp[0] + sp[1])) >> e;
707 else
708 for (ui32 i = h_width; i > 0; --i, sp++, dp++)
709 *dp += (b + a * (sp[-1] + sp[0])) >> e;
710 }
711
712 // swap buffers
713 si64* t = lp; lp = hp; hp = t;
714 even = !even;
715 ui32 w = l_width; l_width = h_width; h_width = w;
716 }
717 }
718 else {
719 if (even)
720 ldst->i64[0] = src->i64[0];
721 else
722 hdst->i64[0] = src->i64[0] << 1;
723 }
724 }
725
727 void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
728 const line_buf* hdst, const line_buf* src,
729 ui32 width, bool even)
730 {
731 if (src->flags & line_buf::LFT_32BIT)
732 {
733 assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
734 (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
735 avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
736 }
737 else
738 {
739 assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
740 (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) &&
741 (src == NULL || src->flags & line_buf::LFT_64BIT));
742 avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
743 }
744 }
745
747 static
748 void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst,
749 const line_buf* lsrc, const line_buf* hsrc,
750 ui32 width, bool even)
751 {
752 if (width > 1)
753 {
754 bool ev = even;
755 si32* oth = hsrc->i32, * aug = lsrc->i32;
756 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
757 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
758 ui32 num_steps = atk->get_num_steps();
759 for (ui32 j = 0; j < num_steps; ++j)
760 {
761 const lifting_step* s = atk->get_step(j);
762 const si32 a = s->rev.Aatk;
763 const si32 b = s->rev.Batk;
764 const ui8 e = s->rev.Eatk;
765 __m256i va = _mm256_set1_epi32(a);
766 __m256i vb = _mm256_set1_epi32(b);
767
768 // extension
769 oth[-1] = oth[0];
770 oth[oth_width] = oth[oth_width - 1];
771 // lifting step
772 const si32* sp = oth;
773 si32* dp = aug;
774 if (a == 1)
775 { // 5/3 update and any case with a == 1
776 int i = (int)aug_width;
777 if (ev)
778 {
779 for (; i > 0; i -= 8, sp += 8, dp += 8)
780 {
781 __m256i s1 = _mm256_load_si256((__m256i*)sp);
782 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
783 __m256i d = _mm256_load_si256((__m256i*)dp);
784 __m256i t = _mm256_add_epi32(s1, s2);
785 __m256i v = _mm256_add_epi32(vb, t);
786 __m256i w = _mm256_srai_epi32(v, e);
787 d = _mm256_sub_epi32(d, w);
788 _mm256_store_si256((__m256i*)dp, d);
789 }
790 }
791 else
792 {
793 for (; i > 0; i -= 8, sp += 8, dp += 8)
794 {
795 __m256i s1 = _mm256_load_si256((__m256i*)sp);
796 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
797 __m256i d = _mm256_load_si256((__m256i*)dp);
798 __m256i t = _mm256_add_epi32(s1, s2);
799 __m256i v = _mm256_add_epi32(vb, t);
800 __m256i w = _mm256_srai_epi32(v, e);
801 d = _mm256_sub_epi32(d, w);
802 _mm256_store_si256((__m256i*)dp, d);
803 }
804 }
805 }
806 else if (a == -1 && b == 1 && e == 1)
807 { // 5/3 predict
808 int i = (int)aug_width;
809 if (ev)
810 for (; i > 0; i -= 8, sp += 8, dp += 8)
811 {
812 __m256i s1 = _mm256_load_si256((__m256i*)sp);
813 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
814 __m256i d = _mm256_load_si256((__m256i*)dp);
815 __m256i t = _mm256_add_epi32(s1, s2);
816 __m256i w = _mm256_srai_epi32(t, e);
817 d = _mm256_add_epi32(d, w);
818 _mm256_store_si256((__m256i*)dp, d);
819 }
820 else
821 for (; i > 0; i -= 8, sp += 8, dp += 8)
822 {
823 __m256i s1 = _mm256_load_si256((__m256i*)sp);
824 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
825 __m256i d = _mm256_load_si256((__m256i*)dp);
826 __m256i t = _mm256_add_epi32(s1, s2);
827 __m256i w = _mm256_srai_epi32(t, e);
828 d = _mm256_add_epi32(d, w);
829 _mm256_store_si256((__m256i*)dp, d);
830 }
831 }
832 else if (a == -1)
833 { // any case with a == -1, which is not 5/3 predict
834 int i = (int)aug_width;
835 if (ev)
836 for (; i > 0; i -= 8, sp += 8, dp += 8)
837 {
838 __m256i s1 = _mm256_load_si256((__m256i*)sp);
839 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
840 __m256i d = _mm256_load_si256((__m256i*)dp);
841 __m256i t = _mm256_add_epi32(s1, s2);
842 __m256i v = _mm256_sub_epi32(vb, t);
843 __m256i w = _mm256_srai_epi32(v, e);
844 d = _mm256_sub_epi32(d, w);
845 _mm256_store_si256((__m256i*)dp, d);
846 }
847 else
848 for (; i > 0; i -= 8, sp += 8, dp += 8)
849 {
850 __m256i s1 = _mm256_load_si256((__m256i*)sp);
851 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
852 __m256i d = _mm256_load_si256((__m256i*)dp);
853 __m256i t = _mm256_add_epi32(s1, s2);
854 __m256i v = _mm256_sub_epi32(vb, t);
855 __m256i w = _mm256_srai_epi32(v, e);
856 d = _mm256_sub_epi32(d, w);
857 _mm256_store_si256((__m256i*)dp, d);
858 }
859 }
860 else {
861 // general case
862 int i = (int)aug_width;
863 if (ev)
864 for (; i > 0; i -= 8, sp += 8, dp += 8)
865 {
866 __m256i s1 = _mm256_load_si256((__m256i*)sp);
867 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
868 __m256i d = _mm256_load_si256((__m256i*)dp);
869 __m256i t = _mm256_add_epi32(s1, s2);
870 __m256i u = _mm256_mullo_epi32(va, t);
871 __m256i v = _mm256_add_epi32(vb, u);
872 __m256i w = _mm256_srai_epi32(v, e);
873 d = _mm256_sub_epi32(d, w);
874 _mm256_store_si256((__m256i*)dp, d);
875 }
876 else
877 for (; i > 0; i -= 8, sp += 8, dp += 8)
878 {
879 __m256i s1 = _mm256_load_si256((__m256i*)sp);
880 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
881 __m256i d = _mm256_load_si256((__m256i*)dp);
882 __m256i t = _mm256_add_epi32(s1, s2);
883 __m256i u = _mm256_mullo_epi32(va, t);
884 __m256i v = _mm256_add_epi32(vb, u);
885 __m256i w = _mm256_srai_epi32(v, e);
886 d = _mm256_sub_epi32(d, w);
887 _mm256_store_si256((__m256i*)dp, d);
888 }
889 }
890
891 // swap buffers
892 si32* t = aug; aug = oth; oth = t;
893 ev = !ev;
894 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
895 }
896
897 // combine both lsrc and hsrc into dst
898 {
899 float* dp = dst->f32;
900 float* spl = even ? lsrc->f32 : hsrc->f32;
901 float* sph = even ? hsrc->f32 : lsrc->f32;
902 int w = (int)width;
903 avx2_interleave32(dp, spl, sph, w);
904 }
905 }
906 else {
907 if (even)
908 dst->i32[0] = lsrc->i32[0];
909 else
910 dst->i32[0] = hsrc->i32[0] >> 1;
911 }
912 }
913
915 static
916 void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst,
917 const line_buf* lsrc, const line_buf* hsrc,
918 ui32 width, bool even)
919 {
920 if (width > 1)
921 {
922 bool ev = even;
923 si64* oth = hsrc->i64, * aug = lsrc->i64;
924 ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass
925 ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass
926 ui32 num_steps = atk->get_num_steps();
927 for (ui32 j = 0; j < num_steps; ++j)
928 {
929 const lifting_step* s = atk->get_step(j);
930 const si32 a = s->rev.Aatk;
931 const si32 b = s->rev.Batk;
932 const ui8 e = s->rev.Eatk;
933 __m256i vb = _mm256_set1_epi64x(b);
934 __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
935
936 // extension
937 oth[-1] = oth[0];
938 oth[oth_width] = oth[oth_width - 1];
939 // lifting step
940 const si64* sp = oth;
941 si64* dp = aug;
942 if (a == 1)
943 { // 5/3 update and any case with a == 1
944 int i = (int)aug_width;
945 if (ev)
946 {
947 for (; i > 0; i -= 4, sp += 4, dp += 4)
948 {
949 __m256i s1 = _mm256_load_si256((__m256i*)sp);
950 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
951 __m256i d = _mm256_load_si256((__m256i*)dp);
952 __m256i t = _mm256_add_epi64(s1, s2);
953 __m256i v = _mm256_add_epi64(vb, t);
954 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
955 d = _mm256_sub_epi64(d, w);
956 _mm256_store_si256((__m256i*)dp, d);
957 }
958 }
959 else
960 {
961 for (; i > 0; i -= 4, sp += 4, dp += 4)
962 {
963 __m256i s1 = _mm256_load_si256((__m256i*)sp);
964 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
965 __m256i d = _mm256_load_si256((__m256i*)dp);
966 __m256i t = _mm256_add_epi64(s1, s2);
967 __m256i v = _mm256_add_epi64(vb, t);
968 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
969 d = _mm256_sub_epi64(d, w);
970 _mm256_store_si256((__m256i*)dp, d);
971 }
972 }
973 }
974 else if (a == -1 && b == 1 && e == 1)
975 { // 5/3 predict
976 int i = (int)aug_width;
977 if (ev)
978 for (; i > 0; i -= 4, sp += 4, dp += 4)
979 {
980 __m256i s1 = _mm256_load_si256((__m256i*)sp);
981 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
982 __m256i d = _mm256_load_si256((__m256i*)dp);
983 __m256i t = _mm256_add_epi64(s1, s2);
984 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
985 d = _mm256_add_epi64(d, w);
986 _mm256_store_si256((__m256i*)dp, d);
987 }
988 else
989 for (; i > 0; i -= 4, sp += 4, dp += 4)
990 {
991 __m256i s1 = _mm256_load_si256((__m256i*)sp);
992 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
993 __m256i d = _mm256_load_si256((__m256i*)dp);
994 __m256i t = _mm256_add_epi64(s1, s2);
995 __m256i w = avx2_mm256_srai_epi64(t, e, ve);
996 d = _mm256_add_epi64(d, w);
997 _mm256_store_si256((__m256i*)dp, d);
998 }
999 }
1000 else if (a == -1)
1001 { // any case with a == -1, which is not 5/3 predict
1002 int i = (int)aug_width;
1003 if (ev)
1004 for (; i > 0; i -= 4, sp += 4, dp += 4)
1005 {
1006 __m256i s1 = _mm256_load_si256((__m256i*)sp);
1007 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
1008 __m256i d = _mm256_load_si256((__m256i*)dp);
1009 __m256i t = _mm256_add_epi64(s1, s2);
1010 __m256i v = _mm256_sub_epi64(vb, t);
1011 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
1012 d = _mm256_sub_epi64(d, w);
1013 _mm256_store_si256((__m256i*)dp, d);
1014 }
1015 else
1016 for (; i > 0; i -= 4, sp += 4, dp += 4)
1017 {
1018 __m256i s1 = _mm256_load_si256((__m256i*)sp);
1019 __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
1020 __m256i d = _mm256_load_si256((__m256i*)dp);
1021 __m256i t = _mm256_add_epi64(s1, s2);
1022 __m256i v = _mm256_sub_epi64(vb, t);
1023 __m256i w = avx2_mm256_srai_epi64(v, e, ve);
1024 d = _mm256_sub_epi64(d, w);
1025 _mm256_store_si256((__m256i*)dp, d);
1026 }
1027 }
1028 else {
1029 // general case
1030 // 64bit multiplication is not supported in avx2;
1031 // in particular, _mm_mullo_epi64.
1032 if (ev)
1033 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
1034 *dp -= (b + a * (sp[-1] + sp[0])) >> e;
1035 else
1036 for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
1037 *dp -= (b + a * (sp[0] + sp[1])) >> e;
1038 }
1039
1040 // swap buffers
1041 si64* t = aug; aug = oth; oth = t;
1042 ev = !ev;
1043 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
1044 }
1045
1046 // combine both lsrc and hsrc into dst
1047 {
1048 void* dp = dst->p;
1049 const void* spl = even ? lsrc->p : hsrc->p;
1050 const void* sph = even ? hsrc->p : lsrc->p;
1051 int w = (int)width;
1052 avx2_interleave64(dp, spl, sph, w);
1053 }
1054 }
1055 else {
1056 if (even)
1057 dst->i64[0] = lsrc->i64[0];
1058 else
1059 dst->i64[0] = hsrc->i64[0] >> 1;
1060 }
1061 }
1062
1064 void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
1065 const line_buf* lsrc, const line_buf* hsrc,
1066 ui32 width, bool even)
1067 {
1068 if (dst->flags & line_buf::LFT_32BIT)
1069 {
1070 assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) &&
1071 (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
1072 avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
1073 }
1074 else
1075 {
1076 assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
1077 (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) &&
1078 (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
1079 avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
1080 }
1081 }
1082
1083 } // !local
1084} // !ojph
1085
1086#endif
void avx2_rev_horz_syn(const param_atk *atk, const line_buf *dst, const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even)
void avx2_rev_vert_step(const lifting_step *s, const line_buf *sig, const line_buf *other, const line_buf *aug, ui32 repeat, bool synthesis)
void avx2_rev_horz_ana(const param_atk *atk, const line_buf *ldst, const line_buf *hdst, const line_buf *src, ui32 width, bool even)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50