OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_colour_vsx.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2021, Aous Naman
6// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2021, The University of New South Wales, Australia
8//
9// Redistribution and use in source and binary forms, with or without
10// modification, are permitted provided that the following conditions are
11// met:
12//
13// 1. Redistributions of source code must retain the above copyright
14// notice, this list of conditions and the following disclaimer.
15//
16// 2. Redistributions in binary form must reproduce the above copyright
17// notice, this list of conditions and the following disclaimer in the
18// documentation and/or other materials provided with the distribution.
19//
20// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31//***************************************************************************/
32// This file is part of the OpenJPH software implementation.
33// File: ojph_colour_vsx.cpp
34// Author: Aous Naman
35// Date: 9 February 2021
36//***************************************************************************/
37
38#include <climits>
39#include <cmath>
40#include "ojph_simd_vsx.h"
41
42#include "ojph_defs.h"
43#include "ojph_mem.h"
44#include "ojph_colour.h"
45#include "ojph_colour_local.h"
46
47namespace ojph {
48 namespace local {
49
51 static inline
53 { // We implement ojph_round, which is
54 // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float; this is
55 // round to nearest with ties away from zero, which is exactly what
56 // xvrspi does. The instruction is used via inline asm because
57 // GCC's vec_round rounds ties to even.
58 vsx_v_f32 w;
59 __asm__("xvrspi %x0,%x1" : "=wa"(w) : "wa"((vsx_v_f32)a));
60 return (v128_t)vec_cts(w, 0); // saturating convert to int32
61 }
62
64 void vsx_rev_convert(const line_buf *src_line,
65 const ui32 src_line_offset,
66 line_buf *dst_line,
67 const ui32 dst_line_offset,
68 si64 shift, ui32 width)
69 {
70 if (src_line->flags & line_buf::LFT_32BIT)
71 {
72 if (dst_line->flags & line_buf::LFT_32BIT)
73 {
74 const si32 *sp = src_line->i32 + src_line_offset;
75 si32 *dp = dst_line->i32 + dst_line_offset;
76 v128_t sh = vsx_i32x4_splat((si32)shift);
77 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
78 {
79 v128_t s = vsx_v128_load(sp);
80 s = vsx_i32x4_add(s, sh);
81 vsx_v128_store(dp, s);
82 }
83 }
84 else
85 {
86 const si32 *sp = src_line->i32 + src_line_offset;
87 si64 *dp = dst_line->i64 + dst_line_offset;
88 v128_t sh = vsx_i64x2_splat(shift);
89 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
90 {
91 v128_t s, t;
92 s = vsx_v128_load(sp);
93
95 t = vsx_i64x2_add(t, sh);
96 vsx_v128_store(dp, t);
97
99 t = vsx_i64x2_add(t, sh);
100 vsx_v128_store(dp + 2, t);
101 }
102 }
103 }
104 else
105 {
106 assert(src_line->flags | line_buf::LFT_64BIT);
107 assert(dst_line->flags | line_buf::LFT_32BIT);
108 const si64 *sp = src_line->i64 + src_line_offset;
109 si32 *dp = dst_line->i32 + dst_line_offset;
110 v128_t sh = vsx_i64x2_splat(shift);
111 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
112 {
113 v128_t s0, s1;
114 s0 = vsx_v128_load(sp);
115 s0 = vsx_i64x2_add(s0, sh);
116 s1 = vsx_v128_load(sp + 2);
117 s1 = vsx_i64x2_add(s1, sh);
118 s0 = vsx_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
119 vsx_v128_store(dp, s0);
120 }
121 }
122 }
123
126 const ui32 src_line_offset,
127 line_buf *dst_line,
128 const ui32 dst_line_offset,
129 si64 shift, ui32 width)
130 {
131 if (src_line->flags & line_buf::LFT_32BIT)
132 {
133 if (dst_line->flags & line_buf::LFT_32BIT)
134 {
135 const si32 *sp = src_line->i32 + src_line_offset;
136 si32 *dp = dst_line->i32 + dst_line_offset;
137 v128_t sh = vsx_i32x4_splat((si32)(-shift));
138 v128_t zero = vsx_i32x4_splat(0);
139 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
140 {
141 v128_t s = vsx_v128_load(sp);
142 v128_t c = vsx_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value
143 v128_t v_m_sh = vsx_i32x4_sub(sh, s); // - shift - value
144 v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value
145 s = vsx_v128_andnot(s, c); // keep only +ve or 0
146 s = vsx_v128_or(s, v_m_sh); // combine
147 vsx_v128_store(dp, s);
148 }
149 }
150 else
151 {
152 const si32 *sp = src_line->i32 + src_line_offset;
153 si64 *dp = dst_line->i64 + dst_line_offset;
154 v128_t sh = vsx_i64x2_splat(-shift);
155 v128_t zero = vsx_i32x4_splat(0);
156 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
157 {
158 v128_t s, u, c, v_m_sh;
159 s = vsx_v128_load(sp);
160
162 c = vsx_i64x2_lt(u, zero); // 64b -1 for -ve value
163 v_m_sh = vsx_i64x2_sub(sh, u); // - shift - value
164 v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value
165 u = vsx_v128_andnot(u, c); // keep only +ve or 0
166 u = vsx_v128_or(u, v_m_sh); // combine
167
168 vsx_v128_store(dp, u);
169
171 c = vsx_i64x2_lt(u, zero); // 64b -1 for -ve value
172 v_m_sh = vsx_i64x2_sub(sh, u); // - shift - value
173 v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value
174 u = vsx_v128_andnot(u, c); // keep only +ve or 0
175 u = vsx_v128_or(u, v_m_sh); // combine
176
177 vsx_v128_store(dp + 2, u);
178 }
179 }
180 }
181 else
182 {
183 assert(src_line->flags | line_buf::LFT_64BIT);
184 assert(dst_line->flags | line_buf::LFT_32BIT);
185 const si64 *sp = src_line->i64 + src_line_offset;
186 si32 *dp = dst_line->i32 + dst_line_offset;
187 v128_t sh = vsx_i64x2_splat(-shift);
188 v128_t zero = vsx_i32x4_splat(0);
189 for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
190 {
191 // s for source, t for target, p for positive, n for negative,
192 // m for mask, and tm for temp
193 v128_t s, t0, t1, p, n, m, tm;
194 s = vsx_v128_load(sp);
195 m = vsx_i64x2_lt(s, zero); // 64b -1 for -ve value
196 tm = vsx_i64x2_sub(sh, s); // - shift - value
197 n = vsx_v128_and(m, tm); // -ve
198 p = vsx_v128_andnot(s, m); // +ve
199 t0 = vsx_v128_or(n, p);
200
201 s = vsx_v128_load(sp + 2);
202 m = vsx_i64x2_lt(s, zero); // 64b -1 for -ve value
203 tm = vsx_i64x2_sub(sh, s); // - shift - value
204 n = vsx_v128_and(m, tm); // -ve
205 p = vsx_v128_andnot(s, m); // +ve
206 t1 = vsx_v128_or(n, p);
207
208 t0 = vsx_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
209 vsx_v128_store(dp, t0);
210 }
211 }
212 }
213
215 void vsx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
216 ui32 width)
217 {
218 v128_t shift = vsx_f32x4_splat(0.5f);
219 v128_t m = vsx_f32x4_splat(mul);
220 for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
221 {
222 v128_t t = vsx_v128_load(sp);
224 s = vsx_f32x4_mul(s, m);
225 s = vsx_f32x4_sub(s, shift);
226 vsx_v128_store(dp, s);
227 }
228 }
229
231 void vsx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
232 ui32 width)
233 {
234 v128_t m = vsx_f32x4_splat(mul);
235 for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
236 {
237 v128_t t = vsx_v128_load(sp);
239 s = vsx_f32x4_mul(s, m);
240 vsx_v128_store(dp, s);
241 }
242 }
243
245 void vsx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
246 ui32 width)
247 {
248 const v128_t half = vsx_f32x4_splat(0.5f);
249 v128_t m = vsx_f32x4_splat(mul);
250 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
251 {
252 v128_t t = vsx_v128_load(sp);
253 v128_t s = vsx_f32x4_add(t, half);
254 s = vsx_f32x4_mul(s, m);
255 s = vsx_f32x4_add(s, half); // + 0.5 and followed by floor next
257 }
258 }
259
261 void vsx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
262 ui32 width)
263 {
264 const v128_t half = vsx_f32x4_splat(0.5f);
265 v128_t m = vsx_f32x4_splat(mul);
266 for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
267 {
268 v128_t t = vsx_v128_load(sp);
269 v128_t s = vsx_f32x4_mul(t, m);
270 s = vsx_f32x4_add(s, half); // + 0.5 and followed by floor next
272 }
273 }
274
276 static inline
278 {
279 v128_t c = vsx_f32x4_ge(x, y); // 0xFFFFFFFF for x >= y
280 return (v128_t)vec_sel((vsx_v_u32)b, (vsx_v_u32)a, (vsx_v_u32)c);
281 }
282
284 static inline
286 {
287 v128_t c = vsx_f32x4_lt(x, y); // 0xFFFFFFFF for x < y
288 return (v128_t)vec_sel((vsx_v_u32)b, (vsx_v_u32)a, (vsx_v_u32)c);
289 }
290
292 template <bool NLT_TYPE3>
293 static inline
295 line_buf *dst_line, ui32 dst_line_offset,
296 ui32 bit_depth, bool is_signed, ui32 width)
297 {
298 assert((src_line->flags & line_buf::LFT_32BIT) &&
299 (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
300 (dst_line->flags & line_buf::LFT_32BIT) &&
301 (dst_line->flags & line_buf::LFT_INTEGER));
302
303 assert(bit_depth <= 32);
304 const float* sp = src_line->f32;
305 si32* dp = dst_line->i32 + dst_line_offset;
306 // There is the possibility that converting to integer will
307 // exceed the dynamic range of 32bit integer; therefore, care must be
308 // exercised.
309 // We look if the floating point number is outside the half-closed
310 // interval [-0.5f, 0.5f). If so, we limit the resulting integer
311 // to the maximum/minimum that number supports.
312 si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
313 v128_t mul = vsx_f32x4_splat((float)(1ull << bit_depth));
314 v128_t fl_up_lim = vsx_f32x4_splat(-(float)neg_limit); // val < upper
315 v128_t fl_low_lim = vsx_f32x4_splat((float)neg_limit); // val >= lower
316 v128_t s32_up_lim = vsx_i32x4_splat(INT_MAX >> (32 - bit_depth));
317 v128_t s32_low_lim = vsx_i32x4_splat(INT_MIN >> (32 - bit_depth));
318
319 if (is_signed)
320 {
321 const v128_t zero = vsx_f32x4_splat(0.0f);
322 v128_t bias = vsx_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
323 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
324 v128_t t = vsx_v128_load(sp);
325 t = vsx_f32x4_mul(t, mul);
327 u = ojph_vsx_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
328 u = ojph_vsx_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
329 if (NLT_TYPE3)
330 {
331 v128_t c = vsx_i32x4_gt(zero, u); // 0xFFFFFFFF for -ve value
332 v128_t neg = vsx_i32x4_sub(bias, u); // -bias -value
333 neg = vsx_v128_and(c, neg); // keep only - bias - value
334 u = vsx_v128_andnot(u, c); // keep only +ve or 0
335 u = vsx_v128_or(neg, u); // combine
336 }
337 vsx_v128_store(dp, u);
338 }
339 }
340 else
341 {
342 v128_t ihalf = vsx_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
343 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
344 v128_t t = vsx_v128_load(sp);
345 t = vsx_f32x4_mul(t, mul);
347 u = ojph_vsx_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
348 u = ojph_vsx_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
349 u = vsx_i32x4_add(u, ihalf);
350 vsx_v128_store(dp, u);
351 }
352 }
353 }
354
357 line_buf *dst_line, ui32 dst_line_offset,
358 ui32 bit_depth, bool is_signed, ui32 width)
359 {
360 local_vsx_irv_convert_to_integer<false>(src_line, dst_line,
361 dst_line_offset, bit_depth, is_signed, width);
362 }
363
366 line_buf *dst_line, ui32 dst_line_offset,
367 ui32 bit_depth, bool is_signed, ui32 width)
368 {
369 local_vsx_irv_convert_to_integer<true>(src_line, dst_line,
370 dst_line_offset, bit_depth, is_signed, width);
371 }
372
374 template <bool NLT_TYPE3>
375 static inline
377 ui32 src_line_offset, line_buf *dst_line,
378 ui32 bit_depth, bool is_signed, ui32 width)
379 {
380 assert((src_line->flags & line_buf::LFT_32BIT) &&
381 (src_line->flags & line_buf::LFT_INTEGER) &&
382 (dst_line->flags & line_buf::LFT_32BIT) &&
383 (dst_line->flags & line_buf::LFT_INTEGER) == 0);
384
385 assert(bit_depth <= 32);
386 v128_t mul = vsx_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth)));
387
388 const si32* sp = src_line->i32 + src_line_offset;
389 float* dp = dst_line->f32;
390 if (is_signed)
391 {
392 v128_t zero = vsx_i32x4_splat(0);
393 v128_t bias = vsx_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
394 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
395 v128_t t = vsx_v128_load(sp);
396 if (NLT_TYPE3)
397 {
398 v128_t c = vsx_i32x4_lt(t, zero); // 0xFFFFFFFF for -ve value
399 v128_t neg = vsx_i32x4_sub(bias, t); // - bias - value
400 neg = vsx_v128_and(c, neg); // keep only - bias - value
401 c = vsx_v128_andnot(t, c); // keep only +ve or 0
402 t = vsx_v128_or(neg, c); // combine
403 }
405 v = vsx_f32x4_mul(v, mul);
406 vsx_v128_store(dp, v);
407 }
408 }
409 else
410 {
411 v128_t half = vsx_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
412 for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
413 v128_t t = vsx_v128_load(sp);
414 t = vsx_i32x4_sub(t, half);
416 v = vsx_f32x4_mul(v, mul);
417 vsx_v128_store(dp, v);
418 }
419 }
420 }
421
423 void vsx_irv_convert_to_float(const line_buf *src_line,
424 ui32 src_line_offset, line_buf *dst_line,
425 ui32 bit_depth, bool is_signed, ui32 width)
426 {
427 local_vsx_irv_convert_to_float<false>(src_line, src_line_offset,
428 dst_line, bit_depth, is_signed, width);
429 }
430
433 ui32 src_line_offset, line_buf *dst_line,
434 ui32 bit_depth, bool is_signed, ui32 width)
435 {
436 local_vsx_irv_convert_to_float<true>(src_line, src_line_offset,
437 dst_line, bit_depth, is_signed, width);
438 }
439
442 const line_buf *g,
443 const line_buf *b,
444 line_buf *y, line_buf *cb, line_buf *cr,
445 ui32 repeat)
446 {
447 assert((y->flags & line_buf::LFT_INTEGER) &&
453
454 if (y->flags & line_buf::LFT_32BIT)
455 {
456 assert((y->flags & line_buf::LFT_32BIT) &&
457 (cb->flags & line_buf::LFT_32BIT) &&
458 (cr->flags & line_buf::LFT_32BIT) &&
459 (r->flags & line_buf::LFT_32BIT) &&
460 (g->flags & line_buf::LFT_32BIT) &&
462 const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
463 si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
464
465 for (int i = (repeat + 3) >> 2; i > 0; --i)
466 {
467 v128_t mr = vsx_v128_load(rp);
468 v128_t mg = vsx_v128_load(gp);
469 v128_t mb = vsx_v128_load(bp);
470 v128_t t = vsx_i32x4_add(mr, mb);
471 t = vsx_i32x4_add(t, vsx_i32x4_shl(mg, 1));
472 vsx_v128_store(yp, vsx_i32x4_shr(t, 2));
473 t = vsx_i32x4_sub(mb, mg);
474 vsx_v128_store(cbp, t);
475 t = vsx_i32x4_sub(mr, mg);
476 vsx_v128_store(crp, t);
477
478 rp += 4; gp += 4; bp += 4;
479 yp += 4; cbp += 4; crp += 4;
480 }
481 }
482 else
483 {
484 assert((y->flags & line_buf::LFT_64BIT) &&
485 (cb->flags & line_buf::LFT_64BIT) &&
486 (cr->flags & line_buf::LFT_64BIT) &&
487 (r->flags & line_buf::LFT_32BIT) &&
488 (g->flags & line_buf::LFT_32BIT) &&
490 const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
491 si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
492 for (int i = (repeat + 3) >> 2; i > 0; --i)
493 {
494 v128_t mr32 = vsx_v128_load(rp);
495 v128_t mg32 = vsx_v128_load(gp);
496 v128_t mb32 = vsx_v128_load(bp);
497 v128_t mr, mg, mb, t;
501
502 t = vsx_i64x2_add(mr, mb);
503 t = vsx_i64x2_add(t, vsx_i64x2_shl(mg, 1));
504 vsx_v128_store(yp, vsx_i64x2_shr(t, 2));
505 t = vsx_i64x2_sub(mb, mg);
506 vsx_v128_store(cbp, t);
507 t = vsx_i64x2_sub(mr, mg);
508 vsx_v128_store(crp, t);
509
510 yp += 2; cbp += 2; crp += 2;
511
515
516 t = vsx_i64x2_add(mr, mb);
517 t = vsx_i64x2_add(t, vsx_i64x2_shl(mg, 1));
518 vsx_v128_store(yp, vsx_i64x2_shr(t, 2));
519 t = vsx_i64x2_sub(mb, mg);
520 vsx_v128_store(cbp, t);
521 t = vsx_i64x2_sub(mr, mg);
522 vsx_v128_store(crp, t);
523
524 rp += 4; gp += 4; bp += 4;
525 yp += 2; cbp += 2; crp += 2;
526 }
527 }
528 }
529
532 const line_buf *cb,
533 const line_buf *cr,
534 line_buf *r, line_buf *g, line_buf *b,
535 ui32 repeat)
536 {
537 assert((y->flags & line_buf::LFT_INTEGER) &&
543
544 if (y->flags & line_buf::LFT_32BIT)
545 {
546 assert((y->flags & line_buf::LFT_32BIT) &&
547 (cb->flags & line_buf::LFT_32BIT) &&
548 (cr->flags & line_buf::LFT_32BIT) &&
549 (r->flags & line_buf::LFT_32BIT) &&
550 (g->flags & line_buf::LFT_32BIT) &&
552 const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
553 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
554 for (int i = (repeat + 3) >> 2; i > 0; --i)
555 {
556 v128_t my = vsx_v128_load(yp);
557 v128_t mcb = vsx_v128_load(cbp);
558 v128_t mcr = vsx_v128_load(crp);
559
560 v128_t t = vsx_i32x4_add(mcb, mcr);
561 t = vsx_i32x4_sub(my, vsx_i32x4_shr(t, 2));
562 vsx_v128_store(gp, t);
563 v128_t u = vsx_i32x4_add(mcb, t);
564 vsx_v128_store(bp, u);
565 u = vsx_i32x4_add(mcr, t);
566 vsx_v128_store(rp, u);
567
568 yp += 4; cbp += 4; crp += 4;
569 rp += 4; gp += 4; bp += 4;
570 }
571 }
572 else
573 {
574 assert((y->flags & line_buf::LFT_64BIT) &&
575 (cb->flags & line_buf::LFT_64BIT) &&
576 (cr->flags & line_buf::LFT_64BIT) &&
577 (r->flags & line_buf::LFT_32BIT) &&
578 (g->flags & line_buf::LFT_32BIT) &&
580 const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
581 si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
582 for (int i = (repeat + 3) >> 2; i > 0; --i)
583 {
584 v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
585 my = vsx_v128_load(yp);
586 mcb = vsx_v128_load(cbp);
587 mcr = vsx_v128_load(crp);
588
589 tg0 = vsx_i64x2_add(mcb, mcr);
590 tg0 = vsx_i64x2_sub(my, vsx_i64x2_shr(tg0, 2));
591 tb0 = vsx_i64x2_add(mcb, tg0);
592 tr0 = vsx_i64x2_add(mcr, tg0);
593
594 yp += 2; cbp += 2; crp += 2;
595
596 my = vsx_v128_load(yp);
597 mcb = vsx_v128_load(cbp);
598 mcr = vsx_v128_load(crp);
599
600 tg1 = vsx_i64x2_add(mcb, mcr);
601 tg1 = vsx_i64x2_sub(my, vsx_i64x2_shr(tg1, 2));
602 tb1 = vsx_i64x2_add(mcb, tg1);
603 tr1 = vsx_i64x2_add(mcr, tg1);
604
605 tr0 = vsx_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
606 tg0 = vsx_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
607 tb0 = vsx_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
608
609 vsx_v128_store(rp, tr0);
610 vsx_v128_store(gp, tg0);
611 vsx_v128_store(bp, tb0);
612
613 yp += 2; cbp += 2; crp += 2;
614 rp += 4; gp += 4; bp += 4;
615 }
616 }
617 }
618
620 void vsx_ict_forward(const float *r, const float *g, const float *b,
621 float *y, float *cb, float *cr, ui32 repeat)
622 {
628 for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
629 {
630 v128_t mr = vsx_v128_load(r);
631 v128_t mb = vsx_v128_load(b);
632 v128_t my = vsx_f32x4_mul(alpha_rf, mr);
633 my = vsx_f32x4_add(my, vsx_f32x4_mul(alpha_gf, vsx_v128_load(g)));
634 my = vsx_f32x4_add(my, vsx_f32x4_mul(alpha_bf, mb));
635 vsx_v128_store(y, my);
636 vsx_v128_store(cb, vsx_f32x4_mul(beta_cbf, vsx_f32x4_sub(mb, my)));
637 vsx_v128_store(cr, vsx_f32x4_mul(beta_crf, vsx_f32x4_sub(mr, my)));
638
639 r += 4; g += 4; b += 4;
640 y += 4; cb += 4; cr += 4;
641 }
642 }
643
645 void vsx_ict_backward(const float *y, const float *cb, const float *cr,
646 float *r, float *g, float *b, ui32 repeat)
647 {
652 for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
653 {
654 v128_t my = vsx_v128_load(y);
655 v128_t mcr = vsx_v128_load(cr);
656 v128_t mcb = vsx_v128_load(cb);
657 v128_t mg = vsx_f32x4_sub(my, vsx_f32x4_mul(gamma_cr2g, mcr));
658 vsx_v128_store(g, vsx_f32x4_sub(mg, vsx_f32x4_mul(gamma_cb2g, mcb)));
659 vsx_v128_store(r, vsx_f32x4_add(my, vsx_f32x4_mul(gamma_cr2r, mcr)));
660 vsx_v128_store(b, vsx_f32x4_add(my, vsx_f32x4_mul(gamma_cb2b, mcb)));
661
662 y += 4; cb += 4; cr += 4;
663 r += 4; g += 4; b += 4;
664 }
665 }
666
667 }
668}
float * f32
Definition ojph_mem.h:187
static v128_t ojph_vsx_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
void vsx_irv_convert_to_float_nlt_type3(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
static void local_vsx_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void vsx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void vsx_rev_convert_nlt_type3(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
void vsx_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void vsx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void vsx_rct_backward(const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
static v128_t ojph_convert_float_to_i32(v128_t a)
void vsx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void vsx_rev_convert(const line_buf *src_line, const ui32 src_line_offset, line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width)
static void local_vsx_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
void vsx_rct_forward(const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
void vsx_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
void vsx_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width)
static v128_t ojph_vsx_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
int64_t si64
Definition ojph_defs.h:57
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
__vector unsigned int vsx_v_u32
static v128_t vsx_i32x4_sub(v128_t a, v128_t b)
static v128_t vsx_f32x4_mul(v128_t a, v128_t b)
static v128_t vsx_f32x4_sub(v128_t a, v128_t b)
static v128_t vsx_i64x2_lt(v128_t a, v128_t b)
static v128_t vsx_f32x4_convert_i32x4(v128_t a)
static v128_t vsx_f32x4_add(v128_t a, v128_t b)
__vector float vsx_v_f32
static v128_t vsx_f32x4_ge(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_high_i32x4(v128_t a)
static v128_t vsx_f32x4_splat(float x)
static v128_t vsx_i32x4_shl(v128_t a, int n)
static v128_t vsx_i32x4_add(v128_t a, v128_t b)
static v128_t vsx_f32x4_lt(v128_t a, v128_t b)
static v128_t vsx_i64x2_extend_low_i32x4(v128_t a)
__vector unsigned char v128_t
static v128_t vsx_v128_andnot(v128_t a, v128_t b)
static void vsx_v128_store(void *p, v128_t a)
static v128_t vsx_v128_and(v128_t a, v128_t b)
static v128_t vsx_i64x2_shl(v128_t a, int n)
static v128_t vsx_v128_or(v128_t a, v128_t b)
static v128_t vsx_i32x4_lt(v128_t a, v128_t b)
#define vsx_i32x4_shuffle(a, b, c0, c1, c2, c3)
static v128_t vsx_i64x2_splat(long long x)
static v128_t vsx_i64x2_shr(v128_t a, int n)
static v128_t vsx_i64x2_add(v128_t a, v128_t b)
static v128_t vsx_i64x2_sub(v128_t a, v128_t b)
static v128_t vsx_i32x4_splat(int x)
static v128_t vsx_v128_load(const void *p)
static v128_t vsx_i32x4_shr(v128_t a, int n)
static v128_t vsx_i32x4_gt(v128_t a, v128_t b)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF