21#ifndef PACS_ENCODING_SIMD_RLE_HPP
22#define PACS_ENCODING_SIMD_RLE_HPP
35 uint8_t* b,
size_t pixel_count)
noexcept;
37 const uint8_t* b, uint8_t* dst,
38 size_t pixel_count)
noexcept;
40 size_t pixel_count)
noexcept;
42 uint8_t* dst,
size_t pixel_count)
noexcept;
51 uint8_t* g, uint8_t* b,
52 size_t pixel_count)
noexcept {
53 for (
size_t i = 0; i < pixel_count; ++i) {
55 g[i] = src[i * 3 + 1];
56 b[i] = src[i * 3 + 2];
61 const uint8_t* b, uint8_t* dst,
62 size_t pixel_count)
noexcept {
63 for (
size_t i = 0; i < pixel_count; ++i) {
65 dst[i * 3 + 1] = g[i];
66 dst[i * 3 + 2] = b[i];
72 size_t pixel_count)
noexcept {
73 for (
size_t i = 0; i < pixel_count; ++i) {
75 high[i] = src[i * 2 + 1];
81 size_t pixel_count)
noexcept {
82 for (
size_t i = 0; i < pixel_count; ++i) {
84 dst[i * 2 + 1] = high[i];
92#if defined(PACS_SIMD_SSSE3)
100inline void interleaved_to_planar_rgb8_ssse3(
const uint8_t* src, uint8_t* r,
101 uint8_t* g, uint8_t* b,
102 size_t pixel_count)
noexcept {
107 const __m128i shuffle_r0 =
108 _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
109 const __m128i shuffle_r1 =
110 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1);
111 const __m128i shuffle_r2 =
112 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13);
114 const __m128i shuffle_g0 =
115 _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
116 const __m128i shuffle_g1 =
117 _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1);
118 const __m128i shuffle_g2 =
119 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14);
121 const __m128i shuffle_b0 =
122 _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
123 const __m128i shuffle_b1 =
124 _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1);
125 const __m128i shuffle_b2 =
126 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15);
128 const size_t simd_count = (pixel_count / 16) * 16;
131 for (; i < simd_count; i += 16) {
133 __m128i v0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3));
134 __m128i v1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3 + 16));
135 __m128i v2 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3 + 32));
138 __m128i r_vec = _mm_or_si128(
139 _mm_or_si128(_mm_shuffle_epi8(v0, shuffle_r0),
140 _mm_shuffle_epi8(v1, shuffle_r1)),
141 _mm_shuffle_epi8(v2, shuffle_r2));
144 __m128i g_vec = _mm_or_si128(
145 _mm_or_si128(_mm_shuffle_epi8(v0, shuffle_g0),
146 _mm_shuffle_epi8(v1, shuffle_g1)),
147 _mm_shuffle_epi8(v2, shuffle_g2));
150 __m128i b_vec = _mm_or_si128(
151 _mm_or_si128(_mm_shuffle_epi8(v0, shuffle_b0),
152 _mm_shuffle_epi8(v1, shuffle_b1)),
153 _mm_shuffle_epi8(v2, shuffle_b2));
156 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(r + i), r_vec);
157 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(g + i), g_vec);
158 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(b + i), b_vec);
172inline void planar_to_interleaved_rgb8_ssse3(
const uint8_t* r,
const uint8_t* g,
173 const uint8_t* b, uint8_t* dst,
174 size_t pixel_count)
noexcept {
176 const __m128i shuffle_r =
177 _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
178 const __m128i shuffle_g =
179 _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
180 const __m128i shuffle_b =
181 _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
183 const __m128i shuffle_r2 =
184 _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1);
185 const __m128i shuffle_g2 =
186 _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10);
187 const __m128i shuffle_b2 =
188 _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1);
190 const __m128i shuffle_r3 =
191 _mm_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1);
192 const __m128i shuffle_g3 =
193 _mm_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1);
194 const __m128i shuffle_b3 =
195 _mm_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15);
197 const size_t simd_count = (pixel_count / 16) * 16;
200 for (; i < simd_count; i += 16) {
202 __m128i r_vec = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(r + i));
203 __m128i g_vec = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(g + i));
204 __m128i b_vec = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(b + i));
207 __m128i out0 = _mm_or_si128(
208 _mm_or_si128(_mm_shuffle_epi8(r_vec, shuffle_r),
209 _mm_shuffle_epi8(g_vec, shuffle_g)),
210 _mm_shuffle_epi8(b_vec, shuffle_b));
213 __m128i out1 = _mm_or_si128(
214 _mm_or_si128(_mm_shuffle_epi8(r_vec, shuffle_r2),
215 _mm_shuffle_epi8(g_vec, shuffle_g2)),
216 _mm_shuffle_epi8(b_vec, shuffle_b2));
219 __m128i out2 = _mm_or_si128(
220 _mm_or_si128(_mm_shuffle_epi8(r_vec, shuffle_r3),
221 _mm_shuffle_epi8(g_vec, shuffle_g3)),
222 _mm_shuffle_epi8(b_vec, shuffle_b3));
225 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i * 3), out0);
226 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i * 3 + 16), out1);
227 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i * 3 + 32), out2);
241inline void split_16bit_to_planes_ssse3(
const uint8_t* src, uint8_t* high,
243 size_t pixel_count)
noexcept {
245 const __m128i shuffle_low =
246 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
248 const __m128i shuffle_high =
249 _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
251 const size_t simd_count = (pixel_count / 16) * 16;
254 for (; i < simd_count; i += 16) {
256 __m128i v0 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 2));
257 __m128i v1 = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 2 + 16));
260 __m128i low0 = _mm_shuffle_epi8(v0, shuffle_low);
261 __m128i high0 = _mm_shuffle_epi8(v0, shuffle_high);
262 __m128i low1 = _mm_shuffle_epi8(v1, shuffle_low);
263 __m128i high1 = _mm_shuffle_epi8(v1, shuffle_high);
266 __m128i low_vec = _mm_or_si128(low0, _mm_slli_si128(low1, 8));
267 __m128i high_vec = _mm_or_si128(high0, _mm_slli_si128(high1, 8));
270 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(low + i), low_vec);
271 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(high + i), high_vec);
284inline void merge_planes_to_16bit_ssse3(
const uint8_t* high,
const uint8_t* low,
286 size_t pixel_count)
noexcept {
287 const size_t simd_count = (pixel_count / 16) * 16;
290 for (; i < simd_count; i += 16) {
292 __m128i low_vec = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(low + i));
293 __m128i high_vec = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(high + i));
296 __m128i out0 = _mm_unpacklo_epi8(low_vec, high_vec);
297 __m128i out1 = _mm_unpackhi_epi8(low_vec, high_vec);
300 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i * 2), out0);
301 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i * 2 + 16), out1);
314#if defined(PACS_SIMD_AVX2)
321inline void interleaved_to_planar_rgb8_avx2(
const uint8_t* src, uint8_t* r,
322 uint8_t* g, uint8_t* b,
323 size_t pixel_count)
noexcept {
325 const __m256i shuffle_r0 = _mm256_setr_epi8(
326 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
327 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
328 const __m256i shuffle_r1 = _mm256_setr_epi8(
329 -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1,
330 -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1);
331 const __m256i shuffle_r2 = _mm256_setr_epi8(
332 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13,
333 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13);
335 const __m256i shuffle_g0 = _mm256_setr_epi8(
336 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
337 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
338 const __m256i shuffle_g1 = _mm256_setr_epi8(
339 -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1,
340 -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1);
341 const __m256i shuffle_g2 = _mm256_setr_epi8(
342 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14,
343 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14);
345 const __m256i shuffle_b0 = _mm256_setr_epi8(
346 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
347 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
348 const __m256i shuffle_b1 = _mm256_setr_epi8(
349 -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1,
350 -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1);
351 const __m256i shuffle_b2 = _mm256_setr_epi8(
352 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15,
353 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15);
355 const size_t simd_count = (pixel_count / 32) * 32;
358 for (; i < simd_count; i += 32) {
361 __m128i v0_lo = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3));
362 __m128i v1_lo = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3 + 16));
363 __m128i v2_lo = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3 + 32));
364 __m128i v0_hi = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3 + 48));
365 __m128i v1_hi = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3 + 64));
366 __m128i v2_hi = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i * 3 + 80));
368 __m256i v0 = _mm256_set_m128i(v0_hi, v0_lo);
369 __m256i v1 = _mm256_set_m128i(v1_hi, v1_lo);
370 __m256i v2 = _mm256_set_m128i(v2_hi, v2_lo);
373 __m256i r_vec = _mm256_or_si256(
374 _mm256_or_si256(_mm256_shuffle_epi8(v0, shuffle_r0),
375 _mm256_shuffle_epi8(v1, shuffle_r1)),
376 _mm256_shuffle_epi8(v2, shuffle_r2));
379 __m256i g_vec = _mm256_or_si256(
380 _mm256_or_si256(_mm256_shuffle_epi8(v0, shuffle_g0),
381 _mm256_shuffle_epi8(v1, shuffle_g1)),
382 _mm256_shuffle_epi8(v2, shuffle_g2));
385 __m256i b_vec = _mm256_or_si256(
386 _mm256_or_si256(_mm256_shuffle_epi8(v0, shuffle_b0),
387 _mm256_shuffle_epi8(v1, shuffle_b1)),
388 _mm256_shuffle_epi8(v2, shuffle_b2));
391 r_vec = _mm256_permute4x64_epi64(r_vec, 0xD8);
392 g_vec = _mm256_permute4x64_epi64(g_vec, 0xD8);
393 b_vec = _mm256_permute4x64_epi64(b_vec, 0xD8);
396 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(r + i), r_vec);
397 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(g + i), g_vec);
398 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(b + i), b_vec);
402#if defined(PACS_SIMD_SSSE3)
403 interleaved_to_planar_rgb8_ssse3(src + i * 3, r + i, g + i, b + i,
416inline void planar_to_interleaved_rgb8_avx2(
const uint8_t* r,
const uint8_t* g,
417 const uint8_t* b, uint8_t* dst,
418 size_t pixel_count)
noexcept {
420 const __m256i shuffle_r = _mm256_setr_epi8(
421 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5,
422 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
423 const __m256i shuffle_g = _mm256_setr_epi8(
424 -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1,
425 -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
426 const __m256i shuffle_b = _mm256_setr_epi8(
427 -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1,
428 -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
430 const __m256i shuffle_r2 = _mm256_setr_epi8(
431 -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1,
432 -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1);
433 const __m256i shuffle_g2 = _mm256_setr_epi8(
434 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10,
435 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10);
436 const __m256i shuffle_b2 = _mm256_setr_epi8(
437 -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1,
438 -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1);
440 const __m256i shuffle_r3 = _mm256_setr_epi8(
441 -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1,
442 -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1);
443 const __m256i shuffle_g3 = _mm256_setr_epi8(
444 -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1,
445 -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1);
446 const __m256i shuffle_b3 = _mm256_setr_epi8(
447 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15,
448 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15);
450 const size_t simd_count = (pixel_count / 32) * 32;
453 for (; i < simd_count; i += 32) {
455 __m256i r_vec = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(r + i));
456 __m256i g_vec = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(g + i));
457 __m256i b_vec = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(b + i));
460 __m256i out0 = _mm256_or_si256(
461 _mm256_or_si256(_mm256_shuffle_epi8(r_vec, shuffle_r),
462 _mm256_shuffle_epi8(g_vec, shuffle_g)),
463 _mm256_shuffle_epi8(b_vec, shuffle_b));
466 __m256i out1 = _mm256_or_si256(
467 _mm256_or_si256(_mm256_shuffle_epi8(r_vec, shuffle_r2),
468 _mm256_shuffle_epi8(g_vec, shuffle_g2)),
469 _mm256_shuffle_epi8(b_vec, shuffle_b2));
472 __m256i out2 = _mm256_or_si256(
473 _mm256_or_si256(_mm256_shuffle_epi8(r_vec, shuffle_r3),
474 _mm256_shuffle_epi8(g_vec, shuffle_g3)),
475 _mm256_shuffle_epi8(b_vec, shuffle_b3));
478 out0 = _mm256_permute4x64_epi64(out0, 0xD8);
479 out1 = _mm256_permute4x64_epi64(out1, 0xD8);
480 out2 = _mm256_permute4x64_epi64(out2, 0xD8);
483 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i * 3), out0);
484 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i * 3 + 32), out1);
485 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i * 3 + 64), out2);
489#if defined(PACS_SIMD_SSSE3)
490 planar_to_interleaved_rgb8_ssse3(r + i, g + i, b + i, dst + i * 3,
503inline void split_16bit_to_planes_avx2(
const uint8_t* src, uint8_t* high,
505 size_t pixel_count)
noexcept {
506 const __m256i shuffle_low = _mm256_setr_epi8(
507 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1,
508 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
509 const __m256i shuffle_high = _mm256_setr_epi8(
510 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1,
511 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
513 const size_t simd_count = (pixel_count / 32) * 32;
516 for (; i < simd_count; i += 32) {
518 __m256i v0 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + i * 2));
519 __m256i v1 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + i * 2 + 32));
522 __m256i low0 = _mm256_shuffle_epi8(v0, shuffle_low);
523 __m256i high0 = _mm256_shuffle_epi8(v0, shuffle_high);
524 __m256i low1 = _mm256_shuffle_epi8(v1, shuffle_low);
525 __m256i high1 = _mm256_shuffle_epi8(v1, shuffle_high);
528 low0 = _mm256_permute4x64_epi64(low0, 0xD8);
529 high0 = _mm256_permute4x64_epi64(high0, 0xD8);
530 low1 = _mm256_permute4x64_epi64(low1, 0xD8);
531 high1 = _mm256_permute4x64_epi64(high1, 0xD8);
534 __m256i low_vec = _mm256_permute2x128_si256(low0, low1, 0x20);
535 __m256i high_vec = _mm256_permute2x128_si256(high0, high1, 0x20);
538 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(low + i), low_vec);
539 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(high + i), high_vec);
543#if defined(PACS_SIMD_SSSE3)
544 split_16bit_to_planes_ssse3(src + i * 2, high + i, low + i, pixel_count - i);
555inline void merge_planes_to_16bit_avx2(
const uint8_t* high,
const uint8_t* low,
557 size_t pixel_count)
noexcept {
558 const size_t simd_count = (pixel_count / 32) * 32;
561 for (; i < simd_count; i += 32) {
563 __m256i low_vec = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(low + i));
564 __m256i high_vec = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(high + i));
567 __m256i out0 = _mm256_unpacklo_epi8(low_vec, high_vec);
568 __m256i out1 = _mm256_unpackhi_epi8(low_vec, high_vec);
571 out0 = _mm256_permute4x64_epi64(out0, 0xD8);
572 out1 = _mm256_permute4x64_epi64(out1, 0xD8);
575 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i * 2), out0);
576 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i * 2 + 32), out1);
580#if defined(PACS_SIMD_SSSE3)
581 merge_planes_to_16bit_ssse3(high + i, low + i, dst + i * 2, pixel_count - i);
593#if defined(PACS_SIMD_NEON)
601inline void interleaved_to_planar_rgb8_neon(
const uint8_t* src, uint8_t* r,
602 uint8_t* g, uint8_t* b,
603 size_t pixel_count)
noexcept {
604 const size_t simd_count = (pixel_count / 16) * 16;
607 for (; i < simd_count; i += 16) {
609 uint8x16x3_t
rgb = vld3q_u8(src + i * 3);
612 vst1q_u8(r + i,
rgb.val[0]);
613 vst1q_u8(g + i,
rgb.val[1]);
614 vst1q_u8(b + i,
rgb.val[2]);
628inline void planar_to_interleaved_rgb8_neon(
const uint8_t* r,
const uint8_t* g,
629 const uint8_t* b, uint8_t* dst,
630 size_t pixel_count)
noexcept {
631 const size_t simd_count = (pixel_count / 16) * 16;
634 for (; i < simd_count; i += 16) {
637 rgb.val[0] = vld1q_u8(r + i);
638 rgb.val[1] = vld1q_u8(g + i);
639 rgb.val[2] = vld1q_u8(b + i);
642 vst3q_u8(dst + i * 3, rgb);
656inline void split_16bit_to_planes_neon(
const uint8_t* src, uint8_t* high,
658 size_t pixel_count)
noexcept {
659 const size_t simd_count = (pixel_count / 16) * 16;
662 for (; i < simd_count; i += 16) {
664 uint8x16_t v0 = vld1q_u8(src + i * 2);
665 uint8x16_t v1 = vld1q_u8(src + i * 2 + 16);
668 uint8x16x2_t deint0 = vuzpq_u8(v0, v1);
671 vst1q_u8(low + i, deint0.val[0]);
672 vst1q_u8(high + i, deint0.val[1]);
685inline void merge_planes_to_16bit_neon(
const uint8_t* high,
const uint8_t* low,
687 size_t pixel_count)
noexcept {
688 const size_t simd_count = (pixel_count / 16) * 16;
691 for (; i < simd_count; i += 16) {
693 uint8x16_t low_vec = vld1q_u8(low + i);
694 uint8x16_t high_vec = vld1q_u8(high + i);
697 uint8x16x2_t interleaved = vzipq_u8(low_vec, high_vec);
700 vst1q_u8(dst + i * 2, interleaved.val[0]);
701 vst1q_u8(dst + i * 2 + 16, interleaved.val[1]);
726 uint8_t* g, uint8_t* b,
727 size_t pixel_count)
noexcept {
728 if (pixel_count == 0) {
732#if defined(PACS_SIMD_AVX2)
734 detail::interleaved_to_planar_rgb8_avx2(src, r, g, b, pixel_count);
739#if defined(PACS_SIMD_SSSE3)
741 detail::interleaved_to_planar_rgb8_ssse3(src, r, g, b, pixel_count);
746#if defined(PACS_SIMD_NEON)
747 detail::interleaved_to_planar_rgb8_neon(src, r, g, b, pixel_count);
764 const uint8_t* b, uint8_t* dst,
765 size_t pixel_count)
noexcept {
766 if (pixel_count == 0) {
770#if defined(PACS_SIMD_AVX2)
772 detail::planar_to_interleaved_rgb8_avx2(r, g, b, dst, pixel_count);
777#if defined(PACS_SIMD_SSSE3)
779 detail::planar_to_interleaved_rgb8_ssse3(r, g, b, dst, pixel_count);
784#if defined(PACS_SIMD_NEON)
785 detail::planar_to_interleaved_rgb8_neon(r, g, b, dst, pixel_count);
802 size_t pixel_count)
noexcept {
803 if (pixel_count == 0) {
807#if defined(PACS_SIMD_AVX2)
809 detail::split_16bit_to_planes_avx2(src, high, low, pixel_count);
814#if defined(PACS_SIMD_SSSE3)
816 detail::split_16bit_to_planes_ssse3(src, high, low, pixel_count);
821#if defined(PACS_SIMD_NEON)
822 detail::split_16bit_to_planes_neon(src, high, low, pixel_count);
839 size_t pixel_count)
noexcept {
840 if (pixel_count == 0) {
844#if defined(PACS_SIMD_AVX2)
846 detail::merge_planes_to_16bit_avx2(high, low, dst, pixel_count);
851#if defined(PACS_SIMD_SSSE3)
853 detail::merge_planes_to_16bit_ssse3(high, low, dst, pixel_count);
858#if defined(PACS_SIMD_NEON)
859 detail::merge_planes_to_16bit_neon(high, low, dst, pixel_count);
@ rgb
Red, Green, Blue color model.
void split_16bit_to_planes_scalar(const uint8_t *src, uint8_t *high, uint8_t *low, size_t pixel_count) noexcept
void interleaved_to_planar_rgb8_scalar(const uint8_t *src, uint8_t *r, uint8_t *g, uint8_t *b, size_t pixel_count) noexcept
void merge_planes_to_16bit_scalar(const uint8_t *high, const uint8_t *low, uint8_t *dst, size_t pixel_count) noexcept
void planar_to_interleaved_rgb8_scalar(const uint8_t *r, const uint8_t *g, const uint8_t *b, uint8_t *dst, size_t pixel_count) noexcept
void merge_planes_to_16bit(const uint8_t *high, const uint8_t *low, uint8_t *dst, size_t pixel_count) noexcept
Merge high and low byte planes into 16-bit data.
bool has_avx2() noexcept
Check if AVX2 is available.
void interleaved_to_planar_rgb8(const uint8_t *src, uint8_t *r, uint8_t *g, uint8_t *b, size_t pixel_count) noexcept
Convert interleaved RGB to planar format using best available SIMD.
bool has_ssse3() noexcept
Check if SSSE3 is available.
void planar_to_interleaved_rgb8(const uint8_t *r, const uint8_t *g, const uint8_t *b, uint8_t *dst, size_t pixel_count) noexcept
Convert planar RGB to interleaved format using best available SIMD.
void split_16bit_to_planes(const uint8_t *src, uint8_t *high, uint8_t *low, size_t pixel_count) noexcept
Split 16-bit data into high and low byte planes.
SIMD configuration and CPU feature detection.
Platform-specific SIMD type definitions and wrappers.