PACS System 0.1.0
PACS DICOM system library
Loading...
Searching...
No Matches
simd_rle.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
21#ifndef PACS_ENCODING_SIMD_RLE_HPP
22#define PACS_ENCODING_SIMD_RLE_HPP
23
24#include "simd_config.h"
25#include "simd_types.h"
26
27#include <cstddef>
28#include <cstdint>
29#include <cstring>
30
32
33// Forward declarations
34void interleaved_to_planar_rgb8(const uint8_t* src, uint8_t* r, uint8_t* g,
35 uint8_t* b, size_t pixel_count) noexcept;
36void planar_to_interleaved_rgb8(const uint8_t* r, const uint8_t* g,
37 const uint8_t* b, uint8_t* dst,
38 size_t pixel_count) noexcept;
39void split_16bit_to_planes(const uint8_t* src, uint8_t* high, uint8_t* low,
40 size_t pixel_count) noexcept;
41void merge_planes_to_16bit(const uint8_t* high, const uint8_t* low,
42 uint8_t* dst, size_t pixel_count) noexcept;
43
44namespace detail {
45
46// ============================================================================
47// Scalar fallback implementations
48// ============================================================================
49
50inline void interleaved_to_planar_rgb8_scalar(const uint8_t* src, uint8_t* r,
51 uint8_t* g, uint8_t* b,
52 size_t pixel_count) noexcept {
53 for (size_t i = 0; i < pixel_count; ++i) {
54 r[i] = src[i * 3];
55 g[i] = src[i * 3 + 1];
56 b[i] = src[i * 3 + 2];
57 }
58}
59
60inline void planar_to_interleaved_rgb8_scalar(const uint8_t* r, const uint8_t* g,
61 const uint8_t* b, uint8_t* dst,
62 size_t pixel_count) noexcept {
63 for (size_t i = 0; i < pixel_count; ++i) {
64 dst[i * 3] = r[i];
65 dst[i * 3 + 1] = g[i];
66 dst[i * 3 + 2] = b[i];
67 }
68}
69
70inline void split_16bit_to_planes_scalar(const uint8_t* src, uint8_t* high,
71 uint8_t* low,
72 size_t pixel_count) noexcept {
73 for (size_t i = 0; i < pixel_count; ++i) {
74 low[i] = src[i * 2];
75 high[i] = src[i * 2 + 1];
76 }
77}
78
79inline void merge_planes_to_16bit_scalar(const uint8_t* high, const uint8_t* low,
80 uint8_t* dst,
81 size_t pixel_count) noexcept {
82 for (size_t i = 0; i < pixel_count; ++i) {
83 dst[i * 2] = low[i];
84 dst[i * 2 + 1] = high[i];
85 }
86}
87
88// ============================================================================
89// SSE2/SSSE3 implementations
90// ============================================================================
91
92#if defined(PACS_SIMD_SSSE3)
93
100inline void interleaved_to_planar_rgb8_ssse3(const uint8_t* src, uint8_t* r,
101 uint8_t* g, uint8_t* b,
102 size_t pixel_count) noexcept {
103 // Shuffle masks for deinterleaving RGB
104 // Input: R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
105 // Output: R0 R1 R2 R3 R4 R5 ... (and similar for G, B)
106
107 const __m128i shuffle_r0 =
108 _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
109 const __m128i shuffle_r1 =
110 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1);
111 const __m128i shuffle_r2 =
112 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13);
113
114 const __m128i shuffle_g0 =
115 _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
116 const __m128i shuffle_g1 =
117 _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1);
118 const __m128i shuffle_g2 =
119 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14);
120
121 const __m128i shuffle_b0 =
122 _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
123 const __m128i shuffle_b1 =
124 _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1);
125 const __m128i shuffle_b2 =
126 _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15);
127
128 const size_t simd_count = (pixel_count / 16) * 16;
129
130 size_t i = 0;
131 for (; i < simd_count; i += 16) {
132 // Load 48 bytes (16 RGB pixels)
133 __m128i v0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3));
134 __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3 + 16));
135 __m128i v2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3 + 32));
136
137 // Extract R channel
138 __m128i r_vec = _mm_or_si128(
139 _mm_or_si128(_mm_shuffle_epi8(v0, shuffle_r0),
140 _mm_shuffle_epi8(v1, shuffle_r1)),
141 _mm_shuffle_epi8(v2, shuffle_r2));
142
143 // Extract G channel
144 __m128i g_vec = _mm_or_si128(
145 _mm_or_si128(_mm_shuffle_epi8(v0, shuffle_g0),
146 _mm_shuffle_epi8(v1, shuffle_g1)),
147 _mm_shuffle_epi8(v2, shuffle_g2));
148
149 // Extract B channel
150 __m128i b_vec = _mm_or_si128(
151 _mm_or_si128(_mm_shuffle_epi8(v0, shuffle_b0),
152 _mm_shuffle_epi8(v1, shuffle_b1)),
153 _mm_shuffle_epi8(v2, shuffle_b2));
154
155 // Store results
156 _mm_storeu_si128(reinterpret_cast<__m128i*>(r + i), r_vec);
157 _mm_storeu_si128(reinterpret_cast<__m128i*>(g + i), g_vec);
158 _mm_storeu_si128(reinterpret_cast<__m128i*>(b + i), b_vec);
159 }
160
161 // Handle remainder
162 interleaved_to_planar_rgb8_scalar(src + i * 3, r + i, g + i, b + i,
163 pixel_count - i);
164}
165
172inline void planar_to_interleaved_rgb8_ssse3(const uint8_t* r, const uint8_t* g,
173 const uint8_t* b, uint8_t* dst,
174 size_t pixel_count) noexcept {
175 // Shuffle masks for interleaving
176 const __m128i shuffle_r =
177 _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
178 const __m128i shuffle_g =
179 _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
180 const __m128i shuffle_b =
181 _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
182
183 const __m128i shuffle_r2 =
184 _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1);
185 const __m128i shuffle_g2 =
186 _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10);
187 const __m128i shuffle_b2 =
188 _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1);
189
190 const __m128i shuffle_r3 =
191 _mm_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1);
192 const __m128i shuffle_g3 =
193 _mm_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1);
194 const __m128i shuffle_b3 =
195 _mm_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15);
196
197 const size_t simd_count = (pixel_count / 16) * 16;
198
199 size_t i = 0;
200 for (; i < simd_count; i += 16) {
201 // Load 16 bytes from each plane
202 __m128i r_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(r + i));
203 __m128i g_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(g + i));
204 __m128i b_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(b + i));
205
206 // First 16 bytes of output (pixels 0-5, partial 6)
207 __m128i out0 = _mm_or_si128(
208 _mm_or_si128(_mm_shuffle_epi8(r_vec, shuffle_r),
209 _mm_shuffle_epi8(g_vec, shuffle_g)),
210 _mm_shuffle_epi8(b_vec, shuffle_b));
211
212 // Second 16 bytes of output (partial 5, pixels 6-10, partial 11)
213 __m128i out1 = _mm_or_si128(
214 _mm_or_si128(_mm_shuffle_epi8(r_vec, shuffle_r2),
215 _mm_shuffle_epi8(g_vec, shuffle_g2)),
216 _mm_shuffle_epi8(b_vec, shuffle_b2));
217
218 // Third 16 bytes of output (partial 10, pixels 11-15)
219 __m128i out2 = _mm_or_si128(
220 _mm_or_si128(_mm_shuffle_epi8(r_vec, shuffle_r3),
221 _mm_shuffle_epi8(g_vec, shuffle_g3)),
222 _mm_shuffle_epi8(b_vec, shuffle_b3));
223
224 // Store results
225 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i * 3), out0);
226 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i * 3 + 16), out1);
227 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i * 3 + 32), out2);
228 }
229
230 // Handle remainder
231 planar_to_interleaved_rgb8_scalar(r + i, g + i, b + i, dst + i * 3,
232 pixel_count - i);
233}
234
241inline void split_16bit_to_planes_ssse3(const uint8_t* src, uint8_t* high,
242 uint8_t* low,
243 size_t pixel_count) noexcept {
244 // Shuffle mask to extract low bytes (even positions)
245 const __m128i shuffle_low =
246 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
247 // Shuffle mask to extract high bytes (odd positions)
248 const __m128i shuffle_high =
249 _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
250
251 const size_t simd_count = (pixel_count / 16) * 16;
252
253 size_t i = 0;
254 for (; i < simd_count; i += 16) {
255 // Load 32 bytes (16 x 16-bit pixels)
256 __m128i v0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 2));
257 __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 2 + 16));
258
259 // Extract low and high bytes
260 __m128i low0 = _mm_shuffle_epi8(v0, shuffle_low);
261 __m128i high0 = _mm_shuffle_epi8(v0, shuffle_high);
262 __m128i low1 = _mm_shuffle_epi8(v1, shuffle_low);
263 __m128i high1 = _mm_shuffle_epi8(v1, shuffle_high);
264
265 // Combine into single vectors
266 __m128i low_vec = _mm_or_si128(low0, _mm_slli_si128(low1, 8));
267 __m128i high_vec = _mm_or_si128(high0, _mm_slli_si128(high1, 8));
268
269 // Store results
270 _mm_storeu_si128(reinterpret_cast<__m128i*>(low + i), low_vec);
271 _mm_storeu_si128(reinterpret_cast<__m128i*>(high + i), high_vec);
272 }
273
274 // Handle remainder
275 split_16bit_to_planes_scalar(src + i * 2, high + i, low + i, pixel_count - i);
276}
277
284inline void merge_planes_to_16bit_ssse3(const uint8_t* high, const uint8_t* low,
285 uint8_t* dst,
286 size_t pixel_count) noexcept {
287 const size_t simd_count = (pixel_count / 16) * 16;
288
289 size_t i = 0;
290 for (; i < simd_count; i += 16) {
291 // Load 16 bytes from each plane
292 __m128i low_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(low + i));
293 __m128i high_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(high + i));
294
295 // Interleave low and high bytes
296 __m128i out0 = _mm_unpacklo_epi8(low_vec, high_vec);
297 __m128i out1 = _mm_unpackhi_epi8(low_vec, high_vec);
298
299 // Store results
300 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i * 2), out0);
301 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i * 2 + 16), out1);
302 }
303
304 // Handle remainder
305 merge_planes_to_16bit_scalar(high + i, low + i, dst + i * 2, pixel_count - i);
306}
307
308#endif // PACS_SIMD_SSSE3
309
310// ============================================================================
311// AVX2 implementations
312// ============================================================================
313
314#if defined(PACS_SIMD_AVX2)
315
321inline void interleaved_to_planar_rgb8_avx2(const uint8_t* src, uint8_t* r,
322 uint8_t* g, uint8_t* b,
323 size_t pixel_count) noexcept {
324 // AVX2 shuffle masks (same pattern in both 128-bit lanes)
325 const __m256i shuffle_r0 = _mm256_setr_epi8(
326 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
327 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
328 const __m256i shuffle_r1 = _mm256_setr_epi8(
329 -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1,
330 -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1);
331 const __m256i shuffle_r2 = _mm256_setr_epi8(
332 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13,
333 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13);
334
335 const __m256i shuffle_g0 = _mm256_setr_epi8(
336 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
337 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
338 const __m256i shuffle_g1 = _mm256_setr_epi8(
339 -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1,
340 -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1);
341 const __m256i shuffle_g2 = _mm256_setr_epi8(
342 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14,
343 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14);
344
345 const __m256i shuffle_b0 = _mm256_setr_epi8(
346 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
347 2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
348 const __m256i shuffle_b1 = _mm256_setr_epi8(
349 -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1,
350 -1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1);
351 const __m256i shuffle_b2 = _mm256_setr_epi8(
352 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15,
353 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15);
354
355 const size_t simd_count = (pixel_count / 32) * 32;
356
357 size_t i = 0;
358 for (; i < simd_count; i += 32) {
359 // Load 96 bytes (32 RGB pixels) as 6 x 128-bit loads
360 // Then combine into 256-bit vectors
361 __m128i v0_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3));
362 __m128i v1_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3 + 16));
363 __m128i v2_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3 + 32));
364 __m128i v0_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3 + 48));
365 __m128i v1_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3 + 64));
366 __m128i v2_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * 3 + 80));
367
368 __m256i v0 = _mm256_set_m128i(v0_hi, v0_lo);
369 __m256i v1 = _mm256_set_m128i(v1_hi, v1_lo);
370 __m256i v2 = _mm256_set_m128i(v2_hi, v2_lo);
371
372 // Extract R channel
373 __m256i r_vec = _mm256_or_si256(
374 _mm256_or_si256(_mm256_shuffle_epi8(v0, shuffle_r0),
375 _mm256_shuffle_epi8(v1, shuffle_r1)),
376 _mm256_shuffle_epi8(v2, shuffle_r2));
377
378 // Extract G channel
379 __m256i g_vec = _mm256_or_si256(
380 _mm256_or_si256(_mm256_shuffle_epi8(v0, shuffle_g0),
381 _mm256_shuffle_epi8(v1, shuffle_g1)),
382 _mm256_shuffle_epi8(v2, shuffle_g2));
383
384 // Extract B channel
385 __m256i b_vec = _mm256_or_si256(
386 _mm256_or_si256(_mm256_shuffle_epi8(v0, shuffle_b0),
387 _mm256_shuffle_epi8(v1, shuffle_b1)),
388 _mm256_shuffle_epi8(v2, shuffle_b2));
389
390 // Permute to get correct order across lanes
391 r_vec = _mm256_permute4x64_epi64(r_vec, 0xD8); // 0, 2, 1, 3
392 g_vec = _mm256_permute4x64_epi64(g_vec, 0xD8);
393 b_vec = _mm256_permute4x64_epi64(b_vec, 0xD8);
394
395 // Store results
396 _mm256_storeu_si256(reinterpret_cast<__m256i*>(r + i), r_vec);
397 _mm256_storeu_si256(reinterpret_cast<__m256i*>(g + i), g_vec);
398 _mm256_storeu_si256(reinterpret_cast<__m256i*>(b + i), b_vec);
399 }
400
401 // Handle remainder with SSSE3 or scalar
402#if defined(PACS_SIMD_SSSE3)
403 interleaved_to_planar_rgb8_ssse3(src + i * 3, r + i, g + i, b + i,
404 pixel_count - i);
405#else
406 interleaved_to_planar_rgb8_scalar(src + i * 3, r + i, g + i, b + i,
407 pixel_count - i);
408#endif
409}
410
416inline void planar_to_interleaved_rgb8_avx2(const uint8_t* r, const uint8_t* g,
417 const uint8_t* b, uint8_t* dst,
418 size_t pixel_count) noexcept {
419 // Shuffle masks for interleaving (same pattern in both lanes)
420 const __m256i shuffle_r = _mm256_setr_epi8(
421 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5,
422 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
423 const __m256i shuffle_g = _mm256_setr_epi8(
424 -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1,
425 -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
426 const __m256i shuffle_b = _mm256_setr_epi8(
427 -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1,
428 -1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
429
430 const __m256i shuffle_r2 = _mm256_setr_epi8(
431 -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1,
432 -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1);
433 const __m256i shuffle_g2 = _mm256_setr_epi8(
434 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10,
435 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10);
436 const __m256i shuffle_b2 = _mm256_setr_epi8(
437 -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1,
438 -1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1);
439
440 const __m256i shuffle_r3 = _mm256_setr_epi8(
441 -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1,
442 -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1);
443 const __m256i shuffle_g3 = _mm256_setr_epi8(
444 -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1,
445 -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1);
446 const __m256i shuffle_b3 = _mm256_setr_epi8(
447 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15,
448 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15);
449
450 const size_t simd_count = (pixel_count / 32) * 32;
451
452 size_t i = 0;
453 for (; i < simd_count; i += 32) {
454 // Load 32 bytes from each plane
455 __m256i r_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(r + i));
456 __m256i g_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(g + i));
457 __m256i b_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(b + i));
458
459 // First 32 bytes of output
460 __m256i out0 = _mm256_or_si256(
461 _mm256_or_si256(_mm256_shuffle_epi8(r_vec, shuffle_r),
462 _mm256_shuffle_epi8(g_vec, shuffle_g)),
463 _mm256_shuffle_epi8(b_vec, shuffle_b));
464
465 // Second 32 bytes
466 __m256i out1 = _mm256_or_si256(
467 _mm256_or_si256(_mm256_shuffle_epi8(r_vec, shuffle_r2),
468 _mm256_shuffle_epi8(g_vec, shuffle_g2)),
469 _mm256_shuffle_epi8(b_vec, shuffle_b2));
470
471 // Third 32 bytes
472 __m256i out2 = _mm256_or_si256(
473 _mm256_or_si256(_mm256_shuffle_epi8(r_vec, shuffle_r3),
474 _mm256_shuffle_epi8(g_vec, shuffle_g3)),
475 _mm256_shuffle_epi8(b_vec, shuffle_b3));
476
477 // Permute to interleave results from both lanes
478 out0 = _mm256_permute4x64_epi64(out0, 0xD8);
479 out1 = _mm256_permute4x64_epi64(out1, 0xD8);
480 out2 = _mm256_permute4x64_epi64(out2, 0xD8);
481
482 // Store results
483 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i * 3), out0);
484 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i * 3 + 32), out1);
485 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i * 3 + 64), out2);
486 }
487
488 // Handle remainder
489#if defined(PACS_SIMD_SSSE3)
490 planar_to_interleaved_rgb8_ssse3(r + i, g + i, b + i, dst + i * 3,
491 pixel_count - i);
492#else
493 planar_to_interleaved_rgb8_scalar(r + i, g + i, b + i, dst + i * 3,
494 pixel_count - i);
495#endif
496}
497
503inline void split_16bit_to_planes_avx2(const uint8_t* src, uint8_t* high,
504 uint8_t* low,
505 size_t pixel_count) noexcept {
506 const __m256i shuffle_low = _mm256_setr_epi8(
507 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1,
508 0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1);
509 const __m256i shuffle_high = _mm256_setr_epi8(
510 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1,
511 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
512
513 const size_t simd_count = (pixel_count / 32) * 32;
514
515 size_t i = 0;
516 for (; i < simd_count; i += 32) {
517 // Load 64 bytes (32 x 16-bit pixels)
518 __m256i v0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * 2));
519 __m256i v1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * 2 + 32));
520
521 // Extract low and high bytes
522 __m256i low0 = _mm256_shuffle_epi8(v0, shuffle_low);
523 __m256i high0 = _mm256_shuffle_epi8(v0, shuffle_high);
524 __m256i low1 = _mm256_shuffle_epi8(v1, shuffle_low);
525 __m256i high1 = _mm256_shuffle_epi8(v1, shuffle_high);
526
527 // Pack results from both lanes
528 low0 = _mm256_permute4x64_epi64(low0, 0xD8);
529 high0 = _mm256_permute4x64_epi64(high0, 0xD8);
530 low1 = _mm256_permute4x64_epi64(low1, 0xD8);
531 high1 = _mm256_permute4x64_epi64(high1, 0xD8);
532
533 // Combine into final vectors
534 __m256i low_vec = _mm256_permute2x128_si256(low0, low1, 0x20);
535 __m256i high_vec = _mm256_permute2x128_si256(high0, high1, 0x20);
536
537 // Store results
538 _mm256_storeu_si256(reinterpret_cast<__m256i*>(low + i), low_vec);
539 _mm256_storeu_si256(reinterpret_cast<__m256i*>(high + i), high_vec);
540 }
541
542 // Handle remainder
543#if defined(PACS_SIMD_SSSE3)
544 split_16bit_to_planes_ssse3(src + i * 2, high + i, low + i, pixel_count - i);
545#else
546 split_16bit_to_planes_scalar(src + i * 2, high + i, low + i, pixel_count - i);
547#endif
548}
549
555inline void merge_planes_to_16bit_avx2(const uint8_t* high, const uint8_t* low,
556 uint8_t* dst,
557 size_t pixel_count) noexcept {
558 const size_t simd_count = (pixel_count / 32) * 32;
559
560 size_t i = 0;
561 for (; i < simd_count; i += 32) {
562 // Load 32 bytes from each plane
563 __m256i low_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(low + i));
564 __m256i high_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(high + i));
565
566 // Interleave low and high bytes
567 __m256i out0 = _mm256_unpacklo_epi8(low_vec, high_vec);
568 __m256i out1 = _mm256_unpackhi_epi8(low_vec, high_vec);
569
570 // Permute to get correct order
571 out0 = _mm256_permute4x64_epi64(out0, 0xD8);
572 out1 = _mm256_permute4x64_epi64(out1, 0xD8);
573
574 // Store results
575 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i * 2), out0);
576 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i * 2 + 32), out1);
577 }
578
579 // Handle remainder
580#if defined(PACS_SIMD_SSSE3)
581 merge_planes_to_16bit_ssse3(high + i, low + i, dst + i * 2, pixel_count - i);
582#else
583 merge_planes_to_16bit_scalar(high + i, low + i, dst + i * 2, pixel_count - i);
584#endif
585}
586
587#endif // PACS_SIMD_AVX2
588
589// ============================================================================
590// ARM NEON implementations
591// ============================================================================
592
593#if defined(PACS_SIMD_NEON)
594
601inline void interleaved_to_planar_rgb8_neon(const uint8_t* src, uint8_t* r,
602 uint8_t* g, uint8_t* b,
603 size_t pixel_count) noexcept {
604 const size_t simd_count = (pixel_count / 16) * 16;
605
606 size_t i = 0;
607 for (; i < simd_count; i += 16) {
608 // vld3q_u8 automatically deinterleaves RGB data
609 uint8x16x3_t rgb = vld3q_u8(src + i * 3);
610
611 // Store each channel
612 vst1q_u8(r + i, rgb.val[0]);
613 vst1q_u8(g + i, rgb.val[1]);
614 vst1q_u8(b + i, rgb.val[2]);
615 }
616
617 // Handle remainder
618 interleaved_to_planar_rgb8_scalar(src + i * 3, r + i, g + i, b + i,
619 pixel_count - i);
620}
621
628inline void planar_to_interleaved_rgb8_neon(const uint8_t* r, const uint8_t* g,
629 const uint8_t* b, uint8_t* dst,
630 size_t pixel_count) noexcept {
631 const size_t simd_count = (pixel_count / 16) * 16;
632
633 size_t i = 0;
634 for (; i < simd_count; i += 16) {
635 // Load each channel
636 uint8x16x3_t rgb;
637 rgb.val[0] = vld1q_u8(r + i);
638 rgb.val[1] = vld1q_u8(g + i);
639 rgb.val[2] = vld1q_u8(b + i);
640
641 // vst3q_u8 automatically interleaves RGB data
642 vst3q_u8(dst + i * 3, rgb);
643 }
644
645 // Handle remainder
646 planar_to_interleaved_rgb8_scalar(r + i, g + i, b + i, dst + i * 3,
647 pixel_count - i);
648}
649
656inline void split_16bit_to_planes_neon(const uint8_t* src, uint8_t* high,
657 uint8_t* low,
658 size_t pixel_count) noexcept {
659 const size_t simd_count = (pixel_count / 16) * 16;
660
661 size_t i = 0;
662 for (; i < simd_count; i += 16) {
663 // Load 32 bytes (16 x 16-bit pixels)
664 uint8x16_t v0 = vld1q_u8(src + i * 2);
665 uint8x16_t v1 = vld1q_u8(src + i * 2 + 16);
666
667 // Deinterleave to separate low and high bytes
668 uint8x16x2_t deint0 = vuzpq_u8(v0, v1);
669
670 // deint0.val[0] contains low bytes, deint0.val[1] contains high bytes
671 vst1q_u8(low + i, deint0.val[0]);
672 vst1q_u8(high + i, deint0.val[1]);
673 }
674
675 // Handle remainder
676 split_16bit_to_planes_scalar(src + i * 2, high + i, low + i, pixel_count - i);
677}
678
685inline void merge_planes_to_16bit_neon(const uint8_t* high, const uint8_t* low,
686 uint8_t* dst,
687 size_t pixel_count) noexcept {
688 const size_t simd_count = (pixel_count / 16) * 16;
689
690 size_t i = 0;
691 for (; i < simd_count; i += 16) {
692 // Load 16 bytes from each plane
693 uint8x16_t low_vec = vld1q_u8(low + i);
694 uint8x16_t high_vec = vld1q_u8(high + i);
695
696 // Interleave low and high bytes
697 uint8x16x2_t interleaved = vzipq_u8(low_vec, high_vec);
698
699 // Store results
700 vst1q_u8(dst + i * 2, interleaved.val[0]);
701 vst1q_u8(dst + i * 2 + 16, interleaved.val[1]);
702 }
703
704 // Handle remainder
705 merge_planes_to_16bit_scalar(high + i, low + i, dst + i * 2, pixel_count - i);
706}
707
708#endif // PACS_SIMD_NEON
709
710} // namespace detail
711
712// ============================================================================
713// Public API - dispatches to best available implementation
714// ============================================================================
715
725inline void interleaved_to_planar_rgb8(const uint8_t* src, uint8_t* r,
726 uint8_t* g, uint8_t* b,
727 size_t pixel_count) noexcept {
728 if (pixel_count == 0) {
729 return;
730 }
731
732#if defined(PACS_SIMD_AVX2)
733 if (has_avx2()) {
734 detail::interleaved_to_planar_rgb8_avx2(src, r, g, b, pixel_count);
735 return;
736 }
737#endif
738
739#if defined(PACS_SIMD_SSSE3)
740 if (has_ssse3()) {
741 detail::interleaved_to_planar_rgb8_ssse3(src, r, g, b, pixel_count);
742 return;
743 }
744#endif
745
746#if defined(PACS_SIMD_NEON)
747 detail::interleaved_to_planar_rgb8_neon(src, r, g, b, pixel_count);
748 return;
749#endif
750
751 detail::interleaved_to_planar_rgb8_scalar(src, r, g, b, pixel_count);
752}
753
763inline void planar_to_interleaved_rgb8(const uint8_t* r, const uint8_t* g,
764 const uint8_t* b, uint8_t* dst,
765 size_t pixel_count) noexcept {
766 if (pixel_count == 0) {
767 return;
768 }
769
770#if defined(PACS_SIMD_AVX2)
771 if (has_avx2()) {
772 detail::planar_to_interleaved_rgb8_avx2(r, g, b, dst, pixel_count);
773 return;
774 }
775#endif
776
777#if defined(PACS_SIMD_SSSE3)
778 if (has_ssse3()) {
779 detail::planar_to_interleaved_rgb8_ssse3(r, g, b, dst, pixel_count);
780 return;
781 }
782#endif
783
784#if defined(PACS_SIMD_NEON)
785 detail::planar_to_interleaved_rgb8_neon(r, g, b, dst, pixel_count);
786 return;
787#endif
788
789 detail::planar_to_interleaved_rgb8_scalar(r, g, b, dst, pixel_count);
790}
791
800inline void split_16bit_to_planes(const uint8_t* src, uint8_t* high,
801 uint8_t* low,
802 size_t pixel_count) noexcept {
803 if (pixel_count == 0) {
804 return;
805 }
806
807#if defined(PACS_SIMD_AVX2)
808 if (has_avx2()) {
809 detail::split_16bit_to_planes_avx2(src, high, low, pixel_count);
810 return;
811 }
812#endif
813
814#if defined(PACS_SIMD_SSSE3)
815 if (has_ssse3()) {
816 detail::split_16bit_to_planes_ssse3(src, high, low, pixel_count);
817 return;
818 }
819#endif
820
821#if defined(PACS_SIMD_NEON)
822 detail::split_16bit_to_planes_neon(src, high, low, pixel_count);
823 return;
824#endif
825
826 detail::split_16bit_to_planes_scalar(src, high, low, pixel_count);
827}
828
837inline void merge_planes_to_16bit(const uint8_t* high, const uint8_t* low,
838 uint8_t* dst,
839 size_t pixel_count) noexcept {
840 if (pixel_count == 0) {
841 return;
842 }
843
844#if defined(PACS_SIMD_AVX2)
845 if (has_avx2()) {
846 detail::merge_planes_to_16bit_avx2(high, low, dst, pixel_count);
847 return;
848 }
849#endif
850
851#if defined(PACS_SIMD_SSSE3)
852 if (has_ssse3()) {
853 detail::merge_planes_to_16bit_ssse3(high, low, dst, pixel_count);
854 return;
855 }
856#endif
857
858#if defined(PACS_SIMD_NEON)
859 detail::merge_planes_to_16bit_neon(high, low, dst, pixel_count);
860 return;
861#endif
862
863 detail::merge_planes_to_16bit_scalar(high, low, dst, pixel_count);
864}
865
866} // namespace kcenon::pacs::encoding::simd
867
868#endif // PACS_ENCODING_SIMD_RLE_HPP
void split_16bit_to_planes_scalar(const uint8_t *src, uint8_t *high, uint8_t *low, size_t pixel_count) noexcept
Definition simd_rle.h:70
void interleaved_to_planar_rgb8_scalar(const uint8_t *src, uint8_t *r, uint8_t *g, uint8_t *b, size_t pixel_count) noexcept
Definition simd_rle.h:50
void merge_planes_to_16bit_scalar(const uint8_t *high, const uint8_t *low, uint8_t *dst, size_t pixel_count) noexcept
Definition simd_rle.h:79
void planar_to_interleaved_rgb8_scalar(const uint8_t *r, const uint8_t *g, const uint8_t *b, uint8_t *dst, size_t pixel_count) noexcept
Definition simd_rle.h:60
void merge_planes_to_16bit(const uint8_t *high, const uint8_t *low, uint8_t *dst, size_t pixel_count) noexcept
Merge high and low byte planes into 16-bit data.
Definition simd_rle.h:837
bool has_avx2() noexcept
Check if AVX2 is available.
void interleaved_to_planar_rgb8(const uint8_t *src, uint8_t *r, uint8_t *g, uint8_t *b, size_t pixel_count) noexcept
Convert interleaved RGB to planar format using best available SIMD.
Definition simd_rle.h:725
bool has_ssse3() noexcept
Check if SSSE3 is available.
void planar_to_interleaved_rgb8(const uint8_t *r, const uint8_t *g, const uint8_t *b, uint8_t *dst, size_t pixel_count) noexcept
Convert planar RGB to interleaved format using best available SIMD.
Definition simd_rle.h:763
void split_16bit_to_planes(const uint8_t *src, uint8_t *high, uint8_t *low, size_t pixel_count) noexcept
Split 16-bit data into high and low byte planes.
Definition simd_rle.h:800
SIMD configuration and CPU feature detection.
Platform-specific SIMD type definitions and wrappers.