PACS System 0.1.0
PACS DICOM system library
Loading...
Searching...
No Matches
simd_utils.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
15#ifndef PACS_ENCODING_SIMD_UTILS_HPP
16#define PACS_ENCODING_SIMD_UTILS_HPP
17
18#include "simd_config.h"
19#include "simd_types.h"
20
21#include <cstddef>
22#include <cstdint>
23#include <cstring>
24
26
27// Forward declarations for byte swap functions
28void swap_bytes_16_simd(const uint8_t* src, uint8_t* dst,
29 size_t count) noexcept;
30void swap_bytes_32_simd(const uint8_t* src, uint8_t* dst,
31 size_t count) noexcept;
32void swap_bytes_64_simd(const uint8_t* src, uint8_t* dst,
33 size_t count) noexcept;
34
35namespace detail {
36
37// Scalar fallback implementations
38inline void swap_bytes_16_scalar(const uint8_t* src, uint8_t* dst,
39 size_t byte_count) noexcept {
40 for (size_t i = 0; i + 1 < byte_count; i += 2) {
41 dst[i] = src[i + 1];
42 dst[i + 1] = src[i];
43 }
44}
45
46inline void swap_bytes_32_scalar(const uint8_t* src, uint8_t* dst,
47 size_t byte_count) noexcept {
48 for (size_t i = 0; i + 3 < byte_count; i += 4) {
49 dst[i] = src[i + 3];
50 dst[i + 1] = src[i + 2];
51 dst[i + 2] = src[i + 1];
52 dst[i + 3] = src[i];
53 }
54}
55
56inline void swap_bytes_64_scalar(const uint8_t* src, uint8_t* dst,
57 size_t byte_count) noexcept {
58 for (size_t i = 0; i + 7 < byte_count; i += 8) {
59 dst[i] = src[i + 7];
60 dst[i + 1] = src[i + 6];
61 dst[i + 2] = src[i + 5];
62 dst[i + 3] = src[i + 4];
63 dst[i + 4] = src[i + 3];
64 dst[i + 5] = src[i + 2];
65 dst[i + 6] = src[i + 1];
66 dst[i + 7] = src[i];
67 }
68}
69
70#if defined(PACS_SIMD_SSSE3)
71
72// SSSE3 shuffle masks for byte swapping
73inline __m128i get_swap16_mask() noexcept {
74 // Swap adjacent bytes: [0,1,2,3,...] -> [1,0,3,2,...]
75 return _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
76}
77
78inline __m128i get_swap32_mask() noexcept {
79 // Reverse 4-byte groups: [0,1,2,3,...] -> [3,2,1,0,7,6,5,4,...]
80 return _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
81}
82
83inline __m128i get_swap64_mask() noexcept {
84 // Reverse 8-byte groups
85 return _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
86}
87
88inline void swap_bytes_16_ssse3(const uint8_t* src, uint8_t* dst,
89 size_t byte_count) noexcept {
90 const __m128i mask = get_swap16_mask();
91 const size_t simd_count = (byte_count / 16) * 16;
92
93 size_t i = 0;
94 for (; i < simd_count; i += 16) {
95 __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
96 v = _mm_shuffle_epi8(v, mask);
97 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), v);
98 }
99
100 // Handle remainder
101 swap_bytes_16_scalar(src + i, dst + i, byte_count - i);
102}
103
104inline void swap_bytes_32_ssse3(const uint8_t* src, uint8_t* dst,
105 size_t byte_count) noexcept {
106 const __m128i mask = get_swap32_mask();
107 const size_t simd_count = (byte_count / 16) * 16;
108
109 size_t i = 0;
110 for (; i < simd_count; i += 16) {
111 __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
112 v = _mm_shuffle_epi8(v, mask);
113 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), v);
114 }
115
116 // Handle remainder
117 swap_bytes_32_scalar(src + i, dst + i, byte_count - i);
118}
119
120inline void swap_bytes_64_ssse3(const uint8_t* src, uint8_t* dst,
121 size_t byte_count) noexcept {
122 const __m128i mask = get_swap64_mask();
123 const size_t simd_count = (byte_count / 16) * 16;
124
125 size_t i = 0;
126 for (; i < simd_count; i += 16) {
127 __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
128 v = _mm_shuffle_epi8(v, mask);
129 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), v);
130 }
131
132 // Handle remainder
133 swap_bytes_64_scalar(src + i, dst + i, byte_count - i);
134}
135
136#endif // PACS_SIMD_SSSE3
137
138#if defined(PACS_SIMD_AVX2)
139
140inline __m256i get_swap16_mask_256() noexcept {
141 return _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
142 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
143}
144
145inline __m256i get_swap32_mask_256() noexcept {
146 return _mm256_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
147 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
148}
149
150inline __m256i get_swap64_mask_256() noexcept {
151 return _mm256_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
152 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
153}
154
155inline void swap_bytes_16_avx2(const uint8_t* src, uint8_t* dst,
156 size_t byte_count) noexcept {
157 const __m256i mask = get_swap16_mask_256();
158 const size_t simd_count = (byte_count / 32) * 32;
159
160 size_t i = 0;
161 for (; i < simd_count; i += 32) {
162 __m256i v =
163 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
164 v = _mm256_shuffle_epi8(v, mask);
165 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), v);
166 }
167
168 // Handle remainder with SSSE3 or scalar
169#if defined(PACS_SIMD_SSSE3)
170 swap_bytes_16_ssse3(src + i, dst + i, byte_count - i);
171#else
172 swap_bytes_16_scalar(src + i, dst + i, byte_count - i);
173#endif
174}
175
176inline void swap_bytes_32_avx2(const uint8_t* src, uint8_t* dst,
177 size_t byte_count) noexcept {
178 const __m256i mask = get_swap32_mask_256();
179 const size_t simd_count = (byte_count / 32) * 32;
180
181 size_t i = 0;
182 for (; i < simd_count; i += 32) {
183 __m256i v =
184 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
185 v = _mm256_shuffle_epi8(v, mask);
186 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), v);
187 }
188
189#if defined(PACS_SIMD_SSSE3)
190 swap_bytes_32_ssse3(src + i, dst + i, byte_count - i);
191#else
192 swap_bytes_32_scalar(src + i, dst + i, byte_count - i);
193#endif
194}
195
196inline void swap_bytes_64_avx2(const uint8_t* src, uint8_t* dst,
197 size_t byte_count) noexcept {
198 const __m256i mask = get_swap64_mask_256();
199 const size_t simd_count = (byte_count / 32) * 32;
200
201 size_t i = 0;
202 for (; i < simd_count; i += 32) {
203 __m256i v =
204 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
205 v = _mm256_shuffle_epi8(v, mask);
206 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), v);
207 }
208
209#if defined(PACS_SIMD_SSSE3)
210 swap_bytes_64_ssse3(src + i, dst + i, byte_count - i);
211#else
212 swap_bytes_64_scalar(src + i, dst + i, byte_count - i);
213#endif
214}
215
216#endif // PACS_SIMD_AVX2
217
218#if defined(PACS_SIMD_NEON)
219
220inline void swap_bytes_16_neon(const uint8_t* src, uint8_t* dst,
221 size_t byte_count) noexcept {
222 const size_t simd_count = (byte_count / 16) * 16;
223
224 size_t i = 0;
225 for (; i < simd_count; i += 16) {
226 uint8x16_t v = vld1q_u8(src + i);
227 // vrev16q_u8 reverses bytes within 16-bit elements
228 v = vrev16q_u8(v);
229 vst1q_u8(dst + i, v);
230 }
231
232 // Handle remainder
233 swap_bytes_16_scalar(src + i, dst + i, byte_count - i);
234}
235
236inline void swap_bytes_32_neon(const uint8_t* src, uint8_t* dst,
237 size_t byte_count) noexcept {
238 const size_t simd_count = (byte_count / 16) * 16;
239
240 size_t i = 0;
241 for (; i < simd_count; i += 16) {
242 uint8x16_t v = vld1q_u8(src + i);
243 // vrev32q_u8 reverses bytes within 32-bit elements
244 v = vrev32q_u8(v);
245 vst1q_u8(dst + i, v);
246 }
247
248 // Handle remainder
249 swap_bytes_32_scalar(src + i, dst + i, byte_count - i);
250}
251
252inline void swap_bytes_64_neon(const uint8_t* src, uint8_t* dst,
253 size_t byte_count) noexcept {
254 const size_t simd_count = (byte_count / 16) * 16;
255
256 size_t i = 0;
257 for (; i < simd_count; i += 16) {
258 uint8x16_t v = vld1q_u8(src + i);
259 // vrev64q_u8 reverses bytes within 64-bit elements
260 v = vrev64q_u8(v);
261 vst1q_u8(dst + i, v);
262 }
263
264 // Handle remainder
265 swap_bytes_64_scalar(src + i, dst + i, byte_count - i);
266}
267
268#endif // PACS_SIMD_NEON
269
270} // namespace detail
271
278inline void swap_bytes_16_simd(const uint8_t* src, uint8_t* dst,
279 size_t byte_count) noexcept {
280 if (byte_count < 2) {
281 return;
282 }
283
284#if defined(PACS_SIMD_AVX2)
285 if (has_avx2()) {
286 detail::swap_bytes_16_avx2(src, dst, byte_count);
287 return;
288 }
289#endif
290
291#if defined(PACS_SIMD_SSSE3)
292 if (has_ssse3()) {
293 detail::swap_bytes_16_ssse3(src, dst, byte_count);
294 return;
295 }
296#endif
297
298#if defined(PACS_SIMD_NEON)
299 detail::swap_bytes_16_neon(src, dst, byte_count);
300 return;
301#endif
302
303 detail::swap_bytes_16_scalar(src, dst, byte_count);
304}
305
312inline void swap_bytes_32_simd(const uint8_t* src, uint8_t* dst,
313 size_t byte_count) noexcept {
314 if (byte_count < 4) {
315 return;
316 }
317
318#if defined(PACS_SIMD_AVX2)
319 if (has_avx2()) {
320 detail::swap_bytes_32_avx2(src, dst, byte_count);
321 return;
322 }
323#endif
324
325#if defined(PACS_SIMD_SSSE3)
326 if (has_ssse3()) {
327 detail::swap_bytes_32_ssse3(src, dst, byte_count);
328 return;
329 }
330#endif
331
332#if defined(PACS_SIMD_NEON)
333 detail::swap_bytes_32_neon(src, dst, byte_count);
334 return;
335#endif
336
337 detail::swap_bytes_32_scalar(src, dst, byte_count);
338}
339
346inline void swap_bytes_64_simd(const uint8_t* src, uint8_t* dst,
347 size_t byte_count) noexcept {
348 if (byte_count < 8) {
349 return;
350 }
351
352#if defined(PACS_SIMD_AVX2)
353 if (has_avx2()) {
354 detail::swap_bytes_64_avx2(src, dst, byte_count);
355 return;
356 }
357#endif
358
359#if defined(PACS_SIMD_SSSE3)
360 if (has_ssse3()) {
361 detail::swap_bytes_64_ssse3(src, dst, byte_count);
362 return;
363 }
364#endif
365
366#if defined(PACS_SIMD_NEON)
367 detail::swap_bytes_64_neon(src, dst, byte_count);
368 return;
369#endif
370
371 detail::swap_bytes_64_scalar(src, dst, byte_count);
372}
373
374} // namespace kcenon::pacs::encoding::simd
375
376#endif // PACS_ENCODING_SIMD_UTILS_HPP
void swap_bytes_64_scalar(const uint8_t *src, uint8_t *dst, size_t byte_count) noexcept
Definition simd_utils.h:56
void swap_bytes_32_scalar(const uint8_t *src, uint8_t *dst, size_t byte_count) noexcept
Definition simd_utils.h:46
void swap_bytes_16_scalar(const uint8_t *src, uint8_t *dst, size_t byte_count) noexcept
Definition simd_utils.h:38
void swap_bytes_32_simd(const uint8_t *src, uint8_t *dst, size_t count) noexcept
Swap bytes in 32-bit words using best available SIMD.
Definition simd_utils.h:312
bool has_avx2() noexcept
Check if AVX2 is available.
bool has_ssse3() noexcept
Check if SSSE3 is available.
void swap_bytes_64_simd(const uint8_t *src, uint8_t *dst, size_t count) noexcept
Swap bytes in 64-bit words using best available SIMD.
Definition simd_utils.h:346
void swap_bytes_16_simd(const uint8_t *src, uint8_t *dst, size_t count) noexcept
Swap bytes in 16-bit words using best available SIMD.
Definition simd_utils.h:278
SIMD configuration and CPU feature detection.
Platform-specific SIMD type definitions and wrappers.