15#ifndef PACS_ENCODING_SIMD_UTILS_HPP
16#define PACS_ENCODING_SIMD_UTILS_HPP
29 size_t count)
noexcept;
31 size_t count)
noexcept;
33 size_t count)
noexcept;
39 size_t byte_count)
noexcept {
40 for (
size_t i = 0; i + 1 < byte_count; i += 2) {
47 size_t byte_count)
noexcept {
48 for (
size_t i = 0; i + 3 < byte_count; i += 4) {
50 dst[i + 1] = src[i + 2];
51 dst[i + 2] = src[i + 1];
57 size_t byte_count)
noexcept {
58 for (
size_t i = 0; i + 7 < byte_count; i += 8) {
60 dst[i + 1] = src[i + 6];
61 dst[i + 2] = src[i + 5];
62 dst[i + 3] = src[i + 4];
63 dst[i + 4] = src[i + 3];
64 dst[i + 5] = src[i + 2];
65 dst[i + 6] = src[i + 1];
70#if defined(PACS_SIMD_SSSE3)
73inline __m128i get_swap16_mask() noexcept {
75 return _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
78inline __m128i get_swap32_mask() noexcept {
80 return _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
83inline __m128i get_swap64_mask() noexcept {
85 return _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
88inline void swap_bytes_16_ssse3(
const uint8_t* src, uint8_t* dst,
89 size_t byte_count)
noexcept {
90 const __m128i mask = get_swap16_mask();
91 const size_t simd_count = (byte_count / 16) * 16;
94 for (; i < simd_count; i += 16) {
95 __m128i v = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i));
96 v = _mm_shuffle_epi8(v, mask);
97 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i), v);
104inline void swap_bytes_32_ssse3(
const uint8_t* src, uint8_t* dst,
105 size_t byte_count)
noexcept {
106 const __m128i mask = get_swap32_mask();
107 const size_t simd_count = (byte_count / 16) * 16;
110 for (; i < simd_count; i += 16) {
111 __m128i v = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i));
112 v = _mm_shuffle_epi8(v, mask);
113 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i), v);
120inline void swap_bytes_64_ssse3(
const uint8_t* src, uint8_t* dst,
121 size_t byte_count)
noexcept {
122 const __m128i mask = get_swap64_mask();
123 const size_t simd_count = (byte_count / 16) * 16;
126 for (; i < simd_count; i += 16) {
127 __m128i v = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i));
128 v = _mm_shuffle_epi8(v, mask);
129 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i), v);
138#if defined(PACS_SIMD_AVX2)
140inline __m256i get_swap16_mask_256() noexcept {
141 return _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
142 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
145inline __m256i get_swap32_mask_256() noexcept {
146 return _mm256_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
147 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
150inline __m256i get_swap64_mask_256() noexcept {
151 return _mm256_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
152 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
155inline void swap_bytes_16_avx2(
const uint8_t* src, uint8_t* dst,
156 size_t byte_count)
noexcept {
157 const __m256i mask = get_swap16_mask_256();
158 const size_t simd_count = (byte_count / 32) * 32;
161 for (; i < simd_count; i += 32) {
163 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + i));
164 v = _mm256_shuffle_epi8(v, mask);
165 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i), v);
169#if defined(PACS_SIMD_SSSE3)
170 swap_bytes_16_ssse3(src + i, dst + i, byte_count - i);
176inline void swap_bytes_32_avx2(
const uint8_t* src, uint8_t* dst,
177 size_t byte_count)
noexcept {
178 const __m256i mask = get_swap32_mask_256();
179 const size_t simd_count = (byte_count / 32) * 32;
182 for (; i < simd_count; i += 32) {
184 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + i));
185 v = _mm256_shuffle_epi8(v, mask);
186 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i), v);
189#if defined(PACS_SIMD_SSSE3)
190 swap_bytes_32_ssse3(src + i, dst + i, byte_count - i);
196inline void swap_bytes_64_avx2(
const uint8_t* src, uint8_t* dst,
197 size_t byte_count)
noexcept {
198 const __m256i mask = get_swap64_mask_256();
199 const size_t simd_count = (byte_count / 32) * 32;
202 for (; i < simd_count; i += 32) {
204 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + i));
205 v = _mm256_shuffle_epi8(v, mask);
206 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i), v);
209#if defined(PACS_SIMD_SSSE3)
210 swap_bytes_64_ssse3(src + i, dst + i, byte_count - i);
218#if defined(PACS_SIMD_NEON)
220inline void swap_bytes_16_neon(
const uint8_t* src, uint8_t* dst,
221 size_t byte_count)
noexcept {
222 const size_t simd_count = (byte_count / 16) * 16;
225 for (; i < simd_count; i += 16) {
226 uint8x16_t v = vld1q_u8(src + i);
229 vst1q_u8(dst + i, v);
236inline void swap_bytes_32_neon(
const uint8_t* src, uint8_t* dst,
237 size_t byte_count)
noexcept {
238 const size_t simd_count = (byte_count / 16) * 16;
241 for (; i < simd_count; i += 16) {
242 uint8x16_t v = vld1q_u8(src + i);
245 vst1q_u8(dst + i, v);
252inline void swap_bytes_64_neon(
const uint8_t* src, uint8_t* dst,
253 size_t byte_count)
noexcept {
254 const size_t simd_count = (byte_count / 16) * 16;
257 for (; i < simd_count; i += 16) {
258 uint8x16_t v = vld1q_u8(src + i);
261 vst1q_u8(dst + i, v);
279 size_t byte_count)
noexcept {
280 if (byte_count < 2) {
284#if defined(PACS_SIMD_AVX2)
286 detail::swap_bytes_16_avx2(src, dst, byte_count);
291#if defined(PACS_SIMD_SSSE3)
293 detail::swap_bytes_16_ssse3(src, dst, byte_count);
298#if defined(PACS_SIMD_NEON)
299 detail::swap_bytes_16_neon(src, dst, byte_count);
313 size_t byte_count)
noexcept {
314 if (byte_count < 4) {
318#if defined(PACS_SIMD_AVX2)
320 detail::swap_bytes_32_avx2(src, dst, byte_count);
325#if defined(PACS_SIMD_SSSE3)
327 detail::swap_bytes_32_ssse3(src, dst, byte_count);
332#if defined(PACS_SIMD_NEON)
333 detail::swap_bytes_32_neon(src, dst, byte_count);
347 size_t byte_count)
noexcept {
348 if (byte_count < 8) {
352#if defined(PACS_SIMD_AVX2)
354 detail::swap_bytes_64_avx2(src, dst, byte_count);
359#if defined(PACS_SIMD_SSSE3)
361 detail::swap_bytes_64_ssse3(src, dst, byte_count);
366#if defined(PACS_SIMD_NEON)
367 detail::swap_bytes_64_neon(src, dst, byte_count);
void swap_bytes_64_scalar(const uint8_t *src, uint8_t *dst, size_t byte_count) noexcept
void swap_bytes_32_scalar(const uint8_t *src, uint8_t *dst, size_t byte_count) noexcept
void swap_bytes_16_scalar(const uint8_t *src, uint8_t *dst, size_t byte_count) noexcept
void swap_bytes_32_simd(const uint8_t *src, uint8_t *dst, size_t count) noexcept
Swap bytes in 32-bit words using best available SIMD.
bool has_avx2() noexcept
Check if AVX2 is available.
bool has_ssse3() noexcept
Check if SSSE3 is available.
void swap_bytes_64_simd(const uint8_t *src, uint8_t *dst, size_t count) noexcept
Swap bytes in 64-bit words using best available SIMD.
void swap_bytes_16_simd(const uint8_t *src, uint8_t *dst, size_t count) noexcept
Swap bytes in 16-bit words using best available SIMD.
SIMD configuration and CPU feature detection.
Platform-specific SIMD type definitions and wrappers.