Container System 0.1.0
High-performance C++20 type-safe container framework with SIMD-accelerated serialization
Loading...
Searching...
No Matches
simd_policies.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2024, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
26#pragma once
27
28#include <algorithm>
29#include <cstddef>
30#include <cstdint>
31#include <cstring>
32#include <limits>
33#include <numeric>
34#include <optional>
35#include <span>
36#include <string_view>
37#include <concepts>
38
39// Platform-specific SIMD headers
40#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
41 #define CONTAINER_HAS_X86_SIMD 1
42 #if defined(__AVX512F__) || defined(HAS_AVX512)
43 #ifndef CONTAINER_HAS_AVX512
44 #define CONTAINER_HAS_AVX512 1
45 #endif
46 #ifndef CONTAINER_HAS_AVX2
47 #define CONTAINER_HAS_AVX2 1
48 #endif
49 #include <immintrin.h>
50 #elif defined(__AVX2__) || defined(HAS_AVX2)
51 #ifndef CONTAINER_HAS_AVX2
52 #define CONTAINER_HAS_AVX2 1
53 #endif
54 #include <immintrin.h>
55 #elif defined(__SSE4_2__) || defined(HAS_SSE42)
56 #ifndef CONTAINER_HAS_SSE42
57 #define CONTAINER_HAS_SSE42 1
58 #endif
59 #include <nmmintrin.h>
60 #include <smmintrin.h>
61 #include <tmmintrin.h>
62 #elif defined(__SSE2__)
63 #ifndef CONTAINER_HAS_SSE2
64 #define CONTAINER_HAS_SSE2 1
65 #endif
66 #include <emmintrin.h>
67 #endif
68 #if defined(CONTAINER_HAS_SSE42) || defined(CONTAINER_HAS_SSE2)
69 #include <xmmintrin.h>
70 #include <emmintrin.h>
71 #include <pmmintrin.h>
72 #endif
73#elif defined(__ARM_NEON) || defined(__aarch64__)
74 #define CONTAINER_HAS_ARM_NEON 1
75 #include <arm_neon.h>
76#endif
77
79
90template<typename T>
91concept SimdPolicy = requires(const T& policy, const float* data, size_t count) {
92 { T::name() } -> std::convertible_to<std::string_view>;
93 { T::simd_width } -> std::convertible_to<size_t>;
94 { policy.sum_floats(data, count) } -> std::same_as<float>;
95 { policy.min_float(data, count) } -> std::same_as<float>;
96 { policy.max_float(data, count) } -> std::same_as<float>;
97};
98
99// ============================================================================
100// Scalar Policy (Fallback)
101// ============================================================================
102
110 static constexpr std::string_view name() noexcept { return "scalar"; }
111 static constexpr size_t simd_width = 1;
112
113 [[nodiscard]] float sum_floats(const float* data, size_t count) const noexcept {
114 float sum = 0.0f;
115 for (size_t i = 0; i < count; ++i) {
116 sum += data[i];
117 }
118 return sum;
119 }
120
121 [[nodiscard]] float min_float(const float* data, size_t count) const noexcept {
122 if (count == 0) return std::numeric_limits<float>::max();
123 float min_val = data[0];
124 for (size_t i = 1; i < count; ++i) {
125 if (data[i] < min_val) {
126 min_val = data[i];
127 }
128 }
129 return min_val;
130 }
131
132 [[nodiscard]] float max_float(const float* data, size_t count) const noexcept {
133 if (count == 0) return std::numeric_limits<float>::lowest();
134 float max_val = data[0];
135 for (size_t i = 1; i < count; ++i) {
136 if (data[i] > max_val) {
137 max_val = data[i];
138 }
139 }
140 return max_val;
141 }
142
143 [[nodiscard]] double sum_doubles(const double* data, size_t count) const noexcept {
144 double sum = 0.0;
145 for (size_t i = 0; i < count; ++i) {
146 sum += data[i];
147 }
148 return sum;
149 }
150};
151
152// ============================================================================
153// SSE Policy (x86)
154// ============================================================================
155
156#if defined(CONTAINER_HAS_SSE42) || defined(CONTAINER_HAS_SSE2)
160struct sse_simd_policy {
161 static constexpr std::string_view name() noexcept { return "sse"; }
162 static constexpr size_t simd_width = 4;
163
164 [[nodiscard]]
165#if defined(__GNUC__) || defined(__clang__)
166 __attribute__((target("sse3")))
167#endif
168 float sum_floats(const float* data, size_t count) const noexcept {
169 __m128 sum_vec = _mm_setzero_ps();
170 size_t simd_end = count - (count % 4);
171
172 for (size_t i = 0; i < simd_end; i += 4) {
173 __m128 vec = _mm_loadu_ps(&data[i]);
174 sum_vec = _mm_add_ps(sum_vec, vec);
175 }
176
177 // Horizontal sum using hadd
178 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
179 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
180
181 float sum = _mm_cvtss_f32(sum_vec);
182
183 // Handle remaining elements
184 for (size_t i = simd_end; i < count; ++i) {
185 sum += data[i];
186 }
187
188 return sum;
189 }
190
191 [[nodiscard]]
192#if defined(__GNUC__) || defined(__clang__)
193 __attribute__((target("sse2")))
194#endif
195 float min_float(const float* data, size_t count) const noexcept {
196 if (count == 0) return std::numeric_limits<float>::max();
197
198 __m128 min_vec = _mm_set1_ps(std::numeric_limits<float>::max());
199 size_t simd_end = count - (count % 4);
200
201 for (size_t i = 0; i < simd_end; i += 4) {
202 __m128 vec = _mm_loadu_ps(&data[i]);
203 min_vec = _mm_min_ps(min_vec, vec);
204 }
205
206 // Extract minimum from vector
207 alignas(16) float result[4];
208 _mm_store_ps(result, min_vec);
209 float min_val = result[0];
210 for (int i = 1; i < 4; ++i) {
211 if (result[i] < min_val) min_val = result[i];
212 }
213
214 // Handle remaining elements
215 for (size_t i = simd_end; i < count; ++i) {
216 if (data[i] < min_val) min_val = data[i];
217 }
218
219 return min_val;
220 }
221
222 [[nodiscard]]
223#if defined(__GNUC__) || defined(__clang__)
224 __attribute__((target("sse2")))
225#endif
226 float max_float(const float* data, size_t count) const noexcept {
227 if (count == 0) return std::numeric_limits<float>::lowest();
228
229 __m128 max_vec = _mm_set1_ps(std::numeric_limits<float>::lowest());
230 size_t simd_end = count - (count % 4);
231
232 for (size_t i = 0; i < simd_end; i += 4) {
233 __m128 vec = _mm_loadu_ps(&data[i]);
234 max_vec = _mm_max_ps(max_vec, vec);
235 }
236
237 // Extract maximum from vector
238 alignas(16) float result[4];
239 _mm_store_ps(result, max_vec);
240 float max_val = result[0];
241 for (int i = 1; i < 4; ++i) {
242 if (result[i] > max_val) max_val = result[i];
243 }
244
245 // Handle remaining elements
246 for (size_t i = simd_end; i < count; ++i) {
247 if (data[i] > max_val) max_val = data[i];
248 }
249
250 return max_val;
251 }
252
253 [[nodiscard]] double sum_doubles(const double* data, size_t count) const noexcept {
254 // Use scalar for doubles in SSE (simpler)
255 double sum = 0.0;
256 for (size_t i = 0; i < count; ++i) {
257 sum += data[i];
258 }
259 return sum;
260 }
261};
262#endif
263
264// ============================================================================
265// AVX2 Policy (x86)
266// ============================================================================
267
268#if defined(CONTAINER_HAS_AVX2)
272struct avx2_simd_policy {
273 static constexpr std::string_view name() noexcept { return "avx2"; }
274 static constexpr size_t simd_width = 8;
275
276 [[nodiscard]]
277#if defined(__GNUC__) || defined(__clang__)
278 __attribute__((target("avx2")))
279#endif
280 float sum_floats(const float* data, size_t count) const noexcept {
281 __m256 sum_vec = _mm256_setzero_ps();
282 size_t simd_end = count - (count % 8);
283
284 for (size_t i = 0; i < simd_end; i += 8) {
285 __m256 vec = _mm256_loadu_ps(&data[i]);
286 sum_vec = _mm256_add_ps(sum_vec, vec);
287 }
288
289 // Horizontal sum
290 __m128 low = _mm256_castps256_ps128(sum_vec);
291 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
292 __m128 sum128 = _mm_add_ps(low, high);
293 sum128 = _mm_hadd_ps(sum128, sum128);
294 sum128 = _mm_hadd_ps(sum128, sum128);
295
296 float sum = _mm_cvtss_f32(sum128);
297
298 // Handle remaining elements
299 for (size_t i = simd_end; i < count; ++i) {
300 sum += data[i];
301 }
302
303 return sum;
304 }
305
306 [[nodiscard]]
307#if defined(__GNUC__) || defined(__clang__)
308 __attribute__((target("avx2")))
309#endif
310 float min_float(const float* data, size_t count) const noexcept {
311 if (count == 0) return std::numeric_limits<float>::max();
312
313 __m256 min_vec = _mm256_set1_ps(std::numeric_limits<float>::max());
314 size_t simd_end = count - (count % 8);
315
316 for (size_t i = 0; i < simd_end; i += 8) {
317 __m256 vec = _mm256_loadu_ps(&data[i]);
318 min_vec = _mm256_min_ps(min_vec, vec);
319 }
320
321 // Extract minimum from vector
322 alignas(32) float result[8];
323 _mm256_store_ps(result, min_vec);
324 float min_val = result[0];
325 for (int i = 1; i < 8; ++i) {
326 if (result[i] < min_val) min_val = result[i];
327 }
328
329 // Handle remaining elements
330 for (size_t i = simd_end; i < count; ++i) {
331 if (data[i] < min_val) min_val = data[i];
332 }
333
334 return min_val;
335 }
336
337 [[nodiscard]]
338#if defined(__GNUC__) || defined(__clang__)
339 __attribute__((target("avx2")))
340#endif
341 float max_float(const float* data, size_t count) const noexcept {
342 if (count == 0) return std::numeric_limits<float>::lowest();
343
344 __m256 max_vec = _mm256_set1_ps(std::numeric_limits<float>::lowest());
345 size_t simd_end = count - (count % 8);
346
347 for (size_t i = 0; i < simd_end; i += 8) {
348 __m256 vec = _mm256_loadu_ps(&data[i]);
349 max_vec = _mm256_max_ps(max_vec, vec);
350 }
351
352 // Extract maximum from vector
353 alignas(32) float result[8];
354 _mm256_store_ps(result, max_vec);
355 float max_val = result[0];
356 for (int i = 1; i < 8; ++i) {
357 if (result[i] > max_val) max_val = result[i];
358 }
359
360 // Handle remaining elements
361 for (size_t i = simd_end; i < count; ++i) {
362 if (data[i] > max_val) max_val = data[i];
363 }
364
365 return max_val;
366 }
367
368 [[nodiscard]] double sum_doubles(const double* data, size_t count) const noexcept {
369 double sum = 0.0;
370 for (size_t i = 0; i < count; ++i) {
371 sum += data[i];
372 }
373 return sum;
374 }
375};
376#endif
377
378// ============================================================================
379// AVX-512 Policy (x86)
380// ============================================================================
381
382#if defined(CONTAINER_HAS_AVX512)
386struct avx512_simd_policy {
387 static constexpr std::string_view name() noexcept { return "avx512"; }
388 static constexpr size_t simd_width = 16;
389
390 [[nodiscard]]
391#if defined(__GNUC__) || defined(__clang__)
392 __attribute__((target("avx512f")))
393#endif
394 float sum_floats(const float* data, size_t count) const noexcept {
395 __m512 sum_vec = _mm512_setzero_ps();
396 size_t simd_end = count - (count % 16);
397
398 for (size_t i = 0; i < simd_end; i += 16) {
399 __m512 vec = _mm512_loadu_ps(&data[i]);
400 sum_vec = _mm512_add_ps(sum_vec, vec);
401 }
402
403 float sum = _mm512_reduce_add_ps(sum_vec);
404
405 for (size_t i = simd_end; i < count; ++i) {
406 sum += data[i];
407 }
408
409 return sum;
410 }
411
412 [[nodiscard]]
413#if defined(__GNUC__) || defined(__clang__)
414 __attribute__((target("avx512f")))
415#endif
416 float min_float(const float* data, size_t count) const noexcept {
417 if (count == 0) return std::numeric_limits<float>::max();
418
419 __m512 min_vec = _mm512_set1_ps(std::numeric_limits<float>::max());
420 size_t simd_end = count - (count % 16);
421
422 for (size_t i = 0; i < simd_end; i += 16) {
423 __m512 vec = _mm512_loadu_ps(&data[i]);
424 min_vec = _mm512_min_ps(min_vec, vec);
425 }
426
427 float min_val = _mm512_reduce_min_ps(min_vec);
428
429 for (size_t i = simd_end; i < count; ++i) {
430 if (data[i] < min_val) min_val = data[i];
431 }
432
433 return min_val;
434 }
435
436 [[nodiscard]]
437#if defined(__GNUC__) || defined(__clang__)
438 __attribute__((target("avx512f")))
439#endif
440 float max_float(const float* data, size_t count) const noexcept {
441 if (count == 0) return std::numeric_limits<float>::lowest();
442
443 __m512 max_vec = _mm512_set1_ps(std::numeric_limits<float>::lowest());
444 size_t simd_end = count - (count % 16);
445
446 for (size_t i = 0; i < simd_end; i += 16) {
447 __m512 vec = _mm512_loadu_ps(&data[i]);
448 max_vec = _mm512_max_ps(max_vec, vec);
449 }
450
451 float max_val = _mm512_reduce_max_ps(max_vec);
452
453 for (size_t i = simd_end; i < count; ++i) {
454 if (data[i] > max_val) max_val = data[i];
455 }
456
457 return max_val;
458 }
459
460 [[nodiscard]]
461#if defined(__GNUC__) || defined(__clang__)
462 __attribute__((target("avx512f")))
463#endif
464 double sum_doubles(const double* data, size_t count) const noexcept {
465 __m512d sum_vec = _mm512_setzero_pd();
466 size_t simd_end = count - (count % 8);
467
468 for (size_t i = 0; i < simd_end; i += 8) {
469 __m512d vec = _mm512_loadu_pd(&data[i]);
470 sum_vec = _mm512_add_pd(sum_vec, vec);
471 }
472
473 double sum = _mm512_reduce_add_pd(sum_vec);
474
475 for (size_t i = simd_end; i < count; ++i) {
476 sum += data[i];
477 }
478
479 return sum;
480 }
481};
482#endif
483
484// ============================================================================
485// NEON Policy (ARM)
486// ============================================================================
487
488#if defined(CONTAINER_HAS_ARM_NEON)
492struct neon_simd_policy {
493 static constexpr std::string_view name() noexcept { return "neon"; }
494 static constexpr size_t simd_width = 4;
495
496 [[nodiscard]] float sum_floats(const float* data, size_t count) const noexcept {
497 float32x4_t sum_vec = vdupq_n_f32(0.0f);
498 size_t simd_end = count - (count % 4);
499
500 for (size_t i = 0; i < simd_end; i += 4) {
501 float32x4_t vec = vld1q_f32(&data[i]);
502 sum_vec = vaddq_f32(sum_vec, vec);
503 }
504
505 // Horizontal sum
506 float32x2_t sum_low = vget_low_f32(sum_vec);
507 float32x2_t sum_high = vget_high_f32(sum_vec);
508 float32x2_t sum_pair = vadd_f32(sum_low, sum_high);
509 float sum = vget_lane_f32(sum_pair, 0) + vget_lane_f32(sum_pair, 1);
510
511 for (size_t i = simd_end; i < count; ++i) {
512 sum += data[i];
513 }
514
515 return sum;
516 }
517
518 [[nodiscard]] float min_float(const float* data, size_t count) const noexcept {
519 if (count == 0) return std::numeric_limits<float>::max();
520
521 float32x4_t min_vec = vdupq_n_f32(std::numeric_limits<float>::max());
522 size_t simd_end = count - (count % 4);
523
524 for (size_t i = 0; i < simd_end; i += 4) {
525 float32x4_t vec = vld1q_f32(&data[i]);
526 min_vec = vminq_f32(min_vec, vec);
527 }
528
529 // Extract minimum from vector
530 float result[4];
531 vst1q_f32(result, min_vec);
532 float min_val = result[0];
533 for (int i = 1; i < 4; ++i) {
534 if (result[i] < min_val) min_val = result[i];
535 }
536
537 for (size_t i = simd_end; i < count; ++i) {
538 if (data[i] < min_val) min_val = data[i];
539 }
540
541 return min_val;
542 }
543
544 [[nodiscard]] float max_float(const float* data, size_t count) const noexcept {
545 if (count == 0) return std::numeric_limits<float>::lowest();
546
547 float32x4_t max_vec = vdupq_n_f32(std::numeric_limits<float>::lowest());
548 size_t simd_end = count - (count % 4);
549
550 for (size_t i = 0; i < simd_end; i += 4) {
551 float32x4_t vec = vld1q_f32(&data[i]);
552 max_vec = vmaxq_f32(max_vec, vec);
553 }
554
555 // Extract maximum from vector
556 float result[4];
557 vst1q_f32(result, max_vec);
558 float max_val = result[0];
559 for (int i = 1; i < 4; ++i) {
560 if (result[i] > max_val) max_val = result[i];
561 }
562
563 for (size_t i = simd_end; i < count; ++i) {
564 if (data[i] > max_val) max_val = data[i];
565 }
566
567 return max_val;
568 }
569
570 [[nodiscard]] double sum_doubles(const double* data, size_t count) const noexcept {
571 double sum = 0.0;
572 for (size_t i = 0; i < count; ++i) {
573 sum += data[i];
574 }
575 return sum;
576 }
577};
578#endif
579
580// ============================================================================
581// Compile-Time Policy Selection
582// ============================================================================
583
594#if defined(CONTAINER_HAS_AVX512)
595 using default_simd_policy = avx512_simd_policy;
596#elif defined(CONTAINER_HAS_AVX2)
597 using default_simd_policy = avx2_simd_policy;
598#elif defined(CONTAINER_HAS_SSE42) || defined(CONTAINER_HAS_SSE2)
599 using default_simd_policy = sse_simd_policy;
600#elif defined(CONTAINER_HAS_ARM_NEON)
601 using default_simd_policy = neon_simd_policy;
602#else
604#endif
605
620template<SimdPolicy Policy = default_simd_policy>
621class simd_ops {
622public:
623 explicit simd_ops(Policy policy = Policy{}) noexcept
624 : policy_(std::move(policy)) {}
625
629 static constexpr std::string_view policy_name() noexcept {
630 return Policy::name();
631 }
632
636 static constexpr size_t simd_width() noexcept {
637 return Policy::simd_width;
638 }
639
643 [[nodiscard]] float sum_floats(const float* data, size_t count) const noexcept {
644 return policy_.sum_floats(data, count);
645 }
646
650 [[nodiscard]] float min_float(const float* data, size_t count) const noexcept {
651 return policy_.min_float(data, count);
652 }
653
657 [[nodiscard]] float max_float(const float* data, size_t count) const noexcept {
658 return policy_.max_float(data, count);
659 }
660
664 [[nodiscard]] double sum_doubles(const double* data, size_t count) const noexcept {
665 return policy_.sum_doubles(data, count);
666 }
667
668private:
669 [[no_unique_address]] Policy policy_;
670};
671
672// Static assertions to verify policy compliance
673static_assert(SimdPolicy<scalar_simd_policy>, "scalar_simd_policy must satisfy SimdPolicy");
674#if defined(CONTAINER_HAS_SSE42) || defined(CONTAINER_HAS_SSE2)
675static_assert(SimdPolicy<sse_simd_policy>, "sse_simd_policy must satisfy SimdPolicy");
676#endif
677#if defined(CONTAINER_HAS_AVX2)
678static_assert(SimdPolicy<avx2_simd_policy>, "avx2_simd_policy must satisfy SimdPolicy");
679#endif
680#if defined(CONTAINER_HAS_AVX512)
681static_assert(SimdPolicy<avx512_simd_policy>, "avx512_simd_policy must satisfy SimdPolicy");
682#endif
683#if defined(CONTAINER_HAS_ARM_NEON)
684static_assert(SimdPolicy<neon_simd_policy>, "neon_simd_policy must satisfy SimdPolicy");
685#endif
686
687} // namespace kcenon::container::simd
SIMD operations wrapper with compile-time policy selection.
double sum_doubles(const double *data, size_t count) const noexcept
Sum all doubles in an array.
static constexpr std::string_view policy_name() noexcept
Get the name of the active SIMD policy.
float max_float(const float *data, size_t count) const noexcept
Find maximum float in an array.
float sum_floats(const float *data, size_t count) const noexcept
Sum all floats in an array.
simd_ops(Policy policy=Policy{}) noexcept
float min_float(const float *data, size_t count) const noexcept
Find minimum float in an array.
static constexpr size_t simd_width() noexcept
Get the SIMD width (number of floats per operation)
Concept for SIMD policy classes.
scalar_simd_policy default_simd_policy
Default SIMD policy selected at compile time based on platform.
Scalar (non-SIMD) implementation of operations.
float min_float(const float *data, size_t count) const noexcept
double sum_doubles(const double *data, size_t count) const noexcept
float max_float(const float *data, size_t count) const noexcept
static constexpr std::string_view name() noexcept
float sum_floats(const float *data, size_t count) const noexcept