Container System 0.1.0
High-performance C++20 type-safe container framework with SIMD-accelerated serialization
Loading...
Searching...
No Matches
simd_processor.cpp
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2024, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
5#include "simd_processor.h"
6#include <algorithm>
7#include <cmath>
8#include <limits>
9
10#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
11 #if defined(__GNUC__) || defined(__clang__)
12 #include <cpuid.h>
13 #endif
14#endif
15
16namespace kcenon::container
17{
18namespace simd
19{
20 // Scalar implementations (fallback)
21 float simd_processor::sum_floats_scalar(const float* data, size_t count)
22 {
23 float sum = 0.0f;
24 for (size_t i = 0; i < count; ++i) {
25 sum += data[i];
26 }
27 return sum;
28 }
29
30 float simd_processor::min_float_scalar(const float* data, size_t count)
31 {
32 if (count == 0) return std::numeric_limits<float>::max();
33
34 float min_val = data[0];
35 for (size_t i = 1; i < count; ++i) {
36 if (data[i] < min_val) {
37 min_val = data[i];
38 }
39 }
40 return min_val;
41 }
42
43 float simd_processor::max_float_scalar(const float* data, size_t count)
44 {
45 if (count == 0) return std::numeric_limits<float>::lowest();
46
47 float max_val = data[0];
48 for (size_t i = 1; i < count; ++i) {
49 if (data[i] > max_val) {
50 max_val = data[i];
51 }
52 }
53 return max_val;
54 }
55
56#if defined(HAS_AVX512)
57 __attribute__((target("avx512f")))
58 float simd_processor::sum_floats_avx512(const float* data, size_t count)
59 {
60 __m512 sum_vec = _mm512_setzero_ps();
61 size_t simd_end = count - (count % 16);
62
63 // Process 16 floats at a time
64 for (size_t i = 0; i < simd_end; i += 16) {
65 __m512 vec = _mm512_loadu_ps(&data[i]);
66 sum_vec = _mm512_add_ps(sum_vec, vec);
67 }
68
69 // Horizontal sum using _mm512_reduce_add_ps
70 float sum = _mm512_reduce_add_ps(sum_vec);
71
72 // Handle remaining elements
73 for (size_t i = simd_end; i < count; ++i) {
74 sum += data[i];
75 }
76
77 return sum;
78 }
79
80 __attribute__((target("avx512f")))
81 float simd_processor::min_float_avx512(const float* data, size_t count)
82 {
83 if (count == 0) return std::numeric_limits<float>::max();
84
85 __m512 min_vec = _mm512_set1_ps(std::numeric_limits<float>::max());
86 size_t simd_end = count - (count % 16);
87
88 for (size_t i = 0; i < simd_end; i += 16) {
89 __m512 vec = _mm512_loadu_ps(&data[i]);
90 min_vec = _mm512_min_ps(min_vec, vec);
91 }
92
93 // Horizontal minimum using _mm512_reduce_min_ps
94 float min_val = _mm512_reduce_min_ps(min_vec);
95
96 // Handle remaining elements
97 for (size_t i = simd_end; i < count; ++i) {
98 if (data[i] < min_val) min_val = data[i];
99 }
100
101 return min_val;
102 }
103
104 __attribute__((target("avx512f")))
105 float simd_processor::max_float_avx512(const float* data, size_t count)
106 {
107 if (count == 0) return std::numeric_limits<float>::lowest();
108
109 __m512 max_vec = _mm512_set1_ps(std::numeric_limits<float>::lowest());
110 size_t simd_end = count - (count % 16);
111
112 for (size_t i = 0; i < simd_end; i += 16) {
113 __m512 vec = _mm512_loadu_ps(&data[i]);
114 max_vec = _mm512_max_ps(max_vec, vec);
115 }
116
117 // Horizontal maximum using _mm512_reduce_max_ps
118 float max_val = _mm512_reduce_max_ps(max_vec);
119
120 // Handle remaining elements
121 for (size_t i = simd_end; i < count; ++i) {
122 if (data[i] > max_val) max_val = data[i];
123 }
124
125 return max_val;
126 }
127
128 __attribute__((target("avx512f")))
129 double simd_processor::sum_doubles_avx512(const double* data, size_t count)
130 {
131 __m512d sum_vec = _mm512_setzero_pd();
132 size_t simd_end = count - (count % 8);
133
134 // Process 8 doubles at a time
135 for (size_t i = 0; i < simd_end; i += 8) {
136 __m512d vec = _mm512_loadu_pd(&data[i]);
137 sum_vec = _mm512_add_pd(sum_vec, vec);
138 }
139
140 // Horizontal sum using _mm512_reduce_add_pd
141 double sum = _mm512_reduce_add_pd(sum_vec);
142
143 // Handle remaining elements
144 for (size_t i = simd_end; i < count; ++i) {
145 sum += data[i];
146 }
147
148 return sum;
149 }
150#endif
151
152#if defined(HAS_AVX2)
153 __attribute__((target("avx2")))
154 float simd_processor::sum_floats_avx2(const float* data, size_t count)
155 {
156 __m256 sum_vec = _mm256_setzero_ps();
157 size_t simd_end = count - (count % 8);
158
159 // Process 8 floats at a time
160 for (size_t i = 0; i < simd_end; i += 8) {
161 __m256 vec = _mm256_loadu_ps(&data[i]);
162 sum_vec = _mm256_add_ps(sum_vec, vec);
163 }
164
165 // Horizontal sum
166 __m128 low = _mm256_castps256_ps128(sum_vec);
167 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
168 __m128 sum128 = _mm_add_ps(low, high);
169 sum128 = _mm_hadd_ps(sum128, sum128);
170 sum128 = _mm_hadd_ps(sum128, sum128);
171
172 float sum = _mm_cvtss_f32(sum128);
173
174 // Handle remaining elements
175 for (size_t i = simd_end; i < count; ++i) {
176 sum += data[i];
177 }
178
179 return sum;
180 }
181
182 __attribute__((target("avx2")))
183 float simd_processor::min_float_avx2(const float* data, size_t count)
184 {
185 if (count == 0) return std::numeric_limits<float>::max();
186
187 __m256 min_vec = _mm256_set1_ps(std::numeric_limits<float>::max());
188 size_t simd_end = count - (count % 8);
189
190 for (size_t i = 0; i < simd_end; i += 8) {
191 __m256 vec = _mm256_loadu_ps(&data[i]);
192 min_vec = _mm256_min_ps(min_vec, vec);
193 }
194
195 // Extract minimum from vector
196 float result[8];
197 _mm256_storeu_ps(result, min_vec);
198 float min_val = result[0];
199 for (int i = 1; i < 8; ++i) {
200 if (result[i] < min_val) min_val = result[i];
201 }
202
203 // Handle remaining elements
204 for (size_t i = simd_end; i < count; ++i) {
205 if (data[i] < min_val) min_val = data[i];
206 }
207
208 return min_val;
209 }
210
211 __attribute__((target("avx2")))
212 float simd_processor::max_float_avx2(const float* data, size_t count)
213 {
214 if (count == 0) return std::numeric_limits<float>::lowest();
215
216 __m256 max_vec = _mm256_set1_ps(std::numeric_limits<float>::lowest());
217 size_t simd_end = count - (count % 8);
218
219 for (size_t i = 0; i < simd_end; i += 8) {
220 __m256 vec = _mm256_loadu_ps(&data[i]);
221 max_vec = _mm256_max_ps(max_vec, vec);
222 }
223
224 // Extract maximum from vector
225 float result[8];
226 _mm256_storeu_ps(result, max_vec);
227 float max_val = result[0];
228 for (int i = 1; i < 8; ++i) {
229 if (result[i] > max_val) max_val = result[i];
230 }
231
232 // Handle remaining elements
233 for (size_t i = simd_end; i < count; ++i) {
234 if (data[i] > max_val) max_val = data[i];
235 }
236
237 return max_val;
238 }
239#endif
240
241#if defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
242 __attribute__((target("sse3")))
243 float simd_processor::sum_floats_sse(const float* data, size_t count)
244 {
245 __m128 sum_vec = _mm_setzero_ps();
246 size_t simd_end = count - (count % 4);
247
248 // Process 4 floats at a time
249 for (size_t i = 0; i < simd_end; i += 4) {
250 __m128 vec = _mm_loadu_ps(&data[i]);
251 sum_vec = _mm_add_ps(sum_vec, vec);
252 }
253
254 // Horizontal sum
255 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
256 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
257
258 float sum = _mm_cvtss_f32(sum_vec);
259
260 // Handle remaining elements
261 for (size_t i = simd_end; i < count; ++i) {
262 sum += data[i];
263 }
264
265 return sum;
266 }
267
268 __attribute__((target("sse2")))
269 float simd_processor::min_float_sse(const float* data, size_t count)
270 {
271 if (count == 0) return std::numeric_limits<float>::max();
272
273 __m128 min_vec = _mm_set1_ps(std::numeric_limits<float>::max());
274 size_t simd_end = count - (count % 4);
275
276 for (size_t i = 0; i < simd_end; i += 4) {
277 __m128 vec = _mm_loadu_ps(&data[i]);
278 min_vec = _mm_min_ps(min_vec, vec);
279 }
280
281 // Extract minimum from vector
282 float result[4];
283 _mm_storeu_ps(result, min_vec);
284 float min_val = result[0];
285 for (int i = 1; i < 4; ++i) {
286 if (result[i] < min_val) min_val = result[i];
287 }
288
289 // Handle remaining elements
290 for (size_t i = simd_end; i < count; ++i) {
291 if (data[i] < min_val) min_val = data[i];
292 }
293
294 return min_val;
295 }
296
297 __attribute__((target("sse2")))
298 float simd_processor::max_float_sse(const float* data, size_t count)
299 {
300 if (count == 0) return std::numeric_limits<float>::lowest();
301
302 __m128 max_vec = _mm_set1_ps(std::numeric_limits<float>::lowest());
303 size_t simd_end = count - (count % 4);
304
305 for (size_t i = 0; i < simd_end; i += 4) {
306 __m128 vec = _mm_loadu_ps(&data[i]);
307 max_vec = _mm_max_ps(max_vec, vec);
308 }
309
310 // Extract maximum from vector
311 float result[4];
312 _mm_storeu_ps(result, max_vec);
313 float max_val = result[0];
314 for (int i = 1; i < 4; ++i) {
315 if (result[i] > max_val) max_val = result[i];
316 }
317
318 // Handle remaining elements
319 for (size_t i = simd_end; i < count; ++i) {
320 if (data[i] > max_val) max_val = data[i];
321 }
322
323 return max_val;
324 }
325#endif
326
327#if defined(HAS_ARM_NEON)
328 float simd_processor::sum_floats_neon(const float* data, size_t count)
329 {
330 float32x4_t sum_vec = vdupq_n_f32(0.0f);
331 size_t simd_end = count - (count % 4);
332
333 // Process 4 floats at a time
334 for (size_t i = 0; i < simd_end; i += 4) {
335 float32x4_t vec = vld1q_f32(&data[i]);
336 sum_vec = vaddq_f32(sum_vec, vec);
337 }
338
339 // Horizontal sum
340 float32x2_t sum_low = vget_low_f32(sum_vec);
341 float32x2_t sum_high = vget_high_f32(sum_vec);
342 float32x2_t sum_pair = vadd_f32(sum_low, sum_high);
343 float sum = vget_lane_f32(sum_pair, 0) + vget_lane_f32(sum_pair, 1);
344
345 // Handle remaining elements
346 for (size_t i = simd_end; i < count; ++i) {
347 sum += data[i];
348 }
349
350 return sum;
351 }
352
353 float simd_processor::min_float_neon(const float* data, size_t count)
354 {
355 if (count == 0) return std::numeric_limits<float>::max();
356
357 float32x4_t min_vec = vdupq_n_f32(std::numeric_limits<float>::max());
358 size_t simd_end = count - (count % 4);
359
360 for (size_t i = 0; i < simd_end; i += 4) {
361 float32x4_t vec = vld1q_f32(&data[i]);
362 min_vec = vminq_f32(min_vec, vec);
363 }
364
365 // Extract minimum from vector
366 float result[4];
367 vst1q_f32(result, min_vec);
368 float min_val = result[0];
369 for (int i = 1; i < 4; ++i) {
370 if (result[i] < min_val) min_val = result[i];
371 }
372
373 // Handle remaining elements
374 for (size_t i = simd_end; i < count; ++i) {
375 if (data[i] < min_val) min_val = data[i];
376 }
377
378 return min_val;
379 }
380
381 float simd_processor::max_float_neon(const float* data, size_t count)
382 {
383 if (count == 0) return std::numeric_limits<float>::lowest();
384
385 float32x4_t max_vec = vdupq_n_f32(std::numeric_limits<float>::lowest());
386 size_t simd_end = count - (count % 4);
387
388 for (size_t i = 0; i < simd_end; i += 4) {
389 float32x4_t vec = vld1q_f32(&data[i]);
390 max_vec = vmaxq_f32(max_vec, vec);
391 }
392
393 // Extract maximum from vector
394 float result[4];
395 vst1q_f32(result, max_vec);
396 float max_val = result[0];
397 for (int i = 1; i < 4; ++i) {
398 if (result[i] > max_val) max_val = result[i];
399 }
400
401 // Handle remaining elements
402 for (size_t i = simd_end; i < count; ++i) {
403 if (data[i] > max_val) max_val = data[i];
404 }
405
406 return max_val;
407 }
408#endif
409
410 // Main interface implementations
411 float simd_processor::sum_floats(const std::vector<ValueVariant>& values)
412 {
413 // Extract float values
414 std::vector<float> floats;
415 floats.reserve(values.size());
416
417 for (const auto& val : values) {
418 if (auto* f = std::get_if<float>(&val)) {
419 floats.push_back(*f);
420 }
421 }
422
423 if (floats.empty()) return 0.0f;
424
425 #if defined(HAS_AVX512)
426 return sum_floats_avx512(floats.data(), floats.size());
427 #elif defined(HAS_AVX2)
428 return sum_floats_avx2(floats.data(), floats.size());
429 #elif defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
430 return sum_floats_sse(floats.data(), floats.size());
431 #elif defined(HAS_ARM_NEON)
432 return sum_floats_neon(floats.data(), floats.size());
433 #else
434 return sum_floats_scalar(floats.data(), floats.size());
435 #endif
436 }
437
438 double simd_processor::sum_doubles(const std::vector<ValueVariant>& values)
439 {
440 // For now, use scalar implementation for doubles
441 double sum = 0.0;
442 for (const auto& val : values) {
443 if (auto* d = std::get_if<double>(&val)) {
444 sum += *d;
445 }
446 }
447 return sum;
448 }
449
450 std::optional<float> simd_processor::min_float(const std::vector<ValueVariant>& values)
451 {
452 std::vector<float> floats;
453 floats.reserve(values.size());
454
455 for (const auto& val : values) {
456 if (auto* f = std::get_if<float>(&val)) {
457 floats.push_back(*f);
458 }
459 }
460
461 if (floats.empty()) return std::nullopt;
462
463 #if defined(HAS_AVX512)
464 return min_float_avx512(floats.data(), floats.size());
465 #elif defined(HAS_AVX2)
466 return min_float_avx2(floats.data(), floats.size());
467 #elif defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
468 return min_float_sse(floats.data(), floats.size());
469 #elif defined(HAS_ARM_NEON)
470 return min_float_neon(floats.data(), floats.size());
471 #else
472 return min_float_scalar(floats.data(), floats.size());
473 #endif
474 }
475
476 std::optional<float> simd_processor::max_float(const std::vector<ValueVariant>& values)
477 {
478 std::vector<float> floats;
479 floats.reserve(values.size());
480
481 for (const auto& val : values) {
482 if (auto* f = std::get_if<float>(&val)) {
483 floats.push_back(*f);
484 }
485 }
486
487 if (floats.empty()) return std::nullopt;
488
489 #if defined(HAS_AVX512)
490 return max_float_avx512(floats.data(), floats.size());
491 #elif defined(HAS_AVX2)
492 return max_float_avx2(floats.data(), floats.size());
493 #elif defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
494 return max_float_sse(floats.data(), floats.size());
495 #elif defined(HAS_ARM_NEON)
496 return max_float_neon(floats.data(), floats.size());
497 #else
498 return max_float_scalar(floats.data(), floats.size());
499 #endif
500 }
501
503 const std::vector<ValueVariant>& values, float target)
504 {
505 std::vector<size_t> indices;
506
507 for (size_t i = 0; i < values.size(); ++i) {
508 if (auto* f = std::get_if<float>(&values[i])) {
509 if (*f == target) {
510 indices.push_back(i);
511 }
512 }
513 }
514
515 return indices;
516 }
517
518 void simd_processor::fast_copy(const void* src, void* dst, size_t size)
519 {
520 // Use standard memcpy which is often optimized with SIMD
521 std::memcpy(dst, src, size);
522 }
523
524 bool simd_processor::fast_compare(const void* a, const void* b, size_t size)
525 {
526 return std::memcmp(a, b, size) == 0;
527 }
528
529 // SIMD support detection
531 {
532 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
533 #if defined(__GNUC__) || defined(__clang__)
534 unsigned int eax, ebx, ecx, edx;
535 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
536 return (edx & (1 << 26)) != 0; // SSE2 bit
537 }
538 #endif
539 #endif
540 return false;
541 }
542
544 {
545 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
546 #if defined(__GNUC__) || defined(__clang__)
547 unsigned int eax, ebx, ecx, edx;
548 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
549 return (ecx & (1 << 20)) != 0; // SSE4.2 bit
550 }
551 #endif
552 #endif
553 return false;
554 }
555
557 {
558 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
559 #if defined(__GNUC__) || defined(__clang__)
560 unsigned int eax, ebx, ecx, edx;
561 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
562 return (ebx & (1 << 5)) != 0; // AVX2 bit
563 }
564 #endif
565 #endif
566 return false;
567 }
568
570 {
571 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
572 #if defined(__GNUC__) || defined(__clang__)
573 unsigned int eax, ebx, ecx, edx;
574 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
575 return (ebx & (1 << 16)) != 0; // AVX-512F bit
576 }
577 #endif
578 #endif
579 return false;
580 }
581
583 {
584 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
585 #if defined(__GNUC__) || defined(__clang__)
586 unsigned int eax, ebx, ecx, edx;
587 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
588 return (ebx & (1 << 17)) != 0; // AVX-512DQ bit
589 }
590 #endif
591 #endif
592 return false;
593 }
594
596 {
597 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
598 #if defined(__GNUC__) || defined(__clang__)
599 unsigned int eax, ebx, ecx, edx;
600 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
601 return (ebx & (1 << 30)) != 0; // AVX-512BW bit
602 }
603 #endif
604 #endif
605 return false;
606 }
607
609 {
610 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
611 #if defined(__GNUC__) || defined(__clang__)
612 unsigned int eax, ebx, ecx, edx;
613 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
614 return (ebx & (1 << 31)) != 0; // AVX-512VL bit
615 }
616 #endif
617 #endif
618 return false;
619 }
620
622 {
623 #if defined(HAS_ARM_NEON)
624 return simd_level::neon;
625 #endif
626 if (has_avx512f()) return simd_level::avx512;
627 if (has_avx2()) return simd_level::avx2;
628 if (has_sse42()) return simd_level::sse42;
629 if (has_sse2()) return simd_level::sse2;
630 return simd_level::none;
631 }
632
634 {
635 #if defined(HAS_ARM_NEON)
636 return true;
637 #else
638 return false;
639 #endif
640 }
641
643 {
644 std::string info = "SIMD Support: ";
645
646 #if defined(HAS_AVX512)
647 info += "AVX-512 ";
648 #elif defined(HAS_AVX2)
649 info += "AVX2 ";
650 #elif defined(HAS_SSE42)
651 info += "SSE4.2 ";
652 #elif defined(HAS_SSE2)
653 info += "SSE2 ";
654 #elif defined(HAS_ARM_NEON)
655 info += "NEON ";
656 #else
657 info += "None ";
658 #endif
659
660 // Add runtime detection info
661 info += "(Compile-time), Runtime: ";
662 if (has_avx512f()) {
663 info += "AVX-512F ";
664 if (has_avx512dq()) info += "AVX-512DQ ";
665 if (has_avx512bw()) info += "AVX-512BW ";
666 if (has_avx512vl()) info += "AVX-512VL ";
667 } else if (has_avx2()) {
668 info += "AVX2 ";
669 } else if (has_sse42()) {
670 info += "SSE4.2 ";
671 } else if (has_sse2()) {
672 info += "SSE2 ";
673 } else if (has_neon()) {
674 info += "NEON ";
675 } else {
676 info += "None ";
677 }
678
679 info += "(Width: " + std::to_string(get_optimal_width()) + ")";
680 return info;
681 }
682
683} // namespace simd
684} // namespace kcenon::container
SIMD processor for vectorized operations on container values.
static bool fast_compare(const void *a, const void *b, size_t size)
Fast memory comparison using SIMD.
static double sum_doubles(const std::vector< ValueVariant > &values)
Sum all double values in a container using SIMD.
static void fast_copy(const void *src, void *dst, size_t size)
Fast memory copy using SIMD.
static float sum_floats_scalar(const float *data, size_t count)
static float sum_floats(const std::vector< ValueVariant > &values)
Sum all float values in a container using SIMD.
static float max_float_scalar(const float *data, size_t count)
static float min_float_scalar(const float *data, size_t count)
static std::optional< float > max_float(const std::vector< ValueVariant > &values)
Find maximum float value using SIMD.
static std::optional< float > min_float(const std::vector< ValueVariant > &values)
Find minimum float value using SIMD.
static std::vector< size_t > find_equal_floats(const std::vector< ValueVariant > &values, float target)
Vectorized comparison - find all values equal to target.
static simd_level get_best_simd_level()
Get the best available SIMD instruction set level.
static std::string get_simd_info()
Get a string describing available SIMD features.
static size_t get_optimal_width()
Get the optimal SIMD width for current platform.
simd_level
SIMD instruction set level enumeration.