10#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
11 #if defined(__GNUC__) || defined(__clang__)
24 for (
size_t i = 0; i < count; ++i) {
32 if (count == 0)
return std::numeric_limits<float>::max();
34 float min_val = data[0];
35 for (
size_t i = 1; i < count; ++i) {
36 if (data[i] < min_val) {
45 if (count == 0)
return std::numeric_limits<float>::lowest();
47 float max_val = data[0];
48 for (
size_t i = 1; i < count; ++i) {
49 if (data[i] > max_val) {
56#if defined(HAS_AVX512)
57 __attribute__((target(
"avx512f")))
58 float
simd_processor::sum_floats_avx512(const
float* data,
size_t count)
60 __m512 sum_vec = _mm512_setzero_ps();
61 size_t simd_end = count - (count % 16);
64 for (
size_t i = 0; i < simd_end; i += 16) {
65 __m512 vec = _mm512_loadu_ps(&data[i]);
66 sum_vec = _mm512_add_ps(sum_vec, vec);
70 float sum = _mm512_reduce_add_ps(sum_vec);
73 for (
size_t i = simd_end; i < count; ++i) {
80 __attribute__((target(
"avx512f")))
81 float simd_processor::min_float_avx512(const
float* data,
size_t count)
83 if (count == 0)
return std::numeric_limits<float>::max();
85 __m512 min_vec = _mm512_set1_ps(std::numeric_limits<float>::max());
86 size_t simd_end = count - (count % 16);
88 for (
size_t i = 0; i < simd_end; i += 16) {
89 __m512 vec = _mm512_loadu_ps(&data[i]);
90 min_vec = _mm512_min_ps(min_vec, vec);
94 float min_val = _mm512_reduce_min_ps(min_vec);
97 for (
size_t i = simd_end; i < count; ++i) {
98 if (data[i] < min_val) min_val = data[i];
104 __attribute__((target(
"avx512f")))
105 float simd_processor::max_float_avx512(const
float* data,
size_t count)
107 if (count == 0)
return std::numeric_limits<float>::lowest();
109 __m512 max_vec = _mm512_set1_ps(std::numeric_limits<float>::lowest());
110 size_t simd_end = count - (count % 16);
112 for (
size_t i = 0; i < simd_end; i += 16) {
113 __m512 vec = _mm512_loadu_ps(&data[i]);
114 max_vec = _mm512_max_ps(max_vec, vec);
118 float max_val = _mm512_reduce_max_ps(max_vec);
121 for (
size_t i = simd_end; i < count; ++i) {
122 if (data[i] > max_val) max_val = data[i];
128 __attribute__((target(
"avx512f")))
129 double simd_processor::sum_doubles_avx512(const
double* data,
size_t count)
131 __m512d sum_vec = _mm512_setzero_pd();
132 size_t simd_end = count - (count % 8);
135 for (
size_t i = 0; i < simd_end; i += 8) {
136 __m512d vec = _mm512_loadu_pd(&data[i]);
137 sum_vec = _mm512_add_pd(sum_vec, vec);
141 double sum = _mm512_reduce_add_pd(sum_vec);
144 for (
size_t i = simd_end; i < count; ++i) {
153 __attribute__((target(
"avx2")))
154 float simd_processor::sum_floats_avx2(const
float* data,
size_t count)
156 __m256 sum_vec = _mm256_setzero_ps();
157 size_t simd_end = count - (count % 8);
160 for (
size_t i = 0; i < simd_end; i += 8) {
161 __m256 vec = _mm256_loadu_ps(&data[i]);
162 sum_vec = _mm256_add_ps(sum_vec, vec);
166 __m128 low = _mm256_castps256_ps128(sum_vec);
167 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
168 __m128 sum128 = _mm_add_ps(low, high);
169 sum128 = _mm_hadd_ps(sum128, sum128);
170 sum128 = _mm_hadd_ps(sum128, sum128);
172 float sum = _mm_cvtss_f32(sum128);
175 for (
size_t i = simd_end; i < count; ++i) {
182 __attribute__((target(
"avx2")))
183 float simd_processor::min_float_avx2(const
float* data,
size_t count)
185 if (count == 0)
return std::numeric_limits<float>::max();
187 __m256 min_vec = _mm256_set1_ps(std::numeric_limits<float>::max());
188 size_t simd_end = count - (count % 8);
190 for (
size_t i = 0; i < simd_end; i += 8) {
191 __m256 vec = _mm256_loadu_ps(&data[i]);
192 min_vec = _mm256_min_ps(min_vec, vec);
197 _mm256_storeu_ps(result, min_vec);
198 float min_val = result[0];
199 for (
int i = 1; i < 8; ++i) {
200 if (result[i] < min_val) min_val = result[i];
204 for (
size_t i = simd_end; i < count; ++i) {
205 if (data[i] < min_val) min_val = data[i];
211 __attribute__((target(
"avx2")))
212 float simd_processor::max_float_avx2(const
float* data,
size_t count)
214 if (count == 0)
return std::numeric_limits<float>::lowest();
216 __m256 max_vec = _mm256_set1_ps(std::numeric_limits<float>::lowest());
217 size_t simd_end = count - (count % 8);
219 for (
size_t i = 0; i < simd_end; i += 8) {
220 __m256 vec = _mm256_loadu_ps(&data[i]);
221 max_vec = _mm256_max_ps(max_vec, vec);
226 _mm256_storeu_ps(result, max_vec);
227 float max_val = result[0];
228 for (
int i = 1; i < 8; ++i) {
229 if (result[i] > max_val) max_val = result[i];
233 for (
size_t i = simd_end; i < count; ++i) {
234 if (data[i] > max_val) max_val = data[i];
241#if defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
242 __attribute__((target(
"sse3")))
243 float simd_processor::sum_floats_sse(const
float* data,
size_t count)
245 __m128 sum_vec = _mm_setzero_ps();
246 size_t simd_end = count - (count % 4);
249 for (
size_t i = 0; i < simd_end; i += 4) {
250 __m128 vec = _mm_loadu_ps(&data[i]);
251 sum_vec = _mm_add_ps(sum_vec, vec);
255 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
256 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
258 float sum = _mm_cvtss_f32(sum_vec);
261 for (
size_t i = simd_end; i < count; ++i) {
268 __attribute__((target(
"sse2")))
269 float simd_processor::min_float_sse(const
float* data,
size_t count)
271 if (count == 0)
return std::numeric_limits<float>::max();
273 __m128 min_vec = _mm_set1_ps(std::numeric_limits<float>::max());
274 size_t simd_end = count - (count % 4);
276 for (
size_t i = 0; i < simd_end; i += 4) {
277 __m128 vec = _mm_loadu_ps(&data[i]);
278 min_vec = _mm_min_ps(min_vec, vec);
283 _mm_storeu_ps(result, min_vec);
284 float min_val = result[0];
285 for (
int i = 1; i < 4; ++i) {
286 if (result[i] < min_val) min_val = result[i];
290 for (
size_t i = simd_end; i < count; ++i) {
291 if (data[i] < min_val) min_val = data[i];
297 __attribute__((target(
"sse2")))
298 float simd_processor::max_float_sse(const
float* data,
size_t count)
300 if (count == 0)
return std::numeric_limits<float>::lowest();
302 __m128 max_vec = _mm_set1_ps(std::numeric_limits<float>::lowest());
303 size_t simd_end = count - (count % 4);
305 for (
size_t i = 0; i < simd_end; i += 4) {
306 __m128 vec = _mm_loadu_ps(&data[i]);
307 max_vec = _mm_max_ps(max_vec, vec);
312 _mm_storeu_ps(result, max_vec);
313 float max_val = result[0];
314 for (
int i = 1; i < 4; ++i) {
315 if (result[i] > max_val) max_val = result[i];
319 for (
size_t i = simd_end; i < count; ++i) {
320 if (data[i] > max_val) max_val = data[i];
327#if defined(HAS_ARM_NEON)
328 float simd_processor::sum_floats_neon(
const float* data,
size_t count)
330 float32x4_t sum_vec = vdupq_n_f32(0.0f);
331 size_t simd_end = count - (count % 4);
334 for (
size_t i = 0; i < simd_end; i += 4) {
335 float32x4_t vec = vld1q_f32(&data[i]);
336 sum_vec = vaddq_f32(sum_vec, vec);
340 float32x2_t sum_low = vget_low_f32(sum_vec);
341 float32x2_t sum_high = vget_high_f32(sum_vec);
342 float32x2_t sum_pair = vadd_f32(sum_low, sum_high);
343 float sum = vget_lane_f32(sum_pair, 0) + vget_lane_f32(sum_pair, 1);
346 for (
size_t i = simd_end; i < count; ++i) {
353 float simd_processor::min_float_neon(
const float* data,
size_t count)
355 if (count == 0)
return std::numeric_limits<float>::max();
357 float32x4_t min_vec = vdupq_n_f32(std::numeric_limits<float>::max());
358 size_t simd_end = count - (count % 4);
360 for (
size_t i = 0; i < simd_end; i += 4) {
361 float32x4_t vec = vld1q_f32(&data[i]);
362 min_vec = vminq_f32(min_vec, vec);
367 vst1q_f32(result, min_vec);
368 float min_val = result[0];
369 for (
int i = 1; i < 4; ++i) {
370 if (result[i] < min_val) min_val = result[i];
374 for (
size_t i = simd_end; i < count; ++i) {
375 if (data[i] < min_val) min_val = data[i];
381 float simd_processor::max_float_neon(
const float* data,
size_t count)
383 if (count == 0)
return std::numeric_limits<float>::lowest();
385 float32x4_t max_vec = vdupq_n_f32(std::numeric_limits<float>::lowest());
386 size_t simd_end = count - (count % 4);
388 for (
size_t i = 0; i < simd_end; i += 4) {
389 float32x4_t vec = vld1q_f32(&data[i]);
390 max_vec = vmaxq_f32(max_vec, vec);
395 vst1q_f32(result, max_vec);
396 float max_val = result[0];
397 for (
int i = 1; i < 4; ++i) {
398 if (result[i] > max_val) max_val = result[i];
402 for (
size_t i = simd_end; i < count; ++i) {
403 if (data[i] > max_val) max_val = data[i];
414 std::vector<float> floats;
415 floats.reserve(values.size());
417 for (
const auto& val : values) {
418 if (
auto* f = std::get_if<float>(&val)) {
419 floats.push_back(*f);
423 if (floats.empty())
return 0.0f;
425 #if defined(HAS_AVX512)
426 return sum_floats_avx512(floats.data(), floats.size());
427 #elif defined(HAS_AVX2)
428 return sum_floats_avx2(floats.data(), floats.size());
429 #elif defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
430 return sum_floats_sse(floats.data(), floats.size());
431 #elif defined(HAS_ARM_NEON)
432 return sum_floats_neon(floats.data(), floats.size());
442 for (
const auto& val : values) {
443 if (
auto* d = std::get_if<double>(&val)) {
452 std::vector<float> floats;
453 floats.reserve(values.size());
455 for (
const auto& val : values) {
456 if (
auto* f = std::get_if<float>(&val)) {
457 floats.push_back(*f);
461 if (floats.empty())
return std::nullopt;
463 #if defined(HAS_AVX512)
464 return min_float_avx512(floats.data(), floats.size());
465 #elif defined(HAS_AVX2)
466 return min_float_avx2(floats.data(), floats.size());
467 #elif defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
468 return min_float_sse(floats.data(), floats.size());
469 #elif defined(HAS_ARM_NEON)
470 return min_float_neon(floats.data(), floats.size());
478 std::vector<float> floats;
479 floats.reserve(values.size());
481 for (
const auto& val : values) {
482 if (
auto* f = std::get_if<float>(&val)) {
483 floats.push_back(*f);
487 if (floats.empty())
return std::nullopt;
489 #if defined(HAS_AVX512)
490 return max_float_avx512(floats.data(), floats.size());
491 #elif defined(HAS_AVX2)
492 return max_float_avx2(floats.data(), floats.size());
493 #elif defined(HAS_X86_SIMD) && (defined(HAS_SSE2) || defined(HAS_SSE42))
494 return max_float_sse(floats.data(), floats.size());
495 #elif defined(HAS_ARM_NEON)
496 return max_float_neon(floats.data(), floats.size());
503 const std::vector<ValueVariant>& values,
float target)
505 std::vector<size_t> indices;
507 for (
size_t i = 0; i < values.size(); ++i) {
508 if (
auto* f = std::get_if<float>(&values[i])) {
510 indices.push_back(i);
521 std::memcpy(dst, src, size);
526 return std::memcmp(a, b, size) == 0;
532 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
533 #if defined(__GNUC__) || defined(__clang__)
534 unsigned int eax, ebx, ecx, edx;
535 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
536 return (edx & (1 << 26)) != 0;
545 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
546 #if defined(__GNUC__) || defined(__clang__)
547 unsigned int eax, ebx, ecx, edx;
548 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
549 return (ecx & (1 << 20)) != 0;
558 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
559 #if defined(__GNUC__) || defined(__clang__)
560 unsigned int eax, ebx, ecx, edx;
561 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
562 return (ebx & (1 << 5)) != 0;
571 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
572 #if defined(__GNUC__) || defined(__clang__)
573 unsigned int eax, ebx, ecx, edx;
574 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
575 return (ebx & (1 << 16)) != 0;
584 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
585 #if defined(__GNUC__) || defined(__clang__)
586 unsigned int eax, ebx, ecx, edx;
587 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
588 return (ebx & (1 << 17)) != 0;
597 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
598 #if defined(__GNUC__) || defined(__clang__)
599 unsigned int eax, ebx, ecx, edx;
600 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
601 return (ebx & (1 << 30)) != 0;
610 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
611 #if defined(__GNUC__) || defined(__clang__)
612 unsigned int eax, ebx, ecx, edx;
613 if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
614 return (ebx & (1 << 31)) != 0;
623 #if defined(HAS_ARM_NEON)
635 #if defined(HAS_ARM_NEON)
644 std::string info =
"SIMD Support: ";
646 #if defined(HAS_AVX512)
648 #elif defined(HAS_AVX2)
650 #elif defined(HAS_SSE42)
652 #elif defined(HAS_SSE2)
654 #elif defined(HAS_ARM_NEON)
661 info +=
"(Compile-time), Runtime: ";
SIMD processor for vectorized operations on container values.
static bool fast_compare(const void *a, const void *b, size_t size)
Fast memory comparison using SIMD.
static double sum_doubles(const std::vector< ValueVariant > &values)
Sum all double values in a container using SIMD.
static void fast_copy(const void *src, void *dst, size_t size)
Fast memory copy using SIMD.
static float sum_floats_scalar(const float *data, size_t count)
static float sum_floats(const std::vector< ValueVariant > &values)
Sum all float values in a container using SIMD.
static float max_float_scalar(const float *data, size_t count)
static float min_float_scalar(const float *data, size_t count)
static std::optional< float > max_float(const std::vector< ValueVariant > &values)
Find maximum float value using SIMD.
static std::optional< float > min_float(const std::vector< ValueVariant > &values)
Find minimum float value using SIMD.
static std::vector< size_t > find_equal_floats(const std::vector< ValueVariant > &values, float target)
Vectorized comparison - find all values equal to target.
static bool has_avx512vl()
static bool has_avx512dq()
static bool has_avx512bw()
static simd_level get_best_simd_level()
Get the best available SIMD instruction set level.
static bool has_avx512f()
static std::string get_simd_info()
Get a string describing available SIMD features.
static size_t get_optimal_width()
Get the optimal SIMD width for current platform.
simd_level
SIMD instruction set level enumeration.