25#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
27 #include <immintrin.h>
28 #define SIMD_AVX2_AVAILABLE 1
29 #elif defined(__SSE4_1__)
30 #include <smmintrin.h>
31 #define SIMD_SSE4_AVAILABLE 1
32 #elif defined(__SSE2__)
33 #include <emmintrin.h>
34 #define SIMD_SSE2_AVAILABLE 1
36#elif defined(__aarch64__) || defined(_M_ARM64)
38 #define SIMD_NEON_AVAILABLE 1
60#if defined(SIMD_AVX2_AVAILABLE)
65#elif defined(SIMD_SSE4_AVAILABLE)
68#elif defined(SIMD_SSE2_AVAILABLE)
70#elif defined(SIMD_NEON_AVAILABLE)
134 if (
this != &
other) {
150 if (
this != &
other) {
168 return (
static_cast<double>(
simd_operations.load()) /
static_cast<double>(total)) * 100.0;
209 common::Result<double>
sum(
const std::vector<double>& data) {
227 return common::ok(result);
235 common::Result<double>
mean(
const std::vector<double>& data) {
240 auto sum_result =
sum(data);
241 if (sum_result.is_err()) {
245 return common::ok(sum_result.value() /
static_cast<double>(data.size()));
253 common::Result<double>
min(
const std::vector<double>& data) {
271 return common::ok(result);
279 common::Result<double>
max(
const std::vector<double>& data) {
297 return common::ok(result);
305 common::Result<double>
variance(
const std::vector<double>& data) {
310 if (data.size() == 1) {
311 return common::ok(0.0);
314 auto mean_result =
mean(data);
315 if (mean_result.is_err()) {
319 double data_mean = mean_result.value();
320 double sum_sq_diff = 0.0;
322 for (
const auto& val : data) {
323 double diff = val - data_mean;
324 sum_sq_diff += diff * diff;
327 return common::ok(sum_sq_diff /
static_cast<double>(data.size() - 1));
335 common::Result<statistical_summary>
compute_summary(
const std::vector<double>& data) {
344 auto sum_result =
sum(data);
345 if (sum_result.is_err()) {
348 summary.sum = sum_result.value();
352 auto min_result =
min(data);
353 auto max_result =
max(data);
355 if (min_result.is_err() || max_result.is_err()) {
359 summary.min_val = min_result.value();
360 summary.max_val = max_result.value();
365 if (var_result.is_ok()) {
366 summary.variance = var_result.value();
388 std::vector<double> test_data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
391 auto sum_result =
sum(test_data);
392 if (sum_result.is_err()) {
393 return common::ok(
false);
396 if (std::abs(sum_result.value() - 36.0) > 1e-10) {
397 return common::ok(
false);
401 auto mean_result =
mean(test_data);
402 if (mean_result.is_err()) {
403 return common::ok(
false);
406 if (std::abs(mean_result.value() - 4.5) > 1e-10) {
407 return common::ok(
false);
411 auto min_result =
min(test_data);
412 auto max_result =
max(test_data);
414 if (min_result.is_err() || max_result.is_err()) {
415 return common::ok(
false);
418 if (std::abs(min_result.value() - 1.0) > 1e-10 ||
419 std::abs(max_result.value() - 8.0) > 1e-10) {
420 return common::ok(
false);
423 return common::ok(
true);
459 return std::accumulate(data.begin(), data.end(), 0.0);
462 double sum_simd(
const std::vector<double>& data)
const {
463#if defined(SIMD_AVX2_AVAILABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) && defined(__AVX2__)
464 const size_t simd_width = 4;
465 size_t simd_count = data.size() / simd_width;
467 __m256d sum_vec = _mm256_setzero_pd();
469 for (
size_t i = 0; i < simd_count; ++i) {
470 __m256d vec = _mm256_loadu_pd(&data[i * simd_width]);
471 sum_vec = _mm256_add_pd(sum_vec, vec);
475 alignas(32)
double temp[4];
476 _mm256_storeu_pd(temp, sum_vec);
477 double result = temp[0] + temp[1] + temp[2] + temp[3];
480 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
485#elif defined(SIMD_SSE2_AVAILABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86))
486 const size_t simd_width = 2;
487 size_t simd_count = data.size() / simd_width;
489 __m128d sum_vec = _mm_setzero_pd();
491 for (
size_t i = 0; i < simd_count; ++i) {
492 __m128d vec = _mm_loadu_pd(&data[i * simd_width]);
493 sum_vec = _mm_add_pd(sum_vec, vec);
496 alignas(16)
double temp[2];
497 _mm_storeu_pd(temp, sum_vec);
498 double result = temp[0] + temp[1];
500 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
505#elif defined(SIMD_NEON_AVAILABLE) && (defined(__aarch64__) || defined(_M_ARM64))
506 const size_t simd_width = 2;
507 size_t simd_count = data.size() / simd_width;
509 float64x2_t sum_vec = vdupq_n_f64(0.0);
511 for (
size_t i = 0; i < simd_count; ++i) {
512 float64x2_t vec = vld1q_f64(&data[i * simd_width]);
513 sum_vec = vaddq_f64(sum_vec, vec);
516 double result = vgetq_lane_f64(sum_vec, 0) + vgetq_lane_f64(sum_vec, 1);
518 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
529 return *std::min_element(data.begin(), data.end());
532 double min_simd(
const std::vector<double>& data)
const {
533#if defined(SIMD_AVX2_AVAILABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) && defined(__AVX2__)
534 const size_t simd_width = 4;
535 size_t simd_count = data.size() / simd_width;
537 __m256d min_vec = _mm256_set1_pd(std::numeric_limits<double>::max());
539 for (
size_t i = 0; i < simd_count; ++i) {
540 __m256d vec = _mm256_loadu_pd(&data[i * simd_width]);
541 min_vec = _mm256_min_pd(min_vec, vec);
544 alignas(32)
double temp[4];
545 _mm256_storeu_pd(temp, min_vec);
546 double result = std::min({temp[0], temp[1], temp[2], temp[3]});
548 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
549 result = std::min(result, data[i]);
553#elif defined(SIMD_SSE2_AVAILABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86))
554 const size_t simd_width = 2;
555 size_t simd_count = data.size() / simd_width;
557 __m128d min_vec = _mm_set1_pd(std::numeric_limits<double>::max());
559 for (
size_t i = 0; i < simd_count; ++i) {
560 __m128d vec = _mm_loadu_pd(&data[i * simd_width]);
561 min_vec = _mm_min_pd(min_vec, vec);
564 alignas(16)
double temp[2];
565 _mm_storeu_pd(temp, min_vec);
566 double result = std::min(temp[0], temp[1]);
568 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
569 result = std::min(result, data[i]);
573#elif defined(SIMD_NEON_AVAILABLE) && (defined(__aarch64__) || defined(_M_ARM64))
574 const size_t simd_width = 2;
575 size_t simd_count = data.size() / simd_width;
577 float64x2_t min_vec = vdupq_n_f64(std::numeric_limits<double>::max());
579 for (
size_t i = 0; i < simd_count; ++i) {
580 float64x2_t vec = vld1q_f64(&data[i * simd_width]);
581 min_vec = vminq_f64(min_vec, vec);
584 double result = std::min(vgetq_lane_f64(min_vec, 0), vgetq_lane_f64(min_vec, 1));
586 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
587 result = std::min(result, data[i]);
597 return *std::max_element(data.begin(), data.end());
600 double max_simd(
const std::vector<double>& data)
const {
601#if defined(SIMD_AVX2_AVAILABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)) && defined(__AVX2__)
602 const size_t simd_width = 4;
603 size_t simd_count = data.size() / simd_width;
605 __m256d max_vec = _mm256_set1_pd(std::numeric_limits<double>::lowest());
607 for (
size_t i = 0; i < simd_count; ++i) {
608 __m256d vec = _mm256_loadu_pd(&data[i * simd_width]);
609 max_vec = _mm256_max_pd(max_vec, vec);
612 alignas(32)
double temp[4];
613 _mm256_storeu_pd(temp, max_vec);
614 double result = std::max({temp[0], temp[1], temp[2], temp[3]});
616 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
617 result = std::max(result, data[i]);
621#elif defined(SIMD_SSE2_AVAILABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86))
622 const size_t simd_width = 2;
623 size_t simd_count = data.size() / simd_width;
625 __m128d max_vec = _mm_set1_pd(std::numeric_limits<double>::lowest());
627 for (
size_t i = 0; i < simd_count; ++i) {
628 __m128d vec = _mm_loadu_pd(&data[i * simd_width]);
629 max_vec = _mm_max_pd(max_vec, vec);
632 alignas(16)
double temp[2];
633 _mm_storeu_pd(temp, max_vec);
634 double result = std::max(temp[0], temp[1]);
636 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
637 result = std::max(result, data[i]);
641#elif defined(SIMD_NEON_AVAILABLE) && (defined(__aarch64__) || defined(_M_ARM64))
642 const size_t simd_width = 2;
643 size_t simd_count = data.size() / simd_width;
645 float64x2_t max_vec = vdupq_n_f64(std::numeric_limits<double>::lowest());
647 for (
size_t i = 0; i < simd_count; ++i) {
648 float64x2_t vec = vld1q_f64(&data[i * simd_width]);
649 max_vec = vmaxq_f64(max_vec, vec);
652 double result = std::max(vgetq_lane_f64(max_vec, 0), vgetq_lane_f64(max_vec, 1));
654 for (
size_t i = simd_count * simd_width; i < data.size(); ++i) {
655 result = std::max(result, data[i]);
674 return std::make_unique<simd_aggregator>();
683 return std::make_unique<simd_aggregator>(config);
693 {.enable_simd =
true, .vector_size = 8, .alignment = 32, .use_fma =
true},
695 {.enable_simd =
false, .vector_size = 8, .alignment = 32, .use_fma =
false},
697 {.enable_simd =
true, .vector_size = 4, .alignment = 16, .use_fma =
true},
699 {.enable_simd =
true, .vector_size = 16, .alignment = 64, .use_fma =
true}
SIMD-accelerated statistical aggregator.
simd_aggregator()
Default constructor with default configuration.
void reset_statistics()
Reset statistics.
common::Result< statistical_summary > compute_summary(const std::vector< double > &data)
Compute full statistical summary.
double min_scalar(const std::vector< double > &data) const
bool should_use_simd(size_t data_size) const
const simd_aggregator_statistics & get_statistics() const
Get aggregator statistics.
double max_scalar(const std::vector< double > &data) const
common::Result< double > mean(const std::vector< double > &data)
Calculate mean of elements.
double max_simd(const std::vector< double > &data) const
simd_capabilities capabilities_
common::Result< bool > test_simd()
Self-test SIMD functionality.
common::Result< double > sum(const std::vector< double > &data)
Calculate sum of elements.
common::Result< double > variance(const std::vector< double > &data)
Calculate variance of elements.
double min_simd(const std::vector< double > &data) const
simd_aggregator(const simd_config &config)
Construct with configuration.
simd_aggregator_statistics stats_
double sum_simd(const std::vector< double > &data) const
const simd_capabilities & get_capabilities() const
Get SIMD capabilities.
common::Result< double > min(const std::vector< double > &data)
Find minimum value.
double sum_scalar(const std::vector< double > &data) const
common::Result< double > max(const std::vector< double > &data)
Find maximum value.
@ summary
Pre-calculated quantiles and count/sum.
std::unique_ptr< simd_aggregator > make_simd_aggregator()
Create a SIMD aggregator with default configuration.
std::vector< simd_config > create_default_simd_configs()
Create default SIMD configurations for different use cases.
Result pattern type definitions for monitoring system.
Extended error information with context.
Statistics for SIMD aggregator operations.
simd_aggregator_statistics(simd_aggregator_statistics &&other) noexcept
simd_aggregator_statistics(const simd_aggregator_statistics &other)
simd_aggregator_statistics()=default
simd_aggregator_statistics & operator=(simd_aggregator_statistics &&other) noexcept
std::atomic< size_t > scalar_operations
std::atomic< size_t > simd_operations
void reset()
Reset all statistics.
simd_aggregator_statistics & operator=(const simd_aggregator_statistics &other)
double get_simd_utilization() const
Get SIMD utilization rate.
std::atomic< size_t > total_operations
std::atomic< size_t > total_elements_processed
SIMD capabilities detection.
static simd_capabilities detect()
Detect available SIMD features at runtime.
Configuration for SIMD aggregator.
bool validate() const
Validate configuration.
bool enable_simd
Enable SIMD acceleration.
size_t alignment
Memory alignment for SIMD operations.
size_t vector_size
SIMD vector width for processing.
bool use_fma
Use fused multiply-add if available.
Statistical summary result.