91concept SimdPolicy =
requires(
const T& policy,
const float* data,
size_t count) {
92 { T::name() } -> std::convertible_to<std::string_view>;
93 { T::simd_width } -> std::convertible_to<size_t>;
94 { policy.sum_floats(data, count) } -> std::same_as<float>;
95 { policy.min_float(data, count) } -> std::same_as<float>;
96 { policy.max_float(data, count) } -> std::same_as<float>;
110 static constexpr std::string_view
name() noexcept {
return "scalar"; }
113 [[nodiscard]]
float sum_floats(
const float* data,
size_t count)
const noexcept {
115 for (
size_t i = 0; i < count; ++i) {
121 [[nodiscard]]
float min_float(
const float* data,
size_t count)
const noexcept {
122 if (count == 0)
return std::numeric_limits<float>::max();
123 float min_val = data[0];
124 for (
size_t i = 1; i < count; ++i) {
125 if (data[i] < min_val) {
132 [[nodiscard]]
float max_float(
const float* data,
size_t count)
const noexcept {
133 if (count == 0)
return std::numeric_limits<float>::lowest();
134 float max_val = data[0];
135 for (
size_t i = 1; i < count; ++i) {
136 if (data[i] > max_val) {
143 [[nodiscard]]
double sum_doubles(
const double* data,
size_t count)
const noexcept {
145 for (
size_t i = 0; i < count; ++i) {
156#if defined(CONTAINER_HAS_SSE42) || defined(CONTAINER_HAS_SSE2)
160struct sse_simd_policy {
161 static constexpr std::string_view name() noexcept {
return "sse"; }
162 static constexpr size_t simd_width = 4;
165#if defined(__GNUC__) || defined(__clang__)
166 __attribute__((target(
"sse3")))
168 float sum_floats(
const float* data,
size_t count)
const noexcept {
169 __m128 sum_vec = _mm_setzero_ps();
170 size_t simd_end = count - (count % 4);
172 for (
size_t i = 0; i < simd_end; i += 4) {
173 __m128 vec = _mm_loadu_ps(&data[i]);
174 sum_vec = _mm_add_ps(sum_vec, vec);
178 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
179 sum_vec = _mm_hadd_ps(sum_vec, sum_vec);
181 float sum = _mm_cvtss_f32(sum_vec);
184 for (
size_t i = simd_end; i < count; ++i) {
192#if defined(__GNUC__) || defined(__clang__)
193 __attribute__((target(
"sse2")))
195 float min_float(
const float* data,
size_t count)
const noexcept {
196 if (count == 0)
return std::numeric_limits<float>::max();
198 __m128 min_vec = _mm_set1_ps(std::numeric_limits<float>::max());
199 size_t simd_end = count - (count % 4);
201 for (
size_t i = 0; i < simd_end; i += 4) {
202 __m128 vec = _mm_loadu_ps(&data[i]);
203 min_vec = _mm_min_ps(min_vec, vec);
207 alignas(16)
float result[4];
208 _mm_store_ps(result, min_vec);
209 float min_val = result[0];
210 for (
int i = 1; i < 4; ++i) {
211 if (result[i] < min_val) min_val = result[i];
215 for (
size_t i = simd_end; i < count; ++i) {
216 if (data[i] < min_val) min_val = data[i];
223#if defined(__GNUC__) || defined(__clang__)
224 __attribute__((target(
"sse2")))
226 float max_float(
const float* data,
size_t count)
const noexcept {
227 if (count == 0)
return std::numeric_limits<float>::lowest();
229 __m128 max_vec = _mm_set1_ps(std::numeric_limits<float>::lowest());
230 size_t simd_end = count - (count % 4);
232 for (
size_t i = 0; i < simd_end; i += 4) {
233 __m128 vec = _mm_loadu_ps(&data[i]);
234 max_vec = _mm_max_ps(max_vec, vec);
238 alignas(16)
float result[4];
239 _mm_store_ps(result, max_vec);
240 float max_val = result[0];
241 for (
int i = 1; i < 4; ++i) {
242 if (result[i] > max_val) max_val = result[i];
246 for (
size_t i = simd_end; i < count; ++i) {
247 if (data[i] > max_val) max_val = data[i];
253 [[nodiscard]]
double sum_doubles(
const double* data,
size_t count)
const noexcept {
256 for (
size_t i = 0; i < count; ++i) {
268#if defined(CONTAINER_HAS_AVX2)
272struct avx2_simd_policy {
273 static constexpr std::string_view name() noexcept {
return "avx2"; }
274 static constexpr size_t simd_width = 8;
277#if defined(__GNUC__) || defined(__clang__)
278 __attribute__((target(
"avx2")))
280 float sum_floats(
const float* data,
size_t count)
const noexcept {
281 __m256 sum_vec = _mm256_setzero_ps();
282 size_t simd_end = count - (count % 8);
284 for (
size_t i = 0; i < simd_end; i += 8) {
285 __m256 vec = _mm256_loadu_ps(&data[i]);
286 sum_vec = _mm256_add_ps(sum_vec, vec);
290 __m128 low = _mm256_castps256_ps128(sum_vec);
291 __m128 high = _mm256_extractf128_ps(sum_vec, 1);
292 __m128 sum128 = _mm_add_ps(low, high);
293 sum128 = _mm_hadd_ps(sum128, sum128);
294 sum128 = _mm_hadd_ps(sum128, sum128);
296 float sum = _mm_cvtss_f32(sum128);
299 for (
size_t i = simd_end; i < count; ++i) {
307#if defined(__GNUC__) || defined(__clang__)
308 __attribute__((target(
"avx2")))
310 float min_float(
const float* data,
size_t count)
const noexcept {
311 if (count == 0)
return std::numeric_limits<float>::max();
313 __m256 min_vec = _mm256_set1_ps(std::numeric_limits<float>::max());
314 size_t simd_end = count - (count % 8);
316 for (
size_t i = 0; i < simd_end; i += 8) {
317 __m256 vec = _mm256_loadu_ps(&data[i]);
318 min_vec = _mm256_min_ps(min_vec, vec);
322 alignas(32)
float result[8];
323 _mm256_store_ps(result, min_vec);
324 float min_val = result[0];
325 for (
int i = 1; i < 8; ++i) {
326 if (result[i] < min_val) min_val = result[i];
330 for (
size_t i = simd_end; i < count; ++i) {
331 if (data[i] < min_val) min_val = data[i];
338#if defined(__GNUC__) || defined(__clang__)
339 __attribute__((target(
"avx2")))
341 float max_float(
const float* data,
size_t count)
const noexcept {
342 if (count == 0)
return std::numeric_limits<float>::lowest();
344 __m256 max_vec = _mm256_set1_ps(std::numeric_limits<float>::lowest());
345 size_t simd_end = count - (count % 8);
347 for (
size_t i = 0; i < simd_end; i += 8) {
348 __m256 vec = _mm256_loadu_ps(&data[i]);
349 max_vec = _mm256_max_ps(max_vec, vec);
353 alignas(32)
float result[8];
354 _mm256_store_ps(result, max_vec);
355 float max_val = result[0];
356 for (
int i = 1; i < 8; ++i) {
357 if (result[i] > max_val) max_val = result[i];
361 for (
size_t i = simd_end; i < count; ++i) {
362 if (data[i] > max_val) max_val = data[i];
368 [[nodiscard]]
double sum_doubles(
const double* data,
size_t count)
const noexcept {
370 for (
size_t i = 0; i < count; ++i) {
382#if defined(CONTAINER_HAS_AVX512)
386struct avx512_simd_policy {
387 static constexpr std::string_view name() noexcept {
return "avx512"; }
388 static constexpr size_t simd_width = 16;
391#if defined(__GNUC__) || defined(__clang__)
392 __attribute__((target(
"avx512f")))
394 float sum_floats(
const float* data,
size_t count)
const noexcept {
395 __m512 sum_vec = _mm512_setzero_ps();
396 size_t simd_end = count - (count % 16);
398 for (
size_t i = 0; i < simd_end; i += 16) {
399 __m512 vec = _mm512_loadu_ps(&data[i]);
400 sum_vec = _mm512_add_ps(sum_vec, vec);
403 float sum = _mm512_reduce_add_ps(sum_vec);
405 for (
size_t i = simd_end; i < count; ++i) {
413#if defined(__GNUC__) || defined(__clang__)
414 __attribute__((target(
"avx512f")))
416 float min_float(
const float* data,
size_t count)
const noexcept {
417 if (count == 0)
return std::numeric_limits<float>::max();
419 __m512 min_vec = _mm512_set1_ps(std::numeric_limits<float>::max());
420 size_t simd_end = count - (count % 16);
422 for (
size_t i = 0; i < simd_end; i += 16) {
423 __m512 vec = _mm512_loadu_ps(&data[i]);
424 min_vec = _mm512_min_ps(min_vec, vec);
427 float min_val = _mm512_reduce_min_ps(min_vec);
429 for (
size_t i = simd_end; i < count; ++i) {
430 if (data[i] < min_val) min_val = data[i];
437#if defined(__GNUC__) || defined(__clang__)
438 __attribute__((target(
"avx512f")))
440 float max_float(
const float* data,
size_t count)
const noexcept {
441 if (count == 0)
return std::numeric_limits<float>::lowest();
443 __m512 max_vec = _mm512_set1_ps(std::numeric_limits<float>::lowest());
444 size_t simd_end = count - (count % 16);
446 for (
size_t i = 0; i < simd_end; i += 16) {
447 __m512 vec = _mm512_loadu_ps(&data[i]);
448 max_vec = _mm512_max_ps(max_vec, vec);
451 float max_val = _mm512_reduce_max_ps(max_vec);
453 for (
size_t i = simd_end; i < count; ++i) {
454 if (data[i] > max_val) max_val = data[i];
461#if defined(__GNUC__) || defined(__clang__)
462 __attribute__((target(
"avx512f")))
464 double sum_doubles(
const double* data,
size_t count)
const noexcept {
465 __m512d sum_vec = _mm512_setzero_pd();
466 size_t simd_end = count - (count % 8);
468 for (
size_t i = 0; i < simd_end; i += 8) {
469 __m512d vec = _mm512_loadu_pd(&data[i]);
470 sum_vec = _mm512_add_pd(sum_vec, vec);
473 double sum = _mm512_reduce_add_pd(sum_vec);
475 for (
size_t i = simd_end; i < count; ++i) {
488#if defined(CONTAINER_HAS_ARM_NEON)
492struct neon_simd_policy {
493 static constexpr std::string_view name() noexcept {
return "neon"; }
494 static constexpr size_t simd_width = 4;
496 [[nodiscard]]
float sum_floats(
const float* data,
size_t count)
const noexcept {
497 float32x4_t sum_vec = vdupq_n_f32(0.0f);
498 size_t simd_end = count - (count % 4);
500 for (
size_t i = 0; i < simd_end; i += 4) {
501 float32x4_t vec = vld1q_f32(&data[i]);
502 sum_vec = vaddq_f32(sum_vec, vec);
506 float32x2_t sum_low = vget_low_f32(sum_vec);
507 float32x2_t sum_high = vget_high_f32(sum_vec);
508 float32x2_t sum_pair = vadd_f32(sum_low, sum_high);
509 float sum = vget_lane_f32(sum_pair, 0) + vget_lane_f32(sum_pair, 1);
511 for (
size_t i = simd_end; i < count; ++i) {
518 [[nodiscard]]
float min_float(
const float* data,
size_t count)
const noexcept {
519 if (count == 0)
return std::numeric_limits<float>::max();
521 float32x4_t min_vec = vdupq_n_f32(std::numeric_limits<float>::max());
522 size_t simd_end = count - (count % 4);
524 for (
size_t i = 0; i < simd_end; i += 4) {
525 float32x4_t vec = vld1q_f32(&data[i]);
526 min_vec = vminq_f32(min_vec, vec);
531 vst1q_f32(result, min_vec);
532 float min_val = result[0];
533 for (
int i = 1; i < 4; ++i) {
534 if (result[i] < min_val) min_val = result[i];
537 for (
size_t i = simd_end; i < count; ++i) {
538 if (data[i] < min_val) min_val = data[i];
544 [[nodiscard]]
float max_float(
const float* data,
size_t count)
const noexcept {
545 if (count == 0)
return std::numeric_limits<float>::lowest();
547 float32x4_t max_vec = vdupq_n_f32(std::numeric_limits<float>::lowest());
548 size_t simd_end = count - (count % 4);
550 for (
size_t i = 0; i < simd_end; i += 4) {
551 float32x4_t vec = vld1q_f32(&data[i]);
552 max_vec = vmaxq_f32(max_vec, vec);
557 vst1q_f32(result, max_vec);
558 float max_val = result[0];
559 for (
int i = 1; i < 4; ++i) {
560 if (result[i] > max_val) max_val = result[i];
563 for (
size_t i = simd_end; i < count; ++i) {
564 if (data[i] > max_val) max_val = data[i];
570 [[nodiscard]]
double sum_doubles(
const double* data,
size_t count)
const noexcept {
572 for (
size_t i = 0; i < count; ++i) {
594#if defined(CONTAINER_HAS_AVX512)
596#elif defined(CONTAINER_HAS_AVX2)
598#elif defined(CONTAINER_HAS_SSE42) || defined(CONTAINER_HAS_SSE2)
600#elif defined(CONTAINER_HAS_ARM_NEON)
620template<SimdPolicy Policy = default_simd_policy>
623 explicit simd_ops(Policy policy = Policy{})
noexcept
624 :
policy_(std::move(policy)) {}
630 return Policy::name();
637 return Policy::simd_width;
643 [[nodiscard]]
float sum_floats(
const float* data,
size_t count)
const noexcept {
644 return policy_.sum_floats(data, count);
650 [[nodiscard]]
float min_float(
const float* data,
size_t count)
const noexcept {
651 return policy_.min_float(data, count);
657 [[nodiscard]]
float max_float(
const float* data,
size_t count)
const noexcept {
658 return policy_.max_float(data, count);
664 [[nodiscard]]
double sum_doubles(
const double* data,
size_t count)
const noexcept {
665 return policy_.sum_doubles(data, count);
674#if defined(CONTAINER_HAS_SSE42) || defined(CONTAINER_HAS_SSE2)
677#if defined(CONTAINER_HAS_AVX2)
680#if defined(CONTAINER_HAS_AVX512)
683#if defined(CONTAINER_HAS_ARM_NEON)