PACS System 0.1.0
PACS DICOM system library
Loading...
Searching...
No Matches
simd_windowing.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
21#ifndef PACS_ENCODING_SIMD_WINDOWING_HPP
22#define PACS_ENCODING_SIMD_WINDOWING_HPP
23
24#include "simd_config.h"
25#include "simd_types.h"
26
27#include <algorithm>
28#include <cmath>
29#include <cstddef>
30#include <cstdint>
31#include <vector>
32
34
39 double center;
40 double width;
41 bool invert;
42
43 constexpr window_level_params(double c = 128.0, double w = 256.0,
44 bool inv = false) noexcept
45 : center(c), width(w), invert(inv) {}
46};
47
48// Forward declarations
49void apply_window_level_8bit(const uint8_t* src, uint8_t* dst,
50 size_t pixel_count,
51 const window_level_params& params) noexcept;
52void apply_window_level_16bit(const uint16_t* src, uint8_t* dst,
53 size_t pixel_count,
54 const window_level_params& params) noexcept;
55void apply_window_level_16bit_signed(const int16_t* src, uint8_t* dst,
56 size_t pixel_count,
57 const window_level_params& params) noexcept;
58
63public:
69 lut.lut_8bit_.resize(256);
70
71 const double min_val = params.center - params.width / 2.0;
72 const double scale = 255.0 / params.width;
73
74 for (int i = 0; i < 256; ++i) {
75 double val = (i - min_val) * scale;
76 val = std::clamp(val, 0.0, 255.0);
77 if (params.invert) {
78 val = 255.0 - val;
79 }
80 lut.lut_8bit_[i] = static_cast<uint8_t>(std::round(val));
81 }
82
83 return lut;
84 }
85
91 lut.lut_16bit_.resize(4096);
92
93 const double min_val = params.center - params.width / 2.0;
94 const double scale = 255.0 / params.width;
95
96 for (int i = 0; i < 4096; ++i) {
97 double val = (i - min_val) * scale;
98 val = std::clamp(val, 0.0, 255.0);
99 if (params.invert) {
100 val = 255.0 - val;
101 }
102 lut.lut_16bit_[i] = static_cast<uint8_t>(std::round(val));
103 }
104
105 return lut;
106 }
107
113 lut.lut_16bit_.resize(65536);
114
115 const double min_val = params.center - params.width / 2.0;
116 const double scale = 255.0 / params.width;
117
118 for (int i = 0; i < 65536; ++i) {
119 double val = (i - min_val) * scale;
120 val = std::clamp(val, 0.0, 255.0);
121 if (params.invert) {
122 val = 255.0 - val;
123 }
124 lut.lut_16bit_[i] = static_cast<uint8_t>(std::round(val));
125 }
126
127 return lut;
128 }
129
133 void apply_8bit(const uint8_t* src, uint8_t* dst,
134 size_t pixel_count) const noexcept {
135 for (size_t i = 0; i < pixel_count; ++i) {
136 dst[i] = lut_8bit_[src[i]];
137 }
138 }
139
143 void apply_16bit(const uint16_t* src, uint8_t* dst,
144 size_t pixel_count) const noexcept {
145 const size_t lut_size = lut_16bit_.size();
146 for (size_t i = 0; i < pixel_count; ++i) {
147 const uint16_t val = src[i];
148 if (val < lut_size) {
149 dst[i] = lut_16bit_[val];
150 } else {
151 dst[i] = lut_16bit_[lut_size - 1];
152 }
153 }
154 }
155
156 [[nodiscard]] bool is_valid_8bit() const noexcept {
157 return !lut_8bit_.empty();
158 }
159
160 [[nodiscard]] bool is_valid_16bit() const noexcept {
161 return !lut_16bit_.empty();
162 }
163
164private:
165 std::vector<uint8_t> lut_8bit_;
166 std::vector<uint8_t> lut_16bit_;
167};
168
169namespace detail {
170
171// ============================================================================
172// Scalar fallback implementations
173// ============================================================================
174
178inline void apply_window_level_8bit_scalar(const uint8_t* src, uint8_t* dst,
179 size_t pixel_count,
180 const window_level_params& params) noexcept {
181 const double min_val = params.center - params.width / 2.0;
182 const double scale = 255.0 / params.width;
183
184 for (size_t i = 0; i < pixel_count; ++i) {
185 double val = (src[i] - min_val) * scale;
186 val = std::clamp(val, 0.0, 255.0);
187 if (params.invert) {
188 val = 255.0 - val;
189 }
190 dst[i] = static_cast<uint8_t>(val);
191 }
192}
193
197inline void apply_window_level_16bit_scalar(const uint16_t* src, uint8_t* dst,
198 size_t pixel_count,
199 const window_level_params& params) noexcept {
200 const double min_val = params.center - params.width / 2.0;
201 const double scale = 255.0 / params.width;
202
203 for (size_t i = 0; i < pixel_count; ++i) {
204 double val = (src[i] - min_val) * scale;
205 val = std::clamp(val, 0.0, 255.0);
206 if (params.invert) {
207 val = 255.0 - val;
208 }
209 dst[i] = static_cast<uint8_t>(val);
210 }
211}
212
217 const int16_t* src, uint8_t* dst, size_t pixel_count,
218 const window_level_params& params) noexcept {
219 const double min_val = params.center - params.width / 2.0;
220 const double scale = 255.0 / params.width;
221
222 for (size_t i = 0; i < pixel_count; ++i) {
223 double val = (src[i] - min_val) * scale;
224 val = std::clamp(val, 0.0, 255.0);
225 if (params.invert) {
226 val = 255.0 - val;
227 }
228 dst[i] = static_cast<uint8_t>(val);
229 }
230}
231
232// ============================================================================
233// SSE2 implementations
234// ============================================================================
235
236#if defined(PACS_SIMD_SSE2)
237
244inline void apply_window_level_8bit_sse2(const uint8_t* src, uint8_t* dst,
245 size_t pixel_count,
246 const window_level_params& params) noexcept {
247 // Precompute fixed-point parameters (16.16 format)
248 const int32_t min_val_fp =
249 static_cast<int32_t>((params.center - params.width / 2.0) * 256);
250 const int32_t scale_fp =
251 static_cast<int32_t>((255.0 / params.width) * 256);
252
253 const __m128i min_vec = _mm_set1_epi16(static_cast<int16_t>(min_val_fp >> 8));
254 const __m128i scale_vec = _mm_set1_epi16(static_cast<int16_t>(scale_fp));
255 const __m128i zero = _mm_setzero_si128();
256 const __m128i max_255 = _mm_set1_epi16(255);
257 const __m128i all_ones = _mm_set1_epi8(static_cast<char>(0xFF));
258
259 const size_t simd_count = (pixel_count / 16) * 16;
260
261 size_t i = 0;
262 for (; i < simd_count; i += 16) {
263 // Load 16 pixels
264 __m128i pixels = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
265
266 // Unpack to 16-bit (low 8 pixels)
267 __m128i pixels_lo = _mm_unpacklo_epi8(pixels, zero);
268 // Unpack to 16-bit (high 8 pixels)
269 __m128i pixels_hi = _mm_unpackhi_epi8(pixels, zero);
270
271 // Subtract minimum value
272 pixels_lo = _mm_sub_epi16(pixels_lo, min_vec);
273 pixels_hi = _mm_sub_epi16(pixels_hi, min_vec);
274
275 // Multiply by scale (fixed-point, keep high 16 bits)
276 pixels_lo = _mm_mulhi_epi16(pixels_lo, scale_vec);
277 pixels_hi = _mm_mulhi_epi16(pixels_hi, scale_vec);
278
279 // Clamp to [0, 255]
280 pixels_lo = _mm_max_epi16(_mm_min_epi16(pixels_lo, max_255), zero);
281 pixels_hi = _mm_max_epi16(_mm_min_epi16(pixels_hi, max_255), zero);
282
283 // Pack back to 8-bit
284 __m128i result = _mm_packus_epi16(pixels_lo, pixels_hi);
285
286 // Apply inversion if needed
287 if (params.invert) {
288 result = _mm_xor_si128(result, all_ones);
289 }
290
291 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), result);
292 }
293
294 // Handle remainder
295 apply_window_level_8bit_scalar(src + i, dst + i, pixel_count - i, params);
296}
297
302inline void apply_window_level_16bit_sse2(const uint16_t* src, uint8_t* dst,
303 size_t pixel_count,
304 const window_level_params& params) noexcept {
305 // For 16-bit input, we need 32-bit arithmetic
306 const float min_val = static_cast<float>(params.center - params.width / 2.0);
307 const float scale = 255.0f / static_cast<float>(params.width);
308
309 const __m128 min_vec = _mm_set1_ps(min_val);
310 const __m128 scale_vec = _mm_set1_ps(scale);
311 const __m128 zero_f = _mm_setzero_ps();
312 const __m128 max_255_f = _mm_set1_ps(255.0f);
313 const __m128i all_ones = _mm_set1_epi8(static_cast<char>(0xFF));
314
315 const size_t simd_count = (pixel_count / 8) * 8;
316
317 size_t i = 0;
318 for (; i < simd_count; i += 8) {
319 // Load 8 16-bit pixels
320 __m128i pixels = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
321
322 // Convert to float (need to split into two 4-element vectors)
323 __m128i lo = _mm_unpacklo_epi16(pixels, _mm_setzero_si128());
324 __m128i hi = _mm_unpackhi_epi16(pixels, _mm_setzero_si128());
325
326 __m128 lo_f = _mm_cvtepi32_ps(lo);
327 __m128 hi_f = _mm_cvtepi32_ps(hi);
328
329 // Apply window/level: (pixel - min) * scale
330 lo_f = _mm_mul_ps(_mm_sub_ps(lo_f, min_vec), scale_vec);
331 hi_f = _mm_mul_ps(_mm_sub_ps(hi_f, min_vec), scale_vec);
332
333 // Clamp to [0, 255]
334 lo_f = _mm_max_ps(_mm_min_ps(lo_f, max_255_f), zero_f);
335 hi_f = _mm_max_ps(_mm_min_ps(hi_f, max_255_f), zero_f);
336
337 // Convert back to integer
338 __m128i lo_i = _mm_cvtps_epi32(lo_f);
339 __m128i hi_i = _mm_cvtps_epi32(hi_f);
340
341 // Pack to 16-bit then 8-bit
342 __m128i packed16 = _mm_packs_epi32(lo_i, hi_i);
343 __m128i packed8 = _mm_packus_epi16(packed16, packed16);
344
345 // Apply inversion if needed
346 if (params.invert) {
347 packed8 = _mm_xor_si128(packed8, all_ones);
348 }
349
350 // Store 8 bytes
351 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst + i), packed8);
352 }
353
354 // Handle remainder
355 apply_window_level_16bit_scalar(src + i, dst + i, pixel_count - i, params);
356}
357
361inline void apply_window_level_16bit_signed_sse2(
362 const int16_t* src, uint8_t* dst, size_t pixel_count,
363 const window_level_params& params) noexcept {
364 const float min_val = static_cast<float>(params.center - params.width / 2.0);
365 const float scale = 255.0f / static_cast<float>(params.width);
366
367 const __m128 min_vec = _mm_set1_ps(min_val);
368 const __m128 scale_vec = _mm_set1_ps(scale);
369 const __m128 zero_f = _mm_setzero_ps();
370 const __m128 max_255_f = _mm_set1_ps(255.0f);
371 const __m128i all_ones = _mm_set1_epi8(static_cast<char>(0xFF));
372
373 const size_t simd_count = (pixel_count / 8) * 8;
374
375 size_t i = 0;
376 for (; i < simd_count; i += 8) {
377 __m128i pixels = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
378
379 // Sign-extend to 32-bit
380 __m128i lo = _mm_srai_epi32(_mm_unpacklo_epi16(pixels, pixels), 16);
381 __m128i hi = _mm_srai_epi32(_mm_unpackhi_epi16(pixels, pixels), 16);
382
383 __m128 lo_f = _mm_cvtepi32_ps(lo);
384 __m128 hi_f = _mm_cvtepi32_ps(hi);
385
386 lo_f = _mm_mul_ps(_mm_sub_ps(lo_f, min_vec), scale_vec);
387 hi_f = _mm_mul_ps(_mm_sub_ps(hi_f, min_vec), scale_vec);
388
389 lo_f = _mm_max_ps(_mm_min_ps(lo_f, max_255_f), zero_f);
390 hi_f = _mm_max_ps(_mm_min_ps(hi_f, max_255_f), zero_f);
391
392 __m128i lo_i = _mm_cvtps_epi32(lo_f);
393 __m128i hi_i = _mm_cvtps_epi32(hi_f);
394
395 __m128i packed16 = _mm_packs_epi32(lo_i, hi_i);
396 __m128i packed8 = _mm_packus_epi16(packed16, packed16);
397
398 if (params.invert) {
399 packed8 = _mm_xor_si128(packed8, all_ones);
400 }
401
402 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst + i), packed8);
403 }
404
405 apply_window_level_16bit_signed_scalar(src + i, dst + i, pixel_count - i, params);
406}
407
408#endif // PACS_SIMD_SSE2
409
410// ============================================================================
411// AVX2 implementations
412// ============================================================================
413
414#if defined(PACS_SIMD_AVX2)
415
420inline void apply_window_level_8bit_avx2(const uint8_t* src, uint8_t* dst,
421 size_t pixel_count,
422 const window_level_params& params) noexcept {
423 const int32_t min_val_fp =
424 static_cast<int32_t>((params.center - params.width / 2.0) * 256);
425 const int32_t scale_fp =
426 static_cast<int32_t>((255.0 / params.width) * 256);
427
428 const __m256i min_vec = _mm256_set1_epi16(static_cast<int16_t>(min_val_fp >> 8));
429 const __m256i scale_vec = _mm256_set1_epi16(static_cast<int16_t>(scale_fp));
430 const __m256i zero = _mm256_setzero_si256();
431 const __m256i max_255 = _mm256_set1_epi16(255);
432 const __m256i all_ones = _mm256_set1_epi8(static_cast<char>(0xFF));
433
434 const size_t simd_count = (pixel_count / 32) * 32;
435
436 size_t i = 0;
437 for (; i < simd_count; i += 32) {
438 __m256i pixels = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
439
440 // Unpack to 16-bit
441 __m256i pixels_lo = _mm256_unpacklo_epi8(pixels, zero);
442 __m256i pixels_hi = _mm256_unpackhi_epi8(pixels, zero);
443
444 // Subtract minimum and multiply by scale
445 pixels_lo = _mm256_sub_epi16(pixels_lo, min_vec);
446 pixels_hi = _mm256_sub_epi16(pixels_hi, min_vec);
447
448 pixels_lo = _mm256_mulhi_epi16(pixels_lo, scale_vec);
449 pixels_hi = _mm256_mulhi_epi16(pixels_hi, scale_vec);
450
451 // Clamp
452 pixels_lo = _mm256_max_epi16(_mm256_min_epi16(pixels_lo, max_255), zero);
453 pixels_hi = _mm256_max_epi16(_mm256_min_epi16(pixels_hi, max_255), zero);
454
455 // Pack back to 8-bit
456 __m256i result = _mm256_packus_epi16(pixels_lo, pixels_hi);
457
458 // Fix lane crossing issue with packus
459 result = _mm256_permute4x64_epi64(result, 0xD8);
460
461 if (params.invert) {
462 result = _mm256_xor_si256(result, all_ones);
463 }
464
465 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), result);
466 }
467
468 // Handle remainder with SSE2
469#if defined(PACS_SIMD_SSE2)
470 if (pixel_count - i >= 16) {
471 apply_window_level_8bit_sse2(src + i, dst + i, pixel_count - i, params);
472 } else
473#endif
474 {
475 apply_window_level_8bit_scalar(src + i, dst + i, pixel_count - i, params);
476 }
477}
478
483inline void apply_window_level_16bit_avx2(const uint16_t* src, uint8_t* dst,
484 size_t pixel_count,
485 const window_level_params& params) noexcept {
486 const float min_val = static_cast<float>(params.center - params.width / 2.0);
487 const float scale = 255.0f / static_cast<float>(params.width);
488
489 const __m256 min_vec = _mm256_set1_ps(min_val);
490 const __m256 scale_vec = _mm256_set1_ps(scale);
491 const __m256 zero_f = _mm256_setzero_ps();
492 const __m256 max_255_f = _mm256_set1_ps(255.0f);
493 const __m128i all_ones_128 = _mm_set1_epi8(static_cast<char>(0xFF));
494
495 const size_t simd_count = (pixel_count / 16) * 16;
496
497 size_t i = 0;
498 for (; i < simd_count; i += 16) {
499 // Load 16 16-bit pixels
500 __m256i pixels = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
501
502 // Convert to 32-bit and float
503 __m128i lo_128 = _mm256_castsi256_si128(pixels);
504 __m128i hi_128 = _mm256_extracti128_si256(pixels, 1);
505
506 __m256i lo_32 = _mm256_cvtepu16_epi32(lo_128);
507 __m256i hi_32 = _mm256_cvtepu16_epi32(hi_128);
508
509 __m256 lo_f = _mm256_cvtepi32_ps(lo_32);
510 __m256 hi_f = _mm256_cvtepi32_ps(hi_32);
511
512 // Apply transformation
513 lo_f = _mm256_mul_ps(_mm256_sub_ps(lo_f, min_vec), scale_vec);
514 hi_f = _mm256_mul_ps(_mm256_sub_ps(hi_f, min_vec), scale_vec);
515
516 // Clamp
517 lo_f = _mm256_max_ps(_mm256_min_ps(lo_f, max_255_f), zero_f);
518 hi_f = _mm256_max_ps(_mm256_min_ps(hi_f, max_255_f), zero_f);
519
520 // Convert back to integer
521 __m256i lo_i = _mm256_cvtps_epi32(lo_f);
522 __m256i hi_i = _mm256_cvtps_epi32(hi_f);
523
524 // Pack to 16-bit
525 __m256i packed16 = _mm256_packs_epi32(lo_i, hi_i);
526 packed16 = _mm256_permute4x64_epi64(packed16, 0xD8);
527
528 // Pack to 8-bit
529 __m128i lo_16 = _mm256_castsi256_si128(packed16);
530 __m128i hi_16 = _mm256_extracti128_si256(packed16, 1);
531 __m128i packed8 = _mm_packus_epi16(lo_16, hi_16);
532
533 if (params.invert) {
534 packed8 = _mm_xor_si128(packed8, all_ones_128);
535 }
536
537 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), packed8);
538 }
539
540 // Handle remainder
541#if defined(PACS_SIMD_SSE2)
542 if (pixel_count - i >= 8) {
543 apply_window_level_16bit_sse2(src + i, dst + i, pixel_count - i, params);
544 } else
545#endif
546 {
547 apply_window_level_16bit_scalar(src + i, dst + i, pixel_count - i, params);
548 }
549}
550
554inline void apply_window_level_16bit_signed_avx2(
555 const int16_t* src, uint8_t* dst, size_t pixel_count,
556 const window_level_params& params) noexcept {
557 const float min_val = static_cast<float>(params.center - params.width / 2.0);
558 const float scale = 255.0f / static_cast<float>(params.width);
559
560 const __m256 min_vec = _mm256_set1_ps(min_val);
561 const __m256 scale_vec = _mm256_set1_ps(scale);
562 const __m256 zero_f = _mm256_setzero_ps();
563 const __m256 max_255_f = _mm256_set1_ps(255.0f);
564 const __m128i all_ones_128 = _mm_set1_epi8(static_cast<char>(0xFF));
565
566 const size_t simd_count = (pixel_count / 16) * 16;
567
568 size_t i = 0;
569 for (; i < simd_count; i += 16) {
570 __m256i pixels = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
571
572 // Sign-extend to 32-bit
573 __m128i lo_128 = _mm256_castsi256_si128(pixels);
574 __m128i hi_128 = _mm256_extracti128_si256(pixels, 1);
575
576 __m256i lo_32 = _mm256_cvtepi16_epi32(lo_128);
577 __m256i hi_32 = _mm256_cvtepi16_epi32(hi_128);
578
579 __m256 lo_f = _mm256_cvtepi32_ps(lo_32);
580 __m256 hi_f = _mm256_cvtepi32_ps(hi_32);
581
582 lo_f = _mm256_mul_ps(_mm256_sub_ps(lo_f, min_vec), scale_vec);
583 hi_f = _mm256_mul_ps(_mm256_sub_ps(hi_f, min_vec), scale_vec);
584
585 lo_f = _mm256_max_ps(_mm256_min_ps(lo_f, max_255_f), zero_f);
586 hi_f = _mm256_max_ps(_mm256_min_ps(hi_f, max_255_f), zero_f);
587
588 __m256i lo_i = _mm256_cvtps_epi32(lo_f);
589 __m256i hi_i = _mm256_cvtps_epi32(hi_f);
590
591 __m256i packed16 = _mm256_packs_epi32(lo_i, hi_i);
592 packed16 = _mm256_permute4x64_epi64(packed16, 0xD8);
593
594 __m128i lo_16 = _mm256_castsi256_si128(packed16);
595 __m128i hi_16 = _mm256_extracti128_si256(packed16, 1);
596 __m128i packed8 = _mm_packus_epi16(lo_16, hi_16);
597
598 if (params.invert) {
599 packed8 = _mm_xor_si128(packed8, all_ones_128);
600 }
601
602 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), packed8);
603 }
604
605#if defined(PACS_SIMD_SSE2)
606 if (pixel_count - i >= 8) {
607 apply_window_level_16bit_signed_sse2(src + i, dst + i, pixel_count - i, params);
608 } else
609#endif
610 {
611 apply_window_level_16bit_signed_scalar(src + i, dst + i, pixel_count - i, params);
612 }
613}
614
615#endif // PACS_SIMD_AVX2
616
617// ============================================================================
618// NEON implementations (ARM)
619// ============================================================================
620
621#if defined(PACS_SIMD_NEON)
622
627inline void apply_window_level_8bit_neon(const uint8_t* src, uint8_t* dst,
628 size_t pixel_count,
629 const window_level_params& params) noexcept {
630 const float min_val = static_cast<float>(params.center - params.width / 2.0);
631 const float scale = 255.0f / static_cast<float>(params.width);
632
633 const float32x4_t min_vec = vdupq_n_f32(min_val);
634 const float32x4_t scale_vec = vdupq_n_f32(scale);
635 const float32x4_t zero_f = vdupq_n_f32(0.0f);
636 const float32x4_t max_255_f = vdupq_n_f32(255.0f);
637 const uint8x16_t all_ones = vdupq_n_u8(0xFF);
638
639 const size_t simd_count = (pixel_count / 16) * 16;
640
641 size_t i = 0;
642 for (; i < simd_count; i += 16) {
643 uint8x16_t pixels = vld1q_u8(src + i);
644
645 // Process in 4-element chunks (NEON doesn't have 8-wide float)
646 uint8x8_t lo8 = vget_low_u8(pixels);
647 uint8x8_t hi8 = vget_high_u8(pixels);
648
649 uint16x8_t lo16 = vmovl_u8(lo8);
650 uint16x8_t hi16 = vmovl_u8(hi8);
651
652 // Process 4 groups of 4 pixels each
653 uint32x4_t p0 = vmovl_u16(vget_low_u16(lo16));
654 uint32x4_t p1 = vmovl_u16(vget_high_u16(lo16));
655 uint32x4_t p2 = vmovl_u16(vget_low_u16(hi16));
656 uint32x4_t p3 = vmovl_u16(vget_high_u16(hi16));
657
658 float32x4_t f0 = vcvtq_f32_u32(p0);
659 float32x4_t f1 = vcvtq_f32_u32(p1);
660 float32x4_t f2 = vcvtq_f32_u32(p2);
661 float32x4_t f3 = vcvtq_f32_u32(p3);
662
663 // Apply transformation
664 f0 = vmulq_f32(vsubq_f32(f0, min_vec), scale_vec);
665 f1 = vmulq_f32(vsubq_f32(f1, min_vec), scale_vec);
666 f2 = vmulq_f32(vsubq_f32(f2, min_vec), scale_vec);
667 f3 = vmulq_f32(vsubq_f32(f3, min_vec), scale_vec);
668
669 // Clamp
670 f0 = vmaxq_f32(vminq_f32(f0, max_255_f), zero_f);
671 f1 = vmaxq_f32(vminq_f32(f1, max_255_f), zero_f);
672 f2 = vmaxq_f32(vminq_f32(f2, max_255_f), zero_f);
673 f3 = vmaxq_f32(vminq_f32(f3, max_255_f), zero_f);
674
675 // Convert back to integer
676 uint32x4_t i0 = vcvtq_u32_f32(f0);
677 uint32x4_t i1 = vcvtq_u32_f32(f1);
678 uint32x4_t i2 = vcvtq_u32_f32(f2);
679 uint32x4_t i3 = vcvtq_u32_f32(f3);
680
681 // Narrow to 16-bit then 8-bit
682 uint16x4_t n0 = vmovn_u32(i0);
683 uint16x4_t n1 = vmovn_u32(i1);
684 uint16x4_t n2 = vmovn_u32(i2);
685 uint16x4_t n3 = vmovn_u32(i3);
686
687 uint16x8_t n_lo = vcombine_u16(n0, n1);
688 uint16x8_t n_hi = vcombine_u16(n2, n3);
689
690 uint8x8_t r_lo = vmovn_u16(n_lo);
691 uint8x8_t r_hi = vmovn_u16(n_hi);
692
693 uint8x16_t result = vcombine_u8(r_lo, r_hi);
694
695 if (params.invert) {
696 result = veorq_u8(result, all_ones);
697 }
698
699 vst1q_u8(dst + i, result);
700 }
701
702 apply_window_level_8bit_scalar(src + i, dst + i, pixel_count - i, params);
703}
704
708inline void apply_window_level_16bit_neon(const uint16_t* src, uint8_t* dst,
709 size_t pixel_count,
710 const window_level_params& params) noexcept {
711 const float min_val = static_cast<float>(params.center - params.width / 2.0);
712 const float scale = 255.0f / static_cast<float>(params.width);
713
714 const float32x4_t min_vec = vdupq_n_f32(min_val);
715 const float32x4_t scale_vec = vdupq_n_f32(scale);
716 const float32x4_t zero_f = vdupq_n_f32(0.0f);
717 const float32x4_t max_255_f = vdupq_n_f32(255.0f);
718 const uint8x8_t all_ones = vdup_n_u8(0xFF);
719
720 const size_t simd_count = (pixel_count / 8) * 8;
721
722 size_t i = 0;
723 for (; i < simd_count; i += 8) {
724 uint16x8_t pixels = vld1q_u16(src + i);
725
726 uint32x4_t lo32 = vmovl_u16(vget_low_u16(pixels));
727 uint32x4_t hi32 = vmovl_u16(vget_high_u16(pixels));
728
729 float32x4_t lo_f = vcvtq_f32_u32(lo32);
730 float32x4_t hi_f = vcvtq_f32_u32(hi32);
731
732 lo_f = vmulq_f32(vsubq_f32(lo_f, min_vec), scale_vec);
733 hi_f = vmulq_f32(vsubq_f32(hi_f, min_vec), scale_vec);
734
735 lo_f = vmaxq_f32(vminq_f32(lo_f, max_255_f), zero_f);
736 hi_f = vmaxq_f32(vminq_f32(hi_f, max_255_f), zero_f);
737
738 uint32x4_t lo_i = vcvtq_u32_f32(lo_f);
739 uint32x4_t hi_i = vcvtq_u32_f32(hi_f);
740
741 uint16x4_t lo16 = vmovn_u32(lo_i);
742 uint16x4_t hi16 = vmovn_u32(hi_i);
743
744 uint16x8_t packed16 = vcombine_u16(lo16, hi16);
745 uint8x8_t packed8 = vmovn_u16(packed16);
746
747 if (params.invert) {
748 packed8 = veor_u8(packed8, all_ones);
749 }
750
751 vst1_u8(dst + i, packed8);
752 }
753
754 apply_window_level_16bit_scalar(src + i, dst + i, pixel_count - i, params);
755}
756
760inline void apply_window_level_16bit_signed_neon(
761 const int16_t* src, uint8_t* dst, size_t pixel_count,
762 const window_level_params& params) noexcept {
763 const float min_val = static_cast<float>(params.center - params.width / 2.0);
764 const float scale = 255.0f / static_cast<float>(params.width);
765
766 const float32x4_t min_vec = vdupq_n_f32(min_val);
767 const float32x4_t scale_vec = vdupq_n_f32(scale);
768 const float32x4_t zero_f = vdupq_n_f32(0.0f);
769 const float32x4_t max_255_f = vdupq_n_f32(255.0f);
770 const uint8x8_t all_ones = vdup_n_u8(0xFF);
771
772 const size_t simd_count = (pixel_count / 8) * 8;
773
774 size_t i = 0;
775 for (; i < simd_count; i += 8) {
776 int16x8_t pixels = vld1q_s16(src + i);
777
778 int32x4_t lo32 = vmovl_s16(vget_low_s16(pixels));
779 int32x4_t hi32 = vmovl_s16(vget_high_s16(pixels));
780
781 float32x4_t lo_f = vcvtq_f32_s32(lo32);
782 float32x4_t hi_f = vcvtq_f32_s32(hi32);
783
784 lo_f = vmulq_f32(vsubq_f32(lo_f, min_vec), scale_vec);
785 hi_f = vmulq_f32(vsubq_f32(hi_f, min_vec), scale_vec);
786
787 lo_f = vmaxq_f32(vminq_f32(lo_f, max_255_f), zero_f);
788 hi_f = vmaxq_f32(vminq_f32(hi_f, max_255_f), zero_f);
789
790 uint32x4_t lo_i = vcvtq_u32_f32(lo_f);
791 uint32x4_t hi_i = vcvtq_u32_f32(hi_f);
792
793 uint16x4_t lo16 = vmovn_u32(lo_i);
794 uint16x4_t hi16 = vmovn_u32(hi_i);
795
796 uint16x8_t packed16 = vcombine_u16(lo16, hi16);
797 uint8x8_t packed8 = vmovn_u16(packed16);
798
799 if (params.invert) {
800 packed8 = veor_u8(packed8, all_ones);
801 }
802
803 vst1_u8(dst + i, packed8);
804 }
805
806 apply_window_level_16bit_signed_scalar(src + i, dst + i, pixel_count - i, params);
807}
808
809#endif // PACS_SIMD_NEON
810
811} // namespace detail
812
813// ============================================================================
814// Public API with runtime dispatch
815// ============================================================================
816
828inline void apply_window_level_8bit(const uint8_t* src, uint8_t* dst,
829 size_t pixel_count,
830 const window_level_params& params) noexcept {
831#if defined(PACS_SIMD_AVX2)
832 if (has_avx2()) {
833 detail::apply_window_level_8bit_avx2(src, dst, pixel_count, params);
834 return;
835 }
836#endif
837#if defined(PACS_SIMD_SSE2)
838 if (has_sse2()) {
839 detail::apply_window_level_8bit_sse2(src, dst, pixel_count, params);
840 return;
841 }
842#endif
843#if defined(PACS_SIMD_NEON)
844 if (has_neon()) {
845 detail::apply_window_level_8bit_neon(src, dst, pixel_count, params);
846 return;
847 }
848#endif
849 detail::apply_window_level_8bit_scalar(src, dst, pixel_count, params);
850}
851
860inline void apply_window_level_16bit(const uint16_t* src, uint8_t* dst,
861 size_t pixel_count,
862 const window_level_params& params) noexcept {
863#if defined(PACS_SIMD_AVX2)
864 if (has_avx2()) {
865 detail::apply_window_level_16bit_avx2(src, dst, pixel_count, params);
866 return;
867 }
868#endif
869#if defined(PACS_SIMD_SSE2)
870 if (has_sse2()) {
871 detail::apply_window_level_16bit_sse2(src, dst, pixel_count, params);
872 return;
873 }
874#endif
875#if defined(PACS_SIMD_NEON)
876 if (has_neon()) {
877 detail::apply_window_level_16bit_neon(src, dst, pixel_count, params);
878 return;
879 }
880#endif
881 detail::apply_window_level_16bit_scalar(src, dst, pixel_count, params);
882}
883
894inline void apply_window_level_16bit_signed(const int16_t* src, uint8_t* dst,
895 size_t pixel_count,
896 const window_level_params& params) noexcept {
897#if defined(PACS_SIMD_AVX2)
898 if (has_avx2()) {
899 detail::apply_window_level_16bit_signed_avx2(src, dst, pixel_count, params);
900 return;
901 }
902#endif
903#if defined(PACS_SIMD_SSE2)
904 if (has_sse2()) {
905 detail::apply_window_level_16bit_signed_sse2(src, dst, pixel_count, params);
906 return;
907 }
908#endif
909#if defined(PACS_SIMD_NEON)
910 if (has_neon()) {
911 detail::apply_window_level_16bit_signed_neon(src, dst, pixel_count, params);
912 return;
913 }
914#endif
915 detail::apply_window_level_16bit_signed_scalar(src, dst, pixel_count, params);
916}
917
918} // namespace kcenon::pacs::encoding::simd
919
920#endif // PACS_ENCODING_SIMD_WINDOWING_HPP
Precomputed LUT for fast repeated window/level application.
void apply_16bit(const uint16_t *src, uint8_t *dst, size_t pixel_count) const noexcept
Apply LUT to 16-bit data (uses clamping for out-of-range values)
void apply_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count) const noexcept
Apply LUT to 8-bit data.
static window_level_lut create_8bit(const window_level_params &params)
Construct LUT for 8-bit input.
static window_level_lut create_16bit(const window_level_params &params)
Construct LUT for 16-bit input.
static window_level_lut create_12bit(const window_level_params &params)
Construct LUT for 12-bit input.
void apply_window_level_16bit_scalar(const uint16_t *src, uint8_t *dst, size_t pixel_count, const window_level_params &params) noexcept
Scalar 16-bit window/level application.
void apply_window_level_16bit_signed_scalar(const int16_t *src, uint8_t *dst, size_t pixel_count, const window_level_params &params) noexcept
Scalar signed 16-bit window/level application.
void apply_window_level_8bit_scalar(const uint8_t *src, uint8_t *dst, size_t pixel_count, const window_level_params &params) noexcept
Scalar 8-bit window/level application.
bool has_neon() noexcept
Check if NEON is available.
bool has_sse2() noexcept
Check if SSE2 is available.
void apply_window_level_16bit_signed(const int16_t *src, uint8_t *dst, size_t pixel_count, const window_level_params &params) noexcept
Apply window/level transformation to 16-bit signed grayscale data.
bool has_avx2() noexcept
Check if AVX2 is available.
void apply_window_level_16bit(const uint16_t *src, uint8_t *dst, size_t pixel_count, const window_level_params &params) noexcept
Apply window/level transformation to 16-bit unsigned grayscale data.
void apply_window_level_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count, const window_level_params &params) noexcept
Apply window/level transformation to 8-bit grayscale data.
SIMD configuration and CPU feature detection.
Platform-specific SIMD type definitions and wrappers.
constexpr window_level_params(double c=128.0, double w=256.0, bool inv=false) noexcept
bool invert
Invert output (for MONOCHROME1)