PACS System 0.1.0
PACS DICOM system library
Loading...
Searching...
No Matches
simd_photometric.h
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
19#ifndef PACS_ENCODING_SIMD_PHOTOMETRIC_HPP
20#define PACS_ENCODING_SIMD_PHOTOMETRIC_HPP
21
22#include "simd_config.h"
23#include "simd_types.h"
24
25#include <cstddef>
26#include <cstdint>
27#include <cstring>
28
30
31// Forward declarations
32void invert_monochrome_8bit(const uint8_t* src, uint8_t* dst,
33 size_t pixel_count) noexcept;
34void invert_monochrome_16bit(const uint16_t* src, uint16_t* dst,
35 size_t pixel_count, uint16_t max_value) noexcept;
36void rgb_to_ycbcr_8bit(const uint8_t* src, uint8_t* dst,
37 size_t pixel_count) noexcept;
38void ycbcr_to_rgb_8bit(const uint8_t* src, uint8_t* dst,
39 size_t pixel_count) noexcept;
40
41namespace detail {
42
43// ============================================================================
44// Scalar fallback implementations
45// ============================================================================
46
50inline void invert_monochrome_8bit_scalar(const uint8_t* src, uint8_t* dst,
51 size_t pixel_count) noexcept {
52 for (size_t i = 0; i < pixel_count; ++i) {
53 dst[i] = 255 - src[i];
54 }
55}
56
61inline void invert_monochrome_16bit_scalar(const uint16_t* src, uint16_t* dst,
62 size_t pixel_count,
63 uint16_t max_value) noexcept {
64 for (size_t i = 0; i < pixel_count; ++i) {
65 dst[i] = max_value - src[i];
66 }
67}
68
76inline void rgb_to_ycbcr_8bit_scalar(const uint8_t* src, uint8_t* dst,
77 size_t pixel_count) noexcept {
78 for (size_t i = 0; i < pixel_count; ++i) {
79 const int r = src[i * 3];
80 const int g = src[i * 3 + 1];
81 const int b = src[i * 3 + 2];
82
83 // Fixed-point arithmetic (16.16 format for precision)
84 // Coefficients scaled by 65536
85 const int y = (19595 * r + 38470 * g + 7471 * b + 32768) >> 16;
86 const int cb = (-11056 * r - 21712 * g + 32768 * b + 32768) >> 16;
87 const int cr = (32768 * r - 27440 * g - 5328 * b + 32768) >> 16;
88
89 dst[i * 3] = static_cast<uint8_t>(y < 0 ? 0 : (y > 255 ? 255 : y));
90 dst[i * 3 + 1] = static_cast<uint8_t>(cb + 128);
91 dst[i * 3 + 2] = static_cast<uint8_t>(cr + 128);
92 }
93}
94
102inline void ycbcr_to_rgb_8bit_scalar(const uint8_t* src, uint8_t* dst,
103 size_t pixel_count) noexcept {
104 for (size_t i = 0; i < pixel_count; ++i) {
105 const int y = src[i * 3];
106 const int cb = src[i * 3 + 1] - 128;
107 const int cr = src[i * 3 + 2] - 128;
108
109 // Fixed-point arithmetic (coefficients scaled by 65536)
110 const int r = y + ((91881 * cr + 32768) >> 16);
111 const int g = y - ((22554 * cb + 46802 * cr + 32768) >> 16);
112 const int b = y + ((116130 * cb + 32768) >> 16);
113
114 dst[i * 3] = static_cast<uint8_t>(r < 0 ? 0 : (r > 255 ? 255 : r));
115 dst[i * 3 + 1] = static_cast<uint8_t>(g < 0 ? 0 : (g > 255 ? 255 : g));
116 dst[i * 3 + 2] = static_cast<uint8_t>(b < 0 ? 0 : (b > 255 ? 255 : b));
117 }
118}
119
120// ============================================================================
121// SSE2 implementations
122// ============================================================================
123
124#if defined(PACS_SIMD_SSE2)
125
130inline void invert_monochrome_8bit_sse2(const uint8_t* src, uint8_t* dst,
131 size_t pixel_count) noexcept {
132 const __m128i all_ones = _mm_set1_epi8(static_cast<char>(0xFF));
133 const size_t simd_count = (pixel_count / 16) * 16;
134
135 size_t i = 0;
136 for (; i < simd_count; i += 16) {
137 __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
138 v = _mm_xor_si128(v, all_ones);
139 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), v);
140 }
141
142 // Handle remainder
143 invert_monochrome_8bit_scalar(src + i, dst + i, pixel_count - i);
144}
145
150inline void invert_monochrome_16bit_sse2(const uint16_t* src, uint16_t* dst,
151 size_t pixel_count,
152 uint16_t max_value) noexcept {
153 const __m128i max_vec = _mm_set1_epi16(static_cast<int16_t>(max_value));
154 const size_t simd_count = (pixel_count / 8) * 8;
155
156 size_t i = 0;
157 for (; i < simd_count; i += 8) {
158 __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
159 v = _mm_sub_epi16(max_vec, v);
160 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + i), v);
161 }
162
163 // Handle remainder
164 invert_monochrome_16bit_scalar(src + i, dst + i, pixel_count - i, max_value);
165}
166
167#endif // PACS_SIMD_SSE2
168
169// ============================================================================
170// AVX2 implementations
171// ============================================================================
172
173#if defined(PACS_SIMD_AVX2)
174
179inline void invert_monochrome_8bit_avx2(const uint8_t* src, uint8_t* dst,
180 size_t pixel_count) noexcept {
181 const __m256i all_ones = _mm256_set1_epi8(static_cast<char>(0xFF));
182 const size_t simd_count = (pixel_count / 32) * 32;
183
184 size_t i = 0;
185 for (; i < simd_count; i += 32) {
186 __m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
187 v = _mm256_xor_si256(v, all_ones);
188 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), v);
189 }
190
191 // Handle remainder with SSE2
192#if defined(PACS_SIMD_SSE2)
193 if (pixel_count - i >= 16) {
194 invert_monochrome_8bit_sse2(src + i, dst + i, pixel_count - i);
195 } else
196#endif
197 {
198 invert_monochrome_8bit_scalar(src + i, dst + i, pixel_count - i);
199 }
200}
201
206inline void invert_monochrome_16bit_avx2(const uint16_t* src, uint16_t* dst,
207 size_t pixel_count,
208 uint16_t max_value) noexcept {
209 const __m256i max_vec = _mm256_set1_epi16(static_cast<int16_t>(max_value));
210 const size_t simd_count = (pixel_count / 16) * 16;
211
212 size_t i = 0;
213 for (; i < simd_count; i += 16) {
214 __m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
215 v = _mm256_sub_epi16(max_vec, v);
216 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), v);
217 }
218
219 // Handle remainder with SSE2
220#if defined(PACS_SIMD_SSE2)
221 if (pixel_count - i >= 8) {
222 invert_monochrome_16bit_sse2(src + i, dst + i, pixel_count - i, max_value);
223 } else
224#endif
225 {
226 invert_monochrome_16bit_scalar(src + i, dst + i, pixel_count - i, max_value);
227 }
228}
229
236inline void rgb_to_ycbcr_8bit_avx2(const uint8_t* src, uint8_t* dst,
237 size_t pixel_count) noexcept {
238 // Coefficients scaled by 256 (8-bit precision)
239 // Y coefficients: 77, 150, 29 (sum = 256)
240 // Cb coefficients: -43, -85, 128
241 // Cr coefficients: 128, -107, -21
242 const __m256i coeff_y_r = _mm256_set1_epi16(77);
243 const __m256i coeff_y_g = _mm256_set1_epi16(150);
244 const __m256i coeff_y_b = _mm256_set1_epi16(29);
245
246 const __m256i coeff_cb_r = _mm256_set1_epi16(-43);
247 const __m256i coeff_cb_g = _mm256_set1_epi16(-85);
248 const __m256i coeff_cb_b = _mm256_set1_epi16(128);
249
250 const __m256i coeff_cr_r = _mm256_set1_epi16(128);
251 const __m256i coeff_cr_g = _mm256_set1_epi16(-107);
252 const __m256i coeff_cr_b = _mm256_set1_epi16(-21);
253
254 const __m256i offset_128 = _mm256_set1_epi16(128);
255 const __m256i round_add = _mm256_set1_epi16(128); // Rounding for >>8
256
257 const size_t simd_count = (pixel_count / 8) * 8;
258
259 size_t i = 0;
260 for (; i < simd_count; i += 8) {
261 // Load 24 bytes (8 RGB pixels)
262 // Note: We need to handle the scattered RGB data
263 alignas(32) int16_t r[8], g[8], b[8];
264 for (int j = 0; j < 8; ++j) {
265 r[j] = src[(i + j) * 3];
266 g[j] = src[(i + j) * 3 + 1];
267 b[j] = src[(i + j) * 3 + 2];
268 }
269
270 __m256i r_vec = _mm256_load_si256(reinterpret_cast<const __m256i*>(r));
271 __m256i g_vec = _mm256_load_si256(reinterpret_cast<const __m256i*>(g));
272 __m256i b_vec = _mm256_load_si256(reinterpret_cast<const __m256i*>(b));
273
274 // Calculate Y = (77*R + 150*G + 29*B + 128) >> 8
275 __m256i y = _mm256_add_epi16(
276 _mm256_add_epi16(_mm256_mullo_epi16(r_vec, coeff_y_r),
277 _mm256_mullo_epi16(g_vec, coeff_y_g)),
278 _mm256_mullo_epi16(b_vec, coeff_y_b));
279 y = _mm256_add_epi16(y, round_add);
280 y = _mm256_srai_epi16(y, 8);
281
282 // Calculate Cb = (-43*R - 85*G + 128*B + 128) >> 8 + 128
283 __m256i cb = _mm256_add_epi16(
284 _mm256_add_epi16(_mm256_mullo_epi16(r_vec, coeff_cb_r),
285 _mm256_mullo_epi16(g_vec, coeff_cb_g)),
286 _mm256_mullo_epi16(b_vec, coeff_cb_b));
287 cb = _mm256_add_epi16(cb, round_add);
288 cb = _mm256_srai_epi16(cb, 8);
289 cb = _mm256_add_epi16(cb, offset_128);
290
291 // Calculate Cr = (128*R - 107*G - 21*B + 128) >> 8 + 128
292 __m256i cr = _mm256_add_epi16(
293 _mm256_add_epi16(_mm256_mullo_epi16(r_vec, coeff_cr_r),
294 _mm256_mullo_epi16(g_vec, coeff_cr_g)),
295 _mm256_mullo_epi16(b_vec, coeff_cr_b));
296 cr = _mm256_add_epi16(cr, round_add);
297 cr = _mm256_srai_epi16(cr, 8);
298 cr = _mm256_add_epi16(cr, offset_128);
299
300 // Store results
301 alignas(32) int16_t y_out[8], cb_out[8], cr_out[8];
302 _mm256_store_si256(reinterpret_cast<__m256i*>(y_out), y);
303 _mm256_store_si256(reinterpret_cast<__m256i*>(cb_out), cb);
304 _mm256_store_si256(reinterpret_cast<__m256i*>(cr_out), cr);
305
306 for (int j = 0; j < 8; ++j) {
307 dst[(i + j) * 3] = static_cast<uint8_t>(
308 y_out[j] < 0 ? 0 : (y_out[j] > 255 ? 255 : y_out[j]));
309 dst[(i + j) * 3 + 1] = static_cast<uint8_t>(
310 cb_out[j] < 0 ? 0 : (cb_out[j] > 255 ? 255 : cb_out[j]));
311 dst[(i + j) * 3 + 2] = static_cast<uint8_t>(
312 cr_out[j] < 0 ? 0 : (cr_out[j] > 255 ? 255 : cr_out[j]));
313 }
314 }
315
316 // Handle remainder
317 rgb_to_ycbcr_8bit_scalar(src + i * 3, dst + i * 3, pixel_count - i);
318}
319
323inline void ycbcr_to_rgb_8bit_avx2(const uint8_t* src, uint8_t* dst,
324 size_t pixel_count) noexcept {
325 // Coefficients scaled by 256
326 // R = Y + 1.402*Cr -> coeff = 359
327 // G = Y - 0.344*Cb - 0.714*Cr -> coeffs = -88, -183
328 // B = Y + 1.772*Cb -> coeff = 454
329 const __m256i coeff_r_cr = _mm256_set1_epi16(359);
330 const __m256i coeff_g_cb = _mm256_set1_epi16(-88);
331 const __m256i coeff_g_cr = _mm256_set1_epi16(-183);
332 const __m256i coeff_b_cb = _mm256_set1_epi16(454);
333
334 const __m256i round_add = _mm256_set1_epi16(128);
335 const __m256i zero = _mm256_setzero_si256();
336 const __m256i max_255 = _mm256_set1_epi16(255);
337
338 const size_t simd_count = (pixel_count / 8) * 8;
339
340 size_t i = 0;
341 for (; i < simd_count; i += 8) {
342 // Load YCbCr values
343 alignas(32) int16_t y[8], cb[8], cr[8];
344 for (int j = 0; j < 8; ++j) {
345 y[j] = src[(i + j) * 3];
346 cb[j] = src[(i + j) * 3 + 1] - 128;
347 cr[j] = src[(i + j) * 3 + 2] - 128;
348 }
349
350 __m256i y_vec = _mm256_load_si256(reinterpret_cast<const __m256i*>(y));
351 __m256i cb_vec = _mm256_load_si256(reinterpret_cast<const __m256i*>(cb));
352 __m256i cr_vec = _mm256_load_si256(reinterpret_cast<const __m256i*>(cr));
353
354 // Calculate R = Y + (359*Cr + 128) >> 8
355 __m256i r = _mm256_add_epi16(
356 _mm256_srai_epi16(
357 _mm256_add_epi16(_mm256_mullo_epi16(cr_vec, coeff_r_cr), round_add),
358 8),
359 y_vec);
360
361 // Calculate G = Y + (-88*Cb - 183*Cr + 128) >> 8
362 __m256i g = _mm256_add_epi16(
363 _mm256_srai_epi16(
364 _mm256_add_epi16(
365 _mm256_add_epi16(_mm256_mullo_epi16(cb_vec, coeff_g_cb),
366 _mm256_mullo_epi16(cr_vec, coeff_g_cr)),
367 round_add),
368 8),
369 y_vec);
370
371 // Calculate B = Y + (454*Cb + 128) >> 8
372 __m256i b = _mm256_add_epi16(
373 _mm256_srai_epi16(
374 _mm256_add_epi16(_mm256_mullo_epi16(cb_vec, coeff_b_cb), round_add),
375 8),
376 y_vec);
377
378 // Clamp to [0, 255]
379 r = _mm256_max_epi16(_mm256_min_epi16(r, max_255), zero);
380 g = _mm256_max_epi16(_mm256_min_epi16(g, max_255), zero);
381 b = _mm256_max_epi16(_mm256_min_epi16(b, max_255), zero);
382
383 // Store results
384 alignas(32) int16_t r_out[8], g_out[8], b_out[8];
385 _mm256_store_si256(reinterpret_cast<__m256i*>(r_out), r);
386 _mm256_store_si256(reinterpret_cast<__m256i*>(g_out), g);
387 _mm256_store_si256(reinterpret_cast<__m256i*>(b_out), b);
388
389 for (int j = 0; j < 8; ++j) {
390 dst[(i + j) * 3] = static_cast<uint8_t>(r_out[j]);
391 dst[(i + j) * 3 + 1] = static_cast<uint8_t>(g_out[j]);
392 dst[(i + j) * 3 + 2] = static_cast<uint8_t>(b_out[j]);
393 }
394 }
395
396 // Handle remainder
397 ycbcr_to_rgb_8bit_scalar(src + i * 3, dst + i * 3, pixel_count - i);
398}
399
400#endif // PACS_SIMD_AVX2
401
402// ============================================================================
403// NEON implementations (ARM)
404// ============================================================================
405
406#if defined(PACS_SIMD_NEON)
407
412inline void invert_monochrome_8bit_neon(const uint8_t* src, uint8_t* dst,
413 size_t pixel_count) noexcept {
414 const uint8x16_t all_ones = vdupq_n_u8(0xFF);
415 const size_t simd_count = (pixel_count / 16) * 16;
416
417 size_t i = 0;
418 for (; i < simd_count; i += 16) {
419 uint8x16_t v = vld1q_u8(src + i);
420 v = veorq_u8(v, all_ones);
421 vst1q_u8(dst + i, v);
422 }
423
424 // Handle remainder
425 invert_monochrome_8bit_scalar(src + i, dst + i, pixel_count - i);
426}
427
432inline void invert_monochrome_16bit_neon(const uint16_t* src, uint16_t* dst,
433 size_t pixel_count,
434 uint16_t max_value) noexcept {
435 const uint16x8_t max_vec = vdupq_n_u16(max_value);
436 const size_t simd_count = (pixel_count / 8) * 8;
437
438 size_t i = 0;
439 for (; i < simd_count; i += 8) {
440 uint16x8_t v = vld1q_u16(src + i);
441 v = vsubq_u16(max_vec, v);
442 vst1q_u16(dst + i, v);
443 }
444
445 // Handle remainder
446 invert_monochrome_16bit_scalar(src + i, dst + i, pixel_count - i, max_value);
447}
448
453inline void rgb_to_ycbcr_8bit_neon(const uint8_t* src, uint8_t* dst,
454 size_t pixel_count) noexcept {
455 // Coefficients (scaled by 256)
456 const int16x8_t coeff_y_r = vdupq_n_s16(77);
457 const int16x8_t coeff_y_g = vdupq_n_s16(150);
458 const int16x8_t coeff_y_b = vdupq_n_s16(29);
459
460 const int16x8_t coeff_cb_r = vdupq_n_s16(-43);
461 const int16x8_t coeff_cb_g = vdupq_n_s16(-85);
462 const int16x8_t coeff_cb_b = vdupq_n_s16(128);
463
464 const int16x8_t coeff_cr_r = vdupq_n_s16(128);
465 const int16x8_t coeff_cr_g = vdupq_n_s16(-107);
466 const int16x8_t coeff_cr_b = vdupq_n_s16(-21);
467
468 const int16x8_t offset_128 = vdupq_n_s16(128);
469
470 const size_t simd_count = (pixel_count / 8) * 8;
471
472 size_t i = 0;
473 for (; i < simd_count; i += 8) {
474 // Load and deinterleave 8 RGB pixels
475 uint8x8x3_t rgb = vld3_u8(src + i * 3);
476
477 // Widen to 16-bit for arithmetic
478 int16x8_t r = vreinterpretq_s16_u16(vmovl_u8(rgb.val[0]));
479 int16x8_t g = vreinterpretq_s16_u16(vmovl_u8(rgb.val[1]));
480 int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(rgb.val[2]));
481
482 // Y = (77*R + 150*G + 29*B + 128) >> 8
483 int16x8_t y = vaddq_s16(vaddq_s16(vmulq_s16(r, coeff_y_r),
484 vmulq_s16(g, coeff_y_g)),
485 vmulq_s16(b, coeff_y_b));
486 y = vaddq_s16(y, offset_128);
487 y = vshrq_n_s16(y, 8);
488
489 // Cb = (-43*R - 85*G + 128*B + 128) >> 8 + 128
490 int16x8_t cb = vaddq_s16(vaddq_s16(vmulq_s16(r, coeff_cb_r),
491 vmulq_s16(g, coeff_cb_g)),
492 vmulq_s16(b, coeff_cb_b));
493 cb = vaddq_s16(cb, offset_128);
494 cb = vshrq_n_s16(cb, 8);
495 cb = vaddq_s16(cb, offset_128);
496
497 // Cr = (128*R - 107*G - 21*B + 128) >> 8 + 128
498 int16x8_t cr = vaddq_s16(vaddq_s16(vmulq_s16(r, coeff_cr_r),
499 vmulq_s16(g, coeff_cr_g)),
500 vmulq_s16(b, coeff_cr_b));
501 cr = vaddq_s16(cr, offset_128);
502 cr = vshrq_n_s16(cr, 8);
503 cr = vaddq_s16(cr, offset_128);
504
505 // Narrow back to 8-bit with saturation
506 uint8x8x3_t ycbcr;
507 ycbcr.val[0] = vqmovun_s16(y);
508 ycbcr.val[1] = vqmovun_s16(cb);
509 ycbcr.val[2] = vqmovun_s16(cr);
510
511 // Store interleaved YCbCr
512 vst3_u8(dst + i * 3, ycbcr);
513 }
514
515 // Handle remainder
516 rgb_to_ycbcr_8bit_scalar(src + i * 3, dst + i * 3, pixel_count - i);
517}
518
522inline void ycbcr_to_rgb_8bit_neon(const uint8_t* src, uint8_t* dst,
523 size_t pixel_count) noexcept {
524 const int16x8_t coeff_r_cr = vdupq_n_s16(359);
525 const int16x8_t coeff_g_cb = vdupq_n_s16(-88);
526 const int16x8_t coeff_g_cr = vdupq_n_s16(-183);
527 const int16x8_t coeff_b_cb = vdupq_n_s16(454);
528 const int16x8_t offset_128 = vdupq_n_s16(128);
529
530 const size_t simd_count = (pixel_count / 8) * 8;
531
532 size_t i = 0;
533 for (; i < simd_count; i += 8) {
534 // Load and deinterleave 8 YCbCr pixels
535 uint8x8x3_t ycbcr = vld3_u8(src + i * 3);
536
537 // Widen to 16-bit
538 int16x8_t y = vreinterpretq_s16_u16(vmovl_u8(ycbcr.val[0]));
539 int16x8_t cb = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(ycbcr.val[1])),
540 offset_128);
541 int16x8_t cr = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(ycbcr.val[2])),
542 offset_128);
543
544 // R = Y + (359*Cr + 128) >> 8
545 int16x8_t r = vaddq_s16(y, vshrq_n_s16(vaddq_s16(vmulq_s16(cr, coeff_r_cr),
546 offset_128), 8));
547
548 // G = Y + (-88*Cb - 183*Cr + 128) >> 8
549 int16x8_t g = vaddq_s16(y, vshrq_n_s16(
550 vaddq_s16(vaddq_s16(vmulq_s16(cb, coeff_g_cb),
551 vmulq_s16(cr, coeff_g_cr)),
552 offset_128), 8));
553
554 // B = Y + (454*Cb + 128) >> 8
555 int16x8_t bl = vaddq_s16(y, vshrq_n_s16(vaddq_s16(vmulq_s16(cb, coeff_b_cb),
556 offset_128), 8));
557
558 // Narrow with saturation
559 uint8x8x3_t rgb;
560 rgb.val[0] = vqmovun_s16(r);
561 rgb.val[1] = vqmovun_s16(g);
562 rgb.val[2] = vqmovun_s16(bl);
563
564 // Store interleaved RGB
565 vst3_u8(dst + i * 3, rgb);
566 }
567
568 // Handle remainder
569 ycbcr_to_rgb_8bit_scalar(src + i * 3, dst + i * 3, pixel_count - i);
570}
571
572#endif // PACS_SIMD_NEON
573
574} // namespace detail
575
576// ============================================================================
577// Public API with runtime dispatch
578// ============================================================================
579
590inline void invert_monochrome_8bit(const uint8_t* src, uint8_t* dst,
591 size_t pixel_count) noexcept {
592#if defined(PACS_SIMD_AVX2)
593 if (has_avx2()) {
594 detail::invert_monochrome_8bit_avx2(src, dst, pixel_count);
595 return;
596 }
597#endif
598#if defined(PACS_SIMD_SSE2)
599 if (has_sse2()) {
600 detail::invert_monochrome_8bit_sse2(src, dst, pixel_count);
601 return;
602 }
603#endif
604#if defined(PACS_SIMD_NEON)
605 if (has_neon()) {
606 detail::invert_monochrome_8bit_neon(src, dst, pixel_count);
607 return;
608 }
609#endif
610 detail::invert_monochrome_8bit_scalar(src, dst, pixel_count);
611}
612
621inline void invert_monochrome_16bit(const uint16_t* src, uint16_t* dst,
622 size_t pixel_count,
623 uint16_t max_value) noexcept {
624#if defined(PACS_SIMD_AVX2)
625 if (has_avx2()) {
626 detail::invert_monochrome_16bit_avx2(src, dst, pixel_count, max_value);
627 return;
628 }
629#endif
630#if defined(PACS_SIMD_SSE2)
631 if (has_sse2()) {
632 detail::invert_monochrome_16bit_sse2(src, dst, pixel_count, max_value);
633 return;
634 }
635#endif
636#if defined(PACS_SIMD_NEON)
637 if (has_neon()) {
638 detail::invert_monochrome_16bit_neon(src, dst, pixel_count, max_value);
639 return;
640 }
641#endif
642 detail::invert_monochrome_16bit_scalar(src, dst, pixel_count, max_value);
643}
644
652inline void rgb_to_ycbcr_8bit(const uint8_t* src, uint8_t* dst,
653 size_t pixel_count) noexcept {
654#if defined(PACS_SIMD_AVX2)
655 if (has_avx2()) {
656 detail::rgb_to_ycbcr_8bit_avx2(src, dst, pixel_count);
657 return;
658 }
659#endif
660#if defined(PACS_SIMD_NEON)
661 if (has_neon()) {
662 detail::rgb_to_ycbcr_8bit_neon(src, dst, pixel_count);
663 return;
664 }
665#endif
666 detail::rgb_to_ycbcr_8bit_scalar(src, dst, pixel_count);
667}
668
676inline void ycbcr_to_rgb_8bit(const uint8_t* src, uint8_t* dst,
677 size_t pixel_count) noexcept {
678#if defined(PACS_SIMD_AVX2)
679 if (has_avx2()) {
680 detail::ycbcr_to_rgb_8bit_avx2(src, dst, pixel_count);
681 return;
682 }
683#endif
684#if defined(PACS_SIMD_NEON)
685 if (has_neon()) {
686 detail::ycbcr_to_rgb_8bit_neon(src, dst, pixel_count);
687 return;
688 }
689#endif
690 detail::ycbcr_to_rgb_8bit_scalar(src, dst, pixel_count);
691}
692
693} // namespace kcenon::pacs::encoding::simd
694
695#endif // PACS_ENCODING_SIMD_PHOTOMETRIC_HPP
void invert_monochrome_8bit_scalar(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Scalar 8-bit monochrome inversion (MONOCHROME1 <-> MONOCHROME2)
void ycbcr_to_rgb_8bit_scalar(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Scalar YCbCr to RGB conversion (ITU-R BT.601)
void invert_monochrome_16bit_scalar(const uint16_t *src, uint16_t *dst, size_t pixel_count, uint16_t max_value) noexcept
Scalar 16-bit monochrome inversion.
void rgb_to_ycbcr_8bit_scalar(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Scalar RGB to YCbCr conversion (ITU-R BT.601)
void invert_monochrome_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Invert 8-bit monochrome pixels (MONOCHROME1 <-> MONOCHROME2)
bool has_neon() noexcept
Check if NEON is available.
bool has_sse2() noexcept
Check if SSE2 is available.
void invert_monochrome_16bit(const uint16_t *src, uint16_t *dst, size_t pixel_count, uint16_t max_value) noexcept
Invert 16-bit monochrome pixels.
bool has_avx2() noexcept
Check if AVX2 is available.
void ycbcr_to_rgb_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Convert YCbCr to RGB color space (ITU-R BT.601)
void rgb_to_ycbcr_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Convert RGB to YCbCr color space (ITU-R BT.601)
SIMD configuration and CPU feature detection.
Platform-specific SIMD type definitions and wrappers.