19#ifndef PACS_ENCODING_SIMD_PHOTOMETRIC_HPP
20#define PACS_ENCODING_SIMD_PHOTOMETRIC_HPP
33 size_t pixel_count)
noexcept;
35 size_t pixel_count, uint16_t max_value)
noexcept;
37 size_t pixel_count)
noexcept;
39 size_t pixel_count)
noexcept;
51 size_t pixel_count)
noexcept {
52 for (
size_t i = 0; i < pixel_count; ++i) {
53 dst[i] = 255 - src[i];
63 uint16_t max_value)
noexcept {
64 for (
size_t i = 0; i < pixel_count; ++i) {
65 dst[i] = max_value - src[i];
77 size_t pixel_count)
noexcept {
78 for (
size_t i = 0; i < pixel_count; ++i) {
79 const int r = src[i * 3];
80 const int g = src[i * 3 + 1];
81 const int b = src[i * 3 + 2];
85 const int y = (19595 * r + 38470 * g + 7471 * b + 32768) >> 16;
86 const int cb = (-11056 * r - 21712 * g + 32768 * b + 32768) >> 16;
87 const int cr = (32768 * r - 27440 * g - 5328 * b + 32768) >> 16;
89 dst[i * 3] =
static_cast<uint8_t
>(y < 0 ? 0 : (y > 255 ? 255 : y));
90 dst[i * 3 + 1] =
static_cast<uint8_t
>(cb + 128);
91 dst[i * 3 + 2] =
static_cast<uint8_t
>(cr + 128);
103 size_t pixel_count)
noexcept {
104 for (
size_t i = 0; i < pixel_count; ++i) {
105 const int y = src[i * 3];
106 const int cb = src[i * 3 + 1] - 128;
107 const int cr = src[i * 3 + 2] - 128;
110 const int r = y + ((91881 * cr + 32768) >> 16);
111 const int g = y - ((22554 * cb + 46802 * cr + 32768) >> 16);
112 const int b = y + ((116130 * cb + 32768) >> 16);
114 dst[i * 3] =
static_cast<uint8_t
>(r < 0 ? 0 : (r > 255 ? 255 : r));
115 dst[i * 3 + 1] =
static_cast<uint8_t
>(g < 0 ? 0 : (g > 255 ? 255 : g));
116 dst[i * 3 + 2] =
static_cast<uint8_t
>(b < 0 ? 0 : (b > 255 ? 255 : b));
124#if defined(PACS_SIMD_SSE2)
130inline void invert_monochrome_8bit_sse2(
const uint8_t* src, uint8_t* dst,
131 size_t pixel_count)
noexcept {
132 const __m128i all_ones = _mm_set1_epi8(
static_cast<char>(0xFF));
133 const size_t simd_count = (pixel_count / 16) * 16;
136 for (; i < simd_count; i += 16) {
137 __m128i v = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i));
138 v = _mm_xor_si128(v, all_ones);
139 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i), v);
150inline void invert_monochrome_16bit_sse2(
const uint16_t* src, uint16_t* dst,
152 uint16_t max_value)
noexcept {
153 const __m128i max_vec = _mm_set1_epi16(
static_cast<int16_t
>(max_value));
154 const size_t simd_count = (pixel_count / 8) * 8;
157 for (; i < simd_count; i += 8) {
158 __m128i v = _mm_loadu_si128(
reinterpret_cast<const __m128i*
>(src + i));
159 v = _mm_sub_epi16(max_vec, v);
160 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(dst + i), v);
173#if defined(PACS_SIMD_AVX2)
179inline void invert_monochrome_8bit_avx2(
const uint8_t* src, uint8_t* dst,
180 size_t pixel_count)
noexcept {
181 const __m256i all_ones = _mm256_set1_epi8(
static_cast<char>(0xFF));
182 const size_t simd_count = (pixel_count / 32) * 32;
185 for (; i < simd_count; i += 32) {
186 __m256i v = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + i));
187 v = _mm256_xor_si256(v, all_ones);
188 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i), v);
192#if defined(PACS_SIMD_SSE2)
193 if (pixel_count - i >= 16) {
194 invert_monochrome_8bit_sse2(src + i, dst + i, pixel_count - i);
206inline void invert_monochrome_16bit_avx2(
const uint16_t* src, uint16_t* dst,
208 uint16_t max_value)
noexcept {
209 const __m256i max_vec = _mm256_set1_epi16(
static_cast<int16_t
>(max_value));
210 const size_t simd_count = (pixel_count / 16) * 16;
213 for (; i < simd_count; i += 16) {
214 __m256i v = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + i));
215 v = _mm256_sub_epi16(max_vec, v);
216 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + i), v);
220#if defined(PACS_SIMD_SSE2)
221 if (pixel_count - i >= 8) {
222 invert_monochrome_16bit_sse2(src + i, dst + i, pixel_count - i, max_value);
236inline void rgb_to_ycbcr_8bit_avx2(
const uint8_t* src, uint8_t* dst,
237 size_t pixel_count)
noexcept {
242 const __m256i coeff_y_r = _mm256_set1_epi16(77);
243 const __m256i coeff_y_g = _mm256_set1_epi16(150);
244 const __m256i coeff_y_b = _mm256_set1_epi16(29);
246 const __m256i coeff_cb_r = _mm256_set1_epi16(-43);
247 const __m256i coeff_cb_g = _mm256_set1_epi16(-85);
248 const __m256i coeff_cb_b = _mm256_set1_epi16(128);
250 const __m256i coeff_cr_r = _mm256_set1_epi16(128);
251 const __m256i coeff_cr_g = _mm256_set1_epi16(-107);
252 const __m256i coeff_cr_b = _mm256_set1_epi16(-21);
254 const __m256i offset_128 = _mm256_set1_epi16(128);
255 const __m256i round_add = _mm256_set1_epi16(128);
257 const size_t simd_count = (pixel_count / 8) * 8;
260 for (; i < simd_count; i += 8) {
263 alignas(32) int16_t r[8], g[8], b[8];
264 for (
int j = 0; j < 8; ++j) {
265 r[j] = src[(i + j) * 3];
266 g[j] = src[(i + j) * 3 + 1];
267 b[j] = src[(i + j) * 3 + 2];
270 __m256i r_vec = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(r));
271 __m256i g_vec = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(g));
272 __m256i b_vec = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(b));
275 __m256i y = _mm256_add_epi16(
276 _mm256_add_epi16(_mm256_mullo_epi16(r_vec, coeff_y_r),
277 _mm256_mullo_epi16(g_vec, coeff_y_g)),
278 _mm256_mullo_epi16(b_vec, coeff_y_b));
279 y = _mm256_add_epi16(y, round_add);
280 y = _mm256_srai_epi16(y, 8);
283 __m256i cb = _mm256_add_epi16(
284 _mm256_add_epi16(_mm256_mullo_epi16(r_vec, coeff_cb_r),
285 _mm256_mullo_epi16(g_vec, coeff_cb_g)),
286 _mm256_mullo_epi16(b_vec, coeff_cb_b));
287 cb = _mm256_add_epi16(cb, round_add);
288 cb = _mm256_srai_epi16(cb, 8);
289 cb = _mm256_add_epi16(cb, offset_128);
292 __m256i
cr = _mm256_add_epi16(
293 _mm256_add_epi16(_mm256_mullo_epi16(r_vec, coeff_cr_r),
294 _mm256_mullo_epi16(g_vec, coeff_cr_g)),
295 _mm256_mullo_epi16(b_vec, coeff_cr_b));
296 cr = _mm256_add_epi16(cr, round_add);
297 cr = _mm256_srai_epi16(cr, 8);
298 cr = _mm256_add_epi16(cr, offset_128);
301 alignas(32) int16_t y_out[8], cb_out[8], cr_out[8];
302 _mm256_store_si256(
reinterpret_cast<__m256i*
>(y_out), y);
303 _mm256_store_si256(
reinterpret_cast<__m256i*
>(cb_out), cb);
304 _mm256_store_si256(
reinterpret_cast<__m256i*
>(cr_out), cr);
306 for (
int j = 0; j < 8; ++j) {
307 dst[(i + j) * 3] =
static_cast<uint8_t
>(
308 y_out[j] < 0 ? 0 : (y_out[j] > 255 ? 255 : y_out[j]));
309 dst[(i + j) * 3 + 1] =
static_cast<uint8_t
>(
310 cb_out[j] < 0 ? 0 : (cb_out[j] > 255 ? 255 : cb_out[j]));
311 dst[(i + j) * 3 + 2] =
static_cast<uint8_t
>(
312 cr_out[j] < 0 ? 0 : (cr_out[j] > 255 ? 255 : cr_out[j]));
323inline void ycbcr_to_rgb_8bit_avx2(
const uint8_t* src, uint8_t* dst,
324 size_t pixel_count)
noexcept {
329 const __m256i coeff_r_cr = _mm256_set1_epi16(359);
330 const __m256i coeff_g_cb = _mm256_set1_epi16(-88);
331 const __m256i coeff_g_cr = _mm256_set1_epi16(-183);
332 const __m256i coeff_b_cb = _mm256_set1_epi16(454);
334 const __m256i round_add = _mm256_set1_epi16(128);
335 const __m256i zero = _mm256_setzero_si256();
336 const __m256i max_255 = _mm256_set1_epi16(255);
338 const size_t simd_count = (pixel_count / 8) * 8;
341 for (; i < simd_count; i += 8) {
343 alignas(32) int16_t y[8], cb[8], cr[8];
344 for (
int j = 0; j < 8; ++j) {
345 y[j] = src[(i + j) * 3];
346 cb[j] = src[(i + j) * 3 + 1] - 128;
347 cr[j] = src[(i + j) * 3 + 2] - 128;
350 __m256i y_vec = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(y));
351 __m256i cb_vec = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(cb));
352 __m256i cr_vec = _mm256_load_si256(
reinterpret_cast<const __m256i*
>(cr));
355 __m256i r = _mm256_add_epi16(
357 _mm256_add_epi16(_mm256_mullo_epi16(cr_vec, coeff_r_cr), round_add),
362 __m256i g = _mm256_add_epi16(
365 _mm256_add_epi16(_mm256_mullo_epi16(cb_vec, coeff_g_cb),
366 _mm256_mullo_epi16(cr_vec, coeff_g_cr)),
372 __m256i b = _mm256_add_epi16(
374 _mm256_add_epi16(_mm256_mullo_epi16(cb_vec, coeff_b_cb), round_add),
379 r = _mm256_max_epi16(_mm256_min_epi16(r, max_255), zero);
380 g = _mm256_max_epi16(_mm256_min_epi16(g, max_255), zero);
381 b = _mm256_max_epi16(_mm256_min_epi16(b, max_255), zero);
384 alignas(32) int16_t r_out[8], g_out[8], b_out[8];
385 _mm256_store_si256(
reinterpret_cast<__m256i*
>(r_out), r);
386 _mm256_store_si256(
reinterpret_cast<__m256i*
>(g_out), g);
387 _mm256_store_si256(
reinterpret_cast<__m256i*
>(b_out), b);
389 for (
int j = 0; j < 8; ++j) {
390 dst[(i + j) * 3] =
static_cast<uint8_t
>(r_out[j]);
391 dst[(i + j) * 3 + 1] =
static_cast<uint8_t
>(g_out[j]);
392 dst[(i + j) * 3 + 2] =
static_cast<uint8_t
>(b_out[j]);
406#if defined(PACS_SIMD_NEON)
412inline void invert_monochrome_8bit_neon(
const uint8_t* src, uint8_t* dst,
413 size_t pixel_count)
noexcept {
414 const uint8x16_t all_ones = vdupq_n_u8(0xFF);
415 const size_t simd_count = (pixel_count / 16) * 16;
418 for (; i < simd_count; i += 16) {
419 uint8x16_t v = vld1q_u8(src + i);
420 v = veorq_u8(v, all_ones);
421 vst1q_u8(dst + i, v);
432inline void invert_monochrome_16bit_neon(
const uint16_t* src, uint16_t* dst,
434 uint16_t max_value)
noexcept {
435 const uint16x8_t max_vec = vdupq_n_u16(max_value);
436 const size_t simd_count = (pixel_count / 8) * 8;
439 for (; i < simd_count; i += 8) {
440 uint16x8_t v = vld1q_u16(src + i);
441 v = vsubq_u16(max_vec, v);
442 vst1q_u16(dst + i, v);
453inline void rgb_to_ycbcr_8bit_neon(
const uint8_t* src, uint8_t* dst,
454 size_t pixel_count)
noexcept {
456 const int16x8_t coeff_y_r = vdupq_n_s16(77);
457 const int16x8_t coeff_y_g = vdupq_n_s16(150);
458 const int16x8_t coeff_y_b = vdupq_n_s16(29);
460 const int16x8_t coeff_cb_r = vdupq_n_s16(-43);
461 const int16x8_t coeff_cb_g = vdupq_n_s16(-85);
462 const int16x8_t coeff_cb_b = vdupq_n_s16(128);
464 const int16x8_t coeff_cr_r = vdupq_n_s16(128);
465 const int16x8_t coeff_cr_g = vdupq_n_s16(-107);
466 const int16x8_t coeff_cr_b = vdupq_n_s16(-21);
468 const int16x8_t offset_128 = vdupq_n_s16(128);
470 const size_t simd_count = (pixel_count / 8) * 8;
473 for (; i < simd_count; i += 8) {
475 uint8x8x3_t
rgb = vld3_u8(src + i * 3);
478 int16x8_t r = vreinterpretq_s16_u16(vmovl_u8(
rgb.val[0]));
479 int16x8_t g = vreinterpretq_s16_u16(vmovl_u8(
rgb.val[1]));
480 int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(
rgb.val[2]));
483 int16x8_t y = vaddq_s16(vaddq_s16(vmulq_s16(r, coeff_y_r),
484 vmulq_s16(g, coeff_y_g)),
485 vmulq_s16(b, coeff_y_b));
486 y = vaddq_s16(y, offset_128);
487 y = vshrq_n_s16(y, 8);
490 int16x8_t cb = vaddq_s16(vaddq_s16(vmulq_s16(r, coeff_cb_r),
491 vmulq_s16(g, coeff_cb_g)),
492 vmulq_s16(b, coeff_cb_b));
493 cb = vaddq_s16(cb, offset_128);
494 cb = vshrq_n_s16(cb, 8);
495 cb = vaddq_s16(cb, offset_128);
498 int16x8_t
cr = vaddq_s16(vaddq_s16(vmulq_s16(r, coeff_cr_r),
499 vmulq_s16(g, coeff_cr_g)),
500 vmulq_s16(b, coeff_cr_b));
501 cr = vaddq_s16(cr, offset_128);
502 cr = vshrq_n_s16(cr, 8);
503 cr = vaddq_s16(cr, offset_128);
507 ycbcr.val[0] = vqmovun_s16(y);
508 ycbcr.val[1] = vqmovun_s16(cb);
509 ycbcr.val[2] = vqmovun_s16(cr);
512 vst3_u8(dst + i * 3, ycbcr);
522inline void ycbcr_to_rgb_8bit_neon(
const uint8_t* src, uint8_t* dst,
523 size_t pixel_count)
noexcept {
524 const int16x8_t coeff_r_cr = vdupq_n_s16(359);
525 const int16x8_t coeff_g_cb = vdupq_n_s16(-88);
526 const int16x8_t coeff_g_cr = vdupq_n_s16(-183);
527 const int16x8_t coeff_b_cb = vdupq_n_s16(454);
528 const int16x8_t offset_128 = vdupq_n_s16(128);
530 const size_t simd_count = (pixel_count / 8) * 8;
533 for (; i < simd_count; i += 8) {
535 uint8x8x3_t ycbcr = vld3_u8(src + i * 3);
538 int16x8_t y = vreinterpretq_s16_u16(vmovl_u8(ycbcr.val[0]));
539 int16x8_t cb = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(ycbcr.val[1])),
541 int16x8_t
cr = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(ycbcr.val[2])),
545 int16x8_t r = vaddq_s16(y, vshrq_n_s16(vaddq_s16(vmulq_s16(cr, coeff_r_cr),
549 int16x8_t g = vaddq_s16(y, vshrq_n_s16(
550 vaddq_s16(vaddq_s16(vmulq_s16(cb, coeff_g_cb),
551 vmulq_s16(cr, coeff_g_cr)),
555 int16x8_t bl = vaddq_s16(y, vshrq_n_s16(vaddq_s16(vmulq_s16(cb, coeff_b_cb),
560 rgb.val[0] = vqmovun_s16(r);
561 rgb.val[1] = vqmovun_s16(g);
562 rgb.val[2] = vqmovun_s16(bl);
565 vst3_u8(dst + i * 3, rgb);
591 size_t pixel_count)
noexcept {
592#if defined(PACS_SIMD_AVX2)
594 detail::invert_monochrome_8bit_avx2(src, dst, pixel_count);
598#if defined(PACS_SIMD_SSE2)
600 detail::invert_monochrome_8bit_sse2(src, dst, pixel_count);
604#if defined(PACS_SIMD_NEON)
606 detail::invert_monochrome_8bit_neon(src, dst, pixel_count);
623 uint16_t max_value)
noexcept {
624#if defined(PACS_SIMD_AVX2)
626 detail::invert_monochrome_16bit_avx2(src, dst, pixel_count, max_value);
630#if defined(PACS_SIMD_SSE2)
632 detail::invert_monochrome_16bit_sse2(src, dst, pixel_count, max_value);
636#if defined(PACS_SIMD_NEON)
638 detail::invert_monochrome_16bit_neon(src, dst, pixel_count, max_value);
653 size_t pixel_count)
noexcept {
654#if defined(PACS_SIMD_AVX2)
656 detail::rgb_to_ycbcr_8bit_avx2(src, dst, pixel_count);
660#if defined(PACS_SIMD_NEON)
662 detail::rgb_to_ycbcr_8bit_neon(src, dst, pixel_count);
677 size_t pixel_count)
noexcept {
678#if defined(PACS_SIMD_AVX2)
680 detail::ycbcr_to_rgb_8bit_avx2(src, dst, pixel_count);
684#if defined(PACS_SIMD_NEON)
686 detail::ycbcr_to_rgb_8bit_neon(src, dst, pixel_count);
@ rgb
Red, Green, Blue color model.
void invert_monochrome_8bit_scalar(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Scalar 8-bit monochrome inversion (MONOCHROME1 <-> MONOCHROME2)
void ycbcr_to_rgb_8bit_scalar(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Scalar YCbCr to RGB conversion (ITU-R BT.601)
void invert_monochrome_16bit_scalar(const uint16_t *src, uint16_t *dst, size_t pixel_count, uint16_t max_value) noexcept
Scalar 16-bit monochrome inversion.
void rgb_to_ycbcr_8bit_scalar(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Scalar RGB to YCbCr conversion (ITU-R BT.601)
void invert_monochrome_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Invert 8-bit monochrome pixels (MONOCHROME1 <-> MONOCHROME2)
bool has_neon() noexcept
Check if NEON is available.
bool has_sse2() noexcept
Check if SSE2 is available.
void invert_monochrome_16bit(const uint16_t *src, uint16_t *dst, size_t pixel_count, uint16_t max_value) noexcept
Invert 16-bit monochrome pixels.
bool has_avx2() noexcept
Check if AVX2 is available.
void ycbcr_to_rgb_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Convert YCbCr to RGB color space (ITU-R BT.601)
void rgb_to_ycbcr_8bit(const uint8_t *src, uint8_t *dst, size_t pixel_count) noexcept
Convert RGB to YCbCr color space (ITU-R BT.601)
@ cr
Computed Radiography.
SIMD configuration and CPU feature detection.
Platform-specific SIMD type definitions and wrappers.