PACS System 0.1.0
PACS DICOM system library
Loading...
Searching...
No Matches
character_set.cpp
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2021-2025, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
11
12#include <algorithm>
13#include <array>
14#include <cstring>
15
16#include <unicode/ucnv.h>
17#include <unicode/utypes.h>
18
19namespace kcenon::pacs::encoding {
20
21// =============================================================================
22// Character Set Registry
23// =============================================================================
24
25namespace {
26
27// Escape sequence constants (raw bytes including ESC = 0x1B)
28constexpr char ESC = '\x1B';
29
30// Korean: ESC $ ) C
31constexpr std::string_view esc_ir_149{"\x1B\x24\x29\x43", 4};
32
33// Japanese Kanji: ESC $ B
34constexpr std::string_view esc_ir_87{"\x1B\x24\x42", 3};
35
36// Japanese Katakana: ESC ( J
37constexpr std::string_view esc_ir_13{"\x1B\x28\x4A", 3};
38
39// Chinese GB2312: ESC $ ) A
40constexpr std::string_view esc_ir_58{"\x1B\x24\x29\x41", 4};
41
42// ASCII return: ESC ( B
43constexpr std::string_view esc_ir_6{"\x1B\x28\x42", 3};
44
45// Latin-1: ESC - A
46constexpr std::string_view esc_ir_100{"\x1B\x2D\x41", 3};
47
48// Latin-2 (Central European): ESC - B
49constexpr std::string_view esc_ir_101{"\x1B\x2D\x42", 3};
50
51// Greek: ESC - F
52constexpr std::string_view esc_ir_126{"\x1B\x2D\x46", 3};
53
54// Arabic: ESC - G
55constexpr std::string_view esc_ir_127{"\x1B\x2D\x47", 3};
56
57// Hebrew: ESC - H
58constexpr std::string_view esc_ir_138{"\x1B\x2D\x48", 3};
59
60// Cyrillic: ESC - L
61constexpr std::string_view esc_ir_144{"\x1B\x2D\x4C", 3};
62
63// Thai: ESC - T
64constexpr std::string_view esc_ir_166{"\x1B\x2D\x54", 3};
65
67const std::array<character_set_info, 20> charset_registry = {{
68 {
69 "ISO_IR 6", // defined_term
70 "ASCII (Default)", // description
71 "ISO-IR 6", // iso_ir
72 false, // uses_code_extensions
73 {}, // escape_sequence (none for default)
74 "ASCII", // encoding_name
75 false // is_multi_byte
76 },
77 {
78 "ISO_IR 100",
79 "Latin-1 (Western European)",
80 "ISO-IR 100",
81 false,
82 {},
83 "ISO-8859-1",
84 false
85 },
86 {
87 "ISO_IR 101",
88 "Latin-2 (Central European)",
89 "ISO-IR 101",
90 false,
91 {},
92 "ISO-8859-2",
93 false
94 },
95 {
96 "ISO_IR 126",
97 "Greek",
98 "ISO-IR 126",
99 false,
100 {},
101 "ISO-8859-7",
102 false
103 },
104 {
105 "ISO_IR 127",
106 "Arabic",
107 "ISO-IR 127",
108 false,
109 {},
110 "ISO-8859-6",
111 false
112 },
113 {
114 "ISO_IR 138",
115 "Hebrew",
116 "ISO-IR 138",
117 false,
118 {},
119 "ISO-8859-8",
120 false
121 },
122 {
123 "ISO_IR 144",
124 "Cyrillic",
125 "ISO-IR 144",
126 false,
127 {},
128 "ISO-8859-5",
129 false
130 },
131 {
132 "ISO_IR 166",
133 "Thai (TIS 620-2533)",
134 "ISO-IR 166",
135 false,
136 {},
137 "TIS-620",
138 false
139 },
140 {
141 "ISO_IR 192",
142 "UTF-8 (Unicode)",
143 "ISO-IR 192",
144 false,
145 {},
146 "UTF-8",
147 true
148 },
149 {
150 "ISO 2022 IR 6",
151 "ASCII (Default, with extensions)",
152 "ISO-IR 6",
153 true,
154 esc_ir_6,
155 "ASCII",
156 false
157 },
158 {
159 "ISO 2022 IR 100",
160 "Latin-1 (with extensions)",
161 "ISO-IR 100",
162 true,
163 esc_ir_100,
164 "ISO-8859-1",
165 false
166 },
167 {
168 "ISO 2022 IR 101",
169 "Latin-2 (with extensions)",
170 "ISO-IR 101",
171 true,
172 esc_ir_101,
173 "ISO-8859-2",
174 false
175 },
176 {
177 "ISO 2022 IR 126",
178 "Greek (with extensions)",
179 "ISO-IR 126",
180 true,
181 esc_ir_126,
182 "ISO-8859-7",
183 false
184 },
185 {
186 "ISO 2022 IR 127",
187 "Arabic (with extensions)",
188 "ISO-IR 127",
189 true,
190 esc_ir_127,
191 "ISO-8859-6",
192 false
193 },
194 {
195 "ISO 2022 IR 138",
196 "Hebrew (with extensions)",
197 "ISO-IR 138",
198 true,
199 esc_ir_138,
200 "ISO-8859-8",
201 false
202 },
203 {
204 "ISO 2022 IR 144",
205 "Cyrillic (with extensions)",
206 "ISO-IR 144",
207 true,
208 esc_ir_144,
209 "ISO-8859-5",
210 false
211 },
212 {
213 "ISO 2022 IR 149",
214 "Korean (KS X 1001)",
215 "ISO-IR 149",
216 true,
217 esc_ir_149,
218 "EUC-KR",
219 true
220 },
221 {
222 "ISO 2022 IR 166",
223 "Thai (with extensions)",
224 "ISO-IR 166",
225 true,
226 esc_ir_166,
227 "TIS-620",
228 false
229 },
230 {
231 "ISO 2022 IR 87",
232 "Japanese Kanji (JIS X 0208)",
233 "ISO-IR 87",
234 true,
235 esc_ir_87,
236 "ISO-2022-JP",
237 true
238 },
239 {
240 "ISO 2022 IR 13",
241 "Japanese Katakana (JIS X 0201)",
242 "ISO-IR 13",
243 true,
244 esc_ir_13,
245 "JIS_X0201",
246 false
247 },
248}};
249
250// GB2312 is separate because it shares the same pattern as Korean
251const character_set_info charset_ir_58 = {
252 "ISO 2022 IR 58",
253 "Chinese (GB2312)",
254 "ISO-IR 58",
255 true,
256 esc_ir_58,
257 "GB2312",
258 true
259};
260
261// GB18030 is separate: replacement encoding without ISO 2022 escape sequences
262const character_set_info charset_gb18030 = {
263 "GB18030",
264 "Chinese (GB18030, full)",
265 "GB18030",
266 false,
267 {},
268 "GB18030",
269 true
270};
271
272const character_set_info* find_in_registry(std::string_view term) noexcept {
273 for (const auto& entry : charset_registry) {
274 if (entry.defined_term == term) {
275 return &entry;
276 }
277 }
278 if (charset_ir_58.defined_term == term) {
279 return &charset_ir_58;
280 }
281 if (charset_gb18030.defined_term == term) {
282 return &charset_gb18030;
283 }
284 return nullptr;
285}
286
288const character_set_info* find_by_ir_number(int ir_number) noexcept {
289 switch (ir_number) {
290 case 6: return find_in_registry("ISO_IR 6");
291 case 100: return find_in_registry("ISO_IR 100");
292 case 101: return find_in_registry("ISO_IR 101");
293 case 126: return find_in_registry("ISO_IR 126");
294 case 127: return find_in_registry("ISO_IR 127");
295 case 138: return find_in_registry("ISO_IR 138");
296 case 144: return find_in_registry("ISO_IR 144");
297 case 166: return find_in_registry("ISO_IR 166");
298 case 192: return find_in_registry("ISO_IR 192");
299 case 149: return find_in_registry("ISO 2022 IR 149");
300 case 87: return find_in_registry("ISO 2022 IR 87");
301 case 13: return find_in_registry("ISO 2022 IR 13");
302 case 58: return &charset_ir_58;
303 default: return nullptr;
304 }
305}
306
308const character_set_info* find_by_escape_sequence(
309 std::string_view text, size_t pos,
310 const specific_character_set& scs) noexcept {
311 // Check extension sets first (more specific)
312 for (const auto* cs : scs.extension_sets) {
313 if (cs && !cs->escape_sequence.empty()) {
314 auto esc_len = cs->escape_sequence.size();
315 if (pos + esc_len <= text.size() &&
316 text.substr(pos, esc_len) == cs->escape_sequence) {
317 return cs;
318 }
319 }
320 }
321 // Check ASCII return sequence
322 if (pos + esc_ir_6.size() <= text.size() &&
323 text.substr(pos, esc_ir_6.size()) == esc_ir_6) {
324 return scs.default_set;
325 }
326 return nullptr;
327}
328
329// ICU-based encoding conversion: source encoding → UTF-8
330std::string icu_convert_to_utf8(std::string_view input,
331 const character_set_info& charset) {
332 if (charset.encoding_name == "ASCII" ||
333 charset.encoding_name == "UTF-8") {
334 return std::string(input);
335 }
336
337 // ISO-2022-JP is a stateful encoding: ICU expects escape sequences
338 // in the input. Since split_by_escape_sequences strips them, we must
339 // re-wrap the raw JIS bytes before calling ucnv_convert.
340 std::string wrapped_input;
341 if (charset.encoding_name == "ISO-2022-JP") {
342 wrapped_input.append(charset.escape_sequence);
343 wrapped_input.append(input);
344 wrapped_input.append(esc_ir_6);
345 }
346 const auto& actual_input = (charset.encoding_name == "ISO-2022-JP")
347 ? std::string_view(wrapped_input)
348 : input;
349
350 // Allocate output buffer (UTF-8 can be up to 4x the input size)
351 auto out_size = static_cast<int32_t>(actual_input.size() * 4 + 4);
352 std::string output(static_cast<size_t>(out_size), '\0');
353
354 UErrorCode status = U_ZERO_ERROR;
355 int32_t result_len = ucnv_convert(
356 "UTF-8", charset.encoding_name.data(),
357 output.data(), out_size,
358 actual_input.data(), static_cast<int32_t>(actual_input.size()),
359 &status);
360
361 if (U_FAILURE(status)) {
362 // Conversion failed, return raw bytes
363 return std::string(input);
364 }
365
366 output.resize(static_cast<size_t>(result_len));
367 return output;
368}
369
370// ICU-based encoding conversion: UTF-8 → target encoding
371std::string icu_convert_from_utf8(std::string_view utf8_input,
372 const character_set_info& charset) {
373 if (charset.encoding_name == "ASCII" ||
374 charset.encoding_name == "UTF-8") {
375 return std::string(utf8_input);
376 }
377
378 auto out_size = static_cast<int32_t>(utf8_input.size() * 4 + 4);
379 std::string output(static_cast<size_t>(out_size), '\0');
380
381 UErrorCode status = U_ZERO_ERROR;
382 int32_t result_len = ucnv_convert(
383 charset.encoding_name.data(), "UTF-8",
384 output.data(), out_size,
385 utf8_input.data(), static_cast<int32_t>(utf8_input.size()),
386 &status);
387
388 if (U_FAILURE(status)) {
389 return std::string(utf8_input);
390 }
391
392 output.resize(static_cast<size_t>(result_len));
393
394 // ISO-2022-JP is stateful: ICU output includes escape sequences
395 // (ESC $ B ... ESC ( B). Since encode_from_utf8() adds its own escape
396 // sequences, strip the ICU-generated ones to avoid duplication.
397 if (charset.encoding_name == "ISO-2022-JP" && output.size() >= 6) {
398 auto esc_prefix = charset.escape_sequence;
399 auto esc_suffix = esc_ir_6;
400 bool has_prefix = (output.size() >= esc_prefix.size() &&
401 std::string_view(output).substr(0, esc_prefix.size()) == esc_prefix);
402 bool has_suffix = (output.size() >= esc_suffix.size() &&
403 std::string_view(output).substr(
404 output.size() - esc_suffix.size()) == esc_suffix);
405 if (has_prefix && has_suffix) {
406 output = output.substr(
407 esc_prefix.size(),
408 output.size() - esc_prefix.size() - esc_suffix.size());
409 }
410 }
411
412 return output;
413}
414
415} // anonymous namespace
416
417// =============================================================================
418// Public Registry Functions
419// =============================================================================
420
422 std::string_view defined_term) noexcept {
423 return find_in_registry(defined_term);
424}
425
427 int iso_ir_number) noexcept {
428 return find_by_ir_number(iso_ir_number);
429}
430
432 // ISO_IR 6 (ASCII) is always the first entry
433 return charset_registry[0];
434}
435
436std::vector<const character_set_info*> all_character_sets() noexcept {
437 std::vector<const character_set_info*> result;
438 result.reserve(charset_registry.size() + 2);
439 for (const auto& entry : charset_registry) {
440 result.push_back(&entry);
441 }
442 result.push_back(&charset_ir_58);
443 result.push_back(&charset_gb18030);
444 return result;
445}
446
447// =============================================================================
448// Specific Character Set Parsing
449// =============================================================================
450
452 return !extension_sets.empty();
453}
454
457 return false;
458 }
459 for (const auto* cs : extension_sets) {
460 if (cs && cs->is_multi_byte) {
461 return false;
462 }
463 }
464 return true;
465}
466
467bool specific_character_set::is_utf8() const noexcept {
468 return default_set && default_set->defined_term == "ISO_IR 192";
469}
470
474
475 if (value.empty()) {
476 return result;
477 }
478
479 // Split by backslash
480 std::vector<std::string_view> components;
481 size_t start = 0;
482 while (start <= value.size()) {
483 size_t pos = value.find('\\', start);
484 if (pos == std::string_view::npos) {
485 components.push_back(value.substr(start));
486 break;
487 }
488 components.push_back(value.substr(start, pos - start));
489 start = pos + 1;
490 }
491
492 if (components.empty()) {
493 return result;
494 }
495
496 // First component: default character set
497 auto first = components[0];
498 // Trim whitespace
499 while (!first.empty() && first.front() == ' ') first.remove_prefix(1);
500 while (!first.empty() && first.back() == ' ') first.remove_suffix(1);
501
502 if (!first.empty()) {
503 const auto* cs = find_in_registry(first);
504 if (cs) {
505 result.default_set = cs;
506 }
507 }
508 // Empty first component: ASCII default (already set)
509
510 // Remaining components: extension character sets
511 for (size_t i = 1; i < components.size(); ++i) {
512 auto term = components[i];
513 while (!term.empty() && term.front() == ' ') term.remove_prefix(1);
514 while (!term.empty() && term.back() == ' ') term.remove_suffix(1);
515
516 if (!term.empty()) {
517 const auto* cs = find_in_registry(term);
518 if (cs) {
519 result.extension_sets.push_back(cs);
520 }
521 }
522 }
523
524 return result;
525}
526
527// =============================================================================
528// ISO 2022 Escape Sequence Handling
529// =============================================================================
530
531std::vector<text_segment> split_by_escape_sequences(
532 std::string_view text,
533 const specific_character_set& scs) {
534
535 std::vector<text_segment> segments;
536
537 if (text.empty()) {
538 return segments;
539 }
540
541 // If no extensions, the entire text uses the default charset
542 if (!scs.uses_extensions()) {
543 segments.push_back({text, scs.default_set});
544 return segments;
545 }
546
547 const character_set_info* current_charset = scs.default_set;
548 size_t segment_start = 0;
549
550 for (size_t i = 0; i < text.size(); ++i) {
551 if (text[i] == ESC) {
552 // Try to match an escape sequence
553 const auto* new_charset = find_by_escape_sequence(text, i, scs);
554 if (new_charset) {
555 // Emit the segment before this escape sequence
556 if (i > segment_start) {
557 segments.push_back({
558 text.substr(segment_start, i - segment_start),
559 current_charset
560 });
561 }
562
563 // Skip the escape sequence
564 size_t esc_len = new_charset->escape_sequence.empty()
565 ? esc_ir_6.size() // ASCII return
566 : new_charset->escape_sequence.size();
567 i += esc_len - 1; // -1 because loop increments
568 segment_start = i + 1;
569 current_charset = new_charset;
570 }
571 }
572 }
573
574 // Emit the remaining segment
575 if (segment_start < text.size()) {
576 segments.push_back({
577 text.substr(segment_start),
578 current_charset
579 });
580 }
581
582 return segments;
583}
584
585// =============================================================================
586// String Decoding
587// =============================================================================
588
589std::string convert_to_utf8(
590 std::string_view text,
591 const character_set_info& charset) {
592
593 if (text.empty()) {
594 return {};
595 }
596
597 // ASCII and UTF-8 need no conversion
598 if (charset.encoding_name == "ASCII" ||
599 charset.encoding_name == "UTF-8") {
600 return std::string(text);
601 }
602
603 return icu_convert_to_utf8(text, charset);
604}
605
606std::string decode_to_utf8(
607 std::string_view text,
608 const specific_character_set& scs) {
609
610 if (text.empty()) {
611 return {};
612 }
613
614 // Fast path: UTF-8 passthrough
615 if (scs.is_utf8()) {
616 return std::string(text);
617 }
618
619 // Fast path: single-byte charset without extensions
620 if (!scs.uses_extensions()) {
621 return convert_to_utf8(text, *scs.default_set);
622 }
623
624 // ISO 2022: split by escape sequences and convert each segment
625 auto segments = split_by_escape_sequences(text, scs);
626
627 std::string result;
628 result.reserve(text.size() * 2);
629
630 for (const auto& seg : segments) {
631 if (seg.charset) {
632 result += convert_to_utf8(seg.text, *seg.charset);
633 } else {
634 result.append(seg.text);
635 }
636 }
637
638 return result;
639}
640
642 std::string_view pn_value,
643 const specific_character_set& scs) {
644
645 if (pn_value.empty()) {
646 return {};
647 }
648
649 // Split by '=' into component groups
650 // (Alphabetic=Ideographic=Phonetic)
651 std::string result;
652 result.reserve(pn_value.size() * 2);
653
654 size_t start = 0;
655 bool first = true;
656
657 while (start <= pn_value.size()) {
658 size_t eq_pos = pn_value.find('=', start);
659 std::string_view group;
660 if (eq_pos == std::string_view::npos) {
661 group = pn_value.substr(start);
662 start = pn_value.size() + 1;
663 } else {
664 group = pn_value.substr(start, eq_pos - start);
665 start = eq_pos + 1;
666 }
667
668 if (!first) {
669 result += '=';
670 }
671 first = false;
672
673 // Decode each component group independently
674 result += decode_to_utf8(group, scs);
675 }
676
677 return result;
678}
679
680// =============================================================================
681// String Encoding (UTF-8 to target encoding)
682// =============================================================================
683
685 std::string_view utf8_text,
686 const character_set_info& charset) {
687
688 if (utf8_text.empty()) {
689 return {};
690 }
691
692 if (charset.encoding_name == "ASCII" ||
693 charset.encoding_name == "UTF-8") {
694 return std::string(utf8_text);
695 }
696
697 return icu_convert_from_utf8(utf8_text, charset);
698}
699
701 std::string_view utf8_text,
702 const specific_character_set& scs) {
703
704 if (utf8_text.empty()) {
705 return {};
706 }
707
708 // Fast path: UTF-8 passthrough
709 if (scs.is_utf8()) {
710 return std::string(utf8_text);
711 }
712
713 // No extensions: simple single-charset conversion
714 if (!scs.uses_extensions()) {
715 return convert_from_utf8(utf8_text, *scs.default_set);
716 }
717
718 // ISO 2022 with extensions: detect non-ASCII runs and wrap with
719 // escape sequences for the first available CJK extension charset.
720 // Strategy: scan UTF-8 text byte by byte. ASCII bytes (< 0x80) use
721 // the default charset. Non-ASCII byte runs are converted using the
722 // first multi-byte extension charset, bracketed by escape sequences.
723
724 const character_set_info* mb_charset = nullptr;
725 for (const auto* cs : scs.extension_sets) {
726 if (cs && cs->is_multi_byte) {
727 mb_charset = cs;
728 break;
729 }
730 }
731
732 if (!mb_charset) {
733 // No multi-byte extension available; convert with default
734 return convert_from_utf8(utf8_text, *scs.default_set);
735 }
736
737 std::string result;
738 result.reserve(utf8_text.size() * 2);
739
740 bool in_multibyte = false;
741 size_t run_start = 0;
742
743 for (size_t i = 0; i < utf8_text.size(); ) {
744 auto uc = static_cast<unsigned char>(utf8_text[i]);
745 bool is_ascii = (uc < 0x80);
746
747 if (is_ascii) {
748 if (in_multibyte) {
749 // Flush multi-byte run
750 auto mb_run = utf8_text.substr(run_start, i - run_start);
751 result.append(mb_charset->escape_sequence);
752 result += convert_from_utf8(mb_run, *mb_charset);
753 // Return to ASCII
754 result.append(esc_ir_6);
755 in_multibyte = false;
756 }
757 if (!in_multibyte && i == run_start) {
758 // Continue ASCII
759 }
760 run_start = i;
761 result += utf8_text[i];
762 ++i;
763 run_start = i;
764 } else {
765 if (!in_multibyte) {
766 in_multibyte = true;
767 run_start = i;
768 }
769 // Skip multi-byte UTF-8 sequence
770 if (uc < 0xC0) { ++i; }
771 else if (uc < 0xE0) { i += 2; }
772 else if (uc < 0xF0) { i += 3; }
773 else { i += 4; }
774 // Clamp to text size
775 if (i > utf8_text.size()) { i = utf8_text.size(); }
776 }
777 }
778
779 // Flush remaining multi-byte run
780 if (in_multibyte && run_start < utf8_text.size()) {
781 auto mb_run = utf8_text.substr(run_start);
782 result.append(mb_charset->escape_sequence);
783 result += convert_from_utf8(mb_run, *mb_charset);
784 result.append(esc_ir_6);
785 }
786
787 return result;
788}
789
790} // namespace kcenon::pacs::encoding
DICOM Character Set registry, ISO 2022 parser, and string decoder.
constexpr dicom_tag status
Status.
std::vector< text_segment > split_by_escape_sequences(std::string_view text, const specific_character_set &scs)
Split a string into segments by ISO 2022 escape sequences.
const character_set_info * find_character_set(std::string_view defined_term) noexcept
Look up character set info by DICOM Defined Term.
std::string encode_from_utf8(std::string_view utf8_text, const specific_character_set &scs)
Encode a UTF-8 string to the target character set encoding.
specific_character_set parse_specific_character_set(std::string_view value)
Parse a Specific Character Set (0008,0005) value.
const character_set_info & default_character_set() noexcept
Get the default character set (ISO-IR 6, ASCII).
std::vector< const character_set_info * > all_character_sets() noexcept
Get all registered character sets.
const character_set_info * find_character_set_by_ir(int iso_ir_number) noexcept
Look up character set info by ISO-IR number.
std::string decode_person_name(std::string_view pn_value, const specific_character_set &scs)
Decode a Person Name (PN) value to UTF-8.
std::string decode_to_utf8(std::string_view text, const specific_character_set &scs)
Decode a DICOM string to UTF-8 using the given character set.
std::string convert_from_utf8(std::string_view utf8_text, const character_set_info &charset)
Encode a single UTF-8 segment to a specific character set.
std::string convert_to_utf8(std::string_view text, const character_set_info &charset)
Decode a single segment from a specific encoding to UTF-8.
Information about a DICOM character set.
std::string_view defined_term
DICOM Defined Term (e.g., "ISO 2022 IR 149")
std::string_view escape_sequence
Raw escape sequence bytes (empty if none)
std::string_view encoding_name
ICU converter name (e.g., "EUC-KR")
bool is_multi_byte
true if characters can be multi-byte
Parsed representation of a multi-valued Specific Character Set.
std::vector< const character_set_info * > extension_sets
Additional character sets activated by escape sequences.
bool uses_extensions() const noexcept
Whether this uses ISO 2022 code extensions.
bool is_utf8() const noexcept
Check if UTF-8 is the active character set.
const character_set_info * default_set
Character set for default (G0) repertoire.
bool is_single_byte_only() const noexcept
Whether this is a single-byte-only configuration.