16#include <unicode/ucnv.h>
17#include <unicode/utypes.h>
28constexpr char ESC =
'\x1B';
31constexpr std::string_view esc_ir_149{
"\x1B\x24\x29\x43", 4};
34constexpr std::string_view esc_ir_87{
"\x1B\x24\x42", 3};
37constexpr std::string_view esc_ir_13{
"\x1B\x28\x4A", 3};
40constexpr std::string_view esc_ir_58{
"\x1B\x24\x29\x41", 4};
43constexpr std::string_view esc_ir_6{
"\x1B\x28\x42", 3};
46constexpr std::string_view esc_ir_100{
"\x1B\x2D\x41", 3};
49constexpr std::string_view esc_ir_101{
"\x1B\x2D\x42", 3};
52constexpr std::string_view esc_ir_126{
"\x1B\x2D\x46", 3};
55constexpr std::string_view esc_ir_127{
"\x1B\x2D\x47", 3};
58constexpr std::string_view esc_ir_138{
"\x1B\x2D\x48", 3};
61constexpr std::string_view esc_ir_144{
"\x1B\x2D\x4C", 3};
64constexpr std::string_view esc_ir_166{
"\x1B\x2D\x54", 3};
67const std::array<character_set_info, 20> charset_registry = {{
79 "Latin-1 (Western European)",
88 "Latin-2 (Central European)",
133 "Thai (TIS 620-2533)",
151 "ASCII (Default, with extensions)",
160 "Latin-1 (with extensions)",
169 "Latin-2 (with extensions)",
178 "Greek (with extensions)",
187 "Arabic (with extensions)",
196 "Hebrew (with extensions)",
205 "Cyrillic (with extensions)",
214 "Korean (KS X 1001)",
223 "Thai (with extensions)",
232 "Japanese Kanji (JIS X 0208)",
241 "Japanese Katakana (JIS X 0201)",
251const character_set_info charset_ir_58 = {
262const character_set_info charset_gb18030 = {
264 "Chinese (GB18030, full)",
272const character_set_info* find_in_registry(std::string_view term)
noexcept {
273 for (
const auto& entry : charset_registry) {
274 if (entry.defined_term == term) {
278 if (charset_ir_58.defined_term == term) {
279 return &charset_ir_58;
281 if (charset_gb18030.defined_term == term) {
282 return &charset_gb18030;
288const character_set_info* find_by_ir_number(
int ir_number)
noexcept {
290 case 6:
return find_in_registry(
"ISO_IR 6");
291 case 100:
return find_in_registry(
"ISO_IR 100");
292 case 101:
return find_in_registry(
"ISO_IR 101");
293 case 126:
return find_in_registry(
"ISO_IR 126");
294 case 127:
return find_in_registry(
"ISO_IR 127");
295 case 138:
return find_in_registry(
"ISO_IR 138");
296 case 144:
return find_in_registry(
"ISO_IR 144");
297 case 166:
return find_in_registry(
"ISO_IR 166");
298 case 192:
return find_in_registry(
"ISO_IR 192");
299 case 149:
return find_in_registry(
"ISO 2022 IR 149");
300 case 87:
return find_in_registry(
"ISO 2022 IR 87");
301 case 13:
return find_in_registry(
"ISO 2022 IR 13");
302 case 58:
return &charset_ir_58;
303 default:
return nullptr;
308const character_set_info* find_by_escape_sequence(
309 std::string_view text,
size_t pos,
310 const specific_character_set& scs)
noexcept {
312 for (
const auto* cs : scs.extension_sets) {
313 if (cs && !cs->escape_sequence.empty()) {
314 auto esc_len = cs->escape_sequence.size();
315 if (pos + esc_len <=
text.size() &&
316 text.substr(pos, esc_len) == cs->escape_sequence) {
322 if (pos + esc_ir_6.size() <=
text.size() &&
323 text.substr(pos, esc_ir_6.size()) == esc_ir_6) {
324 return scs.default_set;
330std::string icu_convert_to_utf8(std::string_view input,
331 const character_set_info& charset) {
332 if (charset.encoding_name ==
"ASCII" ||
333 charset.encoding_name ==
"UTF-8") {
334 return std::string(input);
340 std::string wrapped_input;
341 if (charset.encoding_name ==
"ISO-2022-JP") {
342 wrapped_input.append(charset.escape_sequence);
343 wrapped_input.append(input);
344 wrapped_input.append(esc_ir_6);
346 const auto& actual_input = (charset.encoding_name ==
"ISO-2022-JP")
347 ? std::string_view(wrapped_input)
351 auto out_size =
static_cast<int32_t
>(actual_input.size() * 4 + 4);
352 std::string output(
static_cast<size_t>(out_size),
'\0');
354 UErrorCode
status = U_ZERO_ERROR;
355 int32_t result_len = ucnv_convert(
356 "UTF-8", charset.encoding_name.data(),
357 output.data(), out_size,
358 actual_input.data(),
static_cast<int32_t
>(actual_input.size()),
361 if (U_FAILURE(status)) {
363 return std::string(input);
366 output.resize(
static_cast<size_t>(result_len));
371std::string icu_convert_from_utf8(std::string_view utf8_input,
372 const character_set_info& charset) {
373 if (charset.encoding_name ==
"ASCII" ||
374 charset.encoding_name ==
"UTF-8") {
375 return std::string(utf8_input);
378 auto out_size =
static_cast<int32_t
>(utf8_input.size() * 4 + 4);
379 std::string output(
static_cast<size_t>(out_size),
'\0');
381 UErrorCode
status = U_ZERO_ERROR;
382 int32_t result_len = ucnv_convert(
383 charset.encoding_name.data(),
"UTF-8",
384 output.data(), out_size,
385 utf8_input.data(),
static_cast<int32_t
>(utf8_input.size()),
388 if (U_FAILURE(status)) {
389 return std::string(utf8_input);
392 output.resize(
static_cast<size_t>(result_len));
397 if (charset.encoding_name ==
"ISO-2022-JP" && output.size() >= 6) {
398 auto esc_prefix = charset.escape_sequence;
399 auto esc_suffix = esc_ir_6;
400 bool has_prefix = (output.size() >= esc_prefix.size() &&
401 std::string_view(output).substr(0, esc_prefix.size()) == esc_prefix);
402 bool has_suffix = (output.size() >= esc_suffix.size() &&
403 std::string_view(output).substr(
404 output.size() - esc_suffix.size()) == esc_suffix);
405 if (has_prefix && has_suffix) {
406 output = output.substr(
408 output.size() - esc_prefix.size() - esc_suffix.size());
422 std::string_view defined_term)
noexcept {
423 return find_in_registry(defined_term);
427 int iso_ir_number)
noexcept {
428 return find_by_ir_number(iso_ir_number);
433 return charset_registry[0];
437 std::vector<const character_set_info*> result;
438 result.reserve(charset_registry.size() + 2);
439 for (
const auto& entry : charset_registry) {
440 result.push_back(&entry);
442 result.push_back(&charset_ir_58);
443 result.push_back(&charset_gb18030);
460 if (cs && cs->is_multi_byte) {
480 std::vector<std::string_view> components;
482 while (start <= value.size()) {
483 size_t pos = value.find(
'\\', start);
484 if (pos == std::string_view::npos) {
485 components.push_back(value.substr(start));
488 components.push_back(value.substr(start, pos - start));
492 if (components.empty()) {
497 auto first = components[0];
499 while (!first.empty() && first.front() ==
' ') first.remove_prefix(1);
500 while (!first.empty() && first.back() ==
' ') first.remove_suffix(1);
502 if (!first.empty()) {
503 const auto* cs = find_in_registry(first);
511 for (
size_t i = 1; i < components.size(); ++i) {
512 auto term = components[i];
513 while (!term.empty() && term.front() ==
' ') term.remove_prefix(1);
514 while (!term.empty() && term.back() ==
' ') term.remove_suffix(1);
517 const auto* cs = find_in_registry(term);
532 std::string_view text,
535 std::vector<text_segment> segments;
548 size_t segment_start = 0;
550 for (
size_t i = 0; i < text.size(); ++i) {
551 if (text[i] == ESC) {
553 const auto* new_charset = find_by_escape_sequence(text, i, scs);
556 if (i > segment_start) {
558 text.substr(segment_start, i - segment_start),
564 size_t esc_len = new_charset->escape_sequence.empty()
566 : new_charset->escape_sequence.size();
568 segment_start = i + 1;
569 current_charset = new_charset;
575 if (segment_start < text.size()) {
577 text.substr(segment_start),
590 std::string_view text,
600 return std::string(text);
603 return icu_convert_to_utf8(text, charset);
607 std::string_view text,
616 return std::string(text);
628 result.reserve(text.size() * 2);
630 for (
const auto& seg : segments) {
634 result.append(seg.text);
642 std::string_view pn_value,
645 if (pn_value.empty()) {
652 result.reserve(pn_value.size() * 2);
657 while (start <= pn_value.size()) {
658 size_t eq_pos = pn_value.find(
'=', start);
659 std::string_view group;
660 if (eq_pos == std::string_view::npos) {
661 group = pn_value.substr(start);
662 start = pn_value.size() + 1;
664 group = pn_value.substr(start, eq_pos - start);
685 std::string_view utf8_text,
688 if (utf8_text.empty()) {
694 return std::string(utf8_text);
697 return icu_convert_from_utf8(utf8_text, charset);
701 std::string_view utf8_text,
704 if (utf8_text.empty()) {
710 return std::string(utf8_text);
726 if (cs && cs->is_multi_byte) {
738 result.reserve(utf8_text.size() * 2);
740 bool in_multibyte =
false;
741 size_t run_start = 0;
743 for (
size_t i = 0; i < utf8_text.size(); ) {
744 auto uc =
static_cast<unsigned char>(utf8_text[i]);
745 bool is_ascii = (uc < 0x80);
750 auto mb_run = utf8_text.substr(run_start, i - run_start);
754 result.append(esc_ir_6);
755 in_multibyte =
false;
757 if (!in_multibyte && i == run_start) {
761 result += utf8_text[i];
770 if (uc < 0xC0) { ++i; }
771 else if (uc < 0xE0) { i += 2; }
772 else if (uc < 0xF0) { i += 3; }
775 if (i > utf8_text.size()) { i = utf8_text.size(); }
780 if (in_multibyte && run_start < utf8_text.size()) {
781 auto mb_run = utf8_text.substr(run_start);
784 result.append(esc_ir_6);
DICOM Character Set registry, ISO 2022 parser, and string decoder.
std::vector< text_segment > split_by_escape_sequences(std::string_view text, const specific_character_set &scs)
Split a string into segments by ISO 2022 escape sequences.
const character_set_info * find_character_set(std::string_view defined_term) noexcept
Look up character set info by DICOM Defined Term.
std::string encode_from_utf8(std::string_view utf8_text, const specific_character_set &scs)
Encode a UTF-8 string to the target character set encoding.
specific_character_set parse_specific_character_set(std::string_view value)
Parse a Specific Character Set (0008,0005) value.
const character_set_info & default_character_set() noexcept
Get the default character set (ISO-IR 6, ASCII).
std::vector< const character_set_info * > all_character_sets() noexcept
Get all registered character sets.
const character_set_info * find_character_set_by_ir(int iso_ir_number) noexcept
Look up character set info by ISO-IR number.
std::string decode_person_name(std::string_view pn_value, const specific_character_set &scs)
Decode a Person Name (PN) value to UTF-8.
std::string decode_to_utf8(std::string_view text, const specific_character_set &scs)
Decode a DICOM string to UTF-8 using the given character set.
std::string convert_from_utf8(std::string_view utf8_text, const character_set_info &charset)
Encode a single UTF-8 segment to a specific character set.
std::string convert_to_utf8(std::string_view text, const character_set_info &charset)
Decode a single segment from a specific encoding to UTF-8.
Information about a DICOM character set.
std::string_view defined_term
DICOM Defined Term (e.g., "ISO 2022 IR 149")
std::string_view escape_sequence
Raw escape sequence bytes (empty if none)
std::string_view encoding_name
ICU converter name (e.g., "EUC-KR")
bool is_multi_byte
true if characters can be multi-byte
Parsed representation of a multi-valued Specific Character Set.
std::vector< const character_set_info * > extension_sets
Additional character sets activated by escape sequences.
bool uses_extensions() const noexcept
Whether this uses ISO 2022 code extensions.
bool is_utf8() const noexcept
Check if UTF-8 is the active character set.
const character_set_info * default_set
Character set for default (G0) repertoire.
bool is_single_byte_only() const noexcept
Whether this is a single-byte-only configuration.