35 template <
typename FromType,
typename ToType>
37 const std::string& from_encoding,
38 const std::string& to_encoding)
39 -> std::tuple<std::optional<ToType>, std::optional<std::string>>
43 return { ToType{}, std::nullopt };
46 if constexpr (std::is_same_v<FromType, ToType>)
48 if (from_encoding == to_encoding)
50 return { ToType(value), std::nullopt };
54 const auto* src_data = value.data();
55 size_t src_len = value.size();
58 if (from_encoding ==
"UTF-8"
59 && (to_encoding ==
"UTF-16LE" || to_encoding ==
"UTF-16"))
61 if (!simdutf::validate_utf8(
reinterpret_cast<const char*
>(src_data),
62 src_len *
sizeof(
typename FromType::value_type)))
64 return { std::nullopt,
"Invalid UTF-8 input" };
67 size_t utf16_len = simdutf::utf16_length_from_utf8(
68 reinterpret_cast<const char*
>(src_data),
69 src_len *
sizeof(
typename FromType::value_type));
71 if constexpr (
sizeof(
typename ToType::value_type) == 2)
73 ToType result(utf16_len,
typename ToType::value_type{});
74 size_t written = simdutf::convert_utf8_to_utf16le(
75 reinterpret_cast<const char*
>(src_data),
76 src_len *
sizeof(
typename FromType::value_type),
77 reinterpret_cast<char16_t*
>(result.data()));
78 if (written == 0 && utf16_len > 0)
80 return { std::nullopt,
"UTF-8 to UTF-16LE conversion failed" };
82 result.resize(written);
83 return { result, std::nullopt };
88 if (from_encoding ==
"UTF-8"
89 && (to_encoding ==
"UTF-32LE" || to_encoding ==
"UTF-32"))
91 if (!simdutf::validate_utf8(
reinterpret_cast<const char*
>(src_data),
92 src_len *
sizeof(
typename FromType::value_type)))
94 return { std::nullopt,
"Invalid UTF-8 input" };
97 size_t utf32_len = simdutf::utf32_length_from_utf8(
98 reinterpret_cast<const char*
>(src_data),
99 src_len *
sizeof(
typename FromType::value_type));
101 if constexpr (
sizeof(
typename ToType::value_type) == 4)
103 ToType result(utf32_len,
typename ToType::value_type{});
104 size_t written = simdutf::convert_utf8_to_utf32(
105 reinterpret_cast<const char*
>(src_data),
106 src_len *
sizeof(
typename FromType::value_type),
107 reinterpret_cast<char32_t*
>(result.data()));
108 if (written == 0 && utf32_len > 0)
110 return { std::nullopt,
"UTF-8 to UTF-32 conversion failed" };
112 result.resize(written);
113 return { result, std::nullopt };
118 if ((from_encoding ==
"UTF-16LE" || from_encoding ==
"UTF-16")
119 && to_encoding ==
"UTF-8")
121 if constexpr (
sizeof(
typename FromType::value_type) == 2)
123 const char16_t* utf16_data
124 =
reinterpret_cast<const char16_t*
>(src_data);
126 if (!simdutf::validate_utf16le(utf16_data, src_len))
128 return { std::nullopt,
"Invalid UTF-16LE input" };
132 = simdutf::utf8_length_from_utf16le(utf16_data, src_len);
134 if constexpr (std::is_same_v<ToType, std::string>)
136 std::string result(utf8_len,
'\0');
137 size_t written = simdutf::convert_utf16le_to_utf8(
138 utf16_data, src_len, result.data());
139 if (written == 0 && utf8_len > 0)
141 return { std::nullopt,
142 "UTF-16LE to UTF-8 conversion failed" };
144 result.resize(written);
145 return { result, std::nullopt };
151 if ((from_encoding ==
"UTF-32LE" || from_encoding ==
"UTF-32")
152 && to_encoding ==
"UTF-8")
154 if constexpr (
sizeof(
typename FromType::value_type) == 4)
156 const char32_t* utf32_data
157 =
reinterpret_cast<const char32_t*
>(src_data);
159 if (!simdutf::validate_utf32(utf32_data, src_len))
161 return { std::nullopt,
"Invalid UTF-32 input" };
165 = simdutf::utf8_length_from_utf32(utf32_data, src_len);
167 if constexpr (std::is_same_v<ToType, std::string>)
169 std::string result(utf8_len,
'\0');
170 size_t written = simdutf::convert_utf32_to_utf8(
171 utf32_data, src_len, result.data());
172 if (written == 0 && utf8_len > 0)
174 return { std::nullopt,
175 "UTF-32 to UTF-8 conversion failed" };
177 result.resize(written);
178 return { result, std::nullopt };
183 return { std::nullopt,
184 std::format(
"Unsupported encoding conversion: {} -> {}",
185 from_encoding, to_encoding) };
189 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
192 = convert<std::wstring, std::string>(value, get_wchar_encoding(endian_types::little),
193 get_encoding_name(encoding_types::utf8));
195 if (result.has_value())
197 return utf8_to_system(result.value());
200 return { std::nullopt, error };
204 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
206 auto [result, error] = convert<std::wstring_view, std::string>(
207 value, get_wchar_encoding(endian_types::little),
208 get_encoding_name(encoding_types::utf8));
210 if (result.has_value())
212 return utf8_to_system(result.value());
215 return { std::nullopt, error };
219 -> std::tuple<std::optional<std::wstring>, std::optional<std::string>>
221 auto [result, err] = system_to_utf8(value);
222 if (!result.has_value())
224 return { std::nullopt, err };
227 std::string clean_value = remove_utf8_bom(result.value());
229 return convert<std::string, std::wstring>(
230 clean_value, get_encoding_name(encoding_types::utf8), get_wchar_encoding(endian));
234 -> std::tuple<std::optional<std::wstring>, std::optional<std::string>>
236 auto [result, err] = system_to_utf8(std::string(value));
237 if (!result.has_value())
239 return { std::nullopt, err };
242 std::string clean_value = remove_utf8_bom(result.value());
244 return convert<std::string, std::wstring>(
245 clean_value, get_encoding_name(encoding_types::utf8), get_wchar_encoding(endian));
253 case encoding_types::utf8:
255 case encoding_types::utf16:
256 if (endian == endian_types::little)
258 else if (endian == endian_types::big)
262 case encoding_types::utf32:
263 if (endian == endian_types::little)
265 else if (endian == endian_types::big)
270 throw std::runtime_error(
"Unknown encoding");
276 if constexpr (
sizeof(wchar_t) == 2)
278 return get_encoding_name(encoding_types::utf16, endian);
280 else if constexpr (
sizeof(wchar_t) == 4)
282 return get_encoding_name(encoding_types::utf32, endian);
286 throw std::runtime_error(
"Unsupported wchar_t size");
293 return endian_types::unknown;
295 if (str[0] == 0xFEFF)
296 return endian_types::big;
297 if (str[0] == 0xFFFE)
298 return endian_types::little;
300 size_t sample_size = std::min<size_t>(str.size(), 1000);
301 int le_count = 0, be_count = 0;
302 for (
size_t i = 0; i < sample_size; ++i)
304 uint16_t ch = str[i];
305 if ((ch & 0xFF00) == 0 && (ch & 0x00FF) != 0)
307 if ((ch & 0x00FF) == 0 && (ch & 0xFF00) != 0)
311 if (le_count > be_count)
312 return endian_types::little;
313 if (be_count > le_count)
314 return endian_types::big;
316 return endian_types::unknown;
322 return endian_types::unknown;
324 if (str[0] == 0x0000FEFF)
325 return endian_types::big;
326 if (str[0] == 0xFFFE0000)
327 return endian_types::little;
329 size_t sample_size = std::min<size_t>(str.size(), 1000);
330 int le_count = 0, be_count = 0;
331 for (
size_t i = 0; i < sample_size; ++i)
333 uint32_t ch = str[i];
334 if ((ch & 0xFFFFFF00) == 0 && (ch & 0x000000FF) != 0)
336 if ((ch & 0x00FFFFFF) == 0 && (ch & 0xFF000000) != 0)
340 if (le_count > be_count)
341 return endian_types::little;
342 if (be_count > le_count)
343 return endian_types::big;
345 return endian_types::unknown;
350 return str.length() >= 3 &&
static_cast<unsigned char>(str[0]) == 0xEF
351 &&
static_cast<unsigned char>(str[1]) == 0xBB
352 &&
static_cast<unsigned char>(str[2]) == 0xBF;
357 return has_utf8_bom(str) ? str.substr(3) : str;
362 return has_utf8_bom(str) ? str : std::string(
"\xEF\xBB\xBF") + str;
368 return static_cast<int>(GetACP());
381 return "CP" + std::to_string(code_page);
386 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
388 int code_page = get_system_code_page();
389 if (code_page == 65001)
391 return { value, std::nullopt };
393 return convert<std::string, std::string>(value, get_code_page_name(code_page),
394 get_encoding_name(encoding_types::utf8));
398 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
400 int code_page = get_system_code_page();
401 if (code_page == 65001)
403 return { value, std::nullopt };
405 return convert<std::string, std::string>(value, get_encoding_name(encoding_types::utf8),
406 get_code_page_name(code_page));
410 -> std::tuple<std::optional<std::vector<std::string>>, std::optional<std::string>>
414 return { std::vector{ source }, std::nullopt };
417 std::vector<std::string> result;
418 size_t start_pos = 0;
419 size_t end_pos = source.find(token);
421 while (end_pos != std::string::npos)
423 result.emplace_back(source.substr(start_pos, end_pos - start_pos));
424 start_pos = end_pos + token.length();
425 end_pos = source.find(token, start_pos);
428 result.emplace_back(source.substr(start_pos));
430 return { result, std::nullopt };
434 -> std::tuple<std::optional<std::vector<uint8_t>>, std::optional<std::string>>
436 auto [utf8, convert_error] = system_to_utf8(value);
437 if (convert_error.has_value())
439 return { std::nullopt, convert_error };
442 auto utf8_no_bom = remove_utf8_bom(utf8.value());
444 return { std::vector<uint8_t>(utf8_no_bom.begin(), utf8_no_bom.end()), std::nullopt };
448 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
450 std::string utf8(value.begin(), value.end());
451 auto utf8_no_bom = remove_utf8_bom(utf8);
453 return utf8_to_system(utf8);
457 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
461 std::string encoded = base64_encode(value);
462 return { encoded, std::nullopt };
464 catch (
const std::exception& e)
466 return { std::nullopt, e.what() };
471 -> std::tuple<std::vector<uint8_t>, std::optional<std::string>>
473 return base64_decode(base64_str);
477 const std::string& token,
478 const std::string& target) -> std::optional<std::string>
480 auto [value, value_error] = replace2(source, token, target);
481 if (value_error.has_value())
486 source = value.value();
491 const std::string& token,
492 const std::string& target)
493 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
497 return { std::nullopt,
"Source string is empty" };
502 return { std::nullopt,
"Token string is empty" };
507 size_t last_offset = 0;
508 for (
size_t offset = source.find(token, last_offset); offset != std::string::npos;
509 last_offset = offset + token.size(), offset = source.find(token, last_offset))
511 std::format_to(std::back_inserter(result),
"{}{}",
512 source.substr(last_offset, offset - last_offset), target);
515 std::format_to(std::back_inserter(result),
"{}", source.substr(last_offset));
517 return { result, std::nullopt };
522 static const char base64_chars[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
523 "abcdefghijklmnopqrstuvwxyz"
526 std::string encoded_string;
528 uint32_t octet_a, octet_b, octet_c;
531 while (i < data.size())
533 octet_a = i < data.size() ? data[i++] : 0;
534 octet_b = i < data.size() ? data[i++] : 0;
535 octet_c = i < data.size() ? data[i++] : 0;
537 triple = (octet_a << 16) + (octet_b << 8) + octet_c;
539 encoded_string += base64_chars[(triple >> 18) & 0x3F];
540 encoded_string += base64_chars[(triple >> 12) & 0x3F];
541 encoded_string += base64_chars[(triple >> 6) & 0x3F];
542 encoded_string += base64_chars[triple & 0x3F];
545 int mod_table[] = { 0, 2, 1 };
546 for (
int j = 0; j < mod_table[data.size() % 3]; j++)
548 encoded_string[encoded_string.size() - 1 -
static_cast<size_t>(j)] =
'=';
551 return encoded_string;
555 -> std::tuple<std::vector<uint8_t>, std::optional<std::string>>
557 static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
558 "abcdefghijklmnopqrstuvwxyz"
561 if (base64_str.length() % 4 != 0)
563 return { std::vector<uint8_t>(),
"Invalid base64 input length" };
567 if (!base64_str.empty())
569 if (base64_str[base64_str.length() - 1] ==
'=')
571 if (base64_str.length() >= 2 && base64_str[base64_str.length() - 2] ==
'=')
575 return { std::vector<uint8_t>(),
"Invalid padding in base64 string" };
579 size_t decoded_length = (base64_str.length() / 4) * 3 - padding;
580 std::vector<uint8_t> decoded_data;
581 decoded_data.reserve(decoded_length);
583 std::vector<int> decoding_table(256, -1);
584 for (
int i = 0; i < 64; i++)
586 decoding_table[
static_cast<unsigned char>(base64_chars[
static_cast<size_t>(i)])] = i;
590 int bits_collected = 0;
592 for (; i < base64_str.length(); ++i)
594 char c = base64_str[i];
597 if (i < base64_str.length() - padding)
599 return { std::vector<uint8_t>(),
"Invalid padding position in base64 string" };
604 if (decoding_table[
static_cast<unsigned char>(c)] == -1)
606 return { std::vector<uint8_t>(),
"Invalid character in base64 string" };
609 buffer = (buffer << 6) | static_cast<uint32_t>(decoding_table[
static_cast<unsigned char>(c)]);
612 if (bits_collected >= 8)
615 decoded_data.push_back((buffer >> bits_collected) & 0xFF);
619 for (; i < base64_str.length(); ++i)
621 if (base64_str[i] !=
'=')
623 return { std::vector<uint8_t>(),
624 "Invalid character after padding in base64 string" };
628 return { decoded_data, std::nullopt };
static auto to_string(const std::wstring &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Converts a std::wstring to a std::string using the system encoding.
static auto base64_encode(const std::vector< uint8_t > &data) -> std::string
Encodes a byte array into a Base64 string.
static auto get_encoding_name(encoding_types encoding, endian_types endian=endian_types::little) -> std::string
Returns the encoding name string for the given encoding type and endianness.
static auto has_utf8_bom(const std::string &value) -> bool
Checks if a string has a UTF-8 BOM (Byte Order Mark).
static auto add_utf8_bom(const std::string &value) -> std::string
Adds a UTF-8 BOM to a string if it doesn't already have one.
static auto to_base64(const std::vector< uint8_t > &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Encodes a byte array into a Base64 string.
static auto split(const std::string &source, const std::string &token) -> std::tuple< std::optional< std::vector< std::string > >, std::optional< std::string > >
Splits a string by a given delimiter.
static auto get_system_code_page() -> int
Retrieves the system code page used for conversions.
static auto replace(std::string &source, const std::string &token, const std::string &target) -> std::optional< std::string >
Replaces all occurrences of token in source with target, in place.
static auto remove_utf8_bom(const std::string &value) -> std::string
Removes a leading UTF-8 BOM from a string, if present.
encoding_types
Supported encoding types for Unicode conversion.
static auto from_base64(const std::string &base64_str) -> std::tuple< std::vector< uint8_t >, std::optional< std::string > >
Decodes a Base64 string into a byte array.
static auto get_wchar_encoding(endian_types endian=endian_types::little) -> std::string
Derives the wchar_t encoding name based on its size and endianness.
static auto system_to_utf8(const std::string &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Converts a system-encoded string to UTF-8.
static auto utf8_to_system(const std::string &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Converts a UTF-8 encoded string to the system encoding.
static auto detect_endian(const std::u16string &value) -> endian_types
Detects the endianness of a UTF-16 string by checking for BOM or content patterns.
static auto base64_decode(const std::string &base64_str) -> std::tuple< std::vector< uint8_t >, std::optional< std::string > >
Decodes a Base64 string into a byte array.
static auto replace2(const std::string &source, const std::string &token, const std::string &target) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Replaces all occurrences of token in source with target, returning a new string.
static auto get_code_page_name(int code_page) -> std::string
Retrieves a textual name for a code page (e.g., "CP_ACP" or a locale-based name).
static auto to_wstring(const std::string &value) -> std::tuple< std::optional< std::wstring >, std::optional< std::string > >
Converts a std::string (system-encoded) to a std::wstring.
static auto convert(const FromType &value, const std::string &from_encoding, const std::string &to_encoding) -> std::tuple< std::optional< ToType >, std::optional< std::string > >
Converts from one encoding to another using simdutf.
static auto to_array(const std::string &value) -> std::tuple< std::optional< std::vector< uint8_t > >, std::optional< std::string > >
Converts a system-encoded string to a UTF-8 byte array.
endian_types
Possible endianness values for UTF-16 or UTF-32 data.
String encoding conversion, Base64 encoding/decoding utilities.