Thread System 0.3.1
High-performance C++20 thread pool with work stealing and DAG scheduling
Loading...
Searching...
No Matches
convert_string.cpp
Go to the documentation of this file.
1// BSD 3-Clause License
2// Copyright (c) 2024, 🍀☀🌕🌥 🌊
3// See the LICENSE file in the project root for full license information.
4
6
8
9#include <simdutf.h>
10
11#include <stdexcept>
12#include <cstdint>
13#include <format>
14
15#ifdef _WIN32
16#include <windows.h>
17#endif
18
33namespace utility_module
34{
35 template <typename FromType, typename ToType>
36 auto convert_string::convert(const FromType& value,
37 const std::string& from_encoding,
38 const std::string& to_encoding)
39 -> std::tuple<std::optional<ToType>, std::optional<std::string>>
40 {
41 if (value.empty())
42 {
43 return { ToType{}, std::nullopt };
44 }
45
46 if constexpr (std::is_same_v<FromType, ToType>)
47 {
48 if (from_encoding == to_encoding)
49 {
50 return { ToType(value), std::nullopt };
51 }
52 }
53
54 const auto* src_data = value.data();
55 size_t src_len = value.size();
56
57 // UTF-8 -> UTF-16LE (std::string -> std::wstring on Windows, or char16_t-based)
58 if (from_encoding == "UTF-8"
59 && (to_encoding == "UTF-16LE" || to_encoding == "UTF-16"))
60 {
61 if (!simdutf::validate_utf8(reinterpret_cast<const char*>(src_data),
62 src_len * sizeof(typename FromType::value_type)))
63 {
64 return { std::nullopt, "Invalid UTF-8 input" };
65 }
66
67 size_t utf16_len = simdutf::utf16_length_from_utf8(
68 reinterpret_cast<const char*>(src_data),
69 src_len * sizeof(typename FromType::value_type));
70
71 if constexpr (sizeof(typename ToType::value_type) == 2)
72 {
73 ToType result(utf16_len, typename ToType::value_type{});
74 size_t written = simdutf::convert_utf8_to_utf16le(
75 reinterpret_cast<const char*>(src_data),
76 src_len * sizeof(typename FromType::value_type),
77 reinterpret_cast<char16_t*>(result.data()));
78 if (written == 0 && utf16_len > 0)
79 {
80 return { std::nullopt, "UTF-8 to UTF-16LE conversion failed" };
81 }
82 result.resize(written);
83 return { result, std::nullopt };
84 }
85 }
86
87 // UTF-8 -> UTF-32LE (std::string -> std::wstring on Unix)
88 if (from_encoding == "UTF-8"
89 && (to_encoding == "UTF-32LE" || to_encoding == "UTF-32"))
90 {
91 if (!simdutf::validate_utf8(reinterpret_cast<const char*>(src_data),
92 src_len * sizeof(typename FromType::value_type)))
93 {
94 return { std::nullopt, "Invalid UTF-8 input" };
95 }
96
97 size_t utf32_len = simdutf::utf32_length_from_utf8(
98 reinterpret_cast<const char*>(src_data),
99 src_len * sizeof(typename FromType::value_type));
100
101 if constexpr (sizeof(typename ToType::value_type) == 4)
102 {
103 ToType result(utf32_len, typename ToType::value_type{});
104 size_t written = simdutf::convert_utf8_to_utf32(
105 reinterpret_cast<const char*>(src_data),
106 src_len * sizeof(typename FromType::value_type),
107 reinterpret_cast<char32_t*>(result.data()));
108 if (written == 0 && utf32_len > 0)
109 {
110 return { std::nullopt, "UTF-8 to UTF-32 conversion failed" };
111 }
112 result.resize(written);
113 return { result, std::nullopt };
114 }
115 }
116
117 // UTF-16LE -> UTF-8 (std::wstring -> std::string on Windows)
118 if ((from_encoding == "UTF-16LE" || from_encoding == "UTF-16")
119 && to_encoding == "UTF-8")
120 {
121 if constexpr (sizeof(typename FromType::value_type) == 2)
122 {
123 const char16_t* utf16_data
124 = reinterpret_cast<const char16_t*>(src_data);
125
126 if (!simdutf::validate_utf16le(utf16_data, src_len))
127 {
128 return { std::nullopt, "Invalid UTF-16LE input" };
129 }
130
131 size_t utf8_len
132 = simdutf::utf8_length_from_utf16le(utf16_data, src_len);
133
134 if constexpr (std::is_same_v<ToType, std::string>)
135 {
136 std::string result(utf8_len, '\0');
137 size_t written = simdutf::convert_utf16le_to_utf8(
138 utf16_data, src_len, result.data());
139 if (written == 0 && utf8_len > 0)
140 {
141 return { std::nullopt,
142 "UTF-16LE to UTF-8 conversion failed" };
143 }
144 result.resize(written);
145 return { result, std::nullopt };
146 }
147 }
148 }
149
150 // UTF-32LE -> UTF-8 (std::wstring -> std::string on Unix)
151 if ((from_encoding == "UTF-32LE" || from_encoding == "UTF-32")
152 && to_encoding == "UTF-8")
153 {
154 if constexpr (sizeof(typename FromType::value_type) == 4)
155 {
156 const char32_t* utf32_data
157 = reinterpret_cast<const char32_t*>(src_data);
158
159 if (!simdutf::validate_utf32(utf32_data, src_len))
160 {
161 return { std::nullopt, "Invalid UTF-32 input" };
162 }
163
164 size_t utf8_len
165 = simdutf::utf8_length_from_utf32(utf32_data, src_len);
166
167 if constexpr (std::is_same_v<ToType, std::string>)
168 {
169 std::string result(utf8_len, '\0');
170 size_t written = simdutf::convert_utf32_to_utf8(
171 utf32_data, src_len, result.data());
172 if (written == 0 && utf8_len > 0)
173 {
174 return { std::nullopt,
175 "UTF-32 to UTF-8 conversion failed" };
176 }
177 result.resize(written);
178 return { result, std::nullopt };
179 }
180 }
181 }
182
183 return { std::nullopt,
184 std::format("Unsupported encoding conversion: {} -> {}",
185 from_encoding, to_encoding) };
186 }
187
188 auto convert_string::to_string(const std::wstring& value)
189 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
190 {
191 auto [result, error]
192 = convert<std::wstring, std::string>(value, get_wchar_encoding(endian_types::little),
193 get_encoding_name(encoding_types::utf8));
194
195 if (result.has_value())
196 {
197 return utf8_to_system(result.value());
198 }
199
200 return { std::nullopt, error };
201 }
202
203 auto convert_string::to_string(std::wstring_view value)
204 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
205 {
206 auto [result, error] = convert<std::wstring_view, std::string>(
207 value, get_wchar_encoding(endian_types::little),
208 get_encoding_name(encoding_types::utf8));
209
210 if (result.has_value())
211 {
212 return utf8_to_system(result.value());
213 }
214
215 return { std::nullopt, error };
216 }
217
218 auto convert_string::to_wstring(const std::string& value)
219 -> std::tuple<std::optional<std::wstring>, std::optional<std::string>>
220 {
221 auto [result, err] = system_to_utf8(value);
222 if (!result.has_value())
223 {
224 return { std::nullopt, err };
225 }
226
227 std::string clean_value = remove_utf8_bom(result.value());
228 endian_types endian = endian_types::little;
229 return convert<std::string, std::wstring>(
230 clean_value, get_encoding_name(encoding_types::utf8), get_wchar_encoding(endian));
231 }
232
233 auto convert_string::to_wstring(std::string_view value)
234 -> std::tuple<std::optional<std::wstring>, std::optional<std::string>>
235 {
236 auto [result, err] = system_to_utf8(std::string(value));
237 if (!result.has_value())
238 {
239 return { std::nullopt, err };
240 }
241
242 std::string clean_value = remove_utf8_bom(result.value());
243 endian_types endian = endian_types::little;
244 return convert<std::string, std::wstring>(
245 clean_value, get_encoding_name(encoding_types::utf8), get_wchar_encoding(endian));
246 }
247
249 endian_types endian) -> std::string
250 {
251 switch (encoding)
252 {
253 case encoding_types::utf8:
254 return "UTF-8";
255 case encoding_types::utf16:
256 if (endian == endian_types::little)
257 return "UTF-16LE";
258 else if (endian == endian_types::big)
259 return "UTF-16BE";
260 else
261 return "UTF-16";
262 case encoding_types::utf32:
263 if (endian == endian_types::little)
264 return "UTF-32LE";
265 else if (endian == endian_types::big)
266 return "UTF-32BE";
267 else
268 return "UTF-32";
269 default:
270 throw std::runtime_error("Unknown encoding");
271 }
272 }
273
275 {
276 if constexpr (sizeof(wchar_t) == 2)
277 {
278 return get_encoding_name(encoding_types::utf16, endian);
279 }
280 else if constexpr (sizeof(wchar_t) == 4)
281 {
282 return get_encoding_name(encoding_types::utf32, endian);
283 }
284 else
285 {
286 throw std::runtime_error("Unsupported wchar_t size");
287 }
288 }
289
290 auto convert_string::detect_endian(const std::u16string& str) -> endian_types
291 {
292 if (str.empty())
293 return endian_types::unknown;
294
295 if (str[0] == 0xFEFF)
296 return endian_types::big;
297 if (str[0] == 0xFFFE)
298 return endian_types::little;
299
300 size_t sample_size = std::min<size_t>(str.size(), 1000);
301 int le_count = 0, be_count = 0;
302 for (size_t i = 0; i < sample_size; ++i)
303 {
304 uint16_t ch = str[i];
305 if ((ch & 0xFF00) == 0 && (ch & 0x00FF) != 0)
306 ++le_count;
307 if ((ch & 0x00FF) == 0 && (ch & 0xFF00) != 0)
308 ++be_count;
309 }
310
311 if (le_count > be_count)
312 return endian_types::little;
313 if (be_count > le_count)
314 return endian_types::big;
315
316 return endian_types::unknown;
317 }
318
319 auto convert_string::detect_endian(const std::u32string& str) -> endian_types
320 {
321 if (str.empty())
322 return endian_types::unknown;
323
324 if (str[0] == 0x0000FEFF)
325 return endian_types::big;
326 if (str[0] == 0xFFFE0000)
327 return endian_types::little;
328
329 size_t sample_size = std::min<size_t>(str.size(), 1000);
330 int le_count = 0, be_count = 0;
331 for (size_t i = 0; i < sample_size; ++i)
332 {
333 uint32_t ch = str[i];
334 if ((ch & 0xFFFFFF00) == 0 && (ch & 0x000000FF) != 0)
335 ++le_count;
336 if ((ch & 0x00FFFFFF) == 0 && (ch & 0xFF000000) != 0)
337 ++be_count;
338 }
339
340 if (le_count > be_count)
341 return endian_types::little;
342 if (be_count > le_count)
343 return endian_types::big;
344
345 return endian_types::unknown;
346 }
347
348 auto convert_string::has_utf8_bom(const std::string& str) -> bool
349 {
350 return str.length() >= 3 && static_cast<unsigned char>(str[0]) == 0xEF
351 && static_cast<unsigned char>(str[1]) == 0xBB
352 && static_cast<unsigned char>(str[2]) == 0xBF;
353 }
354
355 auto convert_string::remove_utf8_bom(const std::string& str) -> std::string
356 {
357 return has_utf8_bom(str) ? str.substr(3) : str;
358 }
359
360 auto convert_string::add_utf8_bom(const std::string& str) -> std::string
361 {
362 return has_utf8_bom(str) ? str : std::string("\xEF\xBB\xBF") + str;
363 }
364
366 {
367#ifdef _WIN32
368 return static_cast<int>(GetACP());
369#else
370 return 65001;
371#endif
372 }
373
374 auto convert_string::get_code_page_name(int code_page) -> std::string
375 {
376 switch (code_page)
377 {
378 case 65001:
379 return "UTF-8";
380 default:
381 return "CP" + std::to_string(code_page);
382 }
383 }
384
385 auto convert_string::system_to_utf8(const std::string& value)
386 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
387 {
388 int code_page = get_system_code_page();
389 if (code_page == 65001)
390 {
391 return { value, std::nullopt };
392 }
393 return convert<std::string, std::string>(value, get_code_page_name(code_page),
394 get_encoding_name(encoding_types::utf8));
395 }
396
397 auto convert_string::utf8_to_system(const std::string& value)
398 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
399 {
400 int code_page = get_system_code_page();
401 if (code_page == 65001)
402 {
403 return { value, std::nullopt };
404 }
405 return convert<std::string, std::string>(value, get_encoding_name(encoding_types::utf8),
406 get_code_page_name(code_page));
407 }
408
409 auto convert_string::split(const std::string& source, const std::string& token)
410 -> std::tuple<std::optional<std::vector<std::string>>, std::optional<std::string>>
411 {
412 if (token.empty())
413 {
414 return { std::vector{ source }, std::nullopt };
415 }
416
417 std::vector<std::string> result;
418 size_t start_pos = 0;
419 size_t end_pos = source.find(token);
420
421 while (end_pos != std::string::npos)
422 {
423 result.emplace_back(source.substr(start_pos, end_pos - start_pos));
424 start_pos = end_pos + token.length();
425 end_pos = source.find(token, start_pos);
426 }
427
428 result.emplace_back(source.substr(start_pos));
429
430 return { result, std::nullopt };
431 }
432
433 auto convert_string::to_array(const std::string& value)
434 -> std::tuple<std::optional<std::vector<uint8_t>>, std::optional<std::string>>
435 {
436 auto [utf8, convert_error] = system_to_utf8(value);
437 if (convert_error.has_value())
438 {
439 return { std::nullopt, convert_error };
440 }
441
442 auto utf8_no_bom = remove_utf8_bom(utf8.value());
443
444 return { std::vector<uint8_t>(utf8_no_bom.begin(), utf8_no_bom.end()), std::nullopt };
445 }
446
447 auto convert_string::to_string(const std::vector<uint8_t>& value)
448 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
449 {
450 std::string utf8(value.begin(), value.end());
451 auto utf8_no_bom = remove_utf8_bom(utf8);
452
453 return utf8_to_system(utf8);
454 }
455
456 auto convert_string::to_base64(const std::vector<uint8_t>& value)
457 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
458 {
459 try
460 {
461 std::string encoded = base64_encode(value);
462 return { encoded, std::nullopt };
463 }
464 catch (const std::exception& e)
465 {
466 return { std::nullopt, e.what() };
467 }
468 }
469
470 auto convert_string::from_base64(const std::string& base64_str)
471 -> std::tuple<std::vector<uint8_t>, std::optional<std::string>>
472 {
473 return base64_decode(base64_str);
474 }
475
476 auto convert_string::replace(std::string& source,
477 const std::string& token,
478 const std::string& target) -> std::optional<std::string>
479 {
480 auto [value, value_error] = replace2(source, token, target);
481 if (value_error.has_value())
482 {
483 return value_error;
484 }
485
486 source = value.value();
487 return std::nullopt;
488 }
489
490 auto convert_string::replace2(const std::string& source,
491 const std::string& token,
492 const std::string& target)
493 -> std::tuple<std::optional<std::string>, std::optional<std::string>>
494 {
495 if (source.empty())
496 {
497 return { std::nullopt, "Source string is empty" };
498 }
499
500 if (token.empty())
501 {
502 return { std::nullopt, "Token string is empty" };
503 }
504
505 std::string result;
506
507 size_t last_offset = 0;
508 for (size_t offset = source.find(token, last_offset); offset != std::string::npos;
509 last_offset = offset + token.size(), offset = source.find(token, last_offset))
510 {
511 std::format_to(std::back_inserter(result), "{}{}",
512 source.substr(last_offset, offset - last_offset), target);
513 }
514
515 std::format_to(std::back_inserter(result), "{}", source.substr(last_offset));
516
517 return { result, std::nullopt };
518 }
519
520 auto convert_string::base64_encode(const std::vector<uint8_t>& data) -> std::string
521 {
522 static const char base64_chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
523 "abcdefghijklmnopqrstuvwxyz"
524 "0123456789+/";
525
526 std::string encoded_string;
527 size_t i = 0;
528 uint32_t octet_a, octet_b, octet_c;
529 uint32_t triple;
530
531 while (i < data.size())
532 {
533 octet_a = i < data.size() ? data[i++] : 0;
534 octet_b = i < data.size() ? data[i++] : 0;
535 octet_c = i < data.size() ? data[i++] : 0;
536
537 triple = (octet_a << 16) + (octet_b << 8) + octet_c;
538
539 encoded_string += base64_chars[(triple >> 18) & 0x3F];
540 encoded_string += base64_chars[(triple >> 12) & 0x3F];
541 encoded_string += base64_chars[(triple >> 6) & 0x3F];
542 encoded_string += base64_chars[triple & 0x3F];
543 }
544
545 int mod_table[] = { 0, 2, 1 };
546 for (int j = 0; j < mod_table[data.size() % 3]; j++)
547 {
548 encoded_string[encoded_string.size() - 1 - static_cast<size_t>(j)] = '=';
549 }
550
551 return encoded_string;
552 }
553
554 auto convert_string::base64_decode(const std::string& base64_str)
555 -> std::tuple<std::vector<uint8_t>, std::optional<std::string>>
556 {
557 static const std::string base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
558 "abcdefghijklmnopqrstuvwxyz"
559 "0123456789+/";
560
561 if (base64_str.length() % 4 != 0)
562 {
563 return { std::vector<uint8_t>(), "Invalid base64 input length" };
564 }
565
566 size_t padding = 0;
567 if (!base64_str.empty())
568 {
569 if (base64_str[base64_str.length() - 1] == '=')
570 padding++;
571 if (base64_str.length() >= 2 && base64_str[base64_str.length() - 2] == '=')
572 padding++;
573 if (padding > 2)
574 {
575 return { std::vector<uint8_t>(), "Invalid padding in base64 string" };
576 }
577 }
578
579 size_t decoded_length = (base64_str.length() / 4) * 3 - padding;
580 std::vector<uint8_t> decoded_data;
581 decoded_data.reserve(decoded_length);
582
583 std::vector<int> decoding_table(256, -1);
584 for (int i = 0; i < 64; i++)
585 {
586 decoding_table[static_cast<unsigned char>(base64_chars[static_cast<size_t>(i)])] = i;
587 }
588
589 uint32_t buffer = 0;
590 int bits_collected = 0;
591 size_t i = 0;
592 for (; i < base64_str.length(); ++i)
593 {
594 char c = base64_str[i];
595 if (c == '=')
596 {
597 if (i < base64_str.length() - padding)
598 {
599 return { std::vector<uint8_t>(), "Invalid padding position in base64 string" };
600 }
601 break;
602 }
603
604 if (decoding_table[static_cast<unsigned char>(c)] == -1)
605 {
606 return { std::vector<uint8_t>(), "Invalid character in base64 string" };
607 }
608
609 buffer = (buffer << 6) | static_cast<uint32_t>(decoding_table[static_cast<unsigned char>(c)]);
610 bits_collected += 6;
611
612 if (bits_collected >= 8)
613 {
614 bits_collected -= 8;
615 decoded_data.push_back((buffer >> bits_collected) & 0xFF);
616 }
617 }
618
619 for (; i < base64_str.length(); ++i)
620 {
621 if (base64_str[i] != '=')
622 {
623 return { std::vector<uint8_t>(),
624 "Invalid character after padding in base64 string" };
625 }
626 }
627
628 return { decoded_data, std::nullopt };
629 }
630
631} // namespace utility_module
static auto to_string(const std::wstring &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Converts a std::wstring to a std::string using the system encoding.
static auto base64_encode(const std::vector< uint8_t > &data) -> std::string
Encodes a byte array into a Base64 string.
static auto get_encoding_name(encoding_types encoding, endian_types endian=endian_types::little) -> std::string
Returns the encoding name string for the given encoding type and endianness.
static auto has_utf8_bom(const std::string &value) -> bool
Checks if a string has a UTF-8 BOM (Byte Order Mark).
static auto add_utf8_bom(const std::string &value) -> std::string
Adds a UTF-8 BOM to a string if it doesn't already have one.
static auto to_base64(const std::vector< uint8_t > &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Encodes a byte array into a Base64 string.
static auto split(const std::string &source, const std::string &token) -> std::tuple< std::optional< std::vector< std::string > >, std::optional< std::string > >
Splits a string by a given delimiter.
static auto get_system_code_page() -> int
Retrieves the system code page used for conversions.
static auto replace(std::string &source, const std::string &token, const std::string &target) -> std::optional< std::string >
Replaces all occurrences of token in source with target, in place.
static auto remove_utf8_bom(const std::string &value) -> std::string
Removes a leading UTF-8 BOM from a string, if present.
encoding_types
Supported encoding types for Unicode conversion.
static auto from_base64(const std::string &base64_str) -> std::tuple< std::vector< uint8_t >, std::optional< std::string > >
Decodes a Base64 string into a byte array.
static auto get_wchar_encoding(endian_types endian=endian_types::little) -> std::string
Derives the wchar_t encoding name based on its size and endianness.
static auto system_to_utf8(const std::string &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Converts a system-encoded string to UTF-8.
static auto utf8_to_system(const std::string &value) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Converts a UTF-8 encoded string to the system encoding.
static auto detect_endian(const std::u16string &value) -> endian_types
Detects the endianness of a UTF-16 string by checking for BOM or content patterns.
static auto base64_decode(const std::string &base64_str) -> std::tuple< std::vector< uint8_t >, std::optional< std::string > >
Decodes a Base64 string into a byte array.
static auto replace2(const std::string &source, const std::string &token, const std::string &target) -> std::tuple< std::optional< std::string >, std::optional< std::string > >
Replaces all occurrences of token in source with target, returning a new string.
static auto get_code_page_name(int code_page) -> std::string
Retrieves a textual name for a code page (e.g., "CP_ACP" or a locale-based name).
static auto to_wstring(const std::string &value) -> std::tuple< std::optional< std::wstring >, std::optional< std::string > >
Converts a std::string (system-encoded) to a std::wstring.
static auto convert(const FromType &value, const std::string &from_encoding, const std::string &to_encoding) -> std::tuple< std::optional< ToType >, std::optional< std::string > >
Converts from one encoding to another using simdutf.
static auto to_array(const std::string &value) -> std::tuple< std::optional< std::vector< uint8_t > >, std::optional< std::string > >
Converts a system-encoded string to a UTF-8 byte array.
endian_types
Possible endianness values for UTF-16 or UTF-32 data.
String encoding conversion, Base64 encoding/decoding utilities.
Generic formatter for enum types using user-provided converter functors.