Skip to content

Commit 328f0af

Browse files
committed
Add Unicode transcoder to insert into an existing string
1 parent 167d5a3 commit 328f0af

File tree

1 file changed

+131
-50
lines changed

1 file changed

+131
-50
lines changed

fly/types/string/detail/string_unicode.hpp

+131-50
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <array>
88
#include <functional>
9+
#include <iterator>
910
#include <optional>
1011
#include <string>
1112

@@ -75,6 +76,43 @@ class BasicStringUnicode
7576
static std::optional<DesiredStringType>
7677
convert_encoding(IteratorType &it, const IteratorType &end);
7778

79+
/**
80+
* Convert the Unicode encoding of a string to another encoding, inserting the result into the
81+
* provided output iterator.
82+
*
83+
* @tparam DesiredStringType The type of string to convert to.
84+
* @tparam IteratorType The type of the encoded Unicode string's iterator.
85+
* @tparam OutputIteratorType The type of the output iterator to insert the result into.
86+
*
87+
* @param it Pointer to the beginning of the encoded Unicode string.
88+
* @param end Pointer to the end of the encoded Unicode string.
89+
* @param out The output iterator to insert the result into.
90+
*
91+
* @return Whether the conversion was successful.
92+
*/
93+
template <
94+
typename DesiredStringType,
95+
typename OutputIteratorType,
96+
typename SourceStringType = StringType>
97+
static bool convert_encoding_into(SourceStringType &&value, OutputIteratorType out);
98+
99+
/**
100+
* Convert the Unicode encoding of a string to another encoding, inserting the result into the
101+
* provided output iterator.
102+
*
103+
* @tparam DesiredStringType The type of string to convert to.
104+
* @tparam OutputIteratorType The type of the output iterator to insert the result into.
105+
* @tparam SourceStringType The type of string to convert.
106+
*
107+
* @param value The encoded Unicode string to convert.
108+
* @param out The output iterator to insert the result into.
109+
*
110+
* @return Whether the conversion was successful.
111+
*/
112+
template <typename DesiredStringType, typename IteratorType, typename OutputIteratorType>
113+
static bool
114+
convert_encoding_into(IteratorType &it, const IteratorType &end, OutputIteratorType out);
115+
78116
/**
79117
* Decode a single Unicode codepoint, starting at the character pointed to by the provided
80118
* iterator. If successful, after invoking this method, that iterator will point at the first
@@ -155,6 +193,12 @@ class BasicStringUnicode
155193
static std::optional<StringType> unescape_codepoint(IteratorType &it, const IteratorType &end);
156194

157195
private:
196+
friend BasicStringUnicode<std::string>;
197+
friend BasicStringUnicode<std::wstring>;
198+
friend BasicStringUnicode<std::u8string>;
199+
friend BasicStringUnicode<std::u16string>;
200+
friend BasicStringUnicode<std::u32string>;
201+
158202
/**
159203
* Escape a single Unicode codepoint.
160204
*
@@ -234,32 +278,44 @@ class BasicStringUnicode
234278
/**
235279
* Encode a Unicode codepoint into a UTF-8 string.
236280
*
237-
* @param codepoint The codepoint to encode.
281+
* @tparam OutputIteratorType The type of the output iterator to insert the result into.
238282
*
239-
* @return A string containing the encoded Unicode codepoint.
283+
* @param codepoint The codepoint to encode.
284+
* @param out The output iterator to insert the result into.
240285
*/
241-
template <typename CharType = char_type, enable_if<size_of_type_is<CharType, 1>> = 0>
242-
static StringType codepoint_to_string(codepoint_type codepoint);
286+
template <
287+
typename OutputIteratorType,
288+
typename CharType = char_type,
289+
enable_if<size_of_type_is<CharType, 1>> = 0>
290+
static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
243291

244292
/**
245293
* Encode a Unicode codepoint into a UTF-16 string.
246294
*
247-
* @param codepoint The codepoint to encode.
295+
* @tparam OutputIteratorType The type of the output iterator to insert the result into.
248296
*
249-
* @return A string containing the encoded Unicode codepoint.
297+
* @param codepoint The codepoint to encode.
298+
* @param out The output iterator to insert the result into.
250299
*/
251-
template <typename CharType = char_type, enable_if<size_of_type_is<CharType, 2>> = 0>
252-
static StringType codepoint_to_string(codepoint_type codepoint);
300+
template <
301+
typename OutputIteratorType,
302+
typename CharType = char_type,
303+
enable_if<size_of_type_is<CharType, 2>> = 0>
304+
static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
253305

254306
/**
255307
* Encode a Unicode codepoint into a UTF-32 string.
256308
*
257-
* @param codepoint The codepoint to encode.
309+
* @tparam OutputIteratorType The type of the output iterator to insert the result into.
258310
*
259-
* @return A string containing the encoded Unicode codepoint.
311+
* @param codepoint The codepoint to encode.
312+
* @param out The output iterator to insert the result into.
260313
*/
261-
template <typename CharType = char_type, enable_if<size_of_type_is<CharType, 4>> = 0>
262-
static StringType codepoint_to_string(codepoint_type codepoint);
314+
template <
315+
typename OutputIteratorType,
316+
typename CharType = char_type,
317+
enable_if<size_of_type_is<CharType, 4>> = 0>
318+
static void codepoint_to_string(codepoint_type codepoint, OutputIteratorType out);
263319

264320
/**
265321
* Create a Unicode codepoint from either one complete codepoint or two surrogate halves. The
@@ -382,26 +438,50 @@ template <typename DesiredStringType, typename IteratorType>
382438
std::optional<DesiredStringType>
383439
BasicStringUnicode<StringType>::convert_encoding(IteratorType &it, const IteratorType &end)
384440
{
385-
using DesiredUnicodeType = BasicStringUnicode<DesiredStringType>;
386-
387441
DesiredStringType result;
388442
result.reserve(static_cast<typename StringType::size_type>(std::distance(it, end)));
389443

444+
if (convert_encoding_into<DesiredStringType>(it, end, std::back_inserter(result)))
445+
{
446+
return result;
447+
}
448+
449+
return std::nullopt;
450+
}
451+
452+
//==================================================================================================
453+
template <typename StringType>
454+
template <typename DesiredStringType, typename OutputIteratorType, typename SourceStringType>
455+
bool BasicStringUnicode<StringType>::convert_encoding_into(
456+
SourceStringType &&value,
457+
OutputIteratorType out)
458+
{
459+
auto it = value.cbegin();
460+
return convert_encoding_into<DesiredStringType>(it, value.cend(), out);
461+
}
462+
463+
//==================================================================================================
464+
template <typename StringType>
465+
template <typename DesiredStringType, typename IteratorType, typename OutputIteratorType>
466+
bool BasicStringUnicode<StringType>::convert_encoding_into(
467+
IteratorType &it,
468+
const IteratorType &end,
469+
OutputIteratorType out)
470+
{
471+
using DesiredUnicodeType = BasicStringUnicode<DesiredStringType>;
472+
390473
while (it != end)
391474
{
392475
if (auto codepoint = decode_codepoint(it, end); codepoint)
393476
{
394-
if (auto encoded = DesiredUnicodeType::encode_codepoint(*codepoint); encoded)
395-
{
396-
result += *std::move(encoded);
397-
continue;
398-
}
477+
DesiredUnicodeType::codepoint_to_string(*codepoint, out);
478+
continue;
399479
}
400480

401-
return std::nullopt;
481+
return false;
402482
}
403483

404-
return result;
484+
return true;
405485
}
406486

407487
//==================================================================================================
@@ -426,7 +506,10 @@ std::optional<StringType> BasicStringUnicode<StringType>::encode_codepoint(codep
426506
{
427507
if (validate_codepoint(codepoint))
428508
{
429-
return codepoint_to_string(codepoint);
509+
StringType result;
510+
codepoint_to_string(codepoint, std::back_inserter(result));
511+
512+
return result;
430513
}
431514

432515
return std::nullopt;
@@ -663,64 +746,62 @@ auto BasicStringUnicode<StringType>::codepoint_from_string(
663746

664747
//==================================================================================================
665748
template <typename StringType>
666-
template <typename CharType, enable_if<size_of_type_is<CharType, 1>>>
667-
StringType BasicStringUnicode<StringType>::codepoint_to_string(codepoint_type codepoint)
749+
template <typename OutputIteratorType, typename CharType, enable_if<size_of_type_is<CharType, 1>>>
750+
void BasicStringUnicode<StringType>::codepoint_to_string(
751+
codepoint_type codepoint,
752+
OutputIteratorType out)
668753
{
669-
StringType result;
670-
671754
if (codepoint < 0x80)
672755
{
673-
result += static_cast<char_type>(codepoint);
756+
*out++ = static_cast<char_type>(codepoint);
674757
}
675758
else if (codepoint < 0x800)
676759
{
677-
result += static_cast<char_type>(0xc0 | (codepoint >> 6));
678-
result += static_cast<char_type>(0x80 | (codepoint & 0x3f));
760+
*out++ = static_cast<char_type>(0xc0 | (codepoint >> 6));
761+
*out++ = static_cast<char_type>(0x80 | (codepoint & 0x3f));
679762
}
680763
else if (codepoint < 0x10000)
681764
{
682-
result += static_cast<char_type>(0xe0 | (codepoint >> 12));
683-
result += static_cast<char_type>(0x80 | ((codepoint >> 6) & 0x3f));
684-
result += static_cast<char_type>(0x80 | (codepoint & 0x3f));
765+
*out++ = static_cast<char_type>(0xe0 | (codepoint >> 12));
766+
*out++ = static_cast<char_type>(0x80 | ((codepoint >> 6) & 0x3f));
767+
*out++ = static_cast<char_type>(0x80 | (codepoint & 0x3f));
685768
}
686769
else
687770
{
688-
result += static_cast<char_type>(0xf0 | (codepoint >> 18));
689-
result += static_cast<char_type>(0x80 | ((codepoint >> 12) & 0x3f));
690-
result += static_cast<char_type>(0x80 | ((codepoint >> 6) & 0x3f));
691-
result += static_cast<char_type>(0x80 | (codepoint & 0x3f));
771+
*out++ = static_cast<char_type>(0xf0 | (codepoint >> 18));
772+
*out++ = static_cast<char_type>(0x80 | ((codepoint >> 12) & 0x3f));
773+
*out++ = static_cast<char_type>(0x80 | ((codepoint >> 6) & 0x3f));
774+
*out++ = static_cast<char_type>(0x80 | (codepoint & 0x3f));
692775
}
693-
694-
return result;
695776
}
696777

697778
//==================================================================================================
698779
template <typename StringType>
699-
template <typename CharType, enable_if<size_of_type_is<CharType, 2>>>
700-
StringType BasicStringUnicode<StringType>::codepoint_to_string(codepoint_type codepoint)
780+
template <typename OutputIteratorType, typename CharType, enable_if<size_of_type_is<CharType, 2>>>
781+
void BasicStringUnicode<StringType>::codepoint_to_string(
782+
codepoint_type codepoint,
783+
OutputIteratorType out)
701784
{
702-
StringType result;
703-
704785
if (codepoint < 0x10000)
705786
{
706-
result += static_cast<char_type>(codepoint);
787+
*out++ = static_cast<char_type>(codepoint);
707788
}
708789
else
709790
{
710791
codepoint -= 0x10000;
711-
result += static_cast<char_type>(s_high_surrogate_min | (codepoint >> 10));
712-
result += static_cast<char_type>(s_low_surrogate_min | (codepoint & 0x3ff));
792+
*out++ = static_cast<char_type>(s_high_surrogate_min | (codepoint >> 10));
793+
*out++ = static_cast<char_type>(s_low_surrogate_min | (codepoint & 0x3ff));
713794
}
714-
715-
return result;
716795
}
717796

718797
//==================================================================================================
719798
template <typename StringType>
720-
template <typename CharType, enable_if<size_of_type_is<CharType, 4>>>
721-
StringType BasicStringUnicode<StringType>::codepoint_to_string(codepoint_type codepoint)
799+
template <typename OutputIteratorType, typename CharType, enable_if<size_of_type_is<CharType, 4>>>
800+
void BasicStringUnicode<StringType>::codepoint_to_string(
801+
codepoint_type codepoint,
802+
OutputIteratorType out)
722803
{
723-
return StringType(1, static_cast<char_type>(codepoint));
804+
*out++ = static_cast<char_type>(codepoint);
724805
}
725806

726807
//==================================================================================================

0 commit comments

Comments
 (0)