Skip to content

Commit 3b705a5

Browse files
committed
Use end of chunk in complex unicode
1 parent fbb4a9d commit 3b705a5

File tree

3 files changed

+204
-10
lines changed

3 files changed

+204
-10
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ src/libunicode/ucd.h
1313
src/libunicode/ucd_enums.h
1414
src/libunicode/ucd_fmt.h
1515
src/libunicode/ucd_ostream.h
16+
/.cache/

src/libunicode/grapheme_line_segmenter.h

+57-8
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
#include <libunicode/grapheme_segmenter.h>
1717
#include <libunicode/support.h>
1818

19-
#if defined(LIBUNICODE_TRACE)
19+
#include <fmt/core.h>
20+
21+
#if 0 || defined(LIBUNICODE_TRACE)
2022
#include <format>
2123
#include <iostream>
2224

@@ -457,9 +459,11 @@ namespace detail
457459
//
458460
// @returns a sequence of grapheme clusters up to maxWidth width.
459461
template <OptionalGraphemeSegmentationListenerConcept EventHandlerT>
460-
LIBUNICODE_INLINE auto process_only_complex_unicode(
461-
EventHandlerT& eventHandler, unicode_process_state& state, char const* start, char const* end, unsigned maxWidth) noexcept
462-
-> detail::unicode_process_result
462+
LIBUNICODE_INLINE auto process_only_complex_unicode(EventHandlerT& eventHandler,
463+
unicode_process_state& state,
464+
char const* start,
465+
char const* end,
466+
unsigned maxWidth) noexcept -> detail::unicode_process_result
463467
{
464468
if (!state.utf8DecodeNext)
465469
{
@@ -538,6 +542,10 @@ namespace detail
538542
return make_scan_result(consumedWidths, state.currentClusterStart, StopCondition::EndOfWidth);
539543
}
540544
}
545+
else
546+
{
547+
// Boundary of a grapheme not found yet.
548+
}
541549
}
542550
else if (std::holds_alternative<unicode::Invalid>(result))
543551
{
@@ -660,15 +668,22 @@ class grapheme_line_segmenter<OptionalEventListener>
660668
return { .text = {}, .width = 0, .stop_condition = StopCondition::EndOfInput };
661669

662670
// Points to the beginning of a grapheme cluster.
663-
char const* const resultStart = _complexUnicodeState.currentClusterStart;
671+
char const* const resultStart = _complexUnicodeState.currentCodepointStart;
664672
char const* const endAtMaxWidth = std::min(end(), next() + maxWidth);
665673

674+
LIBUNICODE_TRACE_SEGMENTER("resultStart: {}\n", (void*) resultStart);
666675
// Total number of widths used in the current line.
667676
unsigned processedTotalWidth = 0;
668677

669678
while (true)
670679
{
671-
switch (detail::make_state(next(), end(), processedTotalWidth, maxWidth))
680+
auto const state = detail::make_state(next(), end(), processedTotalWidth, maxWidth);
681+
LIBUNICODE_TRACE_SEGMENTER("currentClusterStart: {}, end: {} , processedTotalWidth: {}, state: {} \n",
682+
(void*) _complexUnicodeState.currentClusterStart,
683+
(void*) end(),
684+
processedTotalWidth,
685+
state);
686+
switch (state)
672687
{
673688
case State::EndOfInput:
674689
return { .text = { resultStart, _complexUnicodeState.currentClusterStart },
@@ -725,14 +740,24 @@ class grapheme_line_segmenter<OptionalEventListener>
725740
std::string_view(start, chunk.end),
726741
(long) std::distance(start, chunk.end),
727742
chunk.totalWidth,
728-
(int) chunk.stop_condition);
743+
[](auto stop) {
744+
switch (stop)
745+
{
746+
case StopCondition::UnexpectedInput: return "UnexpectedInput";
747+
case StopCondition::EndOfInput: return "EndOfInput";
748+
case StopCondition::EndOfWidth: return "EndOfWidth";
749+
}
750+
return "INVALID";
751+
}(chunk.stop_condition));
729752
processedTotalWidth += chunk.totalWidth;
730753
assert(processedTotalWidth <= maxWidth);
731754
if (chunk.stop_condition != StopCondition::UnexpectedInput)
755+
{
732756
// The most recent grapheme cluster does not fit into the current line or the input is exhausted.
733-
return { .text = std::string_view { resultStart, _complexUnicodeState.currentClusterStart },
757+
return { .text = std::string_view { resultStart, chunk.end },
734758
.width = processedTotalWidth,
735759
.stop_condition = chunk.stop_condition };
760+
}
736761
break;
737762
}
738763
}
@@ -828,4 +853,28 @@ inline std::ostream& operator<<(std::ostream& os, unicode::grapheme_segmentation
828853
<< ", stop: " << value.stop_condition << "}";
829854
}
830855
} // namespace std
856+
857+
namespace fmt
858+
{
859+
template <>
860+
struct formatter<unicode::detail::State>: formatter<std::string_view>
861+
{
862+
template <typename FormatContext>
863+
auto format(unicode::detail::State const& value, FormatContext& ctx)
864+
{
865+
std::string_view name;
866+
switch (value)
867+
{
868+
case unicode::detail::State::EndOfInput: name = "EndOfInput"; break;
869+
case unicode::detail::State::EndOfWidth: name = "EndOfWidth"; break;
870+
case unicode::detail::State::C0: name = "C0"; break;
871+
case unicode::detail::State::ASCII: name = "ASCII"; break;
872+
case unicode::detail::State::ComplexUnicode: name = "ComplexUnicode"; break;
873+
}
874+
return formatter<std::string_view>::format(name, ctx);
875+
}
876+
};
877+
878+
} // namespace fmt
879+
831880
// }}}

src/libunicode/grapheme_line_segmenter_test.cpp

+146-2
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,19 @@
2020
#include <catch2/catch_message.hpp>
2121
#include <catch2/catch_test_macros.hpp>
2222

23+
#include <iostream>
2324
#include <string_view>
2425
#include <variant>
2526

27+
#if 0 || defined(LIBUNICODE_TRACE)
28+
#include <format>
29+
#include <iostream>
30+
31+
#define TRACE(...) std::cout << std::format(__VA_ARGS__)
32+
#else
33+
#define TRACE(...) ((void) 0)
34+
#endif
35+
2636
using namespace std::string_view_literals;
2737
using namespace std::string_literals;
2838
using std::pair;
@@ -55,6 +65,31 @@ std::ostream& operator<<(std::ostream& os, expectation const& e)
5565
} // namespace std
5666
// }}}
5767

68+
namespace fmt
69+
{
70+
71+
template <>
72+
struct formatter<expectation>: formatter<std::string_view>
73+
{
74+
template <typename FormatContext>
75+
auto format(expectation const& e, FormatContext& ctx) const
76+
{
77+
return format_to(ctx.out(), "{{ offset: {}, size: {}, width: {} }}", e.offset, e.size, e.width);
78+
}
79+
};
80+
81+
template <>
82+
struct formatter<std::pair<unicode::StopCondition, unsigned>>: formatter<std::string_view>
83+
{
84+
template <typename FormatContext>
85+
auto format(std::pair<unicode::StopCondition, unsigned> const& v, FormatContext& ctx) const
86+
{
87+
return format_to(ctx.out(), "{{{}, {}}}", v.first, v.second);
88+
}
89+
};
90+
91+
} // namespace fmt
92+
5893
// {{{ helpers
5994
namespace
6095
{
@@ -120,12 +155,34 @@ struct complex_unicode_sequence
120155
return os << "{ value: \"" << e(seq.value) << "\", width: " << seq.width << " }";
121156
}
122157

158+
using Record = std::variant<invalid_sequence, ascii_sequence, complex_unicode_sequence>;
159+
123160
} // namespace
124161

125-
namespace
162+
namespace fmt
163+
{
164+
template <>
165+
struct formatter<Record>: formatter<std::string_view>
126166
{
167+
template <typename FormatContext>
168+
auto format(Record const& r, FormatContext& ctx) const
169+
{
170+
if (std::holds_alternative<invalid_sequence>(r))
171+
return fmt::format_to(ctx.out(), "invalid_sequence {{ value: \"{}\" }}", std::get<invalid_sequence>(r).value);
172+
else if (std::holds_alternative<ascii_sequence>(r))
173+
return fmt::format_to(ctx.out(), "ascii_sequence {{ value: \"{}\" }}", std::get<ascii_sequence>(r).value);
174+
else
175+
return fmt::format_to(ctx.out(),
176+
"complex_unicode_sequence {{ value: \"{}\", width: {} }}",
177+
std::get<complex_unicode_sequence>(r).value,
178+
std::get<complex_unicode_sequence>(r).width);
179+
}
180+
};
127181

128-
using Record = std::variant<invalid_sequence, ascii_sequence, complex_unicode_sequence>;
182+
} // namespace fmt
183+
184+
namespace
185+
{
129186

130187
auto constexpr FamilyEmoji = U"\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466"sv;
131188
auto constexpr SmileyEmoji = U"\U0001F600"sv;
@@ -649,3 +706,90 @@ TEST_CASE("grapheme_line_segmenter.complex.sliced_calls")
649706
CHECK(result2.stop_condition == StopCondition::UnexpectedInput); // control character \033
650707
REQUIRE(e(result2.text) == e(u8(SmileyEmoji)));
651708
}
709+
710+
TEST_CASE("grapheme_utf8.0")
711+
{
712+
auto constexpr text = "\xC3\xB6"sv; // 'ö'
713+
714+
const auto* input = text.data();
715+
const auto* const end = text.data() + text.size();
716+
717+
auto recorder = event_recorder { "single_utf8" };
718+
auto segmenter = grapheme_line_segmenter { recorder, ""sv };
719+
720+
auto const chunk = std::string_view(input, end);
721+
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
722+
segmenter.reset(chunk);
723+
724+
auto const result = segmenter.process(10);
725+
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
726+
switch (val)
727+
{
728+
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
729+
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
730+
case unicode::StopCondition::EndOfInput: return "EndOfInput";
731+
}
732+
return "Unknown";
733+
}(result.stop_condition));
734+
735+
CHECK(result.text == text);
736+
CHECK(result.width == 0);
737+
}
738+
739+
TEST_CASE("grapheme_utf8.1")
740+
{
741+
auto constexpr text = "\xC3\xB6 "sv; // 'ö '
742+
743+
const auto* input = text.data();
744+
const auto* const end = text.data() + text.size();
745+
746+
auto recorder = event_recorder { "single_utf8" };
747+
auto segmenter = grapheme_line_segmenter { recorder, ""sv };
748+
749+
auto const chunk = std::string_view(input, end);
750+
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
751+
segmenter.reset(chunk);
752+
753+
auto const result = segmenter.process(10);
754+
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
755+
switch (val)
756+
{
757+
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
758+
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
759+
case unicode::StopCondition::EndOfInput: return "EndOfInput";
760+
}
761+
return "Unknown";
762+
}(result.stop_condition));
763+
764+
CHECK(result.text == text);
765+
CHECK(result.width == 2);
766+
}
767+
768+
TEST_CASE("grapheme_utf8.2")
769+
{
770+
auto constexpr text = "a\xC3\xB6a"sv; // 'aöa'
771+
772+
const auto* input = text.data();
773+
const auto* const end = text.data() + text.size();
774+
775+
auto recorder = event_recorder { "single_utf8" };
776+
auto segmenter = grapheme_line_segmenter { recorder, ""sv };
777+
778+
auto const chunk = std::string_view(input, end);
779+
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
780+
segmenter.reset(chunk);
781+
782+
auto const result = segmenter.process(10);
783+
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
784+
switch (val)
785+
{
786+
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
787+
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
788+
case unicode::StopCondition::EndOfInput: return "EndOfInput";
789+
}
790+
return "Unknown";
791+
}(result.stop_condition));
792+
793+
CHECK(result.text == text);
794+
CHECK(result.width == 3);
795+
}

0 commit comments

Comments
 (0)