Skip to content

Commit 5703821

Browse files
committed
use end of chunk for complex unicode
1 parent fbb4a9d commit 5703821

File tree

55 files changed

+48858
-10
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+48858
-10
lines changed
Binary file not shown.
3.14 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
5.68 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3.15 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
764 Bytes
Binary file not shown.
304 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

src/libunicode/codepoint_properties_data.cpp

+5,026
Large diffs are not rendered by default.
+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// This file was auto-generated using /home/yaraslau/repo/libunicode/src/libunicode/tablegen.cpp.
2+
#pragma once
3+
4+
#include <libunicode/codepoint_properties.h>
5+
6+
#include <array>
7+
#include <cstdint>
8+
9+
namespace unicode::precompiled
10+
{
11+
12+
extern std::array<uint8_t, 4352> const stage1;
13+
extern std::array<uint16_t, 50688> const stage2;
14+
extern std::array<codepoint_properties, 1363> const properties;
15+
extern std::array<uint8_t, 4352> const names_stage1;
16+
extern std::array<uint16_t, 57600> const names_stage2;
17+
extern std::array<std::string_view, 39490> const names_stage3;
18+
19+
} // end namespace unicode::precompiled

src/libunicode/codepoint_properties_names.cpp

+43,610
Large diffs are not rendered by default.

src/libunicode/grapheme_line_segmenter.h

+57-8
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
#include <libunicode/grapheme_segmenter.h>
1717
#include <libunicode/support.h>
1818

19-
#if defined(LIBUNICODE_TRACE)
19+
#include <fmt/core.h>
20+
21+
#if 0 || defined(LIBUNICODE_TRACE)
2022
#include <format>
2123
#include <iostream>
2224

@@ -457,9 +459,11 @@ namespace detail
457459
//
458460
// @returns a sequence of grapheme clusters up to maxWidth width.
459461
template <OptionalGraphemeSegmentationListenerConcept EventHandlerT>
460-
LIBUNICODE_INLINE auto process_only_complex_unicode(
461-
EventHandlerT& eventHandler, unicode_process_state& state, char const* start, char const* end, unsigned maxWidth) noexcept
462-
-> detail::unicode_process_result
462+
LIBUNICODE_INLINE auto process_only_complex_unicode(EventHandlerT& eventHandler,
463+
unicode_process_state& state,
464+
char const* start,
465+
char const* end,
466+
unsigned maxWidth) noexcept -> detail::unicode_process_result
463467
{
464468
if (!state.utf8DecodeNext)
465469
{
@@ -538,6 +542,10 @@ namespace detail
538542
return make_scan_result(consumedWidths, state.currentClusterStart, StopCondition::EndOfWidth);
539543
}
540544
}
545+
else
546+
{
547+
// Boundary of a grapheme not found yet.
548+
}
541549
}
542550
else if (std::holds_alternative<unicode::Invalid>(result))
543551
{
@@ -660,15 +668,22 @@ class grapheme_line_segmenter<OptionalEventListener>
660668
return { .text = {}, .width = 0, .stop_condition = StopCondition::EndOfInput };
661669

662670
// Points to the beginning of a grapheme cluster.
663-
char const* const resultStart = _complexUnicodeState.currentClusterStart;
671+
char const* const resultStart = _complexUnicodeState.currentCodepointStart;
664672
char const* const endAtMaxWidth = std::min(end(), next() + maxWidth);
665673

674+
LIBUNICODE_TRACE_SEGMENTER("resultStart: {}\n", (void*) resultStart);
666675
// Total number of widths used in the current line.
667676
unsigned processedTotalWidth = 0;
668677

669678
while (true)
670679
{
671-
switch (detail::make_state(next(), end(), processedTotalWidth, maxWidth))
680+
auto const state = detail::make_state(next(), end(), processedTotalWidth, maxWidth);
681+
LIBUNICODE_TRACE_SEGMENTER("currentClusterStart: {}, end: {} , processedTotalWidth: {}, state: {} \n",
682+
(void*) _complexUnicodeState.currentClusterStart,
683+
(void*) end(),
684+
processedTotalWidth,
685+
state);
686+
switch (state)
672687
{
673688
case State::EndOfInput:
674689
return { .text = { resultStart, _complexUnicodeState.currentClusterStart },
@@ -725,14 +740,24 @@ class grapheme_line_segmenter<OptionalEventListener>
725740
std::string_view(start, chunk.end),
726741
(long) std::distance(start, chunk.end),
727742
chunk.totalWidth,
728-
(int) chunk.stop_condition);
743+
[](auto stop) {
744+
switch (stop)
745+
{
746+
case StopCondition::UnexpectedInput: return "UnexpectedInput";
747+
case StopCondition::EndOfInput: return "EndOfInput";
748+
case StopCondition::EndOfWidth: return "EndOfWidth";
749+
}
750+
return "INVALID";
751+
}(chunk.stop_condition));
729752
processedTotalWidth += chunk.totalWidth;
730753
assert(processedTotalWidth <= maxWidth);
731754
if (chunk.stop_condition != StopCondition::UnexpectedInput)
755+
{
732756
// The most recent grapheme cluster does not fit into the current line or the input is exhausted.
733-
return { .text = std::string_view { resultStart, _complexUnicodeState.currentClusterStart },
757+
return { .text = std::string_view { resultStart, chunk.end },
734758
.width = processedTotalWidth,
735759
.stop_condition = chunk.stop_condition };
760+
}
736761
break;
737762
}
738763
}
@@ -828,4 +853,28 @@ inline std::ostream& operator<<(std::ostream& os, unicode::grapheme_segmentation
828853
<< ", stop: " << value.stop_condition << "}";
829854
}
830855
} // namespace std
856+
857+
namespace fmt
858+
{
859+
template <>
860+
struct formatter<unicode::detail::State>: formatter<std::string_view>
861+
{
862+
template <typename FormatContext>
863+
auto format(unicode::detail::State const& value, FormatContext& ctx)
864+
{
865+
std::string_view name;
866+
switch (value)
867+
{
868+
case unicode::detail::State::EndOfInput: name = "EndOfInput"; break;
869+
case unicode::detail::State::EndOfWidth: name = "EndOfWidth"; break;
870+
case unicode::detail::State::C0: name = "C0"; break;
871+
case unicode::detail::State::ASCII: name = "ASCII"; break;
872+
case unicode::detail::State::ComplexUnicode: name = "ComplexUnicode"; break;
873+
}
874+
return formatter<std::string_view>::format(name, ctx);
875+
}
876+
};
877+
878+
} // namespace fmt
879+
831880
// }}}

src/libunicode/grapheme_line_segmenter_test.cpp

+146-2
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,19 @@
2020
#include <catch2/catch_message.hpp>
2121
#include <catch2/catch_test_macros.hpp>
2222

23+
#include <iostream>
2324
#include <string_view>
2425
#include <variant>
2526

27+
#if 0 || defined(LIBUNICODE_TRACE)
28+
#include <format>
29+
#include <iostream>
30+
31+
#define TRACE(...) std::cout << std::format(__VA_ARGS__)
32+
#else
33+
#define TRACE(...) ((void) 0)
34+
#endif
35+
2636
using namespace std::string_view_literals;
2737
using namespace std::string_literals;
2838
using std::pair;
@@ -55,6 +65,31 @@ std::ostream& operator<<(std::ostream& os, expectation const& e)
5565
} // namespace std
5666
// }}}
5767

68+
namespace fmt
69+
{
70+
71+
template <>
72+
struct formatter<expectation>: formatter<std::string_view>
73+
{
74+
template <typename FormatContext>
75+
auto format(expectation const& e, FormatContext& ctx) const
76+
{
77+
return format_to(ctx.out(), "{{ offset: {}, size: {}, width: {} }}", e.offset, e.size, e.width);
78+
}
79+
};
80+
81+
template <>
82+
struct formatter<std::pair<unicode::StopCondition, unsigned>>: formatter<std::string_view>
83+
{
84+
template <typename FormatContext>
85+
auto format(std::pair<unicode::StopCondition, unsigned> const& v, FormatContext& ctx) const
86+
{
87+
return format_to(ctx.out(), "{{{}, {}}}", v.first, v.second);
88+
}
89+
};
90+
91+
} // namespace fmt
92+
5893
// {{{ helpers
5994
namespace
6095
{
@@ -120,12 +155,34 @@ struct complex_unicode_sequence
120155
return os << "{ value: \"" << e(seq.value) << "\", width: " << seq.width << " }";
121156
}
122157

158+
using Record = std::variant<invalid_sequence, ascii_sequence, complex_unicode_sequence>;
159+
123160
} // namespace
124161

125-
namespace
162+
namespace fmt
163+
{
164+
template <>
165+
struct formatter<Record>: formatter<std::string_view>
126166
{
167+
template <typename FormatContext>
168+
auto format(Record const& r, FormatContext& ctx) const
169+
{
170+
if (std::holds_alternative<invalid_sequence>(r))
171+
return fmt::format_to(ctx.out(), "invalid_sequence {{ value: \"{}\" }}", std::get<invalid_sequence>(r).value);
172+
else if (std::holds_alternative<ascii_sequence>(r))
173+
return fmt::format_to(ctx.out(), "ascii_sequence {{ value: \"{}\" }}", std::get<ascii_sequence>(r).value);
174+
else
175+
return fmt::format_to(ctx.out(),
176+
"complex_unicode_sequence {{ value: \"{}\", width: {} }}",
177+
std::get<complex_unicode_sequence>(r).value,
178+
std::get<complex_unicode_sequence>(r).width);
179+
}
180+
};
127181

128-
using Record = std::variant<invalid_sequence, ascii_sequence, complex_unicode_sequence>;
182+
} // namespace fmt
183+
184+
namespace
185+
{
129186

130187
auto constexpr FamilyEmoji = U"\U0001F468\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466"sv;
131188
auto constexpr SmileyEmoji = U"\U0001F600"sv;
@@ -649,3 +706,90 @@ TEST_CASE("grapheme_line_segmenter.complex.sliced_calls")
649706
CHECK(result2.stop_condition == StopCondition::UnexpectedInput); // control character \033
650707
REQUIRE(e(result2.text) == e(u8(SmileyEmoji)));
651708
}
709+
710+
TEST_CASE("grapheme_utf8.0")
711+
{
712+
auto constexpr text = "\xC3\xB6"sv; // 'ö'
713+
714+
const auto* input = text.data();
715+
const auto* const end = text.data() + text.size();
716+
717+
auto recorder = event_recorder { "single_utf8" };
718+
auto segmenter = grapheme_line_segmenter { recorder, ""sv };
719+
720+
auto const chunk = std::string_view(input, end);
721+
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
722+
segmenter.reset(chunk);
723+
724+
auto const result = segmenter.process(10);
725+
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
726+
switch (val)
727+
{
728+
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
729+
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
730+
case unicode::StopCondition::EndOfInput: return "EndOfInput";
731+
}
732+
return "Unknown";
733+
}(result.stop_condition));
734+
735+
CHECK(result.text == text);
736+
CHECK(result.width == 0);
737+
}
738+
739+
TEST_CASE("grapheme_utf8.1")
740+
{
741+
auto constexpr text = "\xC3\xB6 "sv; // 'ö '
742+
743+
const auto* input = text.data();
744+
const auto* const end = text.data() + text.size();
745+
746+
auto recorder = event_recorder { "single_utf8" };
747+
auto segmenter = grapheme_line_segmenter { recorder, ""sv };
748+
749+
auto const chunk = std::string_view(input, end);
750+
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
751+
segmenter.reset(chunk);
752+
753+
auto const result = segmenter.process(10);
754+
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
755+
switch (val)
756+
{
757+
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
758+
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
759+
case unicode::StopCondition::EndOfInput: return "EndOfInput";
760+
}
761+
return "Unknown";
762+
}(result.stop_condition));
763+
764+
CHECK(result.text == text);
765+
CHECK(result.width == 2);
766+
}
767+
768+
TEST_CASE("grapheme_utf8.2")
769+
{
770+
auto constexpr text = "a\xC3\xB6a"sv; // 'aöa'
771+
772+
const auto* input = text.data();
773+
const auto* const end = text.data() + text.size();
774+
775+
auto recorder = event_recorder { "single_utf8" };
776+
auto segmenter = grapheme_line_segmenter { recorder, ""sv };
777+
778+
auto const chunk = std::string_view(input, end);
779+
TRACE("Processing {}...{} ({})\n", (void*) input, (void*) end, std::distance(input, end));
780+
segmenter.reset(chunk);
781+
782+
auto const result = segmenter.process(10);
783+
TRACE("result: [text: \"{}\", width: {}, stop: {}]\n", result.text, result.width, [](auto val) {
784+
switch (val)
785+
{
786+
case unicode::StopCondition::UnexpectedInput: return "UnexpectedInput";
787+
case unicode::StopCondition::EndOfWidth: return "EndOfWidth";
788+
case unicode::StopCondition::EndOfInput: return "EndOfInput";
789+
}
790+
return "Unknown";
791+
}(result.stop_condition));
792+
793+
CHECK(result.text == text);
794+
CHECK(result.width == 3);
795+
}

0 commit comments

Comments
 (0)