Skip to content

Commit f9c0def

Browse files
WIP
Signed-off-by: Christian Parpart <[email protected]>
1 parent 496a93e commit f9c0def

File tree

3 files changed

+87
-41
lines changed

3 files changed

+87
-41
lines changed

src/libunicode/grapheme_line_segmenter.cpp

+21-20
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ namespace
5555
void grapheme_line_segmenter::reset(std::string_view buffer) noexcept
5656
{
5757
_buffer = buffer;
58+
_next = buffer.data();
5859

5960
_utf8 = {};
6061
_lastCodepointHint = 0;
@@ -70,7 +71,7 @@ void grapheme_line_segmenter::move_forward_to(char const* pos) noexcept
7071
{
7172
assert(_buffer.data() <= pos && pos <= _buffer.data() + _buffer.size());
7273
auto const skippedBytesCount = static_cast<size_t>(pos - _buffer.data());
73-
_buffer.remove_prefix(skippedBytesCount);
74+
_next += skippedBytesCount;
7475
_lastCodepointHint = 0;
7576
_currentClusterWidth = 0;
7677
_utf8 = {};
@@ -83,7 +84,7 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned m
8384
if (_buffer.empty())
8485
return result_type { .text = _buffer.substr(0, 0), .width = 0 };
8586

86-
char const* start = _buffer.data();
87+
char const* start = _next;
8788
char const* const resultStart = _utf8.expectedLength ? start - _utf8.currentLength : start;
8889

8990
// Number of bytes used in the current line.
@@ -118,17 +119,20 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned m
118119
maxWidth -= count;
119120
totalWidthProcessed += count;
120121
totalByteCountProcessed += count;
121-
_buffer.remove_prefix(count);
122+
_next += count;
122123
break;
123124
}
124125
case State::ComplexUnicode: {
125126
auto const sub = process_complex_unicode(maxWidth);
126-
if (sub.width == 0)
127+
if (sub.graphemeClusterCount == 0)
128+
{
129+
_next += sub.byteCount;
127130
return makeResult();
128-
maxWidth -= sub.width;
129-
totalWidthProcessed += sub.width;
130-
totalByteCountProcessed += sub.text.size();
131-
_buffer.remove_prefix(sub.text.size());
131+
}
132+
maxWidth -= sub.graphemeClusterCount;
133+
totalWidthProcessed += sub.graphemeClusterCount;
134+
totalByteCountProcessed += sub.byteCount;
135+
_next += sub.byteCount;
132136
break;
133137
}
134138
}
@@ -137,7 +141,7 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned m
137141
return makeResult();
138142
}
139143

140-
unsigned grapheme_line_segmenter::process_ascii(unsigned maxWidth) noexcept
144+
unsigned grapheme_line_segmenter::process_ascii(unsigned maxWidth) const noexcept
141145
{
142146
auto input = _buffer.data();
143147
auto const end = _buffer.data() + std::min(static_cast<unsigned>(_buffer.size()), maxWidth);
@@ -173,20 +177,19 @@ unsigned grapheme_line_segmenter::process_ascii(unsigned maxWidth) noexcept
173177
return static_cast<unsigned>(std::distance(_buffer.data(), input));
174178
}
175179

176-
grapheme_line_segmenter::result_type grapheme_line_segmenter::process_complex_unicode(
177-
unsigned maxWidth) noexcept
180+
auto grapheme_line_segmenter::process_complex_unicode(unsigned maxWidth) noexcept -> unicode_process_result
178181
{
179182
char const* const start = _buffer.data();
180183
char const* const end = start + _buffer.size();
181184

182-
char const* input = start; // current input processing position
185+
char const* input = _next; // current input processing position
183186
char const* clusterStart = start; // start position of current grapheme cluster
184187
char const* lastCodepointStart = start; // start position of last codepoint
185188
unsigned consumedWidth = 0; // width consumed for the current line
186189
unsigned currentCodepointLength = 0; // bytes consumed for the current codepoint
187190

188191
char const* const lastClusterStart =
189-
_utf8.expectedLength ? start - _utf8.currentLength : start; // start position of last grapheme cluster
192+
_utf8.expectedLength ? input - _utf8.currentLength : input; // start position of last grapheme cluster
190193

191194
char const* lastClusterEnd = lastClusterStart; // end position of last grapheme cluster
192195

@@ -228,8 +231,7 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process_complex_un
228231
{
229232
auto const prevCodepoint = _lastCodepointHint;
230233
auto const nextCodepoint = std::get<Success>(result).value;
231-
auto const nextWidth =
232-
std::max(_currentClusterWidth, static_cast<unsigned>(unicode::width(nextCodepoint)));
234+
auto const nextWidth = std::max(_currentClusterWidth, static_cast<unsigned>(unicode::width(nextCodepoint)));
233235
_lastCodepointHint = nextCodepoint;
234236
if (grapheme_segmenter::breakable(prevCodepoint, nextCodepoint))
235237
{
@@ -293,16 +295,15 @@ grapheme_line_segmenter::result_type grapheme_line_segmenter::process_complex_un
293295

294296
_currentClusterWidth = 0;
295297

296-
// if (currentCodepointLength <= _buffer.size())
297-
// _buffer.remove_prefix(currentCodepointLength);
298-
// else
299-
// abort();
298+
_next = input;
300299

301300
assert(lastClusterStart <= lastClusterEnd);
302301

303302
auto const resultLength = static_cast<size_t>(std::distance(lastClusterStart, lastClusterEnd));
304303
printf("lastClusterEnd: %p, size: %zu\n", (void*) lastClusterEnd, resultLength);
305-
return result_type { .text = std::string_view(lastClusterStart, resultLength), .width = consumedWidth };
304+
return unicode_process_result { .graphemeClusterCount = consumedWidth, .byteCount = resultLength };
305+
// return result_type { .text = std::string_view(lastClusterStart, resultLength), .width = consumedWidth
306+
// };
306307
}
307308

308309
ConvertResult grapheme_line_segmenter::process_single_byte(uint8_t byte) noexcept

src/libunicode/grapheme_line_segmenter.h

+22-13
Original file line numberDiff line numberDiff line change
@@ -71,23 +71,16 @@ class grapheme_line_segmenter final
7171
virtual void on_grapheme_cluster(std::string_view cluster, unsigned width) = 0;
7272
};
7373

74-
class null_listener final: public event_listener
74+
class null_event_listener final: public event_listener
7575
{
7676
public:
7777
void on_invalid(std::string_view) override {}
7878
void on_ascii(std::string_view) override {}
79-
void on_grapheme_cluster(std::string_view sequence, unsigned width) override
80-
{
81-
printf("on_grapheme_cluster(%u): %s\n", width, std::string(sequence).c_str());
82-
}
83-
84-
static null_listener& instance() noexcept
85-
{
86-
static null_listener instance;
87-
return instance;
88-
}
79+
void on_grapheme_cluster(std::string_view, unsigned) override {}
8980
};
9081

82+
static inline null_event_listener null_listener {};
83+
9184
explicit grapheme_line_segmenter(event_listener& events) noexcept: _events(events) {}
9285

9386
/**
@@ -100,6 +93,8 @@ class grapheme_line_segmenter final
10093

10194
/**
10295
* Expands the internal buffer by the given number of bytes.
96+
*
97+
* @param count the number of bytes to expand the buffer by
10398
*/
10499
void expand_buffer_by(size_t count) noexcept;
105100

@@ -108,6 +103,8 @@ class grapheme_line_segmenter final
108103
*
109104
* A call to this function will also reset the internal UTF-8 decoding state,
110105
* and will reset the last codepoint hint.
106+
*
107+
* @p pos must be a pointer to a position within the current buffer
111108
*/
112109
void move_forward_to(char const* pos) noexcept;
113110

@@ -179,7 +176,16 @@ class grapheme_line_segmenter final
179176
*
180177
* @returns the number of ASCII characters processed (equal to the sum of East Asian Width for each).
181178
*/
182-
unsigned process_ascii(unsigned maxWidth) noexcept;
179+
unsigned process_ascii(unsigned maxWidth) const noexcept;
180+
181+
struct unicode_process_result
182+
{
183+
// The number of grapheme clusters processed.
184+
unsigned graphemeClusterCount;
185+
186+
// The number of bytes processed.
187+
size_t byteCount;
188+
};
183189

184190
/**
185191
* Processes up to maxWidth grapheme clusters.
@@ -194,7 +200,7 @@ class grapheme_line_segmenter final
194200
*
195201
* @see process()
196202
*/
197-
result_type process_complex_unicode(unsigned maxWidth) noexcept;
203+
auto process_complex_unicode(unsigned maxWidth) noexcept -> unicode_process_result;
198204

199205
event_listener& _events;
200206

@@ -212,6 +218,9 @@ class grapheme_line_segmenter final
212218
// The buffer to scan. Its underlying storage must be used by the subsequent calls to process().
213219
// While consuming the buffer, the front of the buffer will be moved forward.
214220
std::string_view _buffer;
221+
222+
// processing position within the buffer
223+
char const* _next {};
215224
// }}}
216225
};
217226

src/libunicode/grapheme_line_segmenter_test.cpp

+44-8
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,47 @@ class event_recorder final: public grapheme_line_segmenter::event_listener
7070

7171
scan_result scan_text(std::string_view text, unsigned width)
7272
{
73-
grapheme_line_segmenter segmenter { grapheme_line_segmenter::null_listener::instance() };
73+
grapheme_line_segmenter segmenter { grapheme_line_segmenter::null_listener };
7474
segmenter.reset(text);
7575
auto result = segmenter.process(width);
7676
UNSCOPED_INFO("result: " << result);
7777
return result;
7878
}
79+
80+
enum class NumericEscape
81+
{
82+
Octal,
83+
Hex
84+
};
85+
86+
std::string escape(uint8_t ch, NumericEscape numericEscape = NumericEscape::Hex)
87+
{
88+
switch (ch)
89+
{
90+
case '\\': return "\\\\";
91+
case 0x1B: return "\\e";
92+
case '\t': return "\\t";
93+
case '\r': return "\\r";
94+
case '\n': return "\\n";
95+
case '"': return "\\\"";
96+
default:
97+
if (0x20 <= ch && ch < 0x7E)
98+
return fmt::format("{}", static_cast<char>(ch));
99+
else if (numericEscape == NumericEscape::Hex)
100+
return fmt::format("\\x{:02x}", static_cast<uint8_t>(ch) & 0xFF);
101+
else
102+
return fmt::format("\\{:03o}", static_cast<uint8_t>(ch) & 0xFF);
103+
}
104+
}
105+
106+
inline std::string e(std::string_view s, NumericEscape numericEscape = NumericEscape::Hex)
107+
{
108+
auto result = std::string {};
109+
for (char c: s)
110+
result += escape(static_cast<uint8_t>(c), numericEscape);
111+
return result;
112+
}
113+
79114
} // namespace
80115

81116
TEST_CASE("grapheme_line_segmenter.ascii.empty")
@@ -120,14 +155,14 @@ TEST_CASE("grapheme_line_segmenter.complex.grapheme_cluster.2")
120155
{
121156
auto const familyEmoji8 = u8(FamilyEmoji) + u8(FamilyEmoji);
122157
auto const result = scan_text(familyEmoji8, 80);
123-
CHECK(result == scan_result { familyEmoji8, 4 });
158+
CHECK(result == scan_result { .text = familyEmoji8, .width = 4 });
124159
}
125160

126161
TEST_CASE("grapheme_line_segmenter.complex.mixed")
127162
{
128163
auto const text = u8(FamilyEmoji) + "ABC"s + u8(FamilyEmoji);
129164
auto const result = scan_text(text, 80);
130-
CHECK(result == scan_result { text, 7 });
165+
CHECK(result == scan_result { .text = text, .width = 7 });
131166
}
132167

133168
TEST_CASE("grapheme_line_segmenter.complex.half-overflowing")
@@ -148,12 +183,13 @@ TEST_CASE("grapheme_line_segmenter.complex.half-overflowing")
148183
TEST_CASE("grapheme_line_segmenter.complex.sliced_calls")
149184
{
150185
// auto const text = u8(SmileyEmoji) + "\033\\0123456789ABCDEF"s; // U+1F600
151-
auto const text = "\xF0\x9F\x98\x80\033\\0123456789ABCDEF"sv; // U+1F600
186+
auto constexpr text = "\xF0\x9F\x98\x80\033\\0123456789ABCDEF"sv; // U+1F600
152187
auto constexpr splitOffset = 3;
153-
auto const chunkOne = std::string_view(text.data(), splitOffset);
154-
auto const chunkTwo = std::string_view(text.data() + splitOffset, text.size() - splitOffset);
188+
auto constexpr chunkOne = text.substr(0, splitOffset);
189+
auto constexpr chunkTwo = text.substr(splitOffset);
190+
REQUIRE(chunkTwo.data() == chunkOne.data() + chunkOne.size());
155191

156-
grapheme_line_segmenter segmenter { grapheme_line_segmenter::null_listener::instance() };
192+
auto segmenter = grapheme_line_segmenter { grapheme_line_segmenter::null_listener };
157193
segmenter.reset(chunkOne);
158194
auto result = segmenter.process(3);
159195

@@ -168,5 +204,5 @@ TEST_CASE("grapheme_line_segmenter.complex.sliced_calls")
168204

169205
REQUIRE(segmenter.utf8_state().expectedLength == 0);
170206
CHECK(result.width == 2);
171-
REQUIRE(result.text == u8(SmileyEmoji));
207+
REQUIRE(e(result.text) == e(u8(SmileyEmoji)));
172208
}

0 commit comments

Comments
 (0)