Skip to content

Commit 7da3728

Browse files
Introduce new API grapheme_line_segmenter to replace scan API
Signed-off-by: Christian Parpart <[email protected]>
1 parent bc9e84a commit 7da3728

File tree

4 files changed

+710
-0
lines changed

4 files changed

+710
-0
lines changed

src/libunicode/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
8686
capi.cpp
8787
codepoint_properties.cpp
8888
emoji_segmenter.cpp
89+
grapheme_line_segmenter.cpp
8990
grapheme_segmenter.cpp
9091
scan.cpp
9192
script_segmenter.cpp
@@ -103,6 +104,7 @@ set(public_headers
103104
codepoint_properties.h
104105
convert.h
105106
emoji_segmenter.h
107+
grapheme_line_segmenter.h
106108
grapheme_segmenter.h
107109
intrinsics.h
108110
multistage_table_view.h
@@ -186,6 +188,7 @@ if(LIBUNICODE_TESTING)
186188
capi_test.cpp
187189
convert_test.cpp
188190
emoji_segmenter_test.cpp
191+
grapheme_line_segmenter_test.cpp
189192
grapheme_segmenter_test.cpp
190193
run_segmenter_test.cpp
191194
scan_test.cpp
+313
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
/**
2+
* This file is part of the "libunicode" project
3+
* Copyright (c) 2023 Christian Parpart <[email protected]>
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
#include <libunicode/grapheme_line_segmenter.h>
15+
#include <libunicode/grapheme_segmenter.h>
16+
#include <libunicode/intrinsics.h>
17+
#include <libunicode/utf8.h>
18+
#include <libunicode/width.h>
19+
20+
#include <algorithm>
21+
#include <cassert>
22+
23+
namespace unicode
24+
{
25+
26+
namespace
27+
{
28+
[[maybe_unused]] int countTrailingZeroBits(unsigned int value) noexcept
29+
{
30+
#if defined(_WIN32)
31+
return _tzcnt_u32(value);
32+
#else
33+
return __builtin_ctz(value);
34+
#endif
35+
}
36+
37+
constexpr bool is_control(char ch) noexcept
38+
{
39+
return static_cast<uint8_t>(ch) < 0x20;
40+
}
41+
42+
// Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E.
43+
constexpr bool is_complex(char ch) noexcept
44+
{
45+
return static_cast<uint8_t>(ch) & 0x80;
46+
}
47+
48+
// Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters.
49+
constexpr bool is_ascii(char ch) noexcept
50+
{
51+
return !is_control(ch) && !is_complex(ch);
52+
}
53+
} // namespace
54+
55+
unsigned grapheme_line_segmenter::process_ascii(unsigned maxWidth) noexcept
56+
{
57+
auto input = _buffer.data();
58+
auto const end = _buffer.data() + std::min(static_cast<unsigned>(_buffer.size()), maxWidth);
59+
60+
intrinsics::m128i const ControlCodeMax = intrinsics::set1_epi8(0x20); // 0..0x1F
61+
intrinsics::m128i const Complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000)
62+
63+
while (input < end - sizeof(intrinsics::m128i))
64+
{
65+
intrinsics::m128i batch = intrinsics::load_unaligned((intrinsics::m128i*) input);
66+
intrinsics::m128i isControl = intrinsics::compare_less(batch, ControlCodeMax);
67+
intrinsics::m128i isComplex = intrinsics::and128(batch, Complex);
68+
// intrinsics::m128i isComplex = _mm_cmplt_epi8(batch, Complex);
69+
intrinsics::m128i testPack = intrinsics::or128(isControl, isComplex);
70+
if (int const check = intrinsics::movemask_epi8(testPack); check != 0)
71+
{
72+
int advance = countTrailingZeroBits(static_cast<unsigned>(check));
73+
input += advance;
74+
break;
75+
}
76+
input += sizeof(intrinsics::m128i);
77+
}
78+
79+
while (input != end && is_ascii(*input))
80+
++input;
81+
82+
// if (static_cast<size_t>(distance(_buffer.data(), input)))
83+
// fmt::print(
84+
// "countAsciiTextChars: {} bytes: \"{}\"\n",
85+
// static_cast<size_t>(distance(_buffer.data(), input)),
86+
// (string_view(_buffer.data(), static_cast<size_t>(distance(_buffer.data(), input)))));
87+
88+
return static_cast<unsigned>(std::distance(_buffer.data(), input));
89+
}
90+
91+
void grapheme_line_segmenter::reset(std::string_view buffer) noexcept
92+
{
93+
_buffer = buffer;
94+
95+
_utf8 = {};
96+
_lastCodepointHint = 0;
97+
_currentClusterWidth = 0;
98+
}
99+
100+
void grapheme_line_segmenter::expand_buffer_by(size_t count) noexcept
101+
{
102+
_buffer = std::string_view(_buffer.data(), _buffer.size() + count);
103+
}
104+
105+
void grapheme_line_segmenter::move_forward_to(char const* pos) noexcept
106+
{
107+
assert(_buffer.data() <= pos && pos <= _buffer.data() + _buffer.size());
108+
auto const skippedBytesCount = static_cast<size_t>(pos - _buffer.data());
109+
_buffer.remove_prefix(skippedBytesCount);
110+
_lastCodepointHint = 0;
111+
_currentClusterWidth = 0;
112+
_utf8 = {};
113+
}
114+
115+
grapheme_line_segmenter::result_type grapheme_line_segmenter::process(unsigned maxWidth) noexcept
116+
{
117+
printf("Processing %zu bytes @%p: \"%s\"\n", _buffer.size(), (void*) _buffer.data(), _buffer.data());
118+
119+
if (_buffer.empty())
120+
return result_type { .text = _buffer.substr(0, 0), .width = 0 };
121+
122+
char const* start = _buffer.data();
123+
char const* const resultStart = _utf8.expectedLength ? start - _utf8.currentLength : start;
124+
125+
// Number of bytes used in the current line.
126+
size_t totalByteCountProcessed = 0;
127+
128+
// Number of width used in the current line.
129+
unsigned totalWidthProcessed = 0;
130+
131+
auto const makeResult = [&]() -> result_type {
132+
return result_type { .text = std::string_view(resultStart, totalByteCountProcessed),
133+
.width = totalWidthProcessed };
134+
};
135+
136+
enum class State
137+
{
138+
ASCII,
139+
ComplexUnicode,
140+
};
141+
142+
while (maxWidth > 0 && !_buffer.empty())
143+
{
144+
State const state =
145+
(_utf8.expectedLength != 0 || is_complex(_buffer.front())) ? State::ComplexUnicode : State::ASCII;
146+
147+
switch (state)
148+
{
149+
case State::ASCII: {
150+
auto const count = process_ascii(maxWidth);
151+
if (count == 0)
152+
return makeResult();
153+
_events.on_ascii(_buffer.substr(0, count));
154+
maxWidth -= count;
155+
totalWidthProcessed += count;
156+
totalByteCountProcessed += count;
157+
_buffer.remove_prefix(count);
158+
break;
159+
}
160+
case State::ComplexUnicode: {
161+
auto const sub = process_complex_unicode(maxWidth);
162+
if (sub.width == 0)
163+
return makeResult();
164+
maxWidth -= sub.width;
165+
totalWidthProcessed += sub.width;
166+
totalByteCountProcessed += sub.text.size();
167+
_buffer.remove_prefix(sub.text.size());
168+
break;
169+
}
170+
}
171+
}
172+
173+
return makeResult();
174+
}
175+
176+
grapheme_line_segmenter::result_type grapheme_line_segmenter::process_complex_unicode(
177+
unsigned maxWidth) noexcept
178+
{
179+
char const* const start = _buffer.data();
180+
char const* const end = start + _buffer.size();
181+
182+
char const* input = start; // current input processing position
183+
char const* clusterStart = start; // start position of current grapheme cluster
184+
char const* lastCodepointStart = start; // start position of last codepoint
185+
unsigned consumedWidth = 0; // width consumed for the current line
186+
unsigned currentCodepointLength = 0; // bytes consumed for the current codepoint
187+
188+
char const* const lastClusterStart =
189+
_utf8.expectedLength ? start - _utf8.currentLength : start; // start position of last grapheme cluster
190+
191+
char const* lastClusterEnd = lastClusterStart; // end position of last grapheme cluster
192+
193+
printf("process_complex_unicode: start at %p\n", (void*) lastClusterStart);
194+
int iteration = 0;
195+
while (input != end && consumedWidth <= maxWidth)
196+
{
197+
++iteration;
198+
if (is_control(*input))
199+
printf("Terminating, because control character 0x%02X.\n", static_cast<uint8_t>(*input));
200+
else if (!is_complex(*input))
201+
printf("Terminating, because single US-ASCII text codepoint. '%c'\n", *input);
202+
if (is_control(*input) || !is_complex(*input))
203+
{
204+
// ASCII control character or single US-ASCII text codepoint.
205+
206+
if (_utf8.expectedLength)
207+
{
208+
// Incomplete UTF-8 sequence hit. That's invalid as well.
209+
++consumedWidth;
210+
_events.on_invalid(std::string_view(input, input + 1));
211+
_utf8 = {};
212+
}
213+
214+
_lastCodepointHint = 0;
215+
lastClusterEnd = input;
216+
currentCodepointLength = 0;
217+
break;
218+
}
219+
220+
printf("complex input (%d): 0x%02X\n", iteration, static_cast<uint8_t>(*input));
221+
auto const result = from_utf8(_utf8, static_cast<uint8_t>(*input++));
222+
++currentCodepointLength;
223+
224+
if (holds_alternative<Incomplete>(result))
225+
continue;
226+
227+
if (holds_alternative<Success>(result))
228+
{
229+
auto const prevCodepoint = _lastCodepointHint;
230+
auto const nextCodepoint = std::get<Success>(result).value;
231+
auto const nextWidth =
232+
std::max(_currentClusterWidth, static_cast<unsigned>(unicode::width(nextCodepoint)));
233+
_lastCodepointHint = nextCodepoint;
234+
if (grapheme_segmenter::breakable(prevCodepoint, nextCodepoint))
235+
{
236+
// Flush out current grapheme cluster's East Asian Width.
237+
consumedWidth += _currentClusterWidth;
238+
maxWidth -= _currentClusterWidth;
239+
240+
if (consumedWidth + nextWidth > maxWidth)
241+
{
242+
// Currently scanned grapheme cluster won't fit. Break at start.
243+
_currentClusterWidth = 0;
244+
input -= currentCodepointLength;
245+
break;
246+
}
247+
248+
_events.on_grapheme_cluster(std::string_view(clusterStart, currentCodepointLength),
249+
_currentClusterWidth);
250+
251+
// And start a new grapheme cluster.
252+
_currentClusterWidth = nextWidth;
253+
clusterStart = lastCodepointStart;
254+
lastCodepointStart = input - currentCodepointLength;
255+
currentCodepointLength = 0;
256+
lastClusterEnd = input;
257+
}
258+
else
259+
{
260+
lastClusterEnd = input;
261+
// Increase width on VS16 but do not decrease on VS15.
262+
if (nextCodepoint == 0xFE0F) // VS16
263+
{
264+
_currentClusterWidth = 2;
265+
if (consumedWidth + _currentClusterWidth > maxWidth)
266+
{
267+
// Rewinding by {currentCodepointLength} bytes (overflow due to VS16).
268+
_currentClusterWidth = 0;
269+
input = clusterStart;
270+
break;
271+
}
272+
}
273+
274+
// Consumed {currentCodepointLength} bytes for grapheme cluster.
275+
lastCodepointStart = input - currentCodepointLength;
276+
}
277+
}
278+
else
279+
{
280+
assert(holds_alternative<Invalid>(result));
281+
consumedWidth++;
282+
_events.on_invalid(std::string_view(clusterStart, currentCodepointLength));
283+
_currentClusterWidth = 0;
284+
_lastCodepointHint = 0;
285+
_utf8.expectedLength = 0;
286+
currentCodepointLength = 0;
287+
clusterStart = input;
288+
lastClusterEnd = input;
289+
}
290+
}
291+
292+
consumedWidth += _currentClusterWidth;
293+
294+
_currentClusterWidth = 0;
295+
296+
// if (currentCodepointLength <= _buffer.size())
297+
// _buffer.remove_prefix(currentCodepointLength);
298+
// else
299+
// abort();
300+
301+
assert(lastClusterStart <= lastClusterEnd);
302+
303+
auto const resultLength = static_cast<size_t>(std::distance(lastClusterStart, lastClusterEnd));
304+
printf("lastClusterEnd: %p, size: %zu\n", (void*) lastClusterEnd, resultLength);
305+
return result_type { .text = std::string_view(lastClusterStart, resultLength), .width = consumedWidth };
306+
}
307+
308+
ConvertResult grapheme_line_segmenter::process_single_byte(uint8_t byte) noexcept
309+
{
310+
return from_utf8(_utf8, byte);
311+
}
312+
313+
} // namespace unicode

0 commit comments

Comments
 (0)