Skip to content

Commit 35cd8b1

Browse files
Introduce new API grapheme_line_segmenter to replace scan API
Signed-off-by: Christian Parpart <[email protected]>
1 parent bc9e84a commit 35cd8b1

File tree

4 files changed

+662
-0
lines changed

4 files changed

+662
-0
lines changed

src/libunicode/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
8686
capi.cpp
8787
codepoint_properties.cpp
8888
emoji_segmenter.cpp
89+
grapheme_line_segmenter.cpp
8990
grapheme_segmenter.cpp
9091
scan.cpp
9192
script_segmenter.cpp
@@ -103,6 +104,7 @@ set(public_headers
103104
codepoint_properties.h
104105
convert.h
105106
emoji_segmenter.h
107+
grapheme_line_segmenter.h
106108
grapheme_segmenter.h
107109
intrinsics.h
108110
multistage_table_view.h
@@ -186,6 +188,7 @@ if(LIBUNICODE_TESTING)
186188
capi_test.cpp
187189
convert_test.cpp
188190
emoji_segmenter_test.cpp
191+
grapheme_line_segmenter_test.cpp
189192
grapheme_segmenter_test.cpp
190193
run_segmenter_test.cpp
191194
scan_test.cpp
+295
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
/**
2+
* This file is part of the "libunicode" project
3+
* Copyright (c) 2023 Christian Parpart <[email protected]>
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
#include <libunicode/grapheme_line_segmenter.h>
15+
#include <libunicode/grapheme_segmenter.h>
16+
#include <libunicode/intrinsics.h>
17+
#include <libunicode/utf8.h>
18+
#include <libunicode/width.h>
19+
20+
#include <algorithm>
21+
#include <cassert>
22+
23+
namespace unicode
24+
{
25+
26+
namespace
27+
{
28+
[[maybe_unused]] int countTrailingZeroBits(unsigned int value) noexcept
29+
{
30+
#if defined(_WIN32)
31+
return _tzcnt_u32(value);
32+
#else
33+
return __builtin_ctz(value);
34+
#endif
35+
}
36+
37+
constexpr bool is_control(char ch) noexcept
38+
{
39+
return static_cast<uint8_t>(ch) < 0x20;
40+
}
41+
42+
// Tests if given UTF-8 byte is part of a complex Unicode codepoint, that is, a value greater than U+7E.
43+
constexpr bool is_complex(char ch) noexcept
44+
{
45+
return static_cast<uint8_t>(ch) & 0x80;
46+
}
47+
48+
// Tests if given UTF-8 byte is a single US-ASCII text codepoint. This excludes control characters.
49+
constexpr bool is_ascii(char ch) noexcept
50+
{
51+
return !is_control(ch) && !is_complex(ch);
52+
}
53+
} // namespace
54+
55+
size_t grapheme_line_segmenter::process_ascii() noexcept
56+
{
57+
auto input = _buffer.data();
58+
auto const end = _buffer.data() + std::min(_buffer.size(), _maxWidth);
59+
60+
intrinsics::m128i const ControlCodeMax = intrinsics::set1_epi8(0x20); // 0..0x1F
61+
intrinsics::m128i const Complex = intrinsics::set1_epi8(-128); // equals to 0x80 (0b1000'0000)
62+
63+
while (input < end - sizeof(intrinsics::m128i))
64+
{
65+
intrinsics::m128i batch = intrinsics::load_unaligned((intrinsics::m128i*) input);
66+
intrinsics::m128i isControl = intrinsics::compare_less(batch, ControlCodeMax);
67+
intrinsics::m128i isComplex = intrinsics::and128(batch, Complex);
68+
// intrinsics::m128i isComplex = _mm_cmplt_epi8(batch, Complex);
69+
intrinsics::m128i testPack = intrinsics::or128(isControl, isComplex);
70+
if (int const check = intrinsics::movemask_epi8(testPack); check != 0)
71+
{
72+
int advance = countTrailingZeroBits(static_cast<unsigned>(check));
73+
input += advance;
74+
break;
75+
}
76+
input += sizeof(intrinsics::m128i);
77+
}
78+
79+
while (input != end && is_ascii(*input))
80+
++input;
81+
82+
// if (static_cast<size_t>(distance(_buffer.data(), input)))
83+
// fmt::print(
84+
// "countAsciiTextChars: {} bytes: \"{}\"\n",
85+
// static_cast<size_t>(distance(_buffer.data(), input)),
86+
// (string_view(_buffer.data(), static_cast<size_t>(distance(_buffer.data(), input)))));
87+
88+
return static_cast<size_t>(std::distance(_buffer.data(), input));
89+
}
90+
91+
void grapheme_line_segmenter::reset(std::string_view buffer, size_t maxWidth) noexcept
92+
{
93+
_buffer = buffer;
94+
95+
_utf8 = {};
96+
_lastCodepointHint = 0;
97+
98+
_maxWidth = maxWidth; // TODO: Not sure we want to do this here, nor need this at all.
99+
}
100+
101+
void grapheme_line_segmenter::expand_buffer_by(size_t count) noexcept
102+
{
103+
_buffer = std::string_view(_buffer.data(), _buffer.size() + count);
104+
}
105+
106+
void grapheme_line_segmenter::move_forward_to(char const* pos) noexcept
107+
{
108+
assert(_buffer.data() <= pos && pos <= _buffer.data() + _buffer.size());
109+
auto const skippedBytesCount = static_cast<size_t>(pos - _buffer.data());
110+
_buffer.remove_prefix(skippedBytesCount);
111+
}
112+
113+
scan_result grapheme_line_segmenter::process(size_t maxWidth) noexcept
114+
{
115+
if (_buffer.empty())
116+
return scan_result { .text = _buffer.substr(0, 0), .width = 0 };
117+
118+
_maxWidth = maxWidth;
119+
120+
char const* start = _buffer.data();
121+
122+
// Number of bytes used in the current line.
123+
size_t totalByteCountProcessed = 0;
124+
125+
// Number of width used in the current line.
126+
size_t totalWidthProcessed = 0;
127+
128+
auto const makeResult = [&]() -> scan_result {
129+
return scan_result { .text = std::string_view(start, totalByteCountProcessed),
130+
.width = totalWidthProcessed };
131+
};
132+
133+
enum class State
134+
{
135+
ASCII,
136+
ComplexUnicode,
137+
};
138+
139+
State state = State::ASCII;
140+
141+
while (_maxWidth > 0 && !_buffer.empty())
142+
{
143+
state =
144+
(_utf8.expectedLength != 0 || is_complex(_buffer.front())) ? State::ComplexUnicode : State::ASCII;
145+
146+
switch (state)
147+
{
148+
case State::ASCII: {
149+
auto const count = process_ascii();
150+
if (count == 0)
151+
return makeResult();
152+
_events.on_ascii(_buffer.substr(0, count));
153+
_maxWidth -= count;
154+
totalWidthProcessed += count;
155+
totalByteCountProcessed += count;
156+
_buffer.remove_prefix(count);
157+
state = State::ComplexUnicode;
158+
break;
159+
}
160+
case State::ComplexUnicode: {
161+
auto const sub = process_complex_unicode();
162+
if (sub.width == 0)
163+
return makeResult();
164+
totalWidthProcessed += sub.width;
165+
totalByteCountProcessed += sub.text.size();
166+
_buffer.remove_prefix(sub.text.size());
167+
state = State::ASCII;
168+
break;
169+
}
170+
}
171+
}
172+
173+
return makeResult();
174+
}
175+
176+
scan_result grapheme_line_segmenter::process_complex_unicode() noexcept
177+
{
178+
size_t count = 0;
179+
180+
char const* start = _buffer.data();
181+
char const* end = start + _buffer.size();
182+
char const* input = start;
183+
char const* clusterStart = start;
184+
char const* lastCodepointStart = start;
185+
186+
unsigned byteCount = 0; // bytes consume for the current codepoint
187+
188+
// TODO: move currentClusterWidth to object's state?
189+
size_t currentClusterWidth = 0; // current grapheme cluster's East Asian Width
190+
191+
char const* resultStart = _utf8.expectedLength ? start - _utf8.currentLength : start;
192+
char const* resultEnd = resultStart;
193+
194+
while (input != end && count <= _maxWidth)
195+
{
196+
if (is_control(*input) || !is_complex(*input))
197+
{
198+
// ASCII control character or single US-ASCII text codepoint.
199+
200+
if (_utf8.expectedLength)
201+
{
202+
// Incomplete UTF-8 sequence hit. That's invalid as well.
203+
++count;
204+
_events.on_invalid(std::string_view(input, input + 1));
205+
_utf8 = {};
206+
}
207+
208+
_lastCodepointHint = 0;
209+
resultEnd = input;
210+
break;
211+
}
212+
213+
auto const result = from_utf8(_utf8, static_cast<uint8_t>(*input++));
214+
++byteCount;
215+
216+
if (holds_alternative<Incomplete>(result))
217+
continue;
218+
219+
if (holds_alternative<Success>(result))
220+
{
221+
auto const prevCodepoint = _lastCodepointHint;
222+
auto const nextCodepoint = std::get<Success>(result).value;
223+
auto const nextWidth = std::max(currentClusterWidth, static_cast<size_t>(width(nextCodepoint)));
224+
_lastCodepointHint = nextCodepoint;
225+
if (grapheme_segmenter::breakable(prevCodepoint, nextCodepoint))
226+
{
227+
// Flush out current grapheme cluster's East Asian Width.
228+
count += currentClusterWidth;
229+
_maxWidth -= currentClusterWidth;
230+
231+
if (count + nextWidth > _maxWidth)
232+
{
233+
// Currently scanned grapheme cluster won't fit. Break at start.
234+
currentClusterWidth = 0;
235+
input -= byteCount;
236+
break;
237+
}
238+
239+
_events.on_grapheme_cluster(std::string_view(clusterStart, byteCount), currentClusterWidth);
240+
241+
// And start a new grapheme cluster.
242+
currentClusterWidth = nextWidth;
243+
clusterStart = lastCodepointStart;
244+
lastCodepointStart = input - byteCount;
245+
byteCount = 0;
246+
resultEnd = input;
247+
}
248+
else
249+
{
250+
resultEnd = input;
251+
// Increase width on VS16 but do not decrease on VS15.
252+
if (nextCodepoint == 0xFE0F) // VS16
253+
{
254+
currentClusterWidth = 2;
255+
if (count + currentClusterWidth > _maxWidth)
256+
{
257+
// Rewinding by {byteCount} bytes (overflow due to VS16).
258+
currentClusterWidth = 0;
259+
input = clusterStart;
260+
break;
261+
}
262+
}
263+
264+
// Consumed {byteCount} bytes for grapheme cluster.
265+
lastCodepointStart = input - byteCount;
266+
}
267+
}
268+
else
269+
{
270+
assert(holds_alternative<Invalid>(result));
271+
count++;
272+
_events.on_invalid(std::string_view(clusterStart, byteCount));
273+
currentClusterWidth = 0;
274+
_lastCodepointHint = 0;
275+
_utf8.expectedLength = 0;
276+
byteCount = 0;
277+
clusterStart = input;
278+
resultEnd = input;
279+
}
280+
}
281+
count += currentClusterWidth;
282+
283+
assert(resultStart <= resultEnd);
284+
285+
return scan_result { .text = std::string_view(resultStart,
286+
static_cast<size_t>(std::distance(resultStart, resultEnd))),
287+
.width = count };
288+
}
289+
290+
ConvertResult grapheme_line_segmenter::process_single_byte(uint8_t byte) noexcept
291+
{
292+
return from_utf8(_utf8, byte);
293+
}
294+
295+
} // namespace unicode

0 commit comments

Comments
 (0)