Skip to content

Commit fbb4a9d

Browse files
Introduce new and more flexible API grapheme_line_segmenter, replacing scan API
Signed-off-by: Christian Parpart <[email protected]>
1 parent 1b88442 commit fbb4a9d

16 files changed

+1644
-882
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
/_ucd/
55
/.clangd/
66
/compile_commands.json
7+
/.vs/
78
/.vscode/
89
/sandbox/
910
/target/

CMakeLists.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ if(("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") OR ("${CMAKE_CXX_COMPILER_ID}" MAT
3232
endif()
3333
elseif(DEFINED MSVC)
3434
add_definitions(-DNOMINMAX)
35-
add_compile_options(/utf-8)
35+
add_compile_options(/nologo)
36+
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /nologo")
3637
endif()
3738

3839
include(EnableCcache)
@@ -47,7 +48,7 @@ option(LIBUNICODE_BENCHMARK "libunicode: Enables building of benchmark for libun
4748
option(LIBUNICODE_TOOLS "libunicode: Builds CLI tools [default: ${MASTER_PROJECT}]" ${MASTER_PROJECT})
4849
option(LIBUNICODE_BUILD_STATIC "libunicode: provide static library instead of dynamic [default: ${LIBUNICODE_BUILD_STATIC_DEFAULT}]" ${LIBUNICODE_BUILD_STATIC_DEFAULT})
4950
option(LIBUNICODE_USE_INTRINSICS "libunicode: Use SIMD extenstion during text read [default: ON]" ON)
50-
option(LIBUNICODE_USE_STD_SIMD "libunicode: Use std::simd as SIMD extenstion during text read (takes precedence over own intrinsics) [default: ON]" ${LIBUNICODE_USE_INTRINSICS})
51+
option(LIBUNICODE_USE_STD_SIMD "libunicode: Use std::simd as SIMD extenstion during text read (takes precedence over own intrinsics) [default: ON]" ON)
5152
option(LIBUNICODE_TABLEGEN_FASTBUILD "libunicode: Use fast table generation (takes more memory in final tables) [default: OFF]" OFF)
5253

5354
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Enable testing of the benchmark library." FORCE)

cmake/presets/common.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"version": 6,
33
"configurePresets": [
4-
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "LIBUNICODE_TABLEGEN_FASTBUILD": "ON" } },
4+
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "LIBUNICODE_TABLEGEN_FASTBUILD": "ON", "LIBUNICODE_TRACE": "ON" } },
55
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
66
{ "name": "arch-native", "hidden": true, "cacheVariables": { "CMAKE_CXX_FLAGS": "-march=native" } },
77
{ "name": "clang", "hidden": true, "cacheVariables": { "CMAKE_CXX_COMPILER": "clang++" } },

cmake/presets/os-windows.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
"cacheVariables": {
1717
"VCPKG_TARGET_TRIPLET": "x64-windows",
1818
"CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
19-
"CMAKE_VERBOSE_MAKEFILE": "ON",
20-
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/../vcpkg/scripts/buildsystems/vcpkg.cmake"
19+
"CMAKE_TOOLCHAIN_FILE": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake",
20+
"CMAKE_VERBOSE_MAKEFILE": "ON"
2121
}
2222
},
2323
{ "name": "windows-cl-debug", "inherits": ["windows-common", "debug"], "displayName": "Windows (MSVC) Debug", "description": "Using MSVC compiler (64-bit)" },

src/libunicode/CMakeLists.txt

+18-11
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
include(GNUInstallDirs)
22

3+
option(LIBUNICODE_TRACE "Enable trace logging" OFF)
4+
35
function(ExtractZipArchive ZIP_FILE OUTPUT_DIR)
46
if(CMAKE_VERSION VERSION_LESS 3.18)
57
# Use the older method for versions prior to CMake 3.18
@@ -84,6 +86,9 @@ add_custom_command(
8486

8587
add_library(unicode_loader ${LIBUNICODE_LIB_MODE} codepoint_properties_loader.h codepoint_properties_loader.cpp)
8688
add_library(unicode::loader ALIAS unicode_loader)
89+
if(DEFINED MSVC)
90+
target_compile_options(unicode_loader PUBLIC /EHsc) # We currently `throw` in the loader, so we need this.
91+
endif()
8792
if(LIBUNICODE_TABLEGEN_FASTBUILD)
8893
target_compile_definitions(unicode_loader PRIVATE LIBUNICODE_TABLEGEN_FASTBUILD)
8994
endif()
@@ -102,7 +107,6 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
102107
codepoint_properties.cpp
103108
emoji_segmenter.cpp
104109
grapheme_segmenter.cpp
105-
scan.cpp
106110
script_segmenter.cpp
107111
utf8.cpp
108112
width.cpp
@@ -114,22 +118,22 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
114118
)
115119

116120
if(LIBUNICODE_USE_STD_SIMD)
117-
target_compile_definitions(unicode PRIVATE LIBUNICODE_USE_STD_SIMD)
121+
target_compile_definitions(unicode PUBLIC LIBUNICODE_USE_STD_SIMD)
118122
endif()
119123
if(LIBUNICODE_USE_INTRINSICS)
120-
target_compile_definitions(unicode PRIVATE USE_INTRINSICS)
124+
target_compile_definitions(unicode PUBLIC LIBUNICODE_USE_INTRINSICS)
121125
endif()
122126

123127
set(public_headers
124128
capi.h
125129
codepoint_properties.h
126130
convert.h
127131
emoji_segmenter.h
132+
grapheme_line_segmenter.h
128133
grapheme_segmenter.h
129134
intrinsics.h
130135
multistage_table_view.h
131136
run_segmenter.h
132-
scan.h
133137
script_segmenter.h
134138
support.h
135139
utf8.h
@@ -150,6 +154,10 @@ set_target_properties(unicode PROPERTIES
150154
SOVERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}"
151155
)
152156

157+
if(LIBUNICODE_TRACE)
158+
target_compile_definitions(unicode PUBLIC LIBUNICODE_TRACE)
159+
endif()
160+
153161
add_library(unicode::unicode ALIAS unicode)
154162
add_library(unicode::core ALIAS unicode)
155163
target_include_directories(unicode PUBLIC $<BUILD_INTERFACE:${${PROJECT_NAME}_SOURCE_DIR}/src>
@@ -161,7 +169,6 @@ add_executable(unicode_tablegen tablegen.cpp)
161169
set_target_properties(unicode_tablegen PROPERTIES CMAKE_BUILD_TYPE Release)
162170
target_link_libraries(unicode_tablegen PRIVATE unicode::loader)
163171

164-
165172
# {{{ installation
166173
set(LIBUNICODE_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/libunicode" CACHE PATH "Installation directory for cmake files, a relative path that will be joined with ${CMAKE_INSTALL_PREFIX} or an absolute path.")
167174
set(LIBUNICODE_INSTALL_CMAKE_FILES ${MASTER_PROJECT} CACHE BOOL "Decides whether or not to install CMake config and -version files.")
@@ -220,35 +227,35 @@ if(LIBUNICODE_TESTING)
220227
capi_test.cpp
221228
convert_test.cpp
222229
emoji_segmenter_test.cpp
230+
grapheme_line_segmenter_test.cpp
223231
grapheme_segmenter_test.cpp
224232
run_segmenter_test.cpp
225-
scan_test.cpp
226233
script_segmenter_test.cpp
227234
test_main.cpp
228-
unicode_test.cpp
229235
utf8_grapheme_segmenter_test.cpp
230236
utf8_test.cpp
231237
width_test.cpp
232238
word_segmenter_test.cpp
233239
)
234240

241+
if(DEFINED MSVC)
242+
target_compile_options(unicode_test PRIVATE /utf-8)
243+
endif()
244+
235245
if(NOT Catch2_FOUND)
236246
# supress conversion warnings for Catch2
237247
# https://github.com/catchorg/Catch2/issues/2583
238248
# https://github.com/SFML/SFML/blob/e45628e2ebc5843baa3739781276fa85a54d4653/test/CMakeLists.txt#L18-L22
239249
set_target_properties(Catch2 PROPERTIES COMPILE_OPTIONS "" EXPORT_COMPILE_COMMANDS OFF)
240-
set_target_properties(Catch2WithMain PROPERTIES EXPORT_COMPILE_COMMANDS OFF)
241250
get_target_property(CATCH2_INCLUDE_DIRS Catch2 INTERFACE_INCLUDE_DIRECTORIES)
242251
target_include_directories(Catch2 SYSTEM INTERFACE ${CATCH2_INCLUDE_DIRS})
243252
endif()
244253

245-
target_link_libraries(unicode_test unicode Catch2::Catch2WithMain fmt::fmt-header-only)
254+
target_link_libraries(unicode_test unicode Catch2::Catch2 fmt::fmt-header-only)
246255
add_test(unicode_test unicode_test)
247256
endif()
248257
# }}}
249258

250-
251-
252259
# {{{ unicode_test
253260
if(LIBUNICODE_BENCHMARK)
254261
if(NOT benchmark_FOUND)

src/libunicode/benchmark.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#include <libunicode/convert.h>
2-
#include <libunicode/scan.h>
2+
#include <libunicode/grapheme_line_segmenter.h>
33
#include <libunicode/utf8.h>
44

55
#include <string_view>
@@ -14,7 +14,7 @@ static void benchmarkWithLength(benchmark::State& benchmarkState)
1414
auto TestText = std::string(L, 'a') + "\u00A9";
1515
for (auto _: benchmarkState)
1616
{
17-
benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10));
17+
benchmark::DoNotOptimize(unicode::detail::process_only_ascii(std::string_view(TestText).substr(0, L + 10)));
1818
}
1919
}
2020

@@ -24,7 +24,9 @@ static void benchmarkWithOffset(benchmark::State& benchmarkState)
2424
auto TestText = std::string(L, 'a') + "\U0001F600" + std::string(1000, 'a');
2525
for (auto _: benchmarkState)
2626
{
27-
benchmark::DoNotOptimize(unicode::detail::scan_for_text_ascii(TestText, L + 10));
27+
auto state = unicode::detail::unicode_process_state {};
28+
auto eventHandler = unicode::detail::EventHandler{};
29+
benchmark::DoNotOptimize(unicode::detail::process_only_complex_unicode(eventHandler, state, TestText, L + 10));
2830
}
2931
}
3032

0 commit comments

Comments
 (0)