Skip to content

Commit 169ad17

Browse files
committed
Use non-regex approach for VS16 adjustments
1 parent 82b17bd commit 169ad17

File tree

2 files changed

+99
-47
lines changed

2 files changed

+99
-47
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# CHANGELOG
22

3+
## 3.1.1 (unreleased)
4+
5+
- Performance improvements
6+
37
## 3.1.0
48

59
**Improve Emoji support:**

lib/unicode/display_width.rb

Lines changed: 95 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,17 @@
88

99
module Unicode
1010
class DisplayWidth
11-
DEFAULT_AMBIGUOUS = 1
1211
INITIAL_DEPTH = 0x10000
12+
def self.width_in_index(codepoint, index)
13+
d = INITIAL_DEPTH
14+
w = index[codepoint / d]
15+
while w.instance_of? Array
16+
w = w[(codepoint %= d) / (d /= 16)]
17+
end
18+
w || 1
19+
end
20+
21+
DEFAULT_AMBIGUOUS = 1
1322
ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/
1423
ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F"
1524
ASCII_BACKSPACE = "\b"
@@ -25,11 +34,19 @@ class DisplayWidth
2534
WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
2635
WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
2736
}
37+
VS16_TEXT_CODEPOINTS = {
38+
WIDTH_ONE: Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT,
39+
WIDTH_TWO: (Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT).reject{ |codepoint|
40+
width_in_index(codepoint, INDEX[:WIDTH_TWO]) == 2
41+
},
42+
}
2843
EMOJI_SEQUENCES_REGEX_MAPPING = {
2944
rgi: :REGEX_INCLUDE_MQE_UQE,
3045
rgi_at: :REGEX_INCLUDE_MQE_UQE,
3146
possible: :REGEX_WELL_FORMED,
3247
}
48+
EMOJI_NON_VS16_OPTIONS = [:all_no_vs16, :rgi_at, :none, false]
49+
VS16 = 0xFE0F
3350
REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
3451
REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
3552
REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/
@@ -61,14 +78,25 @@ def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **option
6178
# # #
6279

6380
if !options[:overwrite].empty?
64-
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
65-
width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
81+
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
82+
width_all_features(
83+
string,
84+
index_full,
85+
index_low,
86+
first_ambiguous,
87+
options[:overwrite],
88+
EMOJI_NON_VS16_OPTIONS.include?(options[:emoji]) ? nil : vs16_text_codepoints
89+
)
6690
end
6791
end
6892

6993
if !string.ascii_only?
70-
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
71-
width_no_overwrite(string, index_full, index_low, first_ambiguous)
94+
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
95+
if EMOJI_NON_VS16_OPTIONS.include?(options[:emoji])
96+
width_no_overwrite(string, index_full, index_low, first_ambiguous)
97+
else
98+
width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
99+
end
72100
end
73101
end
74102

@@ -102,7 +130,13 @@ def self.width_frame(string, options)
102130
ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
103131

104132
# Get general width
105-
res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
133+
res += yield(
134+
string,
135+
INDEX[ambiguous_index_name],
136+
FIRST_4096[ambiguous_index_name],
137+
FIRST_AMBIGUOUS[ambiguous_index_name],
138+
VS16_TEXT_CODEPOINTS[ambiguous_index_name]
139+
)
106140

107141
# Return result + prevent negative lengths
108142
res < 0 ? 0 : res
@@ -139,26 +173,73 @@ def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ =
139173
res
140174
end
141175

176+
def self.width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
177+
res = 0
178+
179+
# Make sure we have UTF-8
180+
string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
181+
182+
# Track last codepoint and apply VS16 adjustment if necassary
183+
last_codepoint = nil
184+
185+
string.scan(/.{,80}/m){ |batch|
186+
if batch.ascii_only?
187+
res += batch.size
188+
else
189+
batch.each_codepoint{ |codepoint|
190+
if codepoint > 15 && codepoint < first_ambiguous
191+
res += 1
192+
elsif codepoint < 0x1001
193+
res += index_low[codepoint] || 1
194+
elsif codepoint == VS16 && vs16_text_codepoints.include?(last_codepoint)
195+
res += 1
196+
else
197+
d = INITIAL_DEPTH
198+
c = codepoint
199+
w = index_full[c / d]
200+
while w.instance_of? Array
201+
w = w[(c %= d) / (d /= 16)]
202+
end
203+
204+
res += w || 1
205+
end
206+
207+
last_codepoint = codepoint
208+
}
209+
end
210+
}
211+
212+
res
213+
end
214+
142215
# Same as .width_no_overwrite - but with applying overwrites for each char
143-
def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite)
216+
def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite, vs16_text_codepoints)
144217
res = 0
145218

219+
# Track last codepoint and apply VS16 adjustment if necassary
220+
last_codepoint = nil
221+
146222
string.each_codepoint{ |codepoint|
147223
if overwrite[codepoint]
148224
res += overwrite[codepoint]
149225
elsif codepoint > 15 && codepoint < first_ambiguous
150226
res += 1
151227
elsif codepoint < 0x1001
152228
res += index_low[codepoint] || 1
229+
elsif codepoint == VS16 && vs16_text_codepoints && vs16_text_codepoints.include?(last_codepoint)
230+
res += 1
153231
else
154232
d = INITIAL_DEPTH
155-
w = index_full[codepoint / d]
233+
c = codepoint
234+
w = index_full[c / d]
156235
while w.instance_of? Array
157-
w = w[(codepoint %= d) / (d /= 16)]
236+
w = w[(c %= d) / (d /= 16)]
158237
end
159238

160239
res += w || 1
161240
end
241+
242+
last_codepoint = codepoint
162243
}
163244

164245
res
@@ -177,35 +258,13 @@ def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
177258
mode == :rgi_at,
178259
ambiguous,
179260
)
180-
elsif mode == :all_no_vs16
261+
elsif mode == :all_no_vs16 || mode == :all
181262
emoji_width_all(string)
182-
elsif mode == :vs16
183-
emoji_width_basic(string)
184-
elsif mode == :all
185-
res_all, string = emoji_width_all(string)
186-
res_basic, string = emoji_width_basic(string)
187-
[res_all + res_basic, string]
188263
else
189264
[0, string]
190265
end
191266
end
192267

193-
# Ensure all explicit VS16 sequences have width 2
194-
def self.emoji_width_basic(string)
195-
res = 0
196-
197-
no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
198-
if basic_emoji.size >= 2 # VS16 present
199-
res += 2
200-
""
201-
else
202-
basic_emoji
203-
end
204-
}
205-
206-
[res, no_emoji_string]
207-
end
208-
209268
# Use simplistic ZWJ/modifier/kecap sequence matching
210269
def self.emoji_width_all(string)
211270
res = 0
@@ -226,31 +285,20 @@ def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, a
226285
no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate|
227286
# Skip notorious false positives
228287
if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
229-
emoji_candidate
288+
res += 1
289+
""
230290

231291
# Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
232292
elsif emoji_candidate == emoji_candidate[emoji_set_regex]
233293
if strict_eaw
234-
res += self.of(emoji_candidate[0], ambiguous, emoji: false)
294+
res += self.width_in_index(emoji_candidate[0].ord, INDEX[AMBIGUOUS_MAP[ambiguous]])
235295
else
236296
res += 2
237297
end
238298
""
239299

240-
# We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set
300+
# Use other counting mechanisms
241301
else
242-
if !strict_eaw
243-
# Ensure all explicit VS16 sequences have width 2
244-
emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji|
245-
if basic_emoji.size == 2 # VS16 present
246-
res += 2
247-
""
248-
else
249-
basic_emoji
250-
end
251-
}
252-
end
253-
254302
emoji_candidate
255303
end
256304
}

0 commit comments

Comments
 (0)