8
8
9
9
module Unicode
10
10
class DisplayWidth
11
- DEFAULT_AMBIGUOUS = 1
12
11
INITIAL_DEPTH = 0x10000
12
+ def self . width_in_index ( codepoint , index )
13
+ d = INITIAL_DEPTH
14
+ w = index [ codepoint / d ]
15
+ while w . instance_of? Array
16
+ w = w [ ( codepoint %= d ) / ( d /= 16 ) ]
17
+ end
18
+ w || 1
19
+ end
20
+
21
+ DEFAULT_AMBIGUOUS = 1
13
22
ASCII_NON_ZERO_REGEX = /[\0 \x05 \a \b \n \v \f \r \x0E \x0F ]/
14
23
ASCII_NON_ZERO_STRING = "\0 \x05 \a \b \n \v \f \r \x0E \x0F "
15
24
ASCII_BACKSPACE = "\b "
@@ -25,11 +34,19 @@ class DisplayWidth
25
34
WIDTH_ONE : decompress_index ( INDEX [ :WIDTH_ONE ] [ 0 ] [ 0 ] , 1 ) ,
26
35
WIDTH_TWO : decompress_index ( INDEX [ :WIDTH_TWO ] [ 0 ] [ 0 ] , 1 ) ,
27
36
}
37
+ VS16_TEXT_CODEPOINTS = {
38
+ WIDTH_ONE : Unicode ::Emoji ::TEXT_PRESENTATION - Unicode ::Emoji ::EMOJI_COMPONENT ,
39
+ WIDTH_TWO : ( Unicode ::Emoji ::TEXT_PRESENTATION - Unicode ::Emoji ::EMOJI_COMPONENT ) . reject { |codepoint |
40
+ width_in_index ( codepoint , INDEX [ :WIDTH_TWO ] ) == 2
41
+ } ,
42
+ }
28
43
EMOJI_SEQUENCES_REGEX_MAPPING = {
29
44
rgi : :REGEX_INCLUDE_MQE_UQE ,
30
45
rgi_at : :REGEX_INCLUDE_MQE_UQE ,
31
46
possible : :REGEX_WELL_FORMED ,
32
47
}
48
+ EMOJI_NON_VS16_OPTIONS = [ :all_no_vs16 , :rgi_at , :none , false ]
49
+ VS16 = 0xFE0F
33
50
REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp . union ( Unicode ::Emoji ::REGEX_BASIC , Unicode ::Emoji ::REGEX_EMOJI_KEYCAP )
34
51
REGEX_EMOJI_ALL_SEQUENCES = Regexp . union ( /.[\u{1F3FB} -\u{1F3FF} \u{FE0F} ]?(\u{200D} .[\u{1F3FB} -\u{1F3FF} \u{FE0F} ]?)+/ , Unicode ::Emoji ::REGEX_EMOJI_KEYCAP )
35
52
REGEX_EMOJI_NOT_POSSIBLE = /\A [#*0-9]\z /
@@ -61,14 +78,25 @@ def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **option
61
78
# # #
62
79
63
80
if !options [ :overwrite ] . empty?
64
- return width_frame ( string , options ) do |string , index_full , index_low , first_ambiguous |
65
- width_all_features ( string , index_full , index_low , first_ambiguous , options [ :overwrite ] )
81
+ return width_frame ( string , options ) do |string , index_full , index_low , first_ambiguous , vs16_text_codepoints |
82
+ width_all_features (
83
+ string ,
84
+ index_full ,
85
+ index_low ,
86
+ first_ambiguous ,
87
+ options [ :overwrite ] ,
88
+ EMOJI_NON_VS16_OPTIONS . include? ( options [ :emoji ] ) ? nil : vs16_text_codepoints
89
+ )
66
90
end
67
91
end
68
92
69
93
if !string . ascii_only?
70
- return width_frame ( string , options ) do |string , index_full , index_low , first_ambiguous |
71
- width_no_overwrite ( string , index_full , index_low , first_ambiguous )
94
+ return width_frame ( string , options ) do |string , index_full , index_low , first_ambiguous , vs16_text_codepoints |
95
+ if EMOJI_NON_VS16_OPTIONS . include? ( options [ :emoji ] )
96
+ width_no_overwrite ( string , index_full , index_low , first_ambiguous )
97
+ else
98
+ width_no_overwrite_with_vs16 ( string , index_full , index_low , first_ambiguous , vs16_text_codepoints )
99
+ end
72
100
end
73
101
end
74
102
@@ -102,7 +130,13 @@ def self.width_frame(string, options)
102
130
ambiguous_index_name = AMBIGUOUS_MAP [ options [ :ambiguous ] ]
103
131
104
132
# Get general width
105
- res += yield ( string , INDEX [ ambiguous_index_name ] , FIRST_4096 [ ambiguous_index_name ] , FIRST_AMBIGUOUS [ ambiguous_index_name ] )
133
+ res += yield (
134
+ string ,
135
+ INDEX [ ambiguous_index_name ] ,
136
+ FIRST_4096 [ ambiguous_index_name ] ,
137
+ FIRST_AMBIGUOUS [ ambiguous_index_name ] ,
138
+ VS16_TEXT_CODEPOINTS [ ambiguous_index_name ]
139
+ )
106
140
107
141
# Return result + prevent negative lengths
108
142
res < 0 ? 0 : res
@@ -139,26 +173,73 @@ def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ =
139
173
res
140
174
end
141
175
176
+ def self . width_no_overwrite_with_vs16 ( string , index_full , index_low , first_ambiguous , vs16_text_codepoints )
177
+ res = 0
178
+
179
+ # Make sure we have UTF-8
180
+ string = string . encode ( Encoding ::UTF_8 ) unless string . encoding . name == "utf-8"
181
+
182
+ # Track last codepoint and apply VS16 adjustment if necassary
183
+ last_codepoint = nil
184
+
185
+ string . scan ( /.{,80}/m ) { |batch |
186
+ if batch . ascii_only?
187
+ res += batch . size
188
+ else
189
+ batch . each_codepoint { |codepoint |
190
+ if codepoint > 15 && codepoint < first_ambiguous
191
+ res += 1
192
+ elsif codepoint < 0x1001
193
+ res += index_low [ codepoint ] || 1
194
+ elsif codepoint == VS16 && vs16_text_codepoints . include? ( last_codepoint )
195
+ res += 1
196
+ else
197
+ d = INITIAL_DEPTH
198
+ c = codepoint
199
+ w = index_full [ c / d ]
200
+ while w . instance_of? Array
201
+ w = w [ ( c %= d ) / ( d /= 16 ) ]
202
+ end
203
+
204
+ res += w || 1
205
+ end
206
+
207
+ last_codepoint = codepoint
208
+ }
209
+ end
210
+ }
211
+
212
+ res
213
+ end
214
+
142
215
# Same as .width_no_overwrite - but with applying overwrites for each char
143
- def self . width_all_features ( string , index_full , index_low , first_ambiguous , overwrite )
216
+ def self . width_all_features ( string , index_full , index_low , first_ambiguous , overwrite , vs16_text_codepoints )
144
217
res = 0
145
218
219
+ # Track last codepoint and apply VS16 adjustment if necassary
220
+ last_codepoint = nil
221
+
146
222
string . each_codepoint { |codepoint |
147
223
if overwrite [ codepoint ]
148
224
res += overwrite [ codepoint ]
149
225
elsif codepoint > 15 && codepoint < first_ambiguous
150
226
res += 1
151
227
elsif codepoint < 0x1001
152
228
res += index_low [ codepoint ] || 1
229
+ elsif codepoint == VS16 && vs16_text_codepoints && vs16_text_codepoints . include? ( last_codepoint )
230
+ res += 1
153
231
else
154
232
d = INITIAL_DEPTH
155
- w = index_full [ codepoint / d ]
233
+ c = codepoint
234
+ w = index_full [ c / d ]
156
235
while w . instance_of? Array
157
- w = w [ ( codepoint %= d ) / ( d /= 16 ) ]
236
+ w = w [ ( c %= d ) / ( d /= 16 ) ]
158
237
end
159
238
160
239
res += w || 1
161
240
end
241
+
242
+ last_codepoint = codepoint
162
243
}
163
244
164
245
res
@@ -177,35 +258,13 @@ def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
177
258
mode == :rgi_at ,
178
259
ambiguous ,
179
260
)
180
- elsif mode == :all_no_vs16
261
+ elsif mode == :all_no_vs16 || mode == :all
181
262
emoji_width_all ( string )
182
- elsif mode == :vs16
183
- emoji_width_basic ( string )
184
- elsif mode == :all
185
- res_all , string = emoji_width_all ( string )
186
- res_basic , string = emoji_width_basic ( string )
187
- [ res_all + res_basic , string ]
188
263
else
189
264
[ 0 , string ]
190
265
end
191
266
end
192
267
193
- # Ensure all explicit VS16 sequences have width 2
194
- def self . emoji_width_basic ( string )
195
- res = 0
196
-
197
- no_emoji_string = string . gsub ( REGEX_EMOJI_BASIC_OR_KEYCAP ) { |basic_emoji |
198
- if basic_emoji . size >= 2 # VS16 present
199
- res += 2
200
- ""
201
- else
202
- basic_emoji
203
- end
204
- }
205
-
206
- [ res , no_emoji_string ]
207
- end
208
-
209
268
# Use simplistic ZWJ/modifier/kecap sequence matching
210
269
def self . emoji_width_all ( string )
211
270
res = 0
@@ -226,31 +285,20 @@ def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, a
226
285
no_emoji_string = string . gsub ( Unicode ::Emoji ::REGEX_POSSIBLE ) { |emoji_candidate |
227
286
# Skip notorious false positives
228
287
if REGEX_EMOJI_NOT_POSSIBLE . match? ( emoji_candidate )
229
- emoji_candidate
288
+ res += 1
289
+ ""
230
290
231
291
# Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
232
292
elsif emoji_candidate == emoji_candidate [ emoji_set_regex ]
233
293
if strict_eaw
234
- res += self . of ( emoji_candidate [ 0 ] , ambiguous , emoji : false )
294
+ res += self . width_in_index ( emoji_candidate [ 0 ] . ord , INDEX [ AMBIGUOUS_MAP [ ambiguous ] ] )
235
295
else
236
296
res += 2
237
297
end
238
298
""
239
299
240
- # We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set
300
+ # Use other counting mechanisms
241
301
else
242
- if !strict_eaw
243
- # Ensure all explicit VS16 sequences have width 2
244
- emoji_candidate . gsub! ( Unicode ::Emoji ::REGEX_BASIC ) { |basic_emoji |
245
- if basic_emoji . size == 2 # VS16 present
246
- res += 2
247
- ""
248
- else
249
- basic_emoji
250
- end
251
- }
252
- end
253
-
254
302
emoji_candidate
255
303
end
256
304
}
0 commit comments