Skip to content

Commit 32c00d8

Browse files
Add Unicode support
1 parent f284cf1 commit 32c00d8

File tree

2 files changed

+31
-18
lines changed

2 files changed

+31
-18
lines changed

spec/std/file/match_spec.cr

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ describe File do
3636
refute_file_matches "a*b?c*x", "abxbbxdbxebxczzy"
3737
end
3838

39-
pending "multibyte" do
39+
describe "multibyte" do
4040
it "single-character match" do
4141
assert_file_matches "a?b", "a☺b"
4242
refute_file_matches "a???b", "a☺b"
@@ -94,7 +94,7 @@ describe File do
9494
end
9595
end
9696

97-
pending "invalid byte sequences" do
97+
describe "invalid byte sequences" do
9898
it "single-character with invalid path" do
9999
assert_file_matches "?.txt", "\xC3.txt" # Invalid byte sequence
100100
refute_file_matches "?.txt", "\xC3\x28.txt" # Invalid byte sequence
@@ -174,10 +174,10 @@ describe File do
174174
refute_file_matches "ab[^c]", "abc"
175175
refute_file_matches "ab[^b-d]", "abc"
176176
assert_file_matches "ab[^e-g]", "abc"
177-
refute_file_matches "a[^a]b", "a☺b" # mulitbyte pending
178-
assert_file_matches "a[^a][^a][^a]b", "a☺b" # mulitbyte pending
177+
assert_file_matches "a[^a]b", "a☺b"
178+
refute_file_matches "a[^a][^a][^a]b", "a☺b"
179179
assert_file_matches "[a-ζ]*", "α"
180-
refute_file_matches "*[a-ζ]", "A" # mulitbyte pending
180+
refute_file_matches "*[a-ζ]", "A"
181181
end
182182

183183
it "escape" do

src/file/match.cr

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ class File < IO::FileDescriptor
276276
elsif '?' === g && @path_index < path.size
277277
if !separators.includes?(path[@path_index].unsafe_chr)
278278
@glob_index += 1
279-
@path_index += 1
279+
_, @path_index = consume_unicode_character(path_str, @path_index)
280280
next
281281
end
282282
elsif '[' === g && @path_index < path.size
@@ -293,25 +293,17 @@ class File < IO::FileDescriptor
293293
first = true
294294
is_match = false
295295

296-
c = path[@path_index]
296+
c, new_path_index = consume_unicode_glob_character(path_str, @path_index)
297297

298298
while @glob_index < glob.size && (first || !(']' === glob[@glob_index]))
299-
low = glob[@glob_index]
300-
if !unescape(pointerof(low), glob, pointerof(@glob_index))
301-
raise File::BadPatternError.new("Invalid pattern")
302-
end
303-
@glob_index += 1
299+
low, @glob_index = consume_unicode_glob_character(glob_str, @glob_index)
304300

305301
# If there is a - and the following character is not ], read the range end character.
306302
if @glob_index + 1 < glob.size &&
307303
glob[@glob_index] === '-' &&
308304
!(glob[@glob_index + 1] === ']')
309305
@glob_index += 1
310-
high = glob[@glob_index]
311-
if !unescape(pointerof(high), glob, pointerof(@glob_index))
312-
raise File::BadPatternError.new("Invalid pattern")
313-
end
314-
@glob_index += 1
306+
high, @glob_index = consume_unicode_glob_character(glob_str, @glob_index)
315307
else
316308
high = low
317309
end
@@ -327,7 +319,7 @@ class File < IO::FileDescriptor
327319
end
328320
@glob_index += 1
329321
if is_match != negated_class
330-
@path_index += 1
322+
@path_index = new_path_index
331323
next
332324
end
333325
elsif g === '{'
@@ -402,3 +394,24 @@ private def unescape(c, glob, glob_index) : Bool
402394

403395
true
404396
end
397+
398+
@[AlwaysInline]
399+
private def consume_unicode_character(string, index) : {Char, UInt64}
400+
reader = Char::Reader.new(string, index)
401+
{reader.current_char, index + reader.current_char_width}
402+
end
403+
404+
@[AlwaysInline]
405+
private def consume_unicode_glob_character(string, index) : {Char, UInt64}
406+
c = string.to_unsafe[index]
407+
408+
if !unescape(pointerof(c), string.to_slice, pointerof(index))
409+
raise File::BadPatternError.new("Invalid pattern")
410+
end
411+
412+
if c < 0x80
413+
{c.unsafe_chr, index + 1}
414+
else
415+
consume_unicode_character(string, index)
416+
end
417+
end

0 commit comments

Comments
 (0)