Skip to content

Commit bff5068

Browse files
authored
Add truncate_at_null parameter to String.new(Bytes) and .from_utf16 (#15887)
It is common to find C strings in `StaticArray`s or `Slice`s that are null-terminated, but may also take up the entire slice if its size is equal to the slice's size. In this case, `String.new(UInt8*)` is unsafe, whereas `String.new(Bytes)` followed by `rstrip('\0')` is not very efficient. This PR allows the `Bytes` constructor to stop as soon as it encounters a null character. `LibC::SockaddrUn#sun_path` is not necessarily null-terminated in practice, according to https://man7.org/linux/man-pages/man7/unix.7.html. `LibC::Dirent#d_name` is always null-terminated (in fact, it is declared as a variable-length array on Solaris). There are probably more places where this is applicable to Win32 code than the ones included here.
1 parent dd9d7ef commit bff5068

File tree

7 files changed

+47
-12
lines changed

7 files changed

+47
-12
lines changed

spec/std/string/utf16_spec.cr

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,5 +78,11 @@ describe "String UTF16" do
7878
string, pointer = String.from_utf16(pointer)
7979
string.should eq("hello\u{d7ff}")
8080
end
81+
82+
it "allows creating from a null-terminated slice" do
83+
String.from_utf16(Slice(UInt16).empty, truncate_at_null: true).should eq("")
84+
String.from_utf16(UInt16.slice(102, 111, 111, 98, 97, 114), truncate_at_null: true).should eq("foobar")
85+
String.from_utf16(UInt16.slice(102, 111, 111, 0, 98, 97, 114), truncate_at_null: true).should eq("foo")
86+
end
8187
end
8288
end

spec/std/string_spec.cr

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,6 +2234,16 @@ describe "String" do
22342234
it "allows creating from an empty slice" do
22352235
String.new(Bytes.empty).should eq("")
22362236
end
2237+
2238+
it "allows creating from a non-empty slice" do
2239+
String.new(UInt8.slice(102, 111, 111, 0, 98, 97, 114)).should eq("foo\0bar")
2240+
end
2241+
2242+
it "allows creating from a null-terminated slice" do
2243+
String.new(Bytes.empty, truncate_at_null: true).should eq("")
2244+
String.new(UInt8.slice(102, 111, 111, 98, 97, 114), truncate_at_null: true).should eq("foobar")
2245+
String.new(UInt8.slice(102, 111, 111, 0, 98, 97, 114), truncate_at_null: true).should eq("foo")
2246+
end
22372247
end
22382248

22392249
describe "tr" do

src/crystal/pe.cr

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,11 @@ module Crystal
5252
if nt_header.name[0] === '/'
5353
# section name is longer than 8 bytes; look up the COFF string table
5454
name_buf = nt_header.name.to_slice + 1
55-
string_offset = String.new(name_buf.to_unsafe, name_buf.index(0) || name_buf.size).to_i
55+
string_offset = String.new(name_buf, truncate_at_null: true).to_i
5656
io.seek(@string_table_base + string_offset)
5757
name = io.gets('\0', chomp: true).not_nil!
5858
else
59-
name = String.new(nt_header.name.to_unsafe, nt_header.name.index(0) || nt_header.name.size)
59+
name = String.new(nt_header.name.to_slice, truncate_at_null: true)
6060
end
6161

6262
SectionHeader.new(name: name, virtual_offset: nt_header.virtualAddress, offset: nt_header.pointerToRawData, size: nt_header.virtualSize)
@@ -84,7 +84,7 @@ module Crystal
8484
io.seek(@string_table_base + sym.n.name.long)
8585
name = io.gets('\0', chomp: true).not_nil!
8686
else
87-
name = String.new(sym.n.shortName.to_slice).rstrip('\0')
87+
name = String.new(sym.n.shortName.to_slice, truncate_at_null: true)
8888
end
8989

9090
# `@coff_symbols` uses zero-based indices

src/crystal/system/win32/library_archive.cr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ module Crystal::System::LibraryArchive
112112
section_header = uninitialized LibC::IMAGE_SECTION_HEADER
113113
return unless io.read_fully?(pointerof(section_header).to_slice(1).to_unsafe_bytes)
114114

115-
name = String.new(section_header.name.to_unsafe, section_header.name.index(0) || section_header.name.size)
115+
name = String.new(section_header.name.to_slice, truncate_at_null: true)
116116
next unless name == (msvc? ? ".idata$6" : ".idata$7")
117117

118118
if msvc? ? section_header.characteristics.bits_set?(LibC::IMAGE_SCN_CNT_INITIALIZED_DATA) : section_header.pointerToRelocations == 0

src/socket/address.cr

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -916,7 +916,7 @@ class Socket
916916
{% unless flag?(:wasm32) %}
917917
protected def initialize(sockaddr : LibC::SockaddrUn*, size)
918918
@family = Family::UNIX
919-
@path = String.new(sockaddr.value.sun_path.to_unsafe)
919+
@path = String.new(sockaddr.value.sun_path.to_slice, truncate_at_null: true)
920920
@size = size || sizeof(LibC::SockaddrUn)
921921
end
922922
{% end %}
@@ -933,7 +933,7 @@ class Socket
933933
{% else %}
934934
sockaddr = Pointer(LibC::SockaddrUn).malloc
935935
sockaddr.value.sun_family = family
936-
sockaddr.value.sun_path.to_unsafe.copy_from(@path.to_unsafe, @path.bytesize + 1)
936+
sockaddr.value.sun_path.to_unsafe.copy_from(@path.to_unsafe, {@path.bytesize + 1, sockaddr.value.sun_path.size}.min)
937937
sockaddr.as(LibC::Sockaddr*)
938938
{% end %}
939939
end

src/string.cr

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,12 +159,24 @@ class String
159159
# This method is always safe to call, and the resulting string will have
160160
# the contents and size of the slice.
161161
#
162+
# If *truncate_at_null* is true, only the characters up to and not including
163+
# the first null character are copied.
164+
#
162165
# ```
163166
# slice = Slice.new(4) { |i| ('a'.ord + i).to_u8 }
164167
# String.new(slice) # => "abcd"
168+
#
169+
# slice = UInt8.slice(102, 111, 111, 0, 98, 97, 114)
170+
# String.new(slice, truncate_at_null: true) # => "foo"
165171
# ```
166-
def self.new(slice : Bytes)
167-
new(slice.to_unsafe, slice.size)
172+
def self.new(slice : Bytes, *, truncate_at_null : Bool = false)
173+
bytesize = slice.size
174+
if truncate_at_null
175+
if index = slice.index(0)
176+
bytesize = index
177+
end
178+
end
179+
new(slice.to_unsafe, bytesize)
168180
end
169181

170182
# Creates a new `String` from the given *bytes*, which are encoded in the given *encoding*.

src/string/utf16.cr

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,21 +48,27 @@ class String
4848
# Invalid values are encoded using the unicode replacement char with
4949
# codepoint `0xfffd`.
5050
#
51+
# If *truncate_at_null* is true, only the characters up to and not including
52+
# the first null character are copied.
53+
#
5154
# ```
5255
# slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
5356
# String.from_utf16(slice) # => "hi 𐂥"
57+
#
58+
# slice = UInt16.slice(102, 111, 111, 0, 98, 97, 114)
59+
# String.from_utf16(slice, truncate_at_null: true) # => "foo"
5460
# ```
55-
def self.from_utf16(slice : Slice(UInt16)) : String
61+
def self.from_utf16(slice : Slice(UInt16), *, truncate_at_null : Bool = false) : String
5662
bytesize = 0
5763
size = 0
5864

59-
each_utf16_char(slice) do |char|
65+
each_utf16_char(slice, truncate_at_null: truncate_at_null) do |char|
6066
bytesize += char.bytesize
6167
size += 1
6268
end
6369

6470
String.new(bytesize) do |buffer|
65-
each_utf16_char(slice) do |char|
71+
each_utf16_char(slice, truncate_at_null: truncate_at_null) do |char|
6672
char.each_byte do |byte|
6773
buffer.value = byte
6874
buffer += 1
@@ -112,10 +118,11 @@ class String
112118
# :nodoc:
113119
#
114120
# Yields each decoded char in the given slice.
115-
def self.each_utf16_char(slice : Slice(UInt16), &)
121+
def self.each_utf16_char(slice : Slice(UInt16), *, truncate_at_null : Bool = false, &)
116122
i = 0
117123
while i < slice.size
118124
byte = slice[i].to_i
125+
break if truncate_at_null && byte == 0
119126
if byte < 0xd800 || byte >= 0xe000
120127
# One byte
121128
codepoint = byte

0 commit comments

Comments
 (0)