Skip to content

Commit e1954da

Browse files
committed
Replace old logic for decoding 9- and 10-byte FlexUInts
1 parent 04d267d commit e1954da

File tree

2 files changed

+36
-202
lines changed

2 files changed

+36
-202
lines changed

src/lazy/encoder/binary/v1_1/flex_int.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ impl FlexInt {
4848
pub fn read(input: &[u8], offset: usize) -> IonResult<FlexInt> {
4949
// A FlexInt has the same structure as a FlexUInt. We can read a FlexUInt and then re-interpret
5050
// its unsigned bytes as two's complement bytes.
51-
let flex_uint =
52-
FlexUInt::read_flex_primitive_as_uint(input, offset, "reading a FlexInt", true)?;
51+
let flex_uint = FlexUInt::read_flex_primitive_as_uint(input, offset, "reading a FlexInt")?;
5352
let unsigned_value = flex_uint.value();
5453

5554
// If the encoded FlexInt required `N` bytes to encode where `N` is fewer than 8, then its

src/lazy/encoder/binary/v1_1/flex_uint.rs

+35-200
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ use crate::{IonResult, UInt};
33
use bumpalo::collections::Vec as BumpVec;
44
use ice_code::ice as cold_path;
55
use std::io::Write;
6-
use std::mem;
76

87
const BITS_PER_U128: usize = 128;
98
const BITS_PER_ENCODED_BYTE: usize = 7;
@@ -59,83 +58,36 @@ impl FlexUInt {
5958
// By branching on particular values, we make the value of `num_encoded_bytes` in their
6059
// corresponding arm `const`. This allows us to use `read_n_bytes` to optimize for those
6160
// sizes.
62-
let flex_uint = match num_encoded_bytes {
63-
1 => Self::read_n_bytes::<1>(input),
64-
2 => Self::read_n_bytes::<2>(input),
65-
3 => Self::read_n_bytes::<3>(input),
66-
4 => Self::read_n_bytes::<4>(input),
61+
let mut buffer = [0u8; size_of::<u64>()];
62+
match num_encoded_bytes {
63+
1 => Self::read_n_bytes::<1>(input, &mut buffer),
64+
2 => Self::read_n_bytes::<2>(input, &mut buffer),
65+
3 => Self::read_n_bytes::<3>(input, &mut buffer),
66+
4 => Self::read_n_bytes::<4>(input, &mut buffer),
67+
// If the number of encoded bytes isn't 1-4, fall back to the general-purpose
68+
// reading logic.
6769
_ => break 'common_case,
6870
};
71+
let value = u64::from_le_bytes(buffer).wrapping_shr(num_encoded_bytes as u32);
72+
let flex_uint = FlexUInt::new(num_encoded_bytes, value);
6973
return Ok(flex_uint);
7074
}
7175
}
72-
// Calling `read_flex_primitive_as_uint_no_inline` keeps this method small enough that
73-
// the code for the common case can be inlined.
74-
Self::read_flex_primitive_as_uint_no_inline(input, offset, "reading a FlexUInt", false)
76+
// General-purpose FlexUInt reading logic. Checks for empty input and supports FlexUInts
77+
// up to U64::MAX.
78+
Self::read_flex_primitive_as_uint(input, offset, "reading a FlexUInt")
7579
}
7680

7781
#[inline]
78-
pub fn read_n_bytes<const NUM_BYTES: usize>(bytes: &[u8]) -> FlexUInt {
82+
pub fn read_n_bytes<const NUM_BYTES: usize>(bytes: &[u8], buffer: &mut [u8; size_of::<u64>()]) {
7983
let input: [u8; NUM_BYTES] = *(bytes.first_chunk::<NUM_BYTES>().unwrap());
80-
let mut buffer = [0u8; size_of::<u64>()];
8184
*buffer.first_chunk_mut::<NUM_BYTES>().unwrap() = input;
82-
let value = u64::from_le_bytes(buffer)
83-
.checked_shr(NUM_BYTES as u32)
84-
.unwrap_or(0);
85-
FlexUInt::new(NUM_BYTES, value)
8685
}
8786

88-
/// Helper method that reads a [`FlexUInt`] with 7 or fewer bytes of magnitude from the buffer.
89-
// Caller must confirm that `bytes` has at least 8 bytes.
90-
#[inline(never)]
91-
fn read_small_flex_uint(bytes: &[u8]) -> FlexUInt {
92-
debug_assert!(bytes.len() >= 8);
93-
let num_encoded_bytes = bytes[0].trailing_zeros() as usize + 1;
94-
let num_encoded_bits = 8 * num_encoded_bytes;
95-
// Get a mask with the low 'n' bits set
96-
// TODO: Should this be a const cache of num_encoded_bits -> mask?
97-
let mask = 1u64
98-
.checked_shl(num_encoded_bits as u32)
99-
.map(|v| v - 1)
100-
.unwrap_or(u64::MAX);
101-
// Convert our longer-than-8-bytes slice to a fixed sized 8-byte array that we can convert
102-
// to a u64 directly.
103-
let fixed_size_input: [u8; 8] = bytes[..8].try_into().unwrap();
104-
// This step will often read unrelated bytes from beyond the FlexUInt, but they are
105-
// discarded in the shift operation that follows.
106-
let encoded_value = u64::from_le_bytes(fixed_size_input);
107-
// Note that `num_encoded_bytes` is also the number of continuation flags that we need
108-
// to discard via right shifting.
109-
let value = (encoded_value & mask) >> num_encoded_bytes;
110-
FlexUInt::new(num_encoded_bytes, value)
111-
}
112-
113-
#[inline(never)]
114-
pub(crate) fn read_flex_primitive_as_uint_no_inline(
115-
input: &[u8],
116-
offset: usize,
117-
label: &'static str,
118-
support_sign_extension: bool,
119-
) -> IonResult<FlexUInt> {
120-
Self::read_flex_primitive_as_uint(input, offset, label, support_sign_extension)
121-
}
122-
123-
/// Helper method that reads a flex-encoded primitive from the buffer, returning it as a `FlexUInt`.
124-
/// If an error occurs while reading, its description will include the supplied `label`.
125-
///
126-
/// The current implementation supports flex primitives with up to 64 bits of representation
127-
/// beyond the leading header bits. Flex primitives requiring 10 bytes to encode have 70 magnitude
128-
/// bits. If this value is unsigned (`support_sign_extension=false`), the six bits beyond the
129-
/// supported 64 must all be `0`. If this value will later be re-interpreted as a signed value,
130-
/// (`support_sign_extension=true`), then the six bits beyond the supported 64 must all be the
131-
/// same as the 64th (highest supported) bit. This will allow encodings of up to 70 bits
132-
/// to be correctly interpreted as positive, negative, or beyond the bounds of the 64 bit
133-
/// limitation.
13487
pub(crate) fn read_flex_primitive_as_uint(
13588
input: &[u8],
13689
offset: usize,
13790
label: &'static str,
138-
support_sign_extension: bool,
13991
) -> IonResult<FlexUInt> {
14092
// A closure that generates an incomplete data result at the current offset. This can be invoked
14193
// in a variety of early-return cases in this method.
@@ -146,147 +98,30 @@ impl FlexUInt {
14698
return incomplete();
14799
}
148100

149-
// The `from_le_bytes` method we use to interpret data requires at least 8 bytes to be available.
150-
// There can be 1-2 bytes of header for a u64, leading to a maximum size of 10 bytes. If the input
151-
// buffer doesn't have at least 10 bytes, copy its contents into a temporary buffer that's
152-
// padded with 0 bytes. We round the size of the temp buffer to 16 as it produces slightly
153-
// nicer assembly than 10.
154-
let mut buffer = [0u8; 16];
155-
let bytes = if bytes_available >= 10 {
156-
input
157-
} else {
158-
buffer[0..bytes_available].copy_from_slice(input);
159-
&buffer[..]
101+
let num_encoded_bytes = match input[0] {
102+
// If the first byte is zero, we're not done reading the length bits yet.
103+
// Confirm that we have more than just one byte remaining in input.
104+
0 if input.len() == 1 => return incomplete(),
105+
// The number of trailing zeros in the second byte plus the 8 trailing
106+
// zeros from the first byte.
107+
0 => (input[1].trailing_zeros() as usize + 1) + 8,
108+
// Otherwise, use the number of trailing zeros from the first byte.
109+
first_byte => first_byte.trailing_zeros() as usize + 1,
160110
};
161111

162-
let first_byte = bytes[0];
163-
// If the first byte is not zero, the FlexUInt is 7 or fewer bytes.
164-
if first_byte != 0 {
165-
let num_encoded_bytes = first_byte.trailing_zeros() as usize + 1;
166-
// Note that `bytes_available` is the number of bytes in the original unpadded input.
167-
// Our buffer may be 16 bytes long but only `bytes_available` of those are meaningful.
168-
if bytes_available < num_encoded_bytes {
169-
return incomplete();
170-
}
171-
// At this point, we know the original input contained all of the FlexUInt's bytes.
172-
// We can call `read_small_flex_uint` with the now-padded version of the buffer.
173-
// It will discard any bytes that are not part of the FlexUInt.
174-
let flex_uint = Self::read_small_flex_uint(bytes);
175-
return Ok(flex_uint);
112+
if num_encoded_bytes > 10 {
113+
return IonResult::decoding_error(
114+
"maximum supported serialized FlexUInt size is 10 bytes",
115+
);
176116
}
177-
178-
cold_path! {{
179-
// If we reach this point, the first byte was a zero. The FlexUInt is at least 9 bytes in size.
180-
// We need to inspect the second byte to see how many more prefix bits there are.
181-
if bytes_available < 2 {
182-
return incomplete();
183-
}
184-
let second_byte = bytes[1];
185-
186-
if second_byte & 0b11 == 0b00 {
187-
// The flag bits in the second byte indicate at least two more bytes, meaning the total
188-
// length is more than 10 bytes. We're not equipped to handle this.
189-
return IonResult::decoding_error(
190-
"found a >10 byte Flex(U)Int too large to fit in 64 bits",
191-
);
192-
}
193-
194-
if second_byte & 0b11 == 0b10 {
195-
// The lowest bit of the second byte is empty, the next lowest is not. The encoding
196-
// is 10 bytes.
197-
198-
if bytes_available < 10 {
199-
return incomplete();
200-
}
201-
202-
let flex_uint = Self::read_10_byte_flex_primitive_as_uint(
203-
support_sign_extension,
204-
bytes,
205-
second_byte,
206-
)?;
207-
return Ok(flex_uint);
208-
}
209-
210-
// The lowest bit of the second byte is set. The encoding is 9 bytes.
211-
if bytes_available < 9 {
212-
return incomplete();
213-
}
214-
// There are 57-63 bits of magnitude. We can decode the remaining bytes in a u64.
215-
let remaining_data = &bytes[1..9];
216-
// We know that the slice is 8 bytes long, so we can unwrap() the conversion to [u8; 8]
217-
// Lop off the lowest bit to discard the `end` flag.
218-
let value = u64::from_le_bytes(remaining_data[..8].try_into().unwrap()) >> 1;
219-
let flex_uint = FlexUInt::new(9, value);
220-
Ok(flex_uint)
221-
}}
222-
}
223-
224-
/// Helper method to handle flex primitives whose encoding requires 10 bytes. This case is
225-
/// complex because it requires evaluating data beyond the supported 64 bits of representation
226-
/// to detect overflow and support signed re-interpretation.
227-
fn read_10_byte_flex_primitive_as_uint(
228-
support_sign_extension: bool,
229-
input: &[u8],
230-
second_byte: u8,
231-
) -> IonResult<FlexUInt> {
232-
// There are 10 prefix (continuation) bits, 64 bits of magnitude, and 6 bits of sign
233-
// extension (if enabled). We cannot store the highest 6 bits, so this method just checks
234-
// to make sure that they do not modify the meaning of the value in the lower 64 bits.
235-
// For signed values, this means the 6 extra bits must all be the same as the 64th bit.
236-
// For unsigned values, this means that the 6 extra bits must all be `0`.
237-
//
238-
// Little Endian byte diagram:
239-
//
240-
// b0 b1 b2 b3
241-
// PPPPPPPP MMMMMMPP MMMMMMMM MMMMMMMM
242-
// b4 b5 b6 b7
243-
// MMMMMMMM MMMMMMMM MMMMMMMM MMMMMMMM
244-
// b8 b9
245-
// MMMMMMMM XXXXXXMM
246-
//
247-
// P = Prefix bit
248-
// M = Magnitude bit
249-
// X = An 'extra' bit; if `support_sign_extension` is true, these are sign bits.
250-
251-
// We've already processed the first byte, and we've looked at the lowest two bits of
252-
// the second byte. Isolate the highest six bits of the second byte (b1) which represent
253-
// the lowest six bits of the magnitude.
254-
let magnitude_low_six = second_byte >> 2;
255-
// Load the remaining 8 bytes into a u64 that we can easily shift/mask.
256-
let remaining_data = &input[2..10];
257-
// We know the slice is 8 bytes long, so we can `unwrap()` the conversion to [u8; 8]
258-
let remaining_magnitude = u64::from_le_bytes(remaining_data.try_into().unwrap());
259-
260-
let sign_extension_bits = (remaining_magnitude & (0b111111 << 58)) >> 58;
261-
if support_sign_extension {
262-
// Something downstream intends to use this as a signed value; we need to make sure
263-
// that bits 65-70 match bit 64. `remaining_magnitude` is storing 58 bits of data,
264-
// so bit 64 of the value (bit index=63) is bit 58 (bit index=57) in `remaining_magnitude`.
265-
let high_bit_is_set = remaining_magnitude & (1 << 57) != 0;
266-
if (high_bit_is_set && sign_extension_bits != 0b111111)
267-
|| (!high_bit_is_set && sign_extension_bits != 0)
268-
{
269-
// If the sign extension bits don't agree with the top bit, this value required
270-
// more than 64 bits to encode.
271-
return IonResult::decoding_error(
272-
"found a 10-byte FlexInt too large to fit in a i64",
273-
);
274-
}
275-
} else {
276-
// This is an unsigned value; if any of the highest six bits are set, then this
277-
// value is beyond the magnitude we can store in a u64.
278-
if sign_extension_bits != 0 {
279-
return IonResult::decoding_error(
280-
"found a 10-byte FlexUInt too large to fit in a u64",
281-
);
282-
}
117+
if num_encoded_bytes > input.len() {
118+
return incomplete();
283119
}
284-
285-
// Shift the magnitude from the last 8 bytes over and combine it with the six bits we
286-
// carried over from the second byte.
287-
let value = (remaining_magnitude << 6) | magnitude_low_six as u64;
288-
let flex_uint = FlexUInt::new(10, value);
289-
Ok(flex_uint)
120+
let mut buffer = [0u8; size_of::<u128>()];
121+
(&mut buffer[..num_encoded_bytes]).copy_from_slice(&input[..num_encoded_bytes]);
122+
let big_value = u128::from_le_bytes(buffer).wrapping_shr(num_encoded_bytes as u32);
123+
let value = big_value as u64;
124+
Ok(FlexUInt::new(num_encoded_bytes, value))
290125
}
291126

292127
#[inline]
@@ -310,7 +145,7 @@ impl FlexUInt {
310145
// can be encoded entirely within a u128, which offers native shifting and masking operations.
311146
// FlexUInts are used to represent symbol/macro table addresses and byte lengths, so 112 bits of
312147
// magnitude should be sufficient for all but the most extreme use cases.
313-
const MAX_FLEX_UINT_ENCODED_SIZE_IN_BYTES: usize = mem::size_of::<u128>();
148+
const MAX_FLEX_UINT_ENCODED_SIZE_IN_BYTES: usize = size_of::<u128>();
314149

315150
#[inline]
316151
pub fn write<W: Write>(output: &mut W, value: impl Into<UInt>) -> IonResult<usize> {

0 commit comments

Comments
 (0)