Skip to content

Commit 5029ff3

Browse files
authored
Add VarZeroCow (#5809)
Fixes #5561 We can add more methods as needed, right now I think it covers most use cases.
1 parent ce8bdf8 commit 5029ff3

File tree

2 files changed

+353
-1
lines changed

2 files changed

+353
-1
lines changed

utils/zerovec/src/cow.rs

+351
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
// This file is part of ICU4X. For terms of use, please see the file
2+
// called LICENSE at the top level of the ICU4X source tree
3+
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4+
5+
use crate::ule::{EncodeAsVarULE, UleError, VarULE};
6+
use alloc::boxed::Box;
7+
use core::fmt;
8+
use core::marker::PhantomData;
9+
use core::mem::ManuallyDrop;
10+
use core::ops::Deref;
11+
use core::ptr::NonNull;
12+
use zerofrom::ZeroFrom;
13+
14+
/// Copy-on-write type that efficiently represents [`VarULE`] types as their bitstream representation.
15+
///
16+
/// The primary use case for [`VarULE`] types is the ability to store complex variable-length datastructures
17+
/// inside variable-length collections like [`crate::VarZeroVec`].
18+
///
19+
/// Underlying this ability is the fact that [`VarULE`] types can be efficiently represented as a flat
20+
/// bytestream.
21+
///
22+
/// In zero-copy cases, sometimes one wishes to unconditionally use this bytestream representation, for example
23+
/// to save stack size. A struct with five `Cow<'a, str>`s is not as stack-efficient as a single `Cow` containing
24+
/// the bytestream representation of, say, `Tuple5VarULE<str, str, str, str, str>`.
25+
///
26+
/// This type helps in this case: It is logically a `Cow<'a, V>`, with some optimizations, that is guaranteed
27+
/// to serialize as a byte stream in machine-readable scenarios.
28+
///
29+
/// During human-readable serialization, it will fall back to the serde impls on `V`, which ought to have
30+
/// a human-readable variant.
31+
pub struct VarZeroCow<'a, V: ?Sized> {
32+
/// Pointer to data
33+
///
34+
/// # Safety Invariants
35+
///
36+
/// 1. This slice must always be valid as a byte slice
37+
/// 2. This slice must represent a valid `V`
38+
/// 3. If `owned` is true, this slice can be freed.
39+
///
40+
/// The slice may NOT have the lifetime of `'a`.
41+
buf: NonNull<[u8]>,
42+
/// The buffer is `Box<[u8]>` if true
43+
owned: bool,
44+
_phantom: PhantomData<(&'a V, Box<V>)>,
45+
}
46+
47+
// This is mostly just a `Cow<[u8]>`, safe to implement Send and Sync on
48+
unsafe impl<'a, V: ?Sized> Send for VarZeroCow<'a, V> {}
49+
unsafe impl<'a, V: ?Sized> Sync for VarZeroCow<'a, V> {}
50+
51+
impl<'a, V: ?Sized> Clone for VarZeroCow<'a, V> {
52+
fn clone(&self) -> Self {
53+
if self.is_owned() {
54+
// This clones the box
55+
let b: Box<[u8]> = self.as_bytes().into();
56+
let b = ManuallyDrop::new(b);
57+
let buf: NonNull<[u8]> = (&**b).into();
58+
Self {
59+
// Invariants upheld:
60+
// 1 & 2: The bytes came from `self` so they're a valid value and byte slice
61+
// 3: This is owned (we cloned it), so we set owned to true.
62+
buf,
63+
owned: true,
64+
_phantom: PhantomData,
65+
}
66+
} else {
67+
// Unfortunately we can't just use `new_borrowed(self.deref())` since the lifetime is shorter
68+
Self {
69+
// Invariants upheld:
70+
// 1 & 2: The bytes came from `self` so they're a valid value and byte slice
71+
// 3: This is borrowed (we're sharing a borrow), so we set owned to false.
72+
buf: self.buf,
73+
owned: false,
74+
_phantom: PhantomData,
75+
}
76+
}
77+
}
78+
}
79+
80+
impl<'a, V: ?Sized> Drop for VarZeroCow<'a, V> {
81+
fn drop(&mut self) {
82+
if self.owned {
83+
unsafe {
84+
// Safety: (Invariant 3 on buf)
85+
// since owned is true, this is a valid Box<[u8]> and can be cleaned up
86+
let _ = Box::<[u8]>::from_raw(self.buf.as_ptr());
87+
}
88+
}
89+
}
90+
}
91+
92+
impl<'a, V: VarULE + ?Sized> VarZeroCow<'a, V> {
93+
/// Construct from a slice. Errors if the slice doesn't represent a valid `V`
94+
pub fn parse_byte_slice(bytes: &'a [u8]) -> Result<Self, UleError> {
95+
let val = V::parse_byte_slice(bytes)?;
96+
Ok(Self::new_borrowed(val))
97+
}
98+
99+
/// Construct from an owned slice. Errors if the slice doesn't represent a valid `V`
100+
pub fn parse_owned_byte_slice(bytes: Box<[u8]>) -> Result<Self, UleError> {
101+
V::validate_byte_slice(&bytes)?;
102+
let bytes = ManuallyDrop::new(bytes);
103+
let buf: NonNull<[u8]> = (&**bytes).into();
104+
Ok(Self {
105+
// Invariants upheld:
106+
// 1 & 2: The bytes came from `val` so they're a valid value and byte slice
107+
// 3: This is owned, so we set owned to true.
108+
buf,
109+
owned: true,
110+
_phantom: PhantomData,
111+
})
112+
}
113+
114+
/// Construct from a slice that is known to represent a valid `V`
115+
///
116+
/// # Safety
117+
///
118+
/// `bytes` must be a valid `V`, i.e. it must successfully pass through
119+
/// `V::parse_byte_slice()` or `V::validate_byte_slice()`.
120+
pub const unsafe fn from_byte_slice_unchecked(bytes: &'a [u8]) -> Self {
121+
unsafe {
122+
// Safety: bytes is an &T which is always non-null
123+
let buf: NonNull<[u8]> = NonNull::new_unchecked(bytes as *const [u8] as *mut [u8]);
124+
Self {
125+
// Invariants upheld:
126+
// 1 & 2: Passed upstream to caller
127+
// 3: This is borrowed, so we set owned to false.
128+
buf,
129+
owned: false,
130+
_phantom: PhantomData,
131+
}
132+
}
133+
}
134+
135+
/// Construct this from an [`EncodeAsVarULE`] version of the contained type
136+
///
137+
/// Will always construct an owned version
138+
pub fn from_encodeable<E: EncodeAsVarULE<V>>(encodeable: &E) -> Self {
139+
let b = crate::ule::encode_varule_to_box(encodeable);
140+
Self::new_owned(b)
141+
}
142+
143+
/// Construct a new borrowed version of this
144+
pub fn new_borrowed(val: &'a V) -> Self {
145+
unsafe {
146+
// Safety: val is a valid V, by type
147+
Self::from_byte_slice_unchecked(val.as_byte_slice())
148+
}
149+
}
150+
151+
/// Construct a new borrowed version of this
152+
pub fn new_owned(val: Box<V>) -> Self {
153+
let val = ManuallyDrop::new(val);
154+
let buf: NonNull<[u8]> = val.as_byte_slice().into();
155+
Self {
156+
// Invariants upheld:
157+
// 1 & 2: The bytes came from `val` so they're a valid value and byte slice
158+
// 3: This is owned, so we set owned to true.
159+
buf,
160+
owned: true,
161+
_phantom: PhantomData,
162+
}
163+
}
164+
}
165+
166+
impl<'a, V: ?Sized> VarZeroCow<'a, V> {
167+
/// Whether or not this is owned
168+
pub fn is_owned(&self) -> bool {
169+
self.owned
170+
}
171+
172+
/// Get the byte representation of this type
173+
///
174+
/// Is also always a valid `V` and can be passed to
175+
/// `V::from_byte_slice_unchecked()`
176+
pub fn as_bytes(&self) -> &[u8] {
177+
// Safety: Invariant 1 on self.buf
178+
// The valid V invariant comes from Invariant 2
179+
unsafe { self.buf.as_ref() }
180+
}
181+
}
182+
183+
impl<'a, V: VarULE + ?Sized> Deref for VarZeroCow<'a, V> {
184+
type Target = V;
185+
fn deref(&self) -> &V {
186+
// Safety: From invariant 2 on self.buf
187+
unsafe { V::from_byte_slice_unchecked(self.as_bytes()) }
188+
}
189+
}
190+
191+
impl<'a, V: VarULE + ?Sized> From<&'a V> for VarZeroCow<'a, V> {
192+
fn from(other: &'a V) -> Self {
193+
Self::new_borrowed(other)
194+
}
195+
}
196+
197+
impl<'a, V: VarULE + ?Sized> From<Box<V>> for VarZeroCow<'a, V> {
198+
fn from(other: Box<V>) -> Self {
199+
Self::new_owned(other)
200+
}
201+
}
202+
203+
impl<'a, V: VarULE + ?Sized + fmt::Debug> fmt::Debug for VarZeroCow<'a, V> {
204+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
205+
self.deref().fmt(f)
206+
}
207+
}
208+
209+
// We need manual impls since `#[derive()]` is disallowed on packed types
210+
impl<'a, V: VarULE + ?Sized + PartialEq> PartialEq for VarZeroCow<'a, V> {
211+
fn eq(&self, other: &Self) -> bool {
212+
self.deref().eq(other.deref())
213+
}
214+
}
215+
216+
impl<'a, V: VarULE + ?Sized + Eq> Eq for VarZeroCow<'a, V> {}
217+
218+
impl<'a, V: VarULE + ?Sized + PartialOrd> PartialOrd for VarZeroCow<'a, V> {
219+
fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
220+
self.deref().partial_cmp(other.deref())
221+
}
222+
}
223+
224+
impl<'a, V: VarULE + ?Sized + Ord> Ord for VarZeroCow<'a, V> {
225+
fn cmp(&self, other: &Self) -> core::cmp::Ordering {
226+
self.deref().cmp(other.deref())
227+
}
228+
}
229+
230+
// # Safety
231+
//
232+
// encode_var_ule_len: Produces the length of the contained bytes, which are known to be a valid V by invariant
233+
//
234+
// encode_var_ule_write: Writes the contained bytes, which are known to be a valid V by invariant
235+
unsafe impl<'a, V: VarULE + ?Sized> EncodeAsVarULE<V> for VarZeroCow<'a, V> {
236+
fn encode_var_ule_as_slices<R>(&self, _: impl FnOnce(&[&[u8]]) -> R) -> R {
237+
// unnecessary if the other two are implemented
238+
unreachable!()
239+
}
240+
241+
#[inline]
242+
fn encode_var_ule_len(&self) -> usize {
243+
self.as_bytes().len()
244+
}
245+
246+
#[inline]
247+
fn encode_var_ule_write(&self, dst: &mut [u8]) {
248+
dst.copy_from_slice(self.as_bytes())
249+
}
250+
}
251+
252+
#[cfg(feature = "serde")]
253+
impl<'a, V: VarULE + ?Sized + serde::Serialize> serde::Serialize for VarZeroCow<'a, V> {
254+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
255+
where
256+
S: serde::Serializer,
257+
{
258+
if serializer.is_human_readable() {
259+
<V as serde::Serialize>::serialize(self.deref(), serializer)
260+
} else {
261+
serializer.serialize_bytes(self.as_bytes())
262+
}
263+
}
264+
}
265+
266+
#[cfg(feature = "serde")]
267+
impl<'a, 'de: 'a, V: VarULE + ?Sized> serde::Deserialize<'de> for VarZeroCow<'a, V>
268+
where
269+
Box<V>: serde::Deserialize<'de>,
270+
{
271+
fn deserialize<Des>(deserializer: Des) -> Result<Self, Des::Error>
272+
where
273+
Des: serde::Deserializer<'de>,
274+
{
275+
if deserializer.is_human_readable() {
276+
let b = Box::<V>::deserialize(deserializer)?;
277+
Ok(Self::new_owned(b))
278+
} else {
279+
let bytes = <&[u8]>::deserialize(deserializer)?;
280+
Self::parse_byte_slice(bytes).map_err(serde::de::Error::custom)
281+
}
282+
}
283+
}
284+
285+
#[cfg(feature = "databake")]
286+
impl<'a, V: VarULE + ?Sized> databake::Bake for VarZeroCow<'a, V> {
287+
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
288+
env.insert("zerovec");
289+
let bytes = self.as_bytes().bake(env);
290+
databake::quote! {
291+
// Safety: Known to come from a valid V since self.as_bytes() is always a valid V
292+
unsafe {
293+
zerovec::VarZeroCow::from_byte_slice_unchecked(#bytes)
294+
}
295+
}
296+
}
297+
}
298+
299+
#[cfg(feature = "databake")]
300+
impl<'a, V: VarULE + ?Sized> databake::BakeSize for VarZeroCow<'a, V> {
301+
fn borrows_size(&self) -> usize {
302+
self.as_bytes().len()
303+
}
304+
}
305+
306+
impl<'a, V: VarULE + ?Sized> ZeroFrom<'a, V> for VarZeroCow<'a, V> {
307+
#[inline]
308+
fn zero_from(other: &'a V) -> Self {
309+
Self::new_borrowed(other)
310+
}
311+
}
312+
313+
impl<'a, 'b, V: VarULE + ?Sized> ZeroFrom<'a, VarZeroCow<'b, V>> for VarZeroCow<'a, V> {
314+
#[inline]
315+
fn zero_from(other: &'a VarZeroCow<'b, V>) -> Self {
316+
Self::new_borrowed(other)
317+
}
318+
}
319+
320+
#[cfg(test)]
321+
mod tests {
322+
use super::VarZeroCow;
323+
use crate::ule::tuplevar::Tuple3VarULE;
324+
use crate::vecs::VarZeroSlice;
325+
#[test]
326+
fn test_cow_roundtrip() {
327+
type Messy = Tuple3VarULE<str, [u8], VarZeroSlice<str>>;
328+
let vec = vec!["one", "two", "three"];
329+
let messy: VarZeroCow<Messy> =
330+
VarZeroCow::from_encodeable(&("hello", &b"g\xFF\xFFdbye"[..], vec));
331+
332+
assert_eq!(messy.a(), "hello");
333+
assert_eq!(messy.b(), b"g\xFF\xFFdbye");
334+
assert_eq!(&messy.c()[1], "two");
335+
336+
#[cfg(feature = "serde")]
337+
{
338+
let bincode = bincode::serialize(&messy).unwrap();
339+
let deserialized: VarZeroCow<Messy> = bincode::deserialize(&bincode).unwrap();
340+
assert_eq!(
341+
messy, deserialized,
342+
"Single element roundtrips with bincode"
343+
);
344+
assert!(!deserialized.is_owned());
345+
346+
let json = serde_json::to_string(&messy).unwrap();
347+
let deserialized: VarZeroCow<Messy> = serde_json::from_str(&json).unwrap();
348+
assert_eq!(messy, deserialized, "Single element roundtrips with serde");
349+
}
350+
}
351+
}

utils/zerovec/src/lib.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@
213213

214214
extern crate alloc;
215215

216+
mod cow;
216217
#[cfg(feature = "hashmap")]
217218
pub mod hashmap;
218219
mod map;
@@ -225,11 +226,11 @@ mod zerovec;
225226
// This must be after `mod zerovec` for some impls on `ZeroSlice<RawBytesULE>`
226227
// to show up in the right spot in the docs
227228
pub mod ule;
228-
229229
#[cfg(feature = "yoke")]
230230
mod yoke_impls;
231231
mod zerofrom_impls;
232232

233+
pub use crate::cow::VarZeroCow;
233234
#[cfg(feature = "hashmap")]
234235
pub use crate::hashmap::ZeroHashMap;
235236
pub use crate::map::map::ZeroMap;

0 commit comments

Comments
 (0)