Skip to content

Commit 01c734a

Browse files
committed
Split yaml.rs into sizeable files.
1 parent 3c9d762 commit 01c734a

File tree

5 files changed

+532
-520
lines changed

5 files changed

+532
-520
lines changed

examples/dump_yaml.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use saphyr::yaml;
1+
use saphyr::{Yaml, YamlLoader};
22
use std::env;
33
use std::fs::File;
44
use std::io::prelude::*;
@@ -9,14 +9,14 @@ fn print_indent(indent: usize) {
99
}
1010
}
1111

12-
fn dump_node(doc: &yaml::Yaml, indent: usize) {
12+
fn dump_node(doc: &Yaml, indent: usize) {
1313
match *doc {
14-
yaml::Yaml::Array(ref v) => {
14+
Yaml::Array(ref v) => {
1515
for x in v {
1616
dump_node(x, indent + 1);
1717
}
1818
}
19-
yaml::Yaml::Hash(ref h) => {
19+
Yaml::Hash(ref h) => {
2020
for (k, v) in h {
2121
print_indent(indent);
2222
println!("{k:?}:");
@@ -36,7 +36,7 @@ fn main() {
3636
let mut s = String::new();
3737
f.read_to_string(&mut s).unwrap();
3838

39-
let docs = yaml::YamlLoader::load_from_str(&s).unwrap();
39+
let docs = YamlLoader::load_from_str(&s).unwrap();
4040
for doc in &docs {
4141
println!("---");
4242
dump_node(doc, 0);

src/encoding.rs

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
//! Encoding utilities. Available only with the `encoding` feature.
2+
3+
use std::{borrow::Cow, ops::ControlFlow};
4+
5+
use encoding_rs::{Decoder, DecoderResult, Encoding};
6+
7+
use crate::{loader::LoadError, Yaml, YamlLoader};
8+
9+
/// The signature of the function to call when using [`YAMLDecodingTrap::Call`].
10+
///
11+
/// The arguments are as follows:
12+
/// * `malformation_length`: The length of the sequence the decoder failed to decode.
13+
/// * `bytes_read_after_malformation`: The number of lookahead bytes the decoder consumed after
14+
/// the malformation.
15+
/// * `input_at_malformation`: What the input buffer is at the malformation.
16+
/// This is the buffer starting at the malformation. The first `malformation_length` bytes are
17+
/// the problematic sequence. The following `bytes_read_after_malformation` are already stored
18+
/// in the decoder and will not be re-fed.
19+
/// * `output`: The output string.
20+
///
21+
/// The function must modify `output` as it feels is best. For instance, one could recreate the
22+
/// behavior of [`YAMLDecodingTrap::Ignore`] with an empty function, [`YAMLDecodingTrap::Replace`]
23+
/// by pushing a `\u{FFFD}` into `output` and [`YAMLDecodingTrap::Strict`] by returning
24+
/// [`ControlFlow::Break`].
25+
///
26+
/// # Returns
27+
/// The function must return [`ControlFlow::Continue`] if decoding may continue or
28+
/// [`ControlFlow::Break`] if decoding must be aborted. An optional error string may be supplied.
29+
pub type YAMLDecodingTrapFn = fn(
30+
malformation_length: u8,
31+
bytes_read_after_malformation: u8,
32+
input_at_malformation: &[u8],
33+
output: &mut String,
34+
) -> ControlFlow<Cow<'static, str>>;
35+
36+
/// The behavior [`YamlDecoder`] must have when an decoding error occurs.
37+
#[derive(Copy, Clone, PartialEq, Eq)]
38+
pub enum YAMLDecodingTrap {
39+
/// Ignore the offending bytes, remove them from the output.
40+
Ignore,
41+
/// Error out.
42+
Strict,
43+
/// Replace them with the Unicode REPLACEMENT CHARACTER.
44+
Replace,
45+
/// Call the user-supplied function upon decoding malformation.
46+
Call(YAMLDecodingTrapFn),
47+
}
48+
49+
/// `YamlDecoder` is a `YamlLoader` builder that allows you to supply your own encoding error trap.
50+
/// For example, to read a YAML file while ignoring Unicode decoding errors you can set the
51+
/// `encoding_trap` to `encoding::DecoderTrap::Ignore`.
52+
/// ```rust
53+
/// use saphyr::{YamlDecoder, YAMLDecodingTrap};
54+
///
55+
/// let string = b"---
56+
/// a\xa9: 1
57+
/// b: 2.2
58+
/// c: [1, 2]
59+
/// ";
60+
/// let out = YamlDecoder::read(string as &[u8])
61+
/// .encoding_trap(YAMLDecodingTrap::Ignore)
62+
/// .decode()
63+
/// .unwrap();
64+
/// ```
65+
pub struct YamlDecoder<T: std::io::Read> {
66+
/// The input stream.
67+
source: T,
68+
/// The behavior to adopt when encountering a malformed encoding.
69+
trap: YAMLDecodingTrap,
70+
}
71+
72+
impl<T: std::io::Read> YamlDecoder<T> {
73+
/// Create a `YamlDecoder` decoding the given source.
74+
pub fn read(source: T) -> YamlDecoder<T> {
75+
YamlDecoder {
76+
source,
77+
trap: YAMLDecodingTrap::Strict,
78+
}
79+
}
80+
81+
/// Set the behavior of the decoder when the encoding is invalid.
82+
pub fn encoding_trap(&mut self, trap: YAMLDecodingTrap) -> &mut Self {
83+
self.trap = trap;
84+
self
85+
}
86+
87+
/// Run the decode operation with the source and trap the `YamlDecoder` was built with.
88+
///
89+
/// # Errors
90+
/// Returns `LoadError` when decoding fails.
91+
pub fn decode(&mut self) -> Result<Vec<Yaml>, LoadError> {
92+
let mut buffer = Vec::new();
93+
self.source.read_to_end(&mut buffer)?;
94+
95+
// Check if the `encoding` library can detect encoding from the BOM, otherwise use
96+
// `detect_utf16_endianness`.
97+
let (encoding, _) =
98+
Encoding::for_bom(&buffer).unwrap_or_else(|| (detect_utf16_endianness(&buffer), 2));
99+
let mut decoder = encoding.new_decoder();
100+
let mut output = String::new();
101+
102+
// Decode the input buffer.
103+
decode_loop(&buffer, &mut output, &mut decoder, self.trap)?;
104+
105+
YamlLoader::load_from_str(&output).map_err(LoadError::Scan)
106+
}
107+
}
108+
109+
/// Perform a loop of [`Decoder::decode_to_string`], reallocating `output` if needed.
110+
fn decode_loop(
111+
input: &[u8],
112+
output: &mut String,
113+
decoder: &mut Decoder,
114+
trap: YAMLDecodingTrap,
115+
) -> Result<(), LoadError> {
116+
use crate::loader::LoadError;
117+
118+
output.reserve(input.len());
119+
let mut total_bytes_read = 0;
120+
121+
loop {
122+
match decoder.decode_to_string_without_replacement(&input[total_bytes_read..], output, true)
123+
{
124+
// If the input is empty, we processed the whole input.
125+
(DecoderResult::InputEmpty, _) => break Ok(()),
126+
// If the output is full, we must reallocate.
127+
(DecoderResult::OutputFull, bytes_read) => {
128+
total_bytes_read += bytes_read;
129+
// The output is already reserved to the size of the input. We slowly resize. Here,
130+
// we're expecting that 10% of bytes will double in size when converting to UTF-8.
131+
output.reserve(input.len() / 10);
132+
}
133+
(DecoderResult::Malformed(malformed_len, bytes_after_malformed), bytes_read) => {
134+
total_bytes_read += bytes_read;
135+
match trap {
136+
// Ignore (skip over) malformed character.
137+
YAMLDecodingTrap::Ignore => {}
138+
// Replace them with the Unicode REPLACEMENT CHARACTER.
139+
YAMLDecodingTrap::Replace => {
140+
output.push('\u{FFFD}');
141+
}
142+
// Otherwise error, getting as much context as possible.
143+
YAMLDecodingTrap::Strict => {
144+
let malformed_len = malformed_len as usize;
145+
let bytes_after_malformed = bytes_after_malformed as usize;
146+
let byte_idx = total_bytes_read - (malformed_len + bytes_after_malformed);
147+
let malformed_sequence = &input[byte_idx..byte_idx + malformed_len];
148+
149+
break Err(LoadError::Decode(Cow::Owned(format!(
150+
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
151+
))));
152+
}
153+
YAMLDecodingTrap::Call(callback) => {
154+
let byte_idx =
155+
total_bytes_read - ((malformed_len + bytes_after_malformed) as usize);
156+
let malformed_sequence =
157+
&input[byte_idx..byte_idx + malformed_len as usize];
158+
if let ControlFlow::Break(error) = callback(
159+
malformed_len,
160+
bytes_after_malformed,
161+
&input[byte_idx..],
162+
output,
163+
) {
164+
if error.is_empty() {
165+
break Err(LoadError::Decode(Cow::Owned(format!(
166+
"Invalid character sequence at {byte_idx}: {malformed_sequence:?}",
167+
))));
168+
}
169+
break Err(LoadError::Decode(error));
170+
}
171+
}
172+
}
173+
}
174+
}
175+
}
176+
}
177+
178+
/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
179+
/// bytestream starts with BOM codepoint.
180+
/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
181+
/// in the general case the bytestream could start with a codepoint that uses both bytes.
182+
///
183+
/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
184+
/// This allows the encoding to be deduced by the pattern of null (#x00) characters.
185+
//
186+
/// See spec at <https://yaml.org/spec/1.2/spec.html#id2771184>
187+
fn detect_utf16_endianness(b: &[u8]) -> &'static Encoding {
188+
if b.len() > 1 && (b[0] != b[1]) {
189+
if b[0] == 0 {
190+
return encoding_rs::UTF_16BE;
191+
} else if b[1] == 0 {
192+
return encoding_rs::UTF_16LE;
193+
}
194+
}
195+
encoding_rs::UTF_8
196+
}
197+
198+
mod test {
199+
use super::{YAMLDecodingTrap, Yaml, YamlDecoder};
200+
201+
#[test]
202+
fn test_read_bom() {
203+
let s = b"\xef\xbb\xbf---
204+
a: 1
205+
b: 2.2
206+
c: [1, 2]
207+
";
208+
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
209+
let doc = &out[0];
210+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
211+
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
212+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
213+
assert!(doc["d"][0].is_badvalue());
214+
}
215+
216+
#[test]
217+
fn test_read_utf16le() {
218+
let s = b"\xff\xfe-\x00-\x00-\x00
219+
\x00a\x00:\x00 \x001\x00
220+
\x00b\x00:\x00 \x002\x00.\x002\x00
221+
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
222+
\x00";
223+
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
224+
let doc = &out[0];
225+
println!("GOT: {doc:?}");
226+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
227+
assert!((doc["b"].as_f64().unwrap() - 2.2f64) <= f64::EPSILON);
228+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
229+
assert!(doc["d"][0].is_badvalue());
230+
}
231+
232+
#[test]
233+
fn test_read_utf16be() {
234+
let s = b"\xfe\xff\x00-\x00-\x00-\x00
235+
\x00a\x00:\x00 \x001\x00
236+
\x00b\x00:\x00 \x002\x00.\x002\x00
237+
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
238+
";
239+
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
240+
let doc = &out[0];
241+
println!("GOT: {doc:?}");
242+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
243+
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
244+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
245+
assert!(doc["d"][0].is_badvalue());
246+
}
247+
248+
#[test]
249+
fn test_read_utf16le_nobom() {
250+
let s = b"-\x00-\x00-\x00
251+
\x00a\x00:\x00 \x001\x00
252+
\x00b\x00:\x00 \x002\x00.\x002\x00
253+
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
254+
\x00";
255+
let out = YamlDecoder::read(s as &[u8]).decode().unwrap();
256+
let doc = &out[0];
257+
println!("GOT: {doc:?}");
258+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
259+
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
260+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
261+
assert!(doc["d"][0].is_badvalue());
262+
}
263+
264+
#[test]
265+
fn test_read_trap() {
266+
let s = b"---
267+
a\xa9: 1
268+
b: 2.2
269+
c: [1, 2]
270+
";
271+
let out = YamlDecoder::read(s as &[u8])
272+
.encoding_trap(YAMLDecodingTrap::Ignore)
273+
.decode()
274+
.unwrap();
275+
let doc = &out[0];
276+
println!("GOT: {doc:?}");
277+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
278+
assert!((doc["b"].as_f64().unwrap() - 2.2f64).abs() <= f64::EPSILON);
279+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
280+
assert!(doc["d"][0].is_badvalue());
281+
}
282+
283+
#[test]
284+
fn test_or() {
285+
assert_eq!(Yaml::Null.or(Yaml::Integer(3)), Yaml::Integer(3));
286+
assert_eq!(Yaml::Integer(3).or(Yaml::Integer(7)), Yaml::Integer(3));
287+
}
288+
}

src/lib.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,20 @@
4343
4444
#![warn(missing_docs, clippy::pedantic)]
4545

46-
pub(crate) mod char_traits;
47-
pub mod emitter;
48-
pub mod yaml;
46+
mod char_traits;
47+
mod emitter;
48+
mod loader;
49+
mod yaml;
4950

5051
// Re-export main components.
5152
pub use crate::emitter::YamlEmitter;
52-
pub use crate::yaml::{Array, Hash, Yaml, YamlLoader};
53+
pub use crate::loader::YamlLoader;
54+
pub use crate::yaml::{Array, Hash, Yaml};
5355

5456
#[cfg(feature = "encoding")]
55-
pub use crate::yaml::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder};
57+
mod encoding;
58+
#[cfg(feature = "encoding")]
59+
pub use crate::encoding::{YAMLDecodingTrap, YAMLDecodingTrapFn, YamlDecoder};
5660

5761
// Re-export `ScanError` as it is used as part of our public API and we want consumers to be able
5862
// to inspect it (e.g. perform a `match`). They wouldn't be able without it.

0 commit comments

Comments
 (0)