Skip to content

Commit 3b784a4

Browse files
authored
fix(embedded): Handle more parsing corner cases (#15187)
### What does this PR try to resolve? This is part of #12207. I found these while implementing frontmatter support within rustc. I'll likely do another pass when I finish rustc support to - Unify tests between cargo and rustc - Improve error messages ### How should we test and review this PR? ### Additional information
2 parents 0a4aff2 + fd2000b commit 3b784a4

File tree

1 file changed

+118
-33
lines changed

1 file changed

+118
-33
lines changed

src/cargo/util/toml/embedded.rs

+118-33
Original file line numberDiff line numberDiff line change
@@ -140,44 +140,28 @@ impl<'s> ScriptSource<'s> {
140140
content: input,
141141
};
142142

143-
// See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang`
144-
// Shebang must start with `#!` literally, without any preceding whitespace.
145-
// For simplicity we consider any line starting with `#!` a shebang,
146-
// regardless of restrictions put on shebangs by specific platforms.
147-
if let Some(rest) = source.content.strip_prefix("#!") {
148-
// Ok, this is a shebang but if the next non-whitespace token is `[`,
149-
// then it may be valid Rust code, so consider it Rust code.
150-
//
151-
// NOTE: rustc considers line and block comments to be whitespace but to avoid
152-
// any more awareness of Rust grammar, we are excluding it.
153-
if rest.trim_start().starts_with('[') {
154-
return Ok(source);
155-
}
156-
157-
// No other choice than to consider this a shebang.
158-
let newline_end = source
159-
.content
160-
.find('\n')
161-
.map(|pos| pos + 1)
162-
.unwrap_or(source.content.len());
163-
let (shebang, content) = source.content.split_at(newline_end);
143+
if let Some(shebang_end) = strip_shebang(source.content) {
144+
let (shebang, content) = source.content.split_at(shebang_end);
164145
source.shebang = Some(shebang);
165146
source.content = content;
166147
}
167148

168149
const FENCE_CHAR: char = '-';
169150

170-
let mut trimmed_content = source.content;
171-
while !trimmed_content.is_empty() {
172-
let c = trimmed_content;
173-
let c = c.trim_start_matches([' ', '\t']);
174-
let c = c.trim_start_matches(['\r', '\n']);
175-
if c == trimmed_content {
151+
let mut rest = source.content;
152+
while !rest.is_empty() {
153+
let without_spaces = rest.trim_start_matches([' ', '\t']);
154+
let without_nl = without_spaces.trim_start_matches(['\r', '\n']);
155+
if without_nl == rest {
156+
// nothing trimmed
176157
break;
158+
} else if without_nl == without_spaces {
159+
// frontmatter must come after a newline
160+
return Ok(source);
177161
}
178-
trimmed_content = c;
162+
rest = without_nl;
179163
}
180-
let fence_end = trimmed_content
164+
let fence_end = rest
181165
.char_indices()
182166
.find_map(|(i, c)| (c != FENCE_CHAR).then_some(i))
183167
.unwrap_or(source.content.len());
@@ -190,20 +174,21 @@ impl<'s> ScriptSource<'s> {
190174
"found {fence_end} `{FENCE_CHAR}` in rust frontmatter, expected at least 3"
191175
)
192176
}
193-
_ => trimmed_content.split_at(fence_end),
177+
_ => rest.split_at(fence_end),
194178
};
179+
let nl_fence_pattern = format!("\n{fence_pattern}");
195180
let (info, content) = rest.split_once("\n").unwrap_or((rest, ""));
196181
let info = info.trim();
197182
if !info.is_empty() {
198183
source.info = Some(info);
199184
}
200185
source.content = content;
201186

202-
let Some((frontmatter, content)) = source.content.split_once(fence_pattern) else {
187+
let Some(frontmatter_nl) = source.content.find(&nl_fence_pattern) else {
203188
anyhow::bail!("no closing `{fence_pattern}` found for frontmatter");
204189
};
205-
source.frontmatter = Some(frontmatter);
206-
source.content = content;
190+
source.frontmatter = Some(&source.content[..frontmatter_nl + 1]);
191+
source.content = &source.content[frontmatter_nl + nl_fence_pattern.len()..];
207192

208193
let (line, content) = source
209194
.content
@@ -235,6 +220,26 @@ impl<'s> ScriptSource<'s> {
235220
}
236221
}
237222

223+
fn strip_shebang(input: &str) -> Option<usize> {
224+
// See rust-lang/rust's compiler/rustc_lexer/src/lib.rs's `strip_shebang`
225+
// Shebang must start with `#!` literally, without any preceding whitespace.
226+
// For simplicity we consider any line starting with `#!` a shebang,
227+
// regardless of restrictions put on shebangs by specific platforms.
228+
if let Some(rest) = input.strip_prefix("#!") {
229+
// Ok, this is a shebang but if the next non-whitespace token is `[`,
230+
// then it may be valid Rust code, so consider it Rust code.
231+
//
232+
// NOTE: rustc considers line and block comments to be whitespace but to avoid
233+
// any more awareness of Rust grammar, we are excluding it.
234+
if !rest.trim_start().starts_with('[') {
235+
// No other choice than to consider this a shebang.
236+
let newline_end = input.find('\n').map(|pos| pos + 1).unwrap_or(input.len());
237+
return Some(newline_end);
238+
}
239+
}
240+
None
241+
}
242+
238243
#[cfg(test)]
239244
mod test_expand {
240245
use snapbox::assert_data_eq;
@@ -466,6 +471,86 @@ fn main() {}
466471
);
467472
}
468473

474+
#[test]
475+
fn split_indent() {
476+
assert_source(
477+
r#"#!/usr/bin/env cargo
478+
---
479+
[dependencies]
480+
time="0.1.25"
481+
----
482+
483+
fn main() {}
484+
"#,
485+
str![[r##"
486+
shebang: "#!/usr/bin/env cargo\n"
487+
info: None
488+
frontmatter: None
489+
content: " ---\n [dependencies]\n time=\"0.1.25\"\n ----\n\nfn main() {}\n"
490+
491+
"##]],
492+
);
493+
}
494+
495+
#[test]
496+
fn split_escaped() {
497+
assert_source(
498+
r#"#!/usr/bin/env cargo
499+
-----
500+
---
501+
---
502+
-----
503+
504+
fn main() {}
505+
"#,
506+
str![[r##"
507+
shebang: "#!/usr/bin/env cargo\n"
508+
info: None
509+
frontmatter: "---\n---\n"
510+
content: "\nfn main() {}\n"
511+
512+
"##]],
513+
);
514+
}
515+
516+
#[test]
517+
fn split_invalid_escaped() {
518+
assert_err(
519+
ScriptSource::parse(
520+
r#"#!/usr/bin/env cargo
521+
---
522+
-----
523+
-----
524+
---
525+
526+
fn main() {}
527+
"#,
528+
),
529+
str!["unexpected trailing content on closing fence: `--`"],
530+
);
531+
}
532+
533+
#[test]
534+
fn split_dashes_in_body() {
535+
assert_source(
536+
r#"#!/usr/bin/env cargo
537+
---
538+
Hello---
539+
World
540+
---
541+
542+
fn main() {}
543+
"#,
544+
str![[r##"
545+
shebang: "#!/usr/bin/env cargo\n"
546+
info: None
547+
frontmatter: "Hello---\nWorld\n"
548+
content: "\nfn main() {}\n"
549+
550+
"##]],
551+
);
552+
}
553+
469554
#[test]
470555
fn split_mismatched_dashes() {
471556
assert_err(

0 commit comments

Comments
 (0)