Skip to content

Commit c7abab2

Browse files
authored
perf: remove heap allocation in parse_query (#1020)
* perf: remove heap allocation in parse_query * faster by 26ns * update comment * 3ns slower, but reduce overboard repetition * Add bench for with a fragment * Improve parse_fragment performance too * rename bench to just fragment * move ascii_tab_or_newline check into macro
1 parent ffca1ef commit c7abab2

File tree

2 files changed

+146
-46
lines changed

2 files changed

+146
-46
lines changed

url/benches/parse_url.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ fn long(bench: &mut Bencher) {
1919
bench.iter(|| black_box(url).parse::<Url>().unwrap());
2020
}
2121

22+
fn fragment(bench: &mut Bencher) {
23+
let url = "https://example.com/parkbench?tre=es&st=uff#fragment";
24+
25+
bench.bytes = url.len() as u64;
26+
bench.iter(|| black_box(url).parse::<Url>().unwrap());
27+
}
28+
2229
fn plain(bench: &mut Bencher) {
2330
let url = "https://example.com/";
2431

@@ -86,6 +93,7 @@ benchmark_group!(
8693
benches,
8794
short,
8895
long,
96+
fragment,
8997
plain,
9098
hyphen,
9199
leading_digit,

url/src/parser.rs

Lines changed: 138 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ macro_rules! simple_enum_error {
7373
}
7474
}
7575

76+
macro_rules! ascii_tab_or_new_line_pattern {
77+
() => {
78+
'\t' | '\n' | '\r'
79+
};
80+
}
81+
7682
#[cfg(feature = "std")]
7783
impl std::error::Error for ParseError {}
7884

@@ -207,7 +213,7 @@ impl<'i> Input<'i> {
207213
if input.len() < original_input.len() {
208214
vfn(SyntaxViolation::C0SpaceIgnored)
209215
}
210-
if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
216+
if input.chars().any(ascii_tab_or_new_line) {
211217
vfn(SyntaxViolation::TabOrNewlineIgnored)
212218
}
213219
}
@@ -225,7 +231,7 @@ impl<'i> Input<'i> {
225231
if input.len() < original_input.len() {
226232
vfn(SyntaxViolation::C0SpaceIgnored)
227233
}
228-
if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
234+
if input.chars().any(ascii_tab_or_new_line) {
229235
vfn(SyntaxViolation::TabOrNewlineIgnored)
230236
}
231237
}
@@ -281,7 +287,7 @@ impl<'i> Input<'i> {
281287
let utf8 = self.chars.as_str();
282288
match self.chars.next() {
283289
Some(c) => {
284-
if !matches!(c, '\t' | '\n' | '\r') {
290+
if !ascii_tab_or_new_line(c) {
285291
return Some((c, &utf8[..c.len_utf8()]));
286292
}
287293
}
@@ -321,9 +327,7 @@ impl<F: FnMut(char) -> bool> Pattern for F {
321327
impl Iterator for Input<'_> {
322328
type Item = char;
323329
fn next(&mut self) -> Option<char> {
324-
self.chars
325-
.by_ref()
326-
.find(|&c| !matches!(c, '\t' | '\n' | '\r'))
330+
self.chars.by_ref().find(|&c| !ascii_tab_or_new_line(c))
327331
}
328332
}
329333

@@ -995,7 +999,7 @@ impl<'a> Parser<'a> {
995999
':' if !inside_square_brackets => break,
9961000
'\\' if scheme_type.is_special() => break,
9971001
'/' | '?' | '#' => break,
998-
'\t' | '\n' | '\r' => {
1002+
ascii_tab_or_new_line_pattern!() => {
9991003
has_ignored_chars = true;
10001004
}
10011005
'[' => {
@@ -1077,7 +1081,7 @@ impl<'a> Parser<'a> {
10771081
for c in input_str.chars() {
10781082
match c {
10791083
'/' | '\\' | '?' | '#' => break,
1080-
'\t' | '\n' | '\r' => has_ignored_chars = true,
1084+
ascii_tab_or_new_line_pattern!() => has_ignored_chars = true,
10811085
_ => non_ignored_chars += 1,
10821086
}
10831087
bytes += c.len_utf8();
@@ -1473,37 +1477,81 @@ impl<'a> Parser<'a> {
14731477
&mut self,
14741478
scheme_type: SchemeType,
14751479
scheme_end: u32,
1476-
mut input: Input<'i>,
1480+
input: Input<'i>,
14771481
) -> Option<Input<'i>> {
1478-
let len = input.chars.as_str().len();
1479-
let mut query = String::with_capacity(len); // FIXME: use a streaming decoder instead
1480-
let mut remaining = None;
1481-
while let Some(c) = input.next() {
1482-
if c == '#' && self.context == Context::UrlParser {
1483-
remaining = Some(input);
1484-
break;
1485-
} else {
1486-
self.check_url_code_point(c, &input);
1487-
query.push(c);
1482+
struct QueryPartIter<'i, 'p> {
1483+
is_url_parser: bool,
1484+
input: Input<'i>,
1485+
violation_fn: Option<&'p dyn Fn(SyntaxViolation)>,
1486+
}
1487+
1488+
impl<'i> Iterator for QueryPartIter<'i, '_> {
1489+
type Item = (&'i str, bool);
1490+
1491+
fn next(&mut self) -> Option<Self::Item> {
1492+
let start = self.input.chars.as_str();
1493+
// bypass self.input.next() in order to get string slices
1494+
// which are faster to operate on
1495+
while let Some(c) = self.input.chars.next() {
1496+
match c {
1497+
ascii_tab_or_new_line_pattern!() => {
1498+
return Some((
1499+
&start[..start.len() - self.input.chars.as_str().len() - 1],
1500+
false,
1501+
));
1502+
}
1503+
'#' if self.is_url_parser => {
1504+
return Some((
1505+
&start[..start.len() - self.input.chars.as_str().len() - 1],
1506+
true,
1507+
));
1508+
}
1509+
c => {
1510+
if let Some(vfn) = &self.violation_fn {
1511+
check_url_code_point(vfn, c, &self.input);
1512+
}
1513+
}
1514+
}
1515+
}
1516+
if start.is_empty() {
1517+
None
1518+
} else {
1519+
Some((start, false))
1520+
}
14881521
}
14891522
}
14901523

1491-
let encoding = match &self.serialization[..scheme_end as usize] {
1492-
"http" | "https" | "file" | "ftp" => self.query_encoding_override,
1493-
_ => None,
1494-
};
1495-
let query_bytes = if let Some(o) = encoding {
1496-
o(&query)
1497-
} else {
1498-
query.as_bytes().into()
1524+
let mut part_iter = QueryPartIter {
1525+
is_url_parser: self.context == Context::UrlParser,
1526+
input,
1527+
violation_fn: self.violation_fn,
14991528
};
15001529
let set = if scheme_type.is_special() {
15011530
SPECIAL_QUERY
15021531
} else {
15031532
QUERY
15041533
};
1505-
self.serialization.extend(percent_encode(&query_bytes, set));
1506-
remaining
1534+
let query_encoding_override = self.query_encoding_override.filter(|_| {
1535+
matches!(
1536+
&self.serialization[..scheme_end as usize],
1537+
"http" | "https" | "file" | "ftp"
1538+
)
1539+
});
1540+
1541+
while let Some((part, is_finished)) = part_iter.next() {
1542+
match query_encoding_override {
1543+
// slightly faster to be repetitive and not convert text to Cow
1544+
Some(o) => self.serialization.extend(percent_encode(&o(part), set)),
1545+
None => self
1546+
.serialization
1547+
.extend(percent_encode(part.as_bytes(), set)),
1548+
}
1549+
if is_finished {
1550+
return Some(part_iter.input);
1551+
}
1552+
}
1553+
1554+
None
15071555
}
15081556

15091557
fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
@@ -1526,31 +1574,75 @@ impl<'a> Parser<'a> {
15261574
})
15271575
}
15281576

1529-
pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1530-
while let Some((c, utf8_c)) = input.next_utf8() {
1531-
if c == '\0' {
1532-
self.log_violation(SyntaxViolation::NullInFragment)
1533-
} else {
1534-
self.check_url_code_point(c, &input);
1577+
pub fn parse_fragment(&mut self, input: Input<'_>) {
1578+
struct FragmentPartIter<'i, 'p> {
1579+
input: Input<'i>,
1580+
violation_fn: Option<&'p dyn Fn(SyntaxViolation)>,
1581+
}
1582+
1583+
impl<'i> Iterator for FragmentPartIter<'i, '_> {
1584+
type Item = &'i str;
1585+
1586+
fn next(&mut self) -> Option<Self::Item> {
1587+
let start = self.input.chars.as_str();
1588+
// bypass self.input.next() in order to get string slices
1589+
// which are faster to operate on
1590+
while let Some(c) = self.input.chars.next() {
1591+
match c {
1592+
ascii_tab_or_new_line_pattern!() => {
1593+
return Some(
1594+
&start[..start.len() - self.input.chars.as_str().len() - 1],
1595+
);
1596+
}
1597+
'\0' => {
1598+
if let Some(vfn) = &self.violation_fn {
1599+
vfn(SyntaxViolation::NullInFragment);
1600+
}
1601+
}
1602+
c => {
1603+
if let Some(vfn) = &self.violation_fn {
1604+
check_url_code_point(vfn, c, &self.input);
1605+
}
1606+
}
1607+
}
1608+
}
1609+
if start.is_empty() {
1610+
None
1611+
} else {
1612+
Some(start)
1613+
}
15351614
}
1615+
}
1616+
1617+
let part_iter = FragmentPartIter {
1618+
input,
1619+
violation_fn: self.violation_fn,
1620+
};
1621+
1622+
for part in part_iter {
15361623
self.serialization
1537-
.extend(utf8_percent_encode(utf8_c, FRAGMENT));
1624+
.extend(utf8_percent_encode(part, FRAGMENT));
15381625
}
15391626
}
15401627

1628+
#[inline]
15411629
fn check_url_code_point(&self, c: char, input: &Input<'_>) {
15421630
if let Some(vfn) = self.violation_fn {
1543-
if c == '%' {
1544-
let mut input = input.clone();
1545-
if !matches!((input.next(), input.next()), (Some(a), Some(b))
1631+
check_url_code_point(vfn, c, input)
1632+
}
1633+
}
1634+
}
1635+
1636+
fn check_url_code_point(vfn: &dyn Fn(SyntaxViolation), c: char, input: &Input<'_>) {
1637+
if c == '%' {
1638+
let mut input = input.clone();
1639+
if !matches!((input.next(), input.next()), (Some(a), Some(b))
15461640
if a.is_ascii_hexdigit() && b.is_ascii_hexdigit())
1547-
{
1548-
vfn(SyntaxViolation::PercentDecode)
1549-
}
1550-
} else if !is_url_code_point(c) {
1551-
vfn(SyntaxViolation::NonUrlCodePoint)
1552-
}
1641+
{
1642+
vfn(SyntaxViolation::PercentDecode)
15531643
}
1644+
} else if !is_url_code_point(c) {
1645+
vfn(SyntaxViolation::NonUrlCodePoint)
15541646
}
15551647
}
15561648

@@ -1589,7 +1681,7 @@ fn c0_control_or_space(ch: char) -> bool {
15891681
/// https://infra.spec.whatwg.org/#ascii-tab-or-newline
15901682
#[inline]
15911683
fn ascii_tab_or_new_line(ch: char) -> bool {
1592-
matches!(ch, '\t' | '\r' | '\n')
1684+
matches!(ch, ascii_tab_or_new_line_pattern!())
15931685
}
15941686

15951687
/// https://url.spec.whatwg.org/#ascii-alpha

0 commit comments

Comments
 (0)