Skip to content

Commit 1c19619

Browse files
committed
syntax: fix literal extraction for 'ab??'
Previously, 'ab??' returned [Complete(ab), Complete(a)], but the order matters here because of greediness. The correct result is [Complete(a), Complete(ab)]. Instead of trying to actually fix literal extraction (which is a mess), we just rewrite 'ab?' (and 'ab??') as 'ab*'. 'ab*' still produces literals in the incorrect order, i.e., [Cut(ab), Complete(a)], but since one is cut we are guaranteed that the regex engine will be called to confirm the match. In so doing, it will correctly report 'a' as a match for 'ab??' in 'ab'. Fixes #862
1 parent 88a2a62 commit 1c19619

File tree

3 files changed

+24
-14
lines changed

3 files changed

+24
-14
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ The below are changes for the next release, which is to be determined.
66
Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.
77
* [BUG #859](https://github.com/rust-lang/regex/issues/859):
88
Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`.
9+
* [BUG #862](https://github.com/rust-lang/regex/issues/862):
10+
Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'.
911

1012

1113
1.5.5 (2022-03-08)

regex-syntax/src/hir/literal/mod.rs

+19-14
Original file line numberDiff line numberDiff line change
@@ -735,18 +735,18 @@ fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
735735
lits: &mut Literals,
736736
mut f: F,
737737
) {
738-
let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
739-
lits3.set_limit_size(lits.limit_size() / 2);
740-
f(e, &mut lits3);
741-
742-
if lits3.is_empty() || !lits2.cross_product(&lits3) {
743-
lits.cut();
744-
return;
745-
}
746-
lits2.add(Literal::empty());
747-
if !lits.union(lits2) {
748-
lits.cut();
749-
}
738+
f(
739+
&Hir::repetition(hir::Repetition {
740+
kind: hir::RepetitionKind::ZeroOrMore,
741+
// FIXME: Our literal extraction doesn't care about greediness.
742+
// Which is partially why we're treating 'e?' as 'e*'. Namely,
743+
// 'ab??' yields [Complete(ab), Complete(a)], but it should yield
744+
// [Complete(a), Complete(ab)] because of the non-greediness.
745+
greedy: true,
746+
hir: Box::new(e.clone()),
747+
}),
748+
lits,
749+
);
750750
}
751751

752752
fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
@@ -1141,6 +1141,11 @@ mod tests {
11411141
test_lit!(pfx_group1, prefixes, "(a)", M("a"));
11421142
test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
11431143
test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1144+
test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1145+
// FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1146+
// repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
1147+
// a cut literal.
1148+
test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
11441149
test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
11451150
test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
11461151
test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
@@ -1249,8 +1254,8 @@ mod tests {
12491254
pfx_crazy1,
12501255
prefixes,
12511256
r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1252-
C("Mo\\'am"),
1253-
C("Mu\\'am"),
1257+
C("Mo\\'"),
1258+
C("Mu\\'"),
12541259
C("Moam"),
12551260
C("Muam")
12561261
);

tests/regression.rs

+3
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,6 @@ matiter!(
217217
// https://en.wikipedia.org/wiki/Je_(Cyrillic)
218218
ismatch!(empty_group_match, r"()Ј01", "zЈ01", true);
219219
matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
220+
221+
// See: https://github.com/rust-lang/regex/issues/862
222+
mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));

0 commit comments

Comments
 (0)