-
-
Notifications
You must be signed in to change notification settings - Fork 36
Partial match support #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,9 +7,8 @@ use std::sync::Arc; | |
use log::debug; | ||
use pcre2_sys::{ | ||
PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, | ||
PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, | ||
PCRE2_NEWLINE_ANYCRLF, | ||
}; | ||
PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, PCRE2_NEWLINE_ANYCRLF, | ||
PCRE2_PARTIAL_HARD}; | ||
use thread_local::CachedThreadLocal; | ||
|
||
use crate::error::Error; | ||
|
@@ -76,6 +75,8 @@ struct Config { | |
utf_check: bool, | ||
/// use pcre2_jit_compile | ||
jit: JITChoice, | ||
/// use JIT for partial matching | ||
jit_partial_matching: bool, | ||
/// Match-time specific configuration knobs. | ||
match_config: MatchConfig, | ||
} | ||
|
@@ -102,6 +103,7 @@ impl Default for Config { | |
utf: false, | ||
utf_check: true, | ||
jit: JITChoice::Never, | ||
jit_partial_matching: false, | ||
match_config: MatchConfig::default(), | ||
} | ||
} | ||
|
@@ -156,10 +158,10 @@ impl RegexBuilder { | |
match self.config.jit { | ||
JITChoice::Never => {} // fallthrough | ||
JITChoice::Always => { | ||
code.jit_compile()?; | ||
code.jit_compile(self.config.jit_partial_matching)?; | ||
} | ||
JITChoice::Attempt => { | ||
if let Err(err) = code.jit_compile() { | ||
if let Err(err) = code.jit_compile(self.config.jit_partial_matching) { | ||
debug!("JIT compilation failed: {}", err); | ||
} | ||
} | ||
|
@@ -315,6 +317,9 @@ impl RegexBuilder { | |
/// This generally speeds up matching quite a bit. The downside is that it | ||
/// can increase the time it takes to compile a pattern. | ||
/// | ||
/// This option enables JIT only for complete matching. | ||
/// To enable JIT additionally for partial matching, enable `jit_partial_matching`. | ||
/// | ||
/// If the JIT isn't available or if JIT compilation returns an error, | ||
/// then a debug message with the error will be emitted and the regex will | ||
/// otherwise silently fall back to non-JIT matching. | ||
|
@@ -329,6 +334,13 @@ impl RegexBuilder { | |
self | ||
} | ||
|
||
/// Additionally enable PCRE2's JIT for partial matching. | ||
/// This works only together with `jit` set to true. | ||
pub fn jit_partial_matching(&mut self, yes: bool) -> &mut RegexBuilder { | ||
self.config.jit_partial_matching = yes; | ||
self | ||
} | ||
|
||
/// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is | ||
/// not enabled, then this has no effect. | ||
/// | ||
|
@@ -427,6 +439,27 @@ impl Regex { | |
self.is_match_at(subject, 0) | ||
} | ||
|
||
/// Returns true if and only if the regex fully or partially matches the | ||
/// subject string given. A partial match occurs when there is a match | ||
/// up to the end of a subject string, but more characters are needed to | ||
/// match the entire pattern. | ||
/// | ||
/// # Example | ||
/// | ||
/// Test if given string can be a beginning of a valid telephone number: | ||
/// | ||
/// ```rust | ||
/// # fn example() -> Result<(), ::pcre2::Error> { | ||
/// use pcre2::bytes::Regex; | ||
/// | ||
/// let text = b"123-456-"; | ||
/// assert!(Regex::new(r"^\d{3}-\d{3}-\d{3}")?.is_partial_match(text)?); | ||
/// # Ok(()) }; example().unwrap() | ||
/// ``` | ||
pub fn is_partial_match(&self, subject: &[u8]) -> Result<bool, Error> { | ||
self.is_partial_match_at(subject, 0) | ||
} | ||
|
||
/// Returns the start and end byte range of the leftmost-first match in | ||
/// `subject`. If no match exists, then `None` is returned. | ||
/// | ||
|
@@ -596,16 +629,19 @@ impl Regex { | |
|
||
/// Advanced or "lower level" search methods. | ||
impl Regex { | ||
|
||
/// Returns the same as is_match, but starts the search at the given | ||
/// offset. | ||
/// | ||
/// The significance of the starting point is that it takes the surrounding | ||
/// context into consideration. For example, the `\A` anchor can only | ||
/// match when `start == 0`. | ||
pub fn is_match_at( | ||
/// | ||
fn is_match_at_imp( | ||
&self, | ||
subject: &[u8], | ||
start: usize, | ||
partial: bool, | ||
) -> Result<bool, Error> { | ||
assert!( | ||
start <= subject.len(), | ||
|
@@ -618,6 +654,9 @@ impl Regex { | |
if !self.config.utf_check { | ||
options |= PCRE2_NO_UTF_CHECK; | ||
} | ||
if partial { | ||
options |= PCRE2_PARTIAL_HARD; | ||
} | ||
|
||
let match_data = self.match_data(); | ||
let mut match_data = match_data.borrow_mut(); | ||
|
@@ -628,6 +667,34 @@ impl Regex { | |
Ok(unsafe { match_data.find(&self.code, subject, start, options)? }) | ||
} | ||
|
||
/// Returns the same as is_match, but starts the search at the given | ||
/// offset. | ||
/// | ||
/// The significance of the starting point is that it takes the surrounding | ||
/// context into consideration. For example, the `\A` anchor can only | ||
/// match when `start == 0`. | ||
pub fn is_match_at( | ||
&self, | ||
subject: &[u8], | ||
start: usize, | ||
) -> Result<bool, Error> { | ||
self.is_match_at_imp(subject, start, false) | ||
} | ||
|
||
/// Returns the same as is_partial_match, but starts the search at the given | ||
/// offset. | ||
/// | ||
/// The significance of the starting point is that it takes the surrounding | ||
/// context into consideration. For example, the `\A` anchor can only | ||
/// match when `start == 0`. | ||
pub fn is_partial_match_at( | ||
&self, | ||
subject: &[u8], | ||
start: usize, | ||
) -> Result<bool, Error> { | ||
self.is_match_at_imp(subject, start, true) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks to me like this code might benefit from a slight refactor. Namely, So I'd say, create a private There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, will do. |
||
|
||
/// Returns the same as find, but starts the search at the given | ||
/// offset. | ||
/// | ||
|
@@ -1150,6 +1217,18 @@ mod tests { | |
assert!(re.is_match(b("Β")).unwrap()); | ||
} | ||
|
||
#[test] | ||
fn partial() { | ||
let re = RegexBuilder::new() | ||
.build("ab$") | ||
.unwrap(); | ||
|
||
assert!(re.is_partial_match(b("a")).unwrap()); | ||
assert!(re.is_partial_match(b("ab")).unwrap()); | ||
assert!(!re.is_partial_match(b("abc")).unwrap()); | ||
assert!(!re.is_partial_match(b("b")).unwrap()); | ||
} | ||
|
||
#[test] | ||
fn crlf() { | ||
let re = RegexBuilder::new() | ||
|
@@ -1247,6 +1326,19 @@ mod tests { | |
} | ||
} | ||
|
||
#[test] | ||
fn jit_partial_matching() { | ||
if is_jit_available() { | ||
let re = RegexBuilder::new() | ||
.jit(true) | ||
.jit_partial_matching(true) | ||
.build(r"[0-9][0-9][0-9]") | ||
.unwrap(); | ||
assert!(!re.is_match(b("12")).unwrap()); | ||
assert!(re.is_partial_match(b("12")).unwrap()); | ||
} | ||
} | ||
|
||
// Unlike jit4lyfe, this tests that everything works when requesting the | ||
// JIT only if it's available. In jit4lyfe, we require the JIT or fail. | ||
// If the JIT isn't available, then in this test, we simply don't use it. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,11 +89,18 @@ impl Code { | |
|
||
/// JIT compile this code object. | ||
/// | ||
/// If partial is set, PCRE2_JIT_PARTIAL_HARD option flag is added | ||
/// to generate code for partial matching. | ||
/// | ||
/// If there was a problem performing JIT compilation, then this returns | ||
/// an error. | ||
pub fn jit_compile(&mut self) -> Result<(), Error> { | ||
pub fn jit_compile(&mut self, partial: bool) -> Result<(), Error> { | ||
let mut options = PCRE2_JIT_COMPLETE; | ||
if partial { | ||
options |= PCRE2_JIT_PARTIAL_HARD; | ||
} | ||
let error_code = unsafe { | ||
pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE) | ||
pcre2_jit_compile_8(self.code, options) | ||
}; | ||
if error_code == 0 { | ||
self.compiled_jit = true; | ||
|
@@ -390,6 +397,10 @@ impl MatchData { | |
/// | ||
/// This returns false if no match occurred. | ||
/// | ||
/// If partial match was requested by PCRE2_PARTIAL_HARD or | ||
/// PCRE2_PARTIAL_SOFT option, this returns true if either a partial match | ||
/// or a complete match occurred. | ||
/// | ||
/// Match offsets can be extracted via `ovector`. | ||
/// | ||
/// # Safety | ||
|
@@ -427,6 +438,9 @@ impl MatchData { | |
); | ||
if rc == PCRE2_ERROR_NOMATCH { | ||
Ok(false) | ||
} else if rc == PCRE2_ERROR_PARTIAL && | ||
options & (PCRE2_PARTIAL_HARD | PCRE2_PARTIAL_SOFT) != 0 { | ||
Ok(true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose this behavior should be documented in this function's contract. |
||
} else if rc > 0 { | ||
Ok(true) | ||
} else { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a blank line before the code block.