helix-editor
diff --git a/‎helix-core/src/shellwords.rs
Lines changed: 163 additions & 160 deletions b/‎helix-core/src/shellwords.rs
Lines changed: 163 additions & 160 deletions
@@ -27,181 +27,172 @@ enum State {
     DquoteEscaped,
 }
 
-/// Get the vec of escaped / quoted / doublequoted filenames from the input str
-pub fn shellwords(input: &str) -> Vec<Cow<'_, str>> {
-    use State::*;
+pub struct Shellwords<'a> {
+    state: State,
+    /// Shellwords where whitespace and escapes has been resolved.
+    words: Vec<Cow<'a, str>>,
+    /// The parts of the input that are divided into shellwords. This can be
+    /// used to retrieve the original text for a given word by looking up the
+    /// same index in the Vec as the word in `words`.
+    parts: Vec<&'a str>,
+}
 
-    let mut state = Unquoted;
-    let mut args: Vec<Cow<str>> = Vec::new();
-    let mut escaped = String::with_capacity(input.len());
+impl<'a> From<&'a str> for Shellwords<'a> {
+    fn from(input: &'a str) -> Self {
+        use State::*;
 
-    let mut start = 0;
-    let mut end = 0;
+        let mut state = Unquoted;
+        let mut words = Vec::new();
+        let mut parts = Vec::new();
+        let mut escaped = String::with_capacity(input.len());
 
-    for (i, c) in input.char_indices() {
-        state = match state {
-            OnWhitespace => match c {
-                '"' => {
-                    end = i;
-                    Dquoted
-                }
-                '\'' => {
-                    end = i;
-                    Quoted
-                }
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        UnquotedEscaped
-                    } else {
+        let mut part_start = 0;
+        let mut unescaped_start = 0;
+        let mut end = 0;
+
+        for (i, c) in input.char_indices() {
+            state = match state {
+                OnWhitespace => match c {
+                    '"' => {
+                        end = i;
+                        Dquoted
+                    }
+                    '\'' => {
+                        end = i;
+                        Quoted
+                    }
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            UnquotedEscaped
+                        } else {
+                            OnWhitespace
+                        }
+                    }
+                    c if c.is_ascii_whitespace() => {
+                        end = i;
                         OnWhitespace
                     }
-                }
-                c if c.is_ascii_whitespace() => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Unquoted,
-            },
-            Unquoted => match c {
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        UnquotedEscaped
-                    } else {
-                        Unquoted
+                    _ => Unquoted,
+                },
+                Unquoted => match c {
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            UnquotedEscaped
+                        } else {
+                            Unquoted
+                        }
                     }
-                }
-                c if c.is_ascii_whitespace() => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Unquoted,
-            },
-            UnquotedEscaped => Unquoted,
-            Quoted => match c {
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        QuoteEscaped
-                    } else {
-                        Quoted
+                    c if c.is_ascii_whitespace() => {
+                        end = i;
+                        OnWhitespace
                     }
-                }
-                '\'' => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Quoted,
-            },
-            QuoteEscaped => Quoted,
-            Dquoted => match c {
-                '\\' => {
-                    if cfg!(unix) {
-                        escaped.push_str(&input[start..i]);
-                        start = i + 1;
-                        DquoteEscaped
-                    } else {
-                        Dquoted
+                    _ => Unquoted,
+                },
+                UnquotedEscaped => Unquoted,
+                Quoted => match c {
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            QuoteEscaped
+                        } else {
+                            Quoted
+                        }
                     }
-                }
-                '"' => {
-                    end = i;
-                    OnWhitespace
-                }
-                _ => Dquoted,
-            },
-            DquoteEscaped => Dquoted,
-        };
+                    '\'' => {
+                        end = i;
+                        OnWhitespace
+                    }
+                    _ => Quoted,
+                },
+                QuoteEscaped => Quoted,
+                Dquoted => match c {
+                    '\\' => {
+                        if cfg!(unix) {
+                            escaped.push_str(&input[unescaped_start..i]);
+                            unescaped_start = i + 1;
+                            DquoteEscaped
+                        } else {
+                            Dquoted
+                        }
+                    }
+                    '"' => {
+                        end = i;
+                        OnWhitespace
+                    }
+                    _ => Dquoted,
+                },
+                DquoteEscaped => Dquoted,
+            };
 
-        if i >= input.len() - 1 && end == 0 {
-            end = i + 1;
-        }
+            if i >= input.len() - 1 && end == 0 {
+                end = i + 1;
+            }
 
-        if end > 0 {
-            let esc_trim = escaped.trim();
-            let inp = &input[start..end];
+            if end > 0 {
+                let esc_trim = escaped.trim();
+                let inp = &input[unescaped_start..end];
 
-            if !(esc_trim.is_empty() && inp.trim().is_empty()) {
-                if esc_trim.is_empty() {
-                    args.push(inp.into());
-                } else {
-                    args.push([escaped, inp.into()].concat().into());
-                    escaped = "".to_string();
+                if !(esc_trim.is_empty() && inp.trim().is_empty()) {
+                    if esc_trim.is_empty() {
+                        words.push(inp.into());
+                        parts.push(inp);
+                    } else {
+                        words.push([escaped, inp.into()].concat().into());
+                        parts.push(&input[part_start..end]);
+                        escaped = "".to_string();
+                    }
                 }
+                unescaped_start = i + 1;
+                part_start = i + 1;
+                end = 0;
             }
-            start = i + 1;
-            end = 0;
         }
-    }
-    args
-}
 
-/// Checks that the input ends with an ascii whitespace character which is
-/// not escaped.
-///
-/// # Examples
-///
-/// ```rust
-/// use helix_core::shellwords::ends_with_whitespace;
-/// assert_eq!(ends_with_whitespace(" "), true);
-/// assert_eq!(ends_with_whitespace(":open "), true);
-/// assert_eq!(ends_with_whitespace(":open foo.txt "), true);
-/// assert_eq!(ends_with_whitespace(":open"), false);
-/// #[cfg(unix)]
-/// assert_eq!(ends_with_whitespace(":open a\\ "), false);
-/// #[cfg(unix)]
-/// assert_eq!(ends_with_whitespace(":open a\\ b.txt"), false);
-/// ```
-pub fn ends_with_whitespace(input: &str) -> bool {
-    use State::*;
+        debug_assert!(words.len() == parts.len());
 
-    // Fast-lane: the input must end with a whitespace character
-    // regardless of quoting.
-    if !input.ends_with(|c: char| c.is_ascii_whitespace()) {
-        return false;
+        Self {
+            state,
+            words,
+            parts,
+        }
     }
+}
 
-    let mut state = Unquoted;
+impl<'a> Shellwords<'a> {
+    /// Checks that the input ends with a whitespace character which is not escaped.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use helix_core::shellwords::Shellwords;
+    /// assert_eq!(Shellwords::from(" ").ends_with_whitespace(), true);
+    /// assert_eq!(Shellwords::from(":open ").ends_with_whitespace(), true);
+    /// assert_eq!(Shellwords::from(":open foo.txt ").ends_with_whitespace(), true);
+    /// assert_eq!(Shellwords::from(":open").ends_with_whitespace(), false);
+    /// #[cfg(unix)]
+    /// assert_eq!(Shellwords::from(":open a\\ ").ends_with_whitespace(), false);
+    /// #[cfg(unix)]
+    /// assert_eq!(Shellwords::from(":open a\\ b.txt").ends_with_whitespace(), false);
+    /// ```
+    pub fn ends_with_whitespace(&self) -> bool {
+        matches!(self.state, State::OnWhitespace)
+    }
 
-    for c in input.chars() {
-        state = match state {
-            OnWhitespace => match c {
-                '"' => Dquoted,
-                '\'' => Quoted,
-                '\\' if cfg!(unix) => UnquotedEscaped,
-                '\\' => OnWhitespace,
-                c if c.is_ascii_whitespace() => OnWhitespace,
-                _ => Unquoted,
-            },
-            Unquoted => match c {
-                '\\' if cfg!(unix) => UnquotedEscaped,
-                '\\' => Unquoted,
-                c if c.is_ascii_whitespace() => OnWhitespace,
-                _ => Unquoted,
-            },
-            UnquotedEscaped => Unquoted,
-            Quoted => match c {
-                '\\' if cfg!(unix) => QuoteEscaped,
-                '\\' => Quoted,
-                '\'' => OnWhitespace,
-                _ => Quoted,
-            },
-            QuoteEscaped => Quoted,
-            Dquoted => match c {
-                '\\' if cfg!(unix) => DquoteEscaped,
-                '\\' => Dquoted,
-                '"' => OnWhitespace,
-                _ => Dquoted,
-            },
-            DquoteEscaped => Dquoted,
-        }
+    /// Returns the list of shellwords calculated from the input string.
+    pub fn words(&self) -> &[Cow<'a, str>] {
+        &self.words
     }
 
-    matches!(state, OnWhitespace)
+    /// Returns a list of strings which correspond to [words] but represent the original
+    /// text in the input string - including escape characters - without separating whitespace.
+    pub fn parts(&self) -> &[&'a str] {
+        &self.parts
+    }
 }
 
 #[cfg(test)]
@@ -212,7 +203,8 @@ mod test {
     #[cfg(windows)]
     fn test_normal() {
         let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#;
-        let result = shellwords(input);
+        let shellwords = Shellwords::from(input);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -230,7 +222,8 @@ mod test {
     #[cfg(unix)]
     fn test_normal() {
         let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#;
-        let result = shellwords(input);
+        let shellwords = Shellwords::from(input);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -247,7 +240,8 @@ mod test {
     fn test_quoted() {
         let quoted =
             r#":o 'single_word' 'twó wörds' '' ' ''\three\' \"with\ escaping\\' 'quote incomplete"#;
-        let result = shellwords(quoted);
+        let shellwords = Shellwords::from(quoted);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -262,7 +256,8 @@ mod test {
     #[cfg(unix)]
     fn test_dquoted() {
         let dquoted = r#":o "single_word" "twó wörds" "" "  ""\three\' \"with\ escaping\\" "dquote incomplete"#;
-        let result = shellwords(dquoted);
+        let shellwords = Shellwords::from(dquoted);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -277,7 +272,8 @@ mod test {
     #[cfg(unix)]
     fn test_mixed() {
         let dquoted = r#":o single_word 'twó wörds' "\three\' \"with\ escaping\\""no space before"'and after' $#%^@ "%^&(%^" ')(*&^%''a\\\\\b' '"#;
-        let result = shellwords(dquoted);
+        let shellwords = Shellwords::from(dquoted);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":o"),
             Cow::from("single_word"),
@@ -298,7 +294,8 @@ mod test {
     fn test_lists() {
         let input =
             r#":set statusline.center ["file-type","file-encoding"] '["list", "in", "qoutes"]'"#;
-        let result = shellwords(input);
+        let shellwords = Shellwords::from(input);
+        let result = shellwords.words().to_vec();
         let expected = vec![
             Cow::from(":set"),
             Cow::from("statusline.center"),
@@ -322,4 +319,10 @@ mod test {
         assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar"));
         assert_eq!(escape("foo bar".into()), Cow::Borrowed("\"foo bar\""));
     }
+
+    #[test]
+    fn test_parts() {
+        assert_eq!(Shellwords::from(":o a").parts(), &[":o", "a"]);
+        assert_eq!(Shellwords::from(":o a\\ ").parts(), &[":o", "a\\ "]);
+    }
 }