shell_words/
lib.rs

1// Copyright 2018 Tomasz Miąsko
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE>
4// or the MIT license <LICENSE-MIT>, at your option.
5//
6//! Process command line according to parsing rules of Unix shell as specified
7//! in [Shell Command Language in POSIX.1-2008][posix-shell].
8//!
9//! [posix-shell]: http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html
10
11#![cfg_attr(not(feature = "std"), no_std)]
12#![forbid(unsafe_code)]
13
14#[cfg(feature = "std")]
15extern crate core;
16
17use core::fmt;
18use core::mem;
19
20#[cfg(not(feature = "std"))]
21#[macro_use]
22extern crate alloc;
23
24#[cfg(not(feature = "std"))]
25use alloc::string::String;
26#[cfg(not(feature = "std"))]
27use alloc::vec::Vec;
28
29#[cfg(not(feature = "std"))]
30use alloc::borrow::Cow;
31#[cfg(feature = "std")]
32use std::borrow::Cow;
33
34/// An error returned when shell parsing fails.
35#[derive(Clone, Copy, Debug, PartialEq, Eq)]
36pub struct ParseError;
37
38impl fmt::Display for ParseError {
39    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
40        f.write_str("missing closing quote")
41    }
42}
43
44#[cfg(feature = "std")]
45impl std::error::Error for ParseError {}
46
47enum State {
48    /// Within a delimiter.
49    Delimiter,
50    /// After backslash, but before starting word.
51    Backslash,
52    /// Within an unquoted word.
53    Unquoted,
54    /// After backslash in an unquoted word.
55    UnquotedBackslash,
56    /// Within a single quoted word.
57    SingleQuoted,
58    /// Within a double quoted word.
59    DoubleQuoted,
60    /// After backslash inside a double quoted word.
61    DoubleQuotedBackslash,
62    /// Inside a comment.
63    Comment,
64}
65
66/// Splits command line into separate arguments, in much the same way Unix shell
67/// would, but without many of expansion the shell would perform.
68///
69/// The split functionality is compatible with behaviour of Unix shell, but with
70/// word expansions limited to quote removal, and without special token
71/// recognition rules for operators.
72///
73/// The result is exactly the same as one obtained from Unix shell as long as
74/// those unsupported features are not present in input: no operators, no
75/// variable assignments, no tilde expansion, no parameter expansion, no command
76/// substitution, no arithmetic expansion, no pathname expansion.
77///
78/// In case those unsupported shell features are present, the syntax that
79/// introduce them is interpreted literally.
80///
81/// # Errors
82///
83/// When input contains unmatched quote, an error is returned.
84///
85/// # Compatibility with other implementations
86///
87/// It should be fully compatible with g_shell_parse_argv from GLib, except that
88/// in GLib it is an error not to have any words after tokenization.
89///
90/// It is also very close to shlex.split available in Python standard library,
91/// when used in POSIX mode with support for comments. Though, shlex
92/// implementation diverges from POSIX, and from implementation contained herein
93/// in three aspects. First, it doesn't support line continuations.
94/// Second, inside double quotes, the backslash characters retains its special
95/// meaning as an escape character only when followed by \\ or \", whereas POSIX
96/// specifies that it should retain its special meaning when followed by: $, \`,
97/// \", \\, or a newline. Third, it treats carriage return as one of delimiters.
98///
99/// # Examples
100///
101/// Building an executable using compiler obtained from CC environment variable
102/// and compiler flags from both CFLAGS and CPPFLAGS. Similar to default build
103/// rule for C used in GNU Make:
104///
105/// ```rust,no_run
106/// use std::env::var;
107/// use std::process::Command;
108///
109/// let cc = var("CC").unwrap_or_else(|_| "cc".to_owned());
110///
111/// let cflags_str = var("CFLAGS").unwrap_or_else(|_| String::new());
112/// let cflags = shell_words::split(&cflags_str).expect("failed to parse CFLAGS");
113///
114/// let cppflags_str = var("CPPFLAGS").unwrap_or_else(|_| String::new());
115/// let cppflags = shell_words::split(&cppflags_str).expect("failed to parse CPPFLAGS");
116///
117/// Command::new(cc)
118///     .args(cflags)
119///     .args(cppflags)
120///     .args(&["-c", "a.c", "-o", "a.out"])
121///     .spawn()
122///     .expect("failed to start subprocess")
123///     .wait()
124///     .expect("failed to wait for subprocess");
125/// ```
126pub fn split(s: &str) -> Result<Vec<String>, ParseError> {
127    use State::*;
128
129    let mut words = Vec::new();
130    let mut word = String::new();
131    let mut chars = s.chars();
132    let mut state = Delimiter;
133
134    loop {
135        let c = chars.next();
136        state = match state {
137            Delimiter => match c {
138                None => break,
139                Some('\'') => SingleQuoted,
140                Some('\"') => DoubleQuoted,
141                Some('\\') => Backslash,
142                Some('\t') | Some(' ') | Some('\n') => Delimiter,
143                Some('#') => Comment,
144                Some(c) => {
145                    word.push(c);
146                    Unquoted
147                }
148            },
149            Backslash => match c {
150                None => {
151                    word.push('\\');
152                    words.push(mem::replace(&mut word, String::new()));
153                    break;
154                }
155                Some('\n') => Delimiter,
156                Some(c) => {
157                    word.push(c);
158                    Unquoted
159                }
160            },
161            Unquoted => match c {
162                None => {
163                    words.push(mem::replace(&mut word, String::new()));
164                    break;
165                }
166                Some('\'') => SingleQuoted,
167                Some('\"') => DoubleQuoted,
168                Some('\\') => UnquotedBackslash,
169                Some('\t') | Some(' ') | Some('\n') => {
170                    words.push(mem::replace(&mut word, String::new()));
171                    Delimiter
172                }
173                Some(c) => {
174                    word.push(c);
175                    Unquoted
176                }
177            },
178            UnquotedBackslash => match c {
179                None => {
180                    word.push('\\');
181                    words.push(mem::replace(&mut word, String::new()));
182                    break;
183                }
184                Some('\n') => Unquoted,
185                Some(c) => {
186                    word.push(c);
187                    Unquoted
188                }
189            },
190            SingleQuoted => match c {
191                None => return Err(ParseError),
192                Some('\'') => Unquoted,
193                Some(c) => {
194                    word.push(c);
195                    SingleQuoted
196                }
197            },
198            DoubleQuoted => match c {
199                None => return Err(ParseError),
200                Some('\"') => Unquoted,
201                Some('\\') => DoubleQuotedBackslash,
202                Some(c) => {
203                    word.push(c);
204                    DoubleQuoted
205                }
206            },
207            DoubleQuotedBackslash => match c {
208                None => return Err(ParseError),
209                Some('\n') => DoubleQuoted,
210                Some(c @ '$') | Some(c @ '`') | Some(c @ '"') | Some(c @ '\\') => {
211                    word.push(c);
212                    DoubleQuoted
213                }
214                Some(c) => {
215                    word.push('\\');
216                    word.push(c);
217                    DoubleQuoted
218                }
219            },
220            Comment => match c {
221                None => break,
222                Some('\n') => Delimiter,
223                Some(_) => Comment,
224            },
225        }
226    }
227
228    Ok(words)
229}
230
231enum EscapeStyle {
232    /// No escaping.
233    None,
234    /// Wrap in single quotes.
235    SingleQuoted,
236    /// Single quotes combined with backslash.
237    Mixed,
238}
239
240/// Determines escaping style to use.
241fn escape_style(s: &str) -> EscapeStyle {
242    if s.is_empty() {
243        return EscapeStyle::SingleQuoted;
244    }
245
246    let mut special = false;
247    let mut newline = false;
248    let mut single_quote = false;
249
250    for c in s.chars() {
251        match c {
252            '\n' => {
253                newline = true;
254                special = true;
255            }
256            '\'' => {
257                single_quote = true;
258                special = true;
259            }
260            '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | ' ' | '\t' | '*'
261            | '?' | '[' | '#' | '˜' | '=' | '%' => {
262                special = true;
263            }
264            _ => continue,
265        }
266    }
267
268    if !special {
269        EscapeStyle::None
270    } else if newline && !single_quote {
271        EscapeStyle::SingleQuoted
272    } else {
273        EscapeStyle::Mixed
274    }
275}
276
277/// Escapes special characters in a string, so that it will retain its literal
278/// meaning when used as a part of command in Unix shell.
279///
280/// It tries to avoid introducing any unnecessary quotes or escape characters,
281/// but specifics regarding quoting style are left unspecified.
282pub fn quote(s: &str) -> Cow<str> {
283    // We are going somewhat out of the way to provide
284    // minimal amount of quoting in typical cases.
285    match escape_style(s) {
286        EscapeStyle::None => s.into(),
287        EscapeStyle::SingleQuoted => format!("'{}'", s).into(),
288        EscapeStyle::Mixed => {
289            let mut quoted = String::new();
290            quoted.push('\'');
291            for c in s.chars() {
292                if c == '\'' {
293                    quoted.push_str("'\\''");
294                } else {
295                    quoted.push(c);
296                }
297            }
298            quoted.push('\'');
299            quoted.into()
300        }
301    }
302}
303
304/// Joins arguments into a single command line suitable for execution in Unix
305/// shell.
306///
307/// Each argument is quoted using [`quote`] to preserve its literal meaning when
308/// parsed by Unix shell.
309///
310/// Note: This function is essentially an inverse of [`split`].
311///
312/// # Examples
313///
314/// Logging executed commands in format that can be easily copied and pasted
315/// into an actual shell:
316///
317/// ```rust,no_run
318/// fn execute(args: &[&str]) {
319///     use std::process::Command;
320///     println!("Executing: {}", shell_words::join(args));
321///     Command::new(&args[0])
322///         .args(&args[1..])
323///         .spawn()
324///         .expect("failed to start subprocess")
325///         .wait()
326///         .expect("failed to wait for subprocess");
327/// }
328///
329/// execute(&["python", "-c", "print('Hello world!')"]);
330/// ```
331///
332/// [`quote`]: fn.quote.html
333/// [`split`]: fn.split.html
334pub fn join<I, S>(words: I) -> String
335where
336    I: IntoIterator<Item = S>,
337    S: AsRef<str>,
338{
339    let mut line = words.into_iter().fold(String::new(), |mut line, word| {
340        let quoted = quote(word.as_ref());
341        line.push_str(quoted.as_ref());
342        line.push(' ');
343        line
344    });
345    line.pop();
346    line
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    fn split_ok(cases: &[(&str, &[&str])]) {
354        for &(input, expected) in cases {
355            match split(input) {
356                Err(actual) => {
357                    panic!(
358                        "After split({:?})\nexpected: Ok({:?})\n  actual: Err({:?})\n",
359                        input, expected, actual
360                    );
361                }
362                Ok(actual) => {
363                    assert!(
364                        expected == actual.as_slice(),
365                        "After split({:?}).unwrap()\nexpected: {:?}\n  actual: {:?}\n",
366                        input,
367                        expected,
368                        actual
369                    );
370                }
371            }
372        }
373    }
374
375    #[test]
376    fn split_empty() {
377        split_ok(&[("", &[])]);
378    }
379
380    #[test]
381    fn split_initial_whitespace_is_removed() {
382        split_ok(&[
383            ("     a", &["a"]),
384            ("\t\t\t\tbar", &["bar"]),
385            ("\t \nc", &["c"]),
386        ]);
387    }
388
389    #[test]
390    fn split_trailing_whitespace_is_removed() {
391        split_ok(&[
392            ("a  ", &["a"]),
393            ("b\t", &["b"]),
394            ("c\t \n \n \n", &["c"]),
395            ("d\n\n", &["d"]),
396        ]);
397    }
398
399    #[test]
400    fn split_carriage_return_is_not_special() {
401        split_ok(&[("c\ra\r'\r'\r", &["c\ra\r\r\r"])]);
402    }
403
404    #[test]
405    fn split_single_quotes() {
406        split_ok(&[
407            (r#"''"#, &[r#""#]),
408            (r#"'a'"#, &[r#"a"#]),
409            (r#"'\'"#, &[r#"\"#]),
410            (r#"' \ '"#, &[r#" \ "#]),
411            (r#"'#'"#, &[r#"#"#]),
412        ]);
413    }
414
415    #[test]
416    fn split_double_quotes() {
417        split_ok(&[
418            (r#""""#, &[""]),
419            (r#""""""#, &[""]),
420            (r#""a b c' d""#, &["a b c' d"]),
421            (r#""\a""#, &["\\a"]),
422            (r#""$""#, &["$"]),
423            (r#""\$""#, &["$"]),
424            (r#""`""#, &["`"]),
425            (r#""\`""#, &["`"]),
426            (r#""\"""#, &["\""]),
427            (r#""\\""#, &["\\"]),
428            ("\"\n\"", &["\n"]),
429            ("\"\\\n\"", &[""]),
430        ]);
431    }
432
433    #[test]
434    fn split_unquoted() {
435        split_ok(&[
436            (r#"\|\&\;"#, &[r#"|&;"#]),
437            (r#"\<\>"#, &[r#"<>"#]),
438            (r#"\(\)"#, &[r#"()"#]),
439            (r#"\$"#, &[r#"$"#]),
440            (r#"\`"#, &[r#"`"#]),
441            (r#"\""#, &[r#"""#]),
442            (r#"\'"#, &[r#"'"#]),
443            ("\\\n", &[]),
444            (" \\\n \n", &[]),
445            ("a\nb\nc", &["a", "b", "c"]),
446            ("a\\\nb\\\nc", &["abc"]),
447            ("foo bar baz", &["foo", "bar", "baz"]),
448            (r#"\🦉"#, &[r"🦉"]),
449        ]);
450    }
451
452    #[test]
453    fn split_trailing_backslash() {
454        split_ok(&[("\\", &["\\"]), (" \\", &["\\"]), ("a\\", &["a\\"])]);
455    }
456
457    #[test]
458    fn split_errors() {
459        assert_eq!(split("'abc"), Err(ParseError));
460        assert_eq!(split("\""), Err(ParseError));
461        assert_eq!(split("'\\"), Err(ParseError));
462        assert_eq!(split("'\\"), Err(ParseError));
463    }
464
465    #[test]
466    fn split_comments() {
467        split_ok(&[
468            (r#" x # comment "#, &["x"]),
469            (r#" w1#w2 "#, &["w1#w2"]),
470            (r#"'not really a # comment'"#, &["not really a # comment"]),
471            (" a # very long comment \n b # another comment", &["a", "b"]),
472        ]);
473    }
474
475    #[test]
476    fn test_quote() {
477        assert_eq!(quote(""), "''");
478        assert_eq!(quote("'"), "''\\'''");
479        assert_eq!(quote("abc"), "abc");
480        assert_eq!(quote("a \n  b"), "'a \n  b'");
481        assert_eq!(quote("X'\nY"), "'X'\\''\nY'");
482    }
483
484    #[test]
485    fn test_join() {
486        assert_eq!(join(&["a", "b", "c"]), "a b c");
487        assert_eq!(join(&[" ", "$", "\n"]), "' ' '$' '\n'");
488    }
489
490    #[test]
491    fn join_followed_by_split_is_identity() {
492        let cases: Vec<&[&str]> = vec![
493            &["a"],
494            &["python", "-c", "print('Hello world!')"],
495            &["echo", " arg with spaces ", "arg \' with \" quotes"],
496            &["even newlines are quoted correctly\n", "\n", "\n\n\t "],
497            &["$", "`test`"],
498            &["cat", "~user/log*"],
499            &["test", "'a \"b", "\"X'"],
500            &["empty", "", "", ""],
501        ];
502        for argv in cases {
503            let args = join(argv);
504            assert_eq!(split(&args).unwrap(), argv);
505        }
506    }
507}