prevent some panics when parsing invalid Unicode on Windows

oconnor663 · oconnor663 · commit 703caf37c495 · 2020-05-05T16:09:06.000-04:00
Previously, OsStr::starts_with() would panic on Windows if the OsStr contained any invalid Unicode. Because this method is in the critical path for parsing command line arguments, Clap up to v2.33.0 on Windows panics on non-Unicode filenames, even when the application specifically requests OsString to account for this case. As a workaround, this commit adds a custom implementation of OsStr::starts_with() for Windows that doesn't rely on OsStr::as_bytes(). With this change, examples like the following can parse successfully. Here "X" represents invalid Unicode: clap.exe X clap.exe --some-arg X However, examples like these will still panic: clap.exe --some-arg=X clap.exe -sX These still panic, because they require string splitting operations like OsStr::split_at_byte() and OsStr::trim_left_matches(). Fixing these would require either breaking open the in-memory representation of OsStr, which is not stable, or changing all these APIs to allow for an allocated OsString/Cow<OsStr>. This fix is aiming to be minimal and also not wildly unsafe, so we leave these bugs in place. Hopefully the majority of invalid Unicode in the wild occurs in filepaths given as standalone arguments, and many of the rest can be converted from flags with `=` to flags with a space. Fixes #1905.
diff --git a/src/osstringext.rs b/src/osstringext.rs
@@ -20,6 +20,73 @@ pub trait OsStrExt2 {
     fn split(&self, b: u8) -> OsSplit;
 }
 
+// A starts-with implementation that does not panic when the OsStr contains
+// invalid Unicode.
+//
+// A Windows OsStr is usually UTF-16. If `prefix` is valid UTF-8, we can
+// re-encode it as UTF-16, and ask whether `osstr` starts with the same series
+// of u16 code units. If `prefix` is not valid UTF-8, then this comparison
+// isn't meaningful, and we just return false.
+#[cfg(target_os = "windows")]
+fn windows_osstr_starts_with(osstr: &OsStr, prefix: &[u8]) -> bool {
+    use std::os::windows::ffi::OsStrExt;
+    let prefix_str = if let Ok(s) = std::str::from_utf8(prefix) {
+        s
+    } else {
+        return false;
+    };
+    let mut osstr_units = osstr.encode_wide();
+    let mut prefix_units = prefix_str.encode_utf16();
+    loop {
+        match (osstr_units.next(), prefix_units.next()) {
+            // These code units match. Keep looping.
+            (Some(o), Some(p)) if o == p => continue,
+            // We've reached the end of the prefix. It's a match.
+            (_, None) => return true,
+            // Otherwise, it's not a match.
+            _ => return false,
+        }
+    }
+}
+
+#[test]
+#[cfg(target_os = "windows")]
+fn test_windows_osstr_starts_with() {
+    use std::ffi::OsString;
+    use std::os::windows::ffi::OsStringExt;
+
+    fn from_ascii(ascii: &[u8]) -> OsString {
+        let u16_vec: Vec<u16> = ascii.iter().map(|&c| c as u16).collect();
+        OsString::from_wide(&u16_vec)
+    }
+
+    // Test all the basic cases.
+    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abc"));
+    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abcdef"));
+    assert!(!windows_osstr_starts_with(&from_ascii(b"abcdef"), b"def"));
+    assert!(!windows_osstr_starts_with(&from_ascii(b"abc"), b"abcd"));
+
+    // Test the case where the candidate prefix is not valid UTF-8. Note that a
+    // standalone \xff byte is valid ASCII but not valid UTF-8. Thus although
+    // these strings look identical, they do not match.
+    assert!(!windows_osstr_starts_with(&from_ascii(b"\xff"), b"\xff"));
+
+    // Test the case where the OsString is not valid UTF-16. It should still be
+    // possible to match the valid characters at the front.
+    //
+    // UTF-16 surrogate characters are only valid in pairs. Including one on
+    // the end by itself makes this invalid UTF-16.
+    let surrogate_char: u16 = 0xDC00;
+    let mut invalid_unicode =
+        OsString::from_wide(&['a' as u16, 'b' as u16, 'c' as u16, surrogate_char]);
+    assert!(
+        invalid_unicode.to_str().is_none(),
+        "This string is invalid Unicode, and conversion to &str should fail.",
+    );
+    assert!(windows_osstr_starts_with(&invalid_unicode, b"abc"));
+    assert!(!windows_osstr_starts_with(&invalid_unicode, b"abcd"));
+}
+
 #[cfg(any(target_os = "windows", target_arch = "wasm32"))]
 impl OsStrExt3 for OsStr {
     fn from_bytes(b: &[u8]) -> &Self {
@@ -33,6 +100,20 @@ impl OsStrExt3 for OsStr {
 
 impl OsStrExt2 for OsStr {
     fn starts_with(&self, s: &[u8]) -> bool {
+        #[cfg(target_os = "windows")]
+        {
+            // On Windows, the as_bytes() method will panic if the OsStr
+            // contains invalid Unicode. To avoid this, we use a
+            // Windows-specific starts-with function that doesn't rely on
+            // as_bytes(). This is necessary for Windows command line
+            // applications to handle non-Unicode arguments successfully. This
+            // allows common cases like `clap.exe [invalid]` to succeed, though
+            // cases that require string splitting will still fail, like
+            // `clap.exe --arg=[invalid]`. Note that this entire module is
+            // replaced in Clap 3.x, so this workaround is specific to the 2.x
+            // branch.
+            return windows_osstr_starts_with(self, s);
+        }
         self.as_bytes().starts_with(s)
     }
 
diff --git a/tests/utf16.rs b/tests/utf16.rs
@@ -0,0 +1,123 @@
+//! These Windows-only tests are ported from the non-Windows tests in
+//! tests/utf8.rs. Two categories of tests are omitted, however:
+//!
+//! 1. We don't include the tests that use StrictUtf8 mode, since that's
+//!    eplicitly Unix-only.
+//! 2. We don't include tests that require splitting invalid Unicode strings.
+//!
+//! Elaborating on that second point: We have fixed invalid Unicode handling in
+//! the OsStr::starts_with() method. That means that examples like these can
+//! should parse successfully ("X" represents invalid Unicode):
+//!
+//!     clap.exe X
+//!     clap.exe --some-arg X
+//!
+//! However, other string handling methods like OsStr::split_at_byte() and
+//! OsStr::trim_left_matches() still panic in the face of invalid Unicode. That
+//! means examples like these *don't* work:
+//!
+//!     clap.exe --some-arg=X
+//!     clap.exe -sX
+//!
+//! Thus this file omits tests for those cases. All of this OsStr handling is
+//! being rewritten for the 3.0 release in any case, so this limitation is
+//! specific to the 2.x series.
+
+#![cfg(windows)]
+
+extern crate clap;
+
+use clap::{App, Arg};
+use std::ffi::OsString;
+use std::os::windows::ffi::OsStringExt;
+
+fn bad_utf16_string() -> OsString {
+    // UTF-16 surrogate characters are only valid in pairs. Including one on
+    // the end by itself makes this invalid UTF-16.
+    let surrogate_char: u16 = 0xDC00;
+    let os = OsString::from_wide(&[surrogate_char]);
+    assert!(os.to_str().is_none(), "should be invalid Unicode");
+    os
+}
+
+#[test]
+fn invalid_utf16_lossy_positional() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("<arg> 'some arg'"))
+        .get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_lossy_option_short_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("-a"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_lossy_option_long_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("--arg"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_positional() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("<arg> 'some arg'"))
+        .get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}
+
+#[test]
+fn invalid_utf16_option_short_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("-a"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}
+
+#[test]
+fn invalid_utf16_option_long_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("--arg"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}